GNAT vs Matlab - operation on multidimensional complex matrices

comp.lang.ada
 help / color / mirror / Atom feed

* GNAT vs Matlab - operation on   multidimensional complex matrices
@ 2020-03-23 23:16 darek
  2020-03-24  2:07 ` johnscpg
                   ` (6 more replies)
  0 siblings, 7 replies; 17+ messages in thread
From: darek @ 2020-03-23 23:16 UTC (permalink / raw)


Hi Everyone, 
 I am working on radar signal processing algorithms that use multidimensional complex arrays. 

To my surprise, the performance of some Matlab functions is much better than compiled Ada code. 

Let's start with a simple problem of computing sum of all elements in a 3D real and complex array. 

The Ada code is as follows:

with Ada.Text_IO;
with Ada.Real_Time; use Ada.Real_Time;
with Ada.Unchecked_Deallocation;

with Ada.Numerics.Long_Complex_Types;  use Ada.Numerics.Long_Complex_Types;

with Ada.Text_IO.Complex_IO;

procedure TestSpeed is
   

   
   package TIO renames Ada.Text_IO;
   
   package CTIO is new Ada.Text_IO.Complex_IO(Ada.Numerics.Long_Complex_Types);
   
   subtype mReal is Long_Float;
   
   
   NumIteration : constant := 1_000;
   NumChannels  : constant  := 64;
   NumRanges    : constant  := 400; 
   NumAngles    : constant  := 30;
   
   type tCubeReal is array (1..NumChannels, 1..NumAngles, 1..NumRanges) of mReal;
   type tCubeRealAcc is access all tCubeReal;
   --for tCubeReal'Alignment use 8;
   
   type tCubeComplex is array (1..NumChannels, 1..NumAngles, 1..NumRanges) of Complex;
   type tCubeComplexAcc is access all tCubeComplex;
   --for tCubeComplex'Alignment use 16;
   
   RealCubeAcc : tCubeRealAcc;
   SReal       : mReal;
   
   ComplexCubeAcc : tCubeComplexAcc;
   SComplex    : Complex;

   
   procedure Free is new Ada.Unchecked_Deallocation(tCubeReal, tCubeRealAcc);
   procedure Free is new Ada.Unchecked_Deallocation(tCubeComplex, tCubeComplexAcc);
   
   --| -------------------------------------------------------------------------
   procedure SpeedSumRealCube (NumIteration : Integer; Mtx : in  tCubeReal;  S: out mReal) is
      
      Ts   : Time;
      TEx  : Time_Span; 
      t    : mReal;
   begin
      Ts := Clock;
      S := 0.0;
      for k in 1..NumIteration loop
         for m  in Mtx'Range(1) loop
            for n in   Mtx'Range(2) loop
               for p in   Mtx'Range(3) loop
                  S := S + Mtx(m, n, p);
               end loop;
            end loop;
         end loop;      
      end loop;

      TEx :=  Clock - Ts;
      TIO.New_Line;
      TIO.Put_Line("Computation time:" & Duration'Image(To_Duration(TEx)));
      t := mReal(To_Duration(TEx))/mReal(NumIteration);
      TIO.Put_Line("Computation time per iteration:" & t'Image);
   end SpeedSumRealCube;
   
   --| -------------------------------------------------------------------------
   
   procedure SpeedSumComplexCube (NumIteration : Integer; Mtx : in  tCubeComplex;  S:  out Complex) is
      
      Ts   : Time;
      TEx  : Time_Span; 
      t    : mReal;
   begin
      Ts := Clock;     
      S := 0.0  + i* 0.0; 
      for k in 1..NumIteration loop
         for m  in Mtx'Range(1) loop
            for n in    Mtx'Range(2) loop
               for p in   Mtx'Range(3) loop
                  S := S + Mtx(m, n, p);
               end loop;
            end loop;
         end loop;      
      end loop;
      TEx :=  Clock - Ts;
      TIO.New_Line;
      TIO.Put_Line("Computation time:" & Duration'Image(To_Duration(TEx)));
      t := mReal(To_Duration(TEx))/mReal(NumIteration);
      TIO.Put_Line("Computation time per iteration:" & t'Image);
   end SpeedSumComplexCube;
   
   --| -------------------------------------------------------------------------
   
begin
   TIO.Put_Line("Real cube");
   TIO.Put_Line("Real type size is:" & Integer(mReal'Size)'Image);
   RealCubeAcc := new tCubeReal;
   RealCubeAcc.all := (others =>(others =>( others => 1.0)));
   SpeedSumRealCube(NumIteration => NumIteration,
                    Mtx           => RealCubeAcc.all,
                    S            => SReal);
   
   TIO.Put_Line("Sum is:" & SReal'Image);
   
   TIO.Put_Line("Complex cube");
   TIO.Put_Line("Complex type size is:" & Integer(Complex'Size)'Image);
   ComplexCubeAcc := new tCubeComplex;
   ComplexCubeAcc.all := (others =>(others =>( others => 1.0 + i * 1.0)));
   SpeedSumComplexCube(NumIteration => NumIteration,
                       Mtx          => ComplexCubeAcc.all,
                       S            => SComplex);
   
   TIO.Put("Sum is:"); CTIO.Put(SComplex);
   Free(ComplexCubeAcc);
   Free(RealCubeAcc);   
end TestSpeed;

1. Compiled with:  gcc version 9.2.0 (tdm64-1) ( and gnat community edition 2019), with the -O2 optimisation level. 
2. CPU: AMD64 Family 23 Model 24 Stepping 1  CPU0      2300           AMD Ryzen 7 3750H with Radeon Vega Mobile Gfx
3. Win10 64bit 


The results of the program execution:

Computation time: 0.616710300
Computation time per iteration: 6.16710300000000E-04
Sum is: 7.68000000000000E+08
Complex cube
Complex type size is: 128

Computation time: 3.707091000
Computation time per iteration: 3.70709100000000E-03
Sum is:( 7.68000000000000E+08, 7.68000000000000E+08)


The executable produced by the gcc provide with the gnat community edition gave very similar results.

More interesting part - the Matlab code.

Matlab version : Matlab 2019b, 64bit

function [] = TestSpeed 

NumIterations = 1000;
NumChannels = 64;  
NumRanges  = 400; 
NumAngles = 30;

%--| real matrix 

ReMtx = ones(NumChannels, NumAngles, NumRanges);

tic
SReal =  ComputeSum(NumIterations, ReMtx);
TExR = toc;%cputime-ts;
fprintf('TExe:%f, sum real=%f\n', TExR, SReal);
%--| complex matrix
CplxMtx = complex(ReMtx, ReMtx);
%ts = cputime;
tic
SCplx = ComputeSum(NumIterations, CplxMtx);
TExC = toc; %cputime - ts;
fprintf('TExe:%f, sum complex= <%f,%f> \n', TExC, real(SCplx), imag(SCplx));
fprintf('Complex operations are %f times slower\n', TExC/TExR);
end % function


function [S] = ComputeSum(NumIterations, Mtx)
 S = 0;
 for i = 1:NumIterations   
    S = S + sum(sum(sum(Mtx)));  
 end % for  
end % function 

The results of the program execution:

TExe:0.260718, sum real=768000000.000000
TExe:0.789778, sum complex= <768000000.000000,768000000.000000> 
Complex operations are 3.029242 times slower


What is wrong with my code ? Is it the Ada compiler doing bad job here?
 
If you look at Matlab code, on average the computation that use complex  addition are ~3 times slower than for the real numbers.

In the case of Ada code, the complex operations are ~ 6 times slower that for the real numbers. 

Did I miss something somewhere ? Any ideas how I can improve the performance of the Ada program (different array layout, magic pragmas, or magic compiler switches) ?

It seems that Matlab is performing really well here ...

Any suggestions are  very welcome.

Regards,
  Darek  







^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on   multidimensional complex matrices
  2020-03-23 23:16 GNAT vs Matlab - operation on multidimensional complex matrices darek
@ 2020-03-24  2:07 ` johnscpg
  2020-03-24  6:40 ` J-P. Rosen
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 17+ messages in thread
From: johnscpg @ 2020-03-24  2:07 UTC (permalink / raw)



> magic compiler switches

On my machine 

   gnatmake -O3 -march=native -gnatnp -ffast-math 

helps. You can also try -funroll-loops, but it didn't help when I tried it.


  

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on multidimensional complex matrices
  2020-03-23 23:16 GNAT vs Matlab - operation on multidimensional complex matrices darek
  2020-03-24  2:07 ` johnscpg
@ 2020-03-24  6:40 ` J-P. Rosen
  2020-03-24  9:37 ` Jeffrey R. Carter
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 17+ messages in thread
From: J-P. Rosen @ 2020-03-24  6:40 UTC (permalink / raw)


Le 24/03/2020 à 00:16, darek a écrit :
> Hi Everyone, 
>  I am working on radar signal processing algorithms that use multidimensional complex arrays. 
> 
> To my surprise, the performance of some Matlab functions is much better than compiled Ada code. 
> 
[...]

Two remarks:
1) why do you need pointers? Ada is very good at using pointer only when
absolutely necessary, which is good for safety and efficiency.

2) you are comparing a hand-written, naive[1] algorithm with a
specialized function that may well be written in assembly language,
making use of special SIMD instructions. I would say that Ada does not
behave that bad...

[1] Naive in that you simply use the "obvious" loops. If you want
maximum efficiency, a lot of tricky factors come into play. For example,
changing the order of the nesting of the loops changes the spread of
memory accesses, which can (or not) have a dramatic effect on
efficiency, given the various caches, whether successive memory accesses
belong to the same memory bank, etc. High efficiency mathematical
libraries are designed to consider these effects.

-- 
J-P. Rosen
Adalog
2 rue du Docteur Lombard, 92441 Issy-les-Moulineaux CEDEX
Tel: +33 1 45 29 21 52, Fax: +33 1 45 29 25 00
http://www.adalog.fr

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on multidimensional complex matrices
  2020-03-23 23:16 GNAT vs Matlab - operation on multidimensional complex matrices darek
  2020-03-24  2:07 ` johnscpg
  2020-03-24  6:40 ` J-P. Rosen
@ 2020-03-24  9:37 ` Jeffrey R. Carter
  2020-03-24 14:55   ` darek
  2020-03-24 15:44 ` johnscpg
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 17+ messages in thread
From: Jeffrey R. Carter @ 2020-03-24  9:37 UTC (permalink / raw)


On 3/24/20 12:16 AM, darek wrote:
> 
> In the case of Ada code, the complex operations are ~ 6 times slower that for the real numbers.

With FSF GNAT 9.2.1 on Linux, compiled with -O2 -gnatn -fstack-check, on my 
machine (Intel Core i7-6700HQ 2.6 GHz) the complex time is about 1.14 times the 
real time. With -O3 -gnatpn, about 1.13 times.

-- 
Jeff Carter
"Now look, Col. Batguano, if that really is your name."
Dr. Strangelove
31

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on multidimensional complex matrices
  2020-03-24  9:37 ` Jeffrey R. Carter
@ 2020-03-24 14:55   ` darek
  0 siblings, 0 replies; 17+ messages in thread
From: darek @ 2020-03-24 14:55 UTC (permalink / raw)


Hi Jeff, 
Thanks for taking time to play with the code. I did an experiment. I have also on this machine Linux running in the BBOx (OpenSuse Leap 15.1 64bit). 

The execution time on the same hardware with the gcc version 8.3.1 20190518 (for GNAT Community 2019 20190517) (GCC)  and the compiler options:
   -ffast-math -funroll-loops -O3 -march=native -gnatn  

is: 
Real cube
Real type size is: 64

Computation time: 0.376393396
Computation time per iteration: 3.76393396000000E-04
Sum is: 7.68000000000000E+08
Complex cube
Complex type size is: 128

Computation time: 0.848316916
Computation time per iteration: 8.48316916000000E-04
Sum is:( 7.68000000000000E+08, 7.68000000000000E+08)


and again, compiling and running on Win10:

Real cube
Real type size is: 64

Computation time: 0.406157900
Computation time per iteration: 4.06157900000000E-04
Sum is: 7.68000000000000E+08
Complex cube
Complex type size is: 128

Computation time: 0.807358900
Computation time per iteration: 8.07358900000000E-04
Sum is:( 7.68000000000000E+08, 7.68000000000000E+08)


So, not bad at all, and my guess is that some thing could be improved. 


BTW, I think the the gprof is missing on GNAT CE on Win10/64bit (Linux distro has it). 



Regards,
  Darek 

P.S. Did anybody tried to use the FFTW with an old Ada binding (Linux/Win10). It  partially works but I have some strange issues with it (depend on the platform either single precision or double precision is not working). 

On Tuesday, 24 March 2020 10:37:29 UTC+1, Jeffrey R. Carter  wrote:
> On 3/24/20 12:16 AM, darek wrote:
> > 
> > In the case of Ada code, the complex operations are ~ 6 times slower that for the real numbers.
> 
> With FSF GNAT 9.2.1 on Linux, compiled with -O2 -gnatn -fstack-check, on my 
> machine (Intel Core i7-6700HQ 2.6 GHz) the complex time is about 1.14 times the 
> real time. With -O3 -gnatpn, about 1.13 times.
> 
> -- 
> Jeff Carter
> "Now look, Col. Batguano, if that really is your name."
> Dr. Strangelove
> 31

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on   multidimensional complex matrices
  2020-03-23 23:16 GNAT vs Matlab - operation on multidimensional complex matrices darek
                   ` (2 preceding siblings ...)
  2020-03-24  9:37 ` Jeffrey R. Carter
@ 2020-03-24 15:44 ` johnscpg
  2020-03-31 17:25 ` Shark8
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 17+ messages in thread
From: johnscpg @ 2020-03-24 15:44 UTC (permalink / raw)



Hi Darek,

Your latest runtimes agree with mine. I ran on a 1 GHz cpu. Your Ryzen 3750H
is iiuc 2.3 GHz. If I divide my runtimes by 2.3 I get pretty much the same
as yours.

The remaining mystery is matlab's sum of the Real array, which matlab runs
in 1/3 the time of the Complex array, even though it does 1/2 as many
operations. I wonder if matlab is running the program on multiple cores.
Here's what www.mathworks.com says:

https://www.mathworks.com/matlabcentral/answers/350324-is-multi-threading-possible-in-matlab

   "Many commands, e.g. sum is multi-threaded internally already: beyond
    a specific data size, the work is distributed to multiple threads,
    which are processed in parallel."

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on   multidimensional complex matrices
  2020-03-23 23:16 GNAT vs Matlab - operation on multidimensional complex matrices darek
                   ` (3 preceding siblings ...)
  2020-03-24 15:44 ` johnscpg
@ 2020-03-31 17:25 ` Shark8
  2020-03-31 19:20   ` Simon Wright
  2020-04-01 11:01   ` darek
  2020-03-31 20:05 ` Shark8
  2020-06-08 17:42 ` Shark8
  6 siblings, 2 replies; 17+ messages in thread
From: Shark8 @ 2020-03-31 17:25 UTC (permalink / raw)


On Monday, March 23, 2020 at 5:16:20 PM UTC-6, darek wrote:
> Hi Everyone, 
>  I am working on radar signal processing algorithms that use multidimensional complex arrays. 

Ok then; the following should have convention Fortran IIRC:

type tCubeReal is array (1..NumChannels, 1..NumAngles, 1..NumRanges) of mReal
  with Convention => Fortran;

That doesn't matter much, for a speed-test, but given that you mention RADAR data it's probably best to mention it now, before you frustrate yourself by accidentally mixing up column- and row-major formats.

> 
> To my surprise, the performance of some Matlab functions is much better than compiled Ada code. 
> 
>    procedure SpeedSumRealCube (NumIteration : Integer; Mtx : in  tCubeReal;  S: out mReal) is

Don't read from S, it's an OUT parameter, and it's best to treat it that way.
(I'm not sure that it would get in the way of optimization, but could.)

>       for k in 1..NumIteration loop
>          for m  in Mtx'Range(1) loop
>             for n in   Mtx'Range(2) loop
>                for p in   Mtx'Range(3) loop
>                   S := S + Mtx(m, n, p);
>                end loop;
>             end loop;
>          end loop;
>       end loop;

The above could be replaced, in Ada2012 with:

Function Summation( Input : in  tCubeReal ) return Complex is
Begin
  Return Result : Complex := (0.0 + i*0.0) do
    For Element of Input loop
      Result:= Result + Element;
    End loop;
  End return;
End Summation;

If you turn it into a generic, you could use it for both Complex and mReal; plus it more accurately mirrors your Matlab code.
-------------------------------------------------------------------------

Is there a reason for the access-types?
If not, I would recommend getting rid of them.


>
> 1. Compiled with:  gcc version 9.2.0 (tdm64-1) ( and gnat community edition 2019), with the -O2 optimisation level. 
> 2. CPU: AMD64 Family 23 Model 24 Stepping 1  CPU0      2300           AMD Ryzen 7 3750H with Radeon Vega Mobile Gfx
> 3. Win10 64bit 
> 
> 
> The results of the program execution:
> 
> Computation time: 0.616710300
> Computation time per iteration: 6.16710300000000E-04
> Sum is: 7.68000000000000E+08
> Complex cube
> Complex type size is: 128
> 
> Computation time: 3.707091000
> Computation time per iteration: 3.70709100000000E-03
> Sum is:( 7.68000000000000E+08, 7.68000000000000E+08)
> 
> 
> The executable produced by the gcc provide with the gnat community edition gave very similar results.
> 
> More interesting part - the Matlab code.
> The results of the program execution:
> 
> TExe:0.260718, sum real=768000000.000000
> TExe:0.789778, sum complex= <768000000.000000,768000000.000000> 
> Complex operations are 3.029242 times slower
> 
> 
> What is wrong with my code? Is it the Ada compiler doing bad job here?
> It seems that Matlab is performing really well here ...

IIRC Matlab has a *LOT* of optimization for Complex numbers/operations, but you're right in that these are surprising.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on   multidimensional complex matrices
  2020-03-31 17:25 ` Shark8
@ 2020-03-31 19:20   ` Simon Wright
  2020-03-31 19:54     ` Shark8
  2020-04-29 21:08     ` vincent.diemunsch
  2020-04-01 11:01   ` darek
  1 sibling, 2 replies; 17+ messages in thread
From: Simon Wright @ 2020-03-31 19:20 UTC (permalink / raw)


Shark8 <onewingedshark@gmail.com> writes:

> Ok then; the following should have convention Fortran IIRC:
>
> type tCubeReal is array (1..NumChannels, 1..NumAngles, 1..NumRanges) of mReal
>   with Convention => Fortran;

I wondered about this; the code ran considerably slower with Convention
=> Fortran.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on   multidimensional complex matrices
  2020-03-31 19:20   ` Simon Wright
@ 2020-03-31 19:54     ` Shark8
  2020-04-29 21:08     ` vincent.diemunsch
  1 sibling, 0 replies; 17+ messages in thread
From: Shark8 @ 2020-03-31 19:54 UTC (permalink / raw)


On Tuesday, March 31, 2020 at 1:20:45 PM UTC-6, Simon Wright wrote:
> Shark8 writes:
> 
> > Ok then; the following should have convention Fortran IIRC:
> >
> > type tCubeReal is array (1..NumChannels, 1..NumAngles, 1..NumRanges) of mReal
> >   with Convention => Fortran;
> 
> I wondered about this; the code ran considerably slower with Convention
> => Fortran.

Interesting / unexpected.
I wonder... could it be that the CPU is itself optimized for row-major access?
It also looks like the For/Of loop takes longer than indexes on nested-for.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on   multidimensional complex matrices
  2020-03-23 23:16 GNAT vs Matlab - operation on multidimensional complex matrices darek
                   ` (4 preceding siblings ...)
  2020-03-31 17:25 ` Shark8
@ 2020-03-31 20:05 ` Shark8
  2020-03-31 20:51   ` Jeffrey R. Carter
  2020-06-08 17:42 ` Shark8
  6 siblings, 1 reply; 17+ messages in thread
From: Shark8 @ 2020-03-31 20:05 UTC (permalink / raw)


IIRC, one method to ensure proper cleanup w/o using Unchecked Deallocation is to use Generics:

    Generic
        Type T is private;
        Type IndexA is (<>);
        Type IndexB is (<>);
        Type IndexC is (<>);
        Type Cubic is Array(IndexA, IndexB, indexC) of T;
        Default : in T;
    Package Test_Data is
        Type Access_Cubic is not null access all Cubic;
        
        Access_Data : Constant Access_Cubic := new Cubic'(others => (others => (others => Default)));
        Data        : Cubic renames Access_Data.all;
    End Test_Data;

Now you can do something like:

    COMPLEX_CUBE_TEST:
    Declare
        Package Test_Set is new Test_Data(
           T       => Complex,
           IndexA  => Channel,
           IndexB  => Angle,
           IndexC  => Range_T,
           Cubic   => tCubeComplex,
           Default => (Re => 1.0, Im => 1.0)
          );
        Complex_Cube : tCubeComplex renames Test_Set.Data;
        SComplex     : Complex;
    Begin
        TIO.Put_Line("Complex cube");
        TIO.Put_Line("Complex type size is:" & Integer'Image(Complex'Size));
        
        SpeedSumComplexCube(
           Mtx          => Complex_Cube,
           S            => SComplex,
           Iterations   => NumIteration
          );
        
        TIO.Put("Sum is:"); CTIO.Put(SComplex); TIO.New_Line;
    End COMPLEX_CUBE_TEST;

and upon exit of the COMPLEX_CUBE_TEST block your test-data is cleaned up when Test_Set is destroyed.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on multidimensional complex matrices
  2020-03-31 20:05 ` Shark8
@ 2020-03-31 20:51   ` Jeffrey R. Carter
  0 siblings, 0 replies; 17+ messages in thread
From: Jeffrey R. Carter @ 2020-03-31 20:51 UTC (permalink / raw)


On 3/31/20 10:05 PM, Shark8 wrote:
> 
> and upon exit of the COMPLEX_CUBE_TEST block your test-data is cleaned up when Test_Set is destroyed.

This is only required if Storage_Size is specified for the access type; see ARM 
13.11.

-- 
Jeff Carter
"What's special about Agile is that it's a mix of the
best and the worst."
Bertrand Meyer
148

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on   multidimensional complex matrices
  2020-03-31 17:25 ` Shark8
  2020-03-31 19:20   ` Simon Wright
@ 2020-04-01 11:01   ` darek
  2020-04-01 15:01     ` J-P. Rosen
  1 sibling, 1 reply; 17+ messages in thread
From: darek @ 2020-04-01 11:01 UTC (permalink / raw)


Thanks for all your posts. At this moment I use the access types to avoid an overhead associated with allocation, and initialization of these arrays (I know that the overhead is small but later on when I move from 4 to 16 real channels every millisecond counts :)). This is a MIMO system so the number of virtual channels depends on the number of Tx's and Rx's.  Now, the data is streamed from the device  at 62Mbs (4 x Tx, 4 x Rx => 16 virtual channels) but in the future the data will arrive at ~ 250Mbs (4 x Tx, 16 x Rx => 64 virtual channels).  For each virtual channel you need to compute 2D complex FFT (so-called range-Doppler surface), and then combine them (digital beamforming).  Lots of number crunching here, and lots of fun ...
 
I will try to decompose the processing chain to handle even and odd channels separately, and try to run the computations in parallel on different CPUs (I never use these capabilities, so I am expecting some issues there). 

Regards,
  Darek



On Tuesday, 31 March 2020 19:25:22 UTC+2, Shark8  wrote:
> On Monday, March 23, 2020 at 5:16:20 PM UTC-6, darek wrote:
> > Hi Everyone, 
> >  I am working on radar signal processing algorithms that use multidimensional complex arrays. 
> 
> Ok then; the following should have convention Fortran IIRC:
> 
> type tCubeReal is array (1..NumChannels, 1..NumAngles, 1..NumRanges) of mReal
>   with Convention => Fortran;
> 
> That doesn't matter much, for a speed-test, but given that you mention RADAR data it's probably best to mention it now, before you frustrate yourself by accidentally mixing up column- and row-major formats.
> 
> > 
> > To my surprise, the performance of some Matlab functions is much better than compiled Ada code. 
> > 
> >    procedure SpeedSumRealCube (NumIteration : Integer; Mtx : in  tCubeReal;  S: out mReal) is
> 
> Don't read from S, it's an OUT parameter, and it's best to treat it that way.
> (I'm not sure that it would get in the way of optimization, but could.)
> 
> >       for k in 1..NumIteration loop
> >          for m  in Mtx'Range(1) loop
> >             for n in   Mtx'Range(2) loop
> >                for p in   Mtx'Range(3) loop
> >                   S := S + Mtx(m, n, p);
> >                end loop;
> >             end loop;
> >          end loop;
> >       end loop;
> 
> The above could be replaced, in Ada2012 with:
> 
> Function Summation( Input : in  tCubeReal ) return Complex is
> Begin
>   Return Result : Complex := (0.0 + i*0.0) do
>     For Element of Input loop
>       Result:= Result + Element;
>     End loop;
>   End return;
> End Summation;
> 
> If you turn it into a generic, you could use it for both Complex and mReal; plus it more accurately mirrors your Matlab code.
> -------------------------------------------------------------------------
> 
> Is there a reason for the access-types?
> If not, I would recommend getting rid of them.
> 
> 
> >
> > 1. Compiled with:  gcc version 9.2.0 (tdm64-1) ( and gnat community edition 2019), with the -O2 optimisation level. 
> > 2. CPU: AMD64 Family 23 Model 24 Stepping 1  CPU0      2300           AMD Ryzen 7 3750H with Radeon Vega Mobile Gfx
> > 3. Win10 64bit 
> > 
> > 
> > The results of the program execution:
> > 
> > Computation time: 0.616710300
> > Computation time per iteration: 6.16710300000000E-04
> > Sum is: 7.68000000000000E+08
> > Complex cube
> > Complex type size is: 128
> > 
> > Computation time: 3.707091000
> > Computation time per iteration: 3.70709100000000E-03
> > Sum is:( 7.68000000000000E+08, 7.68000000000000E+08)
> > 
> > 
> > The executable produced by the gcc provide with the gnat community edition gave very similar results.
> > 
> > More interesting part - the Matlab code.
> > The results of the program execution:
> > 
> > TExe:0.260718, sum real=768000000.000000
> > TExe:0.789778, sum complex= <768000000.000000,768000000.000000> 
> > Complex operations are 3.029242 times slower
> > 
> > 
> > What is wrong with my code? Is it the Ada compiler doing bad job here?
> > It seems that Matlab is performing really well here ...
> 
> IIRC Matlab has a *LOT* of optimization for Complex numbers/operations, but you're right in that these are surprising.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on multidimensional complex matrices
  2020-04-01 11:01   ` darek
@ 2020-04-01 15:01     ` J-P. Rosen
  2020-04-01 16:39       ` darek
  0 siblings, 1 reply; 17+ messages in thread
From: J-P. Rosen @ 2020-04-01 15:01 UTC (permalink / raw)


Le 01/04/2020 à 13:01, darek a écrit :
> Thanks for all your posts. At this moment I use the access types to
> avoid an overhead associated with allocation, and initialization of
> these arrays

Could you elaborate on this? Did you measure? Normally, accessing a
statically allocated array is faster...

-- 
J-P. Rosen
Adalog
2 rue du Docteur Lombard, 92441 Issy-les-Moulineaux CEDEX
Tel: +33 1 45 29 21 52, Fax: +33 1 45 29 25 00
http://www.adalog.fr

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on multidimensional complex matrices
  2020-04-01 15:01     ` J-P. Rosen
@ 2020-04-01 16:39       ` darek
  2020-04-01 17:15         ` J-P. Rosen
  0 siblings, 1 reply; 17+ messages in thread
From: darek @ 2020-04-01 16:39 UTC (permalink / raw)

Hi Jean-Pierre, 
 No, I did not evaluate it. This how I look at it:some of the functions (i.e.  FFT) (that have some local complex arrays inside), I need to call  ~320k-400k times per second (depends on the settings, and this is for the  simpler 4x4 MIMO). So, even if a single allocation cost some microseconds, then the total time is substantial. 

The development of the code is in an early stage, and I do (almost) everything to avoid using C :) :)  in this project (just kidding). 

Regards,
   Darek 

On Wednesday, 1 April 2020 17:01:12 UTC+2, J-P. Rosen  wrote:
> Le 01/04/2020 à 13:01, darek a écrit :
> > Thanks for all your posts. At this moment I use the access types to
> > avoid an overhead associated with allocation, and initialization of
> > these arrays
> 
> Could you elaborate on this? Did you measure? Normally, accessing a
> statically allocated array is faster...
> 
> -- 
> J-P. Rosen
> Adalog
> 2 rue du Docteur Lombard, 92441 Issy-les-Moulineaux CEDEX
> Tel: +33 1 45 29 21 52, Fax: +33 1 45 29 25 00
> http://www.adalog.fr

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on multidimensional complex matrices
  2020-04-01 16:39       ` darek
@ 2020-04-01 17:15         ` J-P. Rosen
  0 siblings, 0 replies; 17+ messages in thread
From: J-P. Rosen @ 2020-04-01 17:15 UTC (permalink / raw)


Le 01/04/2020 à 18:39, darek a écrit :
> No, I did not evaluate it. This how I look at it:some of the
> functions (i.e.  FFT) (that have some local complex arrays inside), I
> need to call  ~320k-400k times per second (depends on the settings,
> and this is for the  simpler 4x4 MIMO). So, even if a single
> allocation cost some microseconds, then the total time is
> substantial.

I still fail to see how using pointers would save you anything... Unless
you are assuming that arrays are passed by copy, which would be
extremely unlikely in your case!

Note: compilers are free to pass arrays by copy or by reference, this is
intended to let them choose the most efficient method.
-- 
J-P. Rosen
Adalog
2 rue du Docteur Lombard, 92441 Issy-les-Moulineaux CEDEX
Tel: +33 1 45 29 21 52, Fax: +33 1 45 29 25 00
http://www.adalog.fr

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on   multidimensional complex matrices
  2020-03-31 19:20   ` Simon Wright
  2020-03-31 19:54     ` Shark8
@ 2020-04-29 21:08     ` vincent.diemunsch
  1 sibling, 0 replies; 17+ messages in thread
From: vincent.diemunsch @ 2020-04-29 21:08 UTC (permalink / raw)

Le mardi 31 mars 2020 21:20:45 UTC+2, Simon Wright a écrit :
> Shark8 <onewingedshark@gmail.com> writes:
> 
> > Ok then; the following should have convention Fortran IIRC:
> >
> > type tCubeReal is array (1..NumChannels, 1..NumAngles, 1..NumRanges) of mReal
> >   with Convention => Fortran;
> 
> I wondered about this; the code ran considerably slower with Convention
> => Fortran.

In Row Major (Convention Fortran) one needs to iterate on the first dimension, then the second and finally the third, to stay on contiguous regions in memory (and hence in the cache). So the loop must put the first dimension in the center :

for p in M’Range(3) loop
  for n in M’Range (2) loop
     for m in M’Range (1) loop
         ... M(m, n, p)

This is somehow counter-intuitive. But Row Major (like Little-endian with which it is closely related) has more properties (like extracting a submatrix or conforming to classical linear algebra) and is therefore preferred by scientists.

Regards,

Vincent

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: GNAT vs Matlab - operation on   multidimensional complex matrices
  2020-03-23 23:16 GNAT vs Matlab - operation on multidimensional complex matrices darek
                   ` (5 preceding siblings ...)
  2020-03-31 20:05 ` Shark8
@ 2020-06-08 17:42 ` Shark8
  6 siblings, 0 replies; 17+ messages in thread
From: Shark8 @ 2020-06-08 17:42 UTC (permalink / raw)


Cleaning out my computer, I found this working-file; it might be of interesti to you -- it shows "going all-in" with Generics:
--------------------------------------------------------------
With
Ada.Real_Time,
Ada.Containers.Indefinite_Holders,
Ada.Numerics.Long_Complex_Types,
Ada.Exceptions,
Ada.Text_IO.Complex_IO;

Procedure Demo is
    package TIO renames Ada.Text_IO;
    package CTIO is new TIO.Complex_IO(Ada.Numerics.Long_Complex_Types);

    subtype mReal   is Long_Float;
    subtype Complex is Ada.Numerics.Long_Complex_Types.Complex;


    NumIteration : constant  := 1_000;
    NumChannels  : constant  := 64;
    NumRanges    : constant  := 400;
    NumAngles    : constant  := 30;


    Type Channel is range 1..NumChannels;
    Type Angle   is range 1..NumAngles;
    Type Range_T is range 1..NumRanges;

    type tCubeReal    is array (Channel, Angle, Range_T) of mReal;
    type tCubeComplex is array (Channel, Angle, Range_T) of Complex;

    Generic
        Type T is private;
        Type IndexA is (<>);
        Type IndexB is (<>);
        Type IndexC is (<>);
        Type Cubic is Array(IndexA, IndexB, indexC) of T;
        Zero : in T;
        with Function "+"(Left, Right: T) return T is <>;
    Function Summation( Input : Cubic ) return T
      with Inline;

    Function Summation( Input : Cubic ) return T is
    Begin
        Return Result : T := Zero do
            For A in IndexA loop
                For B in IndexB loop
                    For C in IndexC loop
                        Result:= Result + Input(A, B, C);
                    End loop;
                End loop;
            End loop;
        End return;
    End Summation;

    Generic
        Type Element is private;
        Type Cubic is array (Channel, Angle, Range_T) of Element;
        Zero : In Element;
        with Function "+"(Left, Right: Element) return Element is <>;
    Procedure Timing_Harness(Mtx : in Cubic;  S: out Element; Iterations : in Positive:= 1);

    Procedure Timing_Harness(Mtx : in Cubic;  S: out Element; Iterations : in Positive:= 1) is
        Function Sum is new Summation(Element, Channel, Angle, Range_T, Cubic, Zero);
        Use Ada.Real_Time;
        Start : Time renames Clock;
    Begin
        For Count in 1..Iterations Loop
            S := Sum( Mtx );
        End loop;


        METRICS:
        Declare
            Execution_Time  : Constant Time_Span:= Clock - Start;
            Execution_Image : Constant String:= Duration'Image(To_Duration(Execution_Time));
            T : Constant mReal := mReal(To_Duration(Execution_Time))/mReal(NumIteration);
        Begin
            TIO.New_Line;
            TIO.Put_Line("Computation time:" & Execution_Image );
            TIO.Put_Line("Computation time per iteration:" & mReal'Image(T));
        End METRICS;
    End Timing_Harness;


    Function Image( Input : mReal   )  return string renames mReal'Image;
    Function Image( Input : Complex )  return string is
        ('(' & mReal'Image(Input.Re) & ", " & mReal'Image(Input.Im) & ')');
    
    Generic
        Type T is private;
        Type IndexA is (<>);
        Type IndexB is (<>);
        Type IndexC is (<>);
        Type Cubic is Array(IndexA, IndexB, indexC) of T;
        Default : in T;
    Package Test_Data is
        Type Access_Cubic is not null access all Cubic;

        Access_Data : Constant Access_Cubic := new Cubic'(others => (others => (others => Default)));
        Data        : Cubic renames Access_Data.all;
    End Test_Data;

    Generic
        Type Element    is private;
        Type Cube_Array is array (Channel, Angle, Range_T) of Element;
        Default,
        Zero      : in Element;
        Test_Name : in String;
        with Function "+"(Left, Right: Element) return Element is <>;
        with Function Image( Input : Element )  return string  is <>;
    Procedure TEST;


    Procedure TEST is

        Package Test_Set is new Test_Data(
           T       => Element,
           IndexA  => Channel,
           IndexB  => Angle,
           IndexC  => Range_T,
           Cubic   => Cube_Array,
           Default => Default
          );
        
        procedure SpeedSum is new Timing_Harness(
           Element => Element,
           Cubic   => Cube_Array,
           Zero    => Zero,
           "+"     => TEST."+"
          );
        
        Cube   : Cube_Array renames Test_Set.Data;
        Result : Element;
    Begin
        TIO.Put_Line(Test_Name & " cube");
        TIO.Put_Line(Test_Name & " type size is:" & Integer'Image(Element'Size));

        SpeedSum(
           Mtx          => Cube,
           S            => Result,
           Iterations   => NumIteration
          );

        TIO.Put_Line("Sum is:" & Image(Result));
    End TEST;

Begin
    REAL_CUBE_TEST:
    Declare
        Procedure Do_Test is new Test(
           Element    => mReal,
           Cube_Array => tCubeReal,
           Default    => 1.0,
           Zero       => 0.0,
           Test_Name  => "Real"
          );
    Begin
        Do_Test;
    End REAL_CUBE_TEST;

    TIO.Put_Line( (1..20 => '-') ); -- Separator.

    COMPLEX_CUBE_TEST:
    Declare
        Procedure Do_Test is new Test(
           "+"        => Ada.Numerics.Long_Complex_Types."+",
           Element    => Complex,
           Cube_Array => tCubeComplex,
           Default    => (Re => 1.0, Im => 1.0),
           Zero       => (Re => 0.0, Im => 0.0),
           Test_Name  => "Complex"
          );
    Begin
        Do_Test;
    End COMPLEX_CUBE_TEST;



    TIO.Put_Line( (1..20 => '-') ); -- Separator.


    Ada.Text_IO.Put_Line( "Done." );
End Demo;

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2020-06-08 17:42 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-23 23:16 GNAT vs Matlab - operation on multidimensional complex matrices darek
2020-03-24  2:07 ` johnscpg
2020-03-24  6:40 ` J-P. Rosen
2020-03-24  9:37 ` Jeffrey R. Carter
2020-03-24 14:55   ` darek
2020-03-24 15:44 ` johnscpg
2020-03-31 17:25 ` Shark8
2020-03-31 19:20   ` Simon Wright
2020-03-31 19:54     ` Shark8
2020-04-29 21:08     ` vincent.diemunsch
2020-04-01 11:01   ` darek
2020-04-01 15:01     ` J-P. Rosen
2020-04-01 16:39       ` darek
2020-04-01 17:15         ` J-P. Rosen
2020-03-31 20:05 ` Shark8
2020-03-31 20:51   ` Jeffrey R. Carter
2020-06-08 17:42 ` Shark8

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox