Grid/benchmarks/Benchmark_wilson.cc

#include <Grid.h>

using namespace std;
using namespace Grid;
using namespace Grid::QCD;

template<class d>
struct scal {
  d internal;
};

  Gamma::GammaMatrix Gmu [] = {
    Gamma::GammaX,
    Gamma::GammaY,
    Gamma::GammaZ,
    Gamma::GammaT
  };

int main (int argc, char ** argv)
{
  Grid_init(&argc,&argv);


  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);

  int threads = GridThread::GetThreads();
  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::cout << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
  std::cout << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
  std::cout << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;

  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);
  pRNG.SeedFixedIntegers(seeds);
  //  pRNG.SeedRandomDevice();

  LatticeFermion src   (&Grid); random(pRNG,src);
  LatticeFermion result(&Grid); result=zero;
  LatticeFermion    ref(&Grid);    ref=zero;
  LatticeFermion    tmp(&Grid);    tmp=zero;
  LatticeFermion    err(&Grid);    tmp=zero;
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
  std::vector<LatticeColourMatrix> U(4,&Grid);

  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }  

  // Only one non-zero (y)
#if 0
  Umu=zero;
  Complex cone(1.0,0.0);
  for(int nn=0;nn<Nd;nn++){
    random(pRNG,U[nn]);
    if(0) {
      if (nn==-1) { U[nn]=zero; std::cout << "zeroing gauge field in dir "<<nn<<std::endl; }
      else       { U[nn] = cone;std::cout << "unit gauge field in dir "<<nn<<std::endl; }
    }
    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
  }
#endif

  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  
  { // Naive wilson implementation
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
      //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
      tmp = U[mu]*Cshift(src,mu,1);
      for(int i=0;i<ref._odata.size();i++){
	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
      }

      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu,-1);
      for(int i=0;i<ref._odata.size();i++){
	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
    }
  }
  ref = -0.5*ref;
  RealD mass=0.1;
  WilsonFermion Dw(Umu,Grid,RBGrid,mass);
  
  std::cout << "Calling Dw"<<std::endl;
  int ncall=10000;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.Dhop(src,result,0);
  }
  double t1=usecond();
  double flops=1344*volume*ncall;
  
  std::cout << "Called Dw"<<std::endl;
  std::cout << "norm result "<< norm2(result)<<std::endl;
  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
  std::cout << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
  err = ref-result; 
  std::cout << "norm diff   "<< norm2(err)<<std::endl;


  //  for(int ss=0;ss<10;ss++ ){
  for(int ss=0;ss<0;ss++ ){
    for(int i=0;i<Ns;i++){
      for(int j=0;j<Nc;j++){
	ComplexF * ref_p = (ComplexF *)&ref._odata[ss]()(i)(j);
	ComplexF * res_p = (ComplexF *)&result._odata[ss]()(i)(j);
	std::cout << ss<< " "<<i<<" "<<j<<" "<< (*ref_p)<<" " <<(*res_p)<<std::endl;
      }
    }
  }

  { // Naive wilson dag implementation
    ref = zero;
    for(int mu=0;mu<Nd;mu++){

      //    ref =  src - Gamma(Gamma::GammaX)* src ; // 1+gamma_x
      tmp = U[mu]*Cshift(src,mu,1);
      for(int i=0;i<ref._odata.size();i++){
	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
      }

      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu,-1);
      for(int i=0;i<ref._odata.size();i++){
	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
    }
  }
  ref = -0.5*ref;
  Dw.Dhop(src,result,1);
  std::cout << "Called DwDag"<<std::endl;
  std::cout << "norm result "<< norm2(result)<<std::endl;
  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
  err = ref-result; 
  std::cout << "norm diff   "<< norm2(err)<<std::endl;

  Grid_finalize();
}
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`#include <Grid.h>`

			`using namespace std;`
			`using namespace Grid;`
			`using namespace Grid::QCD;`

			`template<class d>`
			`struct scal {`
			`d internal;`
			`};`

Reworking CSHIFT and Stencil. Implementing Wilson and discovered rework is required 2015-04-27 13:45:07 +01:00			`Gamma::GammaMatrix Gmu [] = {`
			`Gamma::GammaX,`
			`Gamma::GammaY,`
			`Gamma::GammaZ,`
			`Gamma::GammaT`
			`};`
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00
			`int main (int argc, char ** argv)`
			`{`
			`Grid_init(&argc,&argv);`

Command line args and a general clean up 2015-05-11 12:43:10 +01:00
Adding a better controlled threading class, preparing to force in deterministic reduction. 2015-05-11 18:59:03 +01:00			`std::vector<int> latt_size = GridDefaultLatt();`
Change the SIMD command correctly with precision = double vs. single and connect the "Real" default precisoin to a configure flag. Have RealF, RealD and Real types, where Real is compile target dependent single/double, RealF is single and RealD is double etc.. 2015-07-01 22:45:15 +01:00			`std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());`
Adding a better controlled threading class, preparing to force in deterministic reduction. 2015-05-11 18:59:03 +01:00			`std::vector<int> mpi_layout = GridDefaultMpi();`
Schur complement based red-black inversion working 2015-05-25 13:47:12 +01:00			`GridCartesian Grid(latt_size,simd_layout,mpi_layout);`
			`GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);`
Adding a better controlled threading class, preparing to force in deterministic reduction. 2015-05-11 18:59:03 +01:00
Add messages to get the number of threads for openmp 2015-05-19 14:54:42 +01:00			`int threads = GridThread::GetThreads();`
			`std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;`
Change the SIMD command correctly with precision = double vs. single and connect the "Real" default precisoin to a configure flag. Have RealF, RealD and Real types, where Real is compile target dependent single/double, RealF is single and RealD is double etc.. 2015-07-01 22:45:15 +01:00			`std::cout << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;`
			`std::cout << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;`
			`std::cout << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;`
Add messages to get the number of threads for openmp 2015-05-19 14:54:42 +01:00
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`std::vector<int> seeds({1,2,3,4});`
			`GridParallelRNG pRNG(&Grid);`
Hand unrolled version of dslash in a separate class. Useful to compare; raises Intel compiler from 9GFlop/s to 17.5 Gflops. on ivybridge core. Raises Clang form 14.5 to 17.5 2015-05-26 19:54:03 +01:00			`pRNG.SeedFixedIntegers(seeds);`
			`// pRNG.SeedRandomDevice();`
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00
Schur complement based red-black inversion working 2015-05-25 13:47:12 +01:00			`LatticeFermion src (&Grid); random(pRNG,src);`
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`LatticeFermion result(&Grid); result=zero;`
			`LatticeFermion ref(&Grid); ref=zero;`
Reworking CSHIFT and Stencil. Implementing Wilson and discovered rework is required 2015-04-27 13:45:07 +01:00			`LatticeFermion tmp(&Grid); tmp=zero;`
Schur complement based red-black inversion working 2015-05-25 13:47:12 +01:00			`LatticeFermion err(&Grid); tmp=zero;`
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`LatticeGaugeField Umu(&Grid); random(pRNG,Umu);`
			`std::vector<LatticeColourMatrix> U(4,&Grid);`

Benchmark wilson dhop now; 14.6GF on one core, not as fast as SU(3)xSU(3) [23GF] but still not too shabby. Disassembling output shows ugly sequences in the permute sector. Could comparatively benchmark with and without the if-else structure to see how much I'm losing. Drops to 9GF as it falls out of cache. Moving to Lebesgue ordering should help there. Substantive progress. 2015-04-29 06:50:18 +01:00			`double volume=1;`
			`for(int mu=0;mu<Nd;mu++){`
			`volume=volume*latt_size[mu];`
			`}`

Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`// Only one non-zero (y)`
Large scale change to support 5d fermion formulations. Have 5d replicated wilson with 4d gauge working and matrix regressing to Ls copies of wilson. 2015-05-31 15:09:02 +01:00			`#if 0`
Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`Umu=zero;`
Schur complement based red-black inversion working 2015-05-25 13:47:12 +01:00			`Complex cone(1.0,0.0);`
Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`for(int nn=0;nn<Nd;nn++){`
			`random(pRNG,U[nn]);`
Hand unrolled version of dslash in a separate class. Useful to compare; raises Intel compiler from 9GFlop/s to 17.5 Gflops. on ivybridge core. Raises Clang form 14.5 to 17.5 2015-05-26 19:54:03 +01:00			`if(0) {`
			`if (nn==-1) { U[nn]=zero; std::cout << "zeroing gauge field in dir "<<nn<<std::endl; }`
			`else { U[nn] = cone;std::cout << "unit gauge field in dir "<<nn<<std::endl; }`
			`}`
Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`pokeIndex<LorentzIndex>(Umu,U[nn],nn);`
			`}`
Large scale change to support 5d fermion formulations. Have 5d replicated wilson with 4d gauge working and matrix regressing to Ls copies of wilson. 2015-05-31 15:09:02 +01:00			`#endif`
Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`for(int mu=0;mu<Nd;mu++){`
big commit fixing nocompiles in defective C++11 compilers (gcc, icpc). stared getting to near the bleeding edge I guess 2015-06-30 15:01:44 +01:00			`U[mu] = PeekIndex<LorentzIndex>(Umu,mu);`
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`}`

Reworking CSHIFT and Stencil. Implementing Wilson and discovered rework is required 2015-04-27 13:45:07 +01:00			`{ // Naive wilson implementation`
			`ref = zero;`
			`for(int mu=0;mu<Nd;mu++){`
			`// ref = src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x`
Remove debug masking 2015-05-15 11:51:15 +01:00			`tmp = U[mu]*Cshift(src,mu,1);`
			`for(int i=0;i<ref._odata.size();i++){`
			`ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;`
Reworking CSHIFT and Stencil. Implementing Wilson and discovered rework is required 2015-04-27 13:45:07 +01:00			`}`

Remove debug masking 2015-05-15 11:51:15 +01:00			`tmp =adj(U[mu])*src;`
			`tmp =Cshift(tmp,mu,-1);`
			`for(int i=0;i<ref._odata.size();i++){`
			`ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;`
Reworking CSHIFT and Stencil. Implementing Wilson and discovered rework is required 2015-04-27 13:45:07 +01:00			`}`
			`}`
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`}`
Large scale change to support 5d fermion formulations. Have 5d replicated wilson with 4d gauge working and matrix regressing to Ls copies of wilson. 2015-05-31 15:09:02 +01:00			`ref = -0.5*ref;`
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`RealD mass=0.1;`
Large scale change to support 5d fermion formulations. Have 5d replicated wilson with 4d gauge working and matrix regressing to Ls copies of wilson. 2015-05-31 15:09:02 +01:00			`WilsonFermion Dw(Umu,Grid,RBGrid,mass);`
Benchmark wilson dhop now; 14.6GF on one core, not as fast as SU(3)xSU(3) [23GF] but still not too shabby. Disassembling output shows ugly sequences in the permute sector. Could comparatively benchmark with and without the if-else structure to see how much I'm losing. Drops to 9GF as it falls out of cache. Moving to Lebesgue ordering should help there. Substantive progress. 2015-04-29 06:50:18 +01:00
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`std::cout << "Calling Dw"<<std::endl;`
Hand unrolled version of dslash in a separate class. Useful to compare; raises Intel compiler from 9GFlop/s to 17.5 Gflops. on ivybridge core. Raises Clang form 14.5 to 17.5 2015-05-26 19:54:03 +01:00			`int ncall=10000;`
Benchmark wilson dhop now; 14.6GF on one core, not as fast as SU(3)xSU(3) [23GF] but still not too shabby. Disassembling output shows ugly sequences in the permute sector. Could comparatively benchmark with and without the if-else structure to see how much I'm losing. Drops to 9GF as it falls out of cache. Moving to Lebesgue ordering should help there. Substantive progress. 2015-04-29 06:50:18 +01:00			`double t0=usecond();`
			`for(int i=0;i<ncall;i++){`
Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`Dw.Dhop(src,result,0);`
Benchmark wilson dhop now; 14.6GF on one core, not as fast as SU(3)xSU(3) [23GF] but still not too shabby. Disassembling output shows ugly sequences in the permute sector. Could comparatively benchmark with and without the if-else structure to see how much I'm losing. Drops to 9GF as it falls out of cache. Moving to Lebesgue ordering should help there. Substantive progress. 2015-04-29 06:50:18 +01:00			`}`
			`double t1=usecond();`
Large scale change to support 5d fermion formulations. Have 5d replicated wilson with 4d gauge working and matrix regressing to Ls copies of wilson. 2015-05-31 15:09:02 +01:00			`double flops=1344volumencall;`
Benchmark wilson dhop now; 14.6GF on one core, not as fast as SU(3)xSU(3) [23GF] but still not too shabby. Disassembling output shows ugly sequences in the permute sector. Could comparatively benchmark with and without the if-else structure to see how much I'm losing. Drops to 9GF as it falls out of cache. Moving to Lebesgue ordering should help there. Substantive progress. 2015-04-29 06:50:18 +01:00
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`std::cout << "Called Dw"<<std::endl;`
			`std::cout << "norm result "<< norm2(result)<<std::endl;`
			`std::cout << "norm ref "<< norm2(ref)<<std::endl;`
Added a comms benchmark 2015-05-02 23:42:30 +01:00			`std::cout << "mflop/s = "<< flops/(t1-t0)<<std::endl;`
Schur complement based red-black inversion working 2015-05-25 13:47:12 +01:00			`err = ref-result;`
Added a comms benchmark 2015-05-02 23:42:30 +01:00			`std::cout << "norm diff "<< norm2(err)<<std::endl;`

Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00
Fixed the stencil sector and Wilson now agrees between stencil based implementation and the cshift based implementation. Managed to reduce the volume of code in this sector a little, but consolidation would be good, perhaps taking common logic out into simple helper functions 2015-04-29 06:23:56 +01:00			`// for(int ss=0;ss<10;ss++ ){`
			`for(int ss=0;ss<0;ss++ ){`
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`for(int i=0;i<Ns;i++){`
			`for(int j=0;j<Nc;j++){`
			`ComplexF * ref_p = (ComplexF *)&ref._odata[ss]()(i)(j);`
			`ComplexF * res_p = (ComplexF *)&result._odata[ss]()(i)(j);`
			`std::cout << ss<< " "<<i<<" "<<j<<" "<< (ref_p)<<" " <<(res_p)<<std::endl;`
			`}`
			`}`
			`}`

Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`{ // Naive wilson dag implementation`
			`ref = zero;`
			`for(int mu=0;mu<Nd;mu++){`

			`// ref = src - Gamma(Gamma::GammaX)* src ; // 1+gamma_x`
			`tmp = U[mu]*Cshift(src,mu,1);`
			`for(int i=0;i<ref._odata.size();i++){`
			`ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;`
			`}`

			`tmp =adj(U[mu])*src;`
			`tmp =Cshift(tmp,mu,-1);`
			`for(int i=0;i<ref._odata.size();i++){`
			`ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;`
			`}`
			`}`
			`}`
Large scale change to support 5d fermion formulations. Have 5d replicated wilson with 4d gauge working and matrix regressing to Ls copies of wilson. 2015-05-31 15:09:02 +01:00			`ref = -0.5*ref;`
Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`Dw.Dhop(src,result,1);`
			`std::cout << "Called DwDag"<<std::endl;`
			`std::cout << "norm result "<< norm2(result)<<std::endl;`
			`std::cout << "norm ref "<< norm2(ref)<<std::endl;`
Schur complement based red-black inversion working 2015-05-25 13:47:12 +01:00			`err = ref-result;`
Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`std::cout << "norm diff "<< norm2(err)<<std::endl;`

Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`Grid_finalize();`
			`}`