mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	merge upstream develop
This commit is contained in:
		@@ -14,6 +14,7 @@ std::string filestem(const int l)
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  int64_t threads = GridThread::GetThreads();
 | 
			
		||||
@@ -42,6 +43,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
  return EXIT_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -2,7 +2,7 @@
 | 
			
		||||
#define Benchmark_IO_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
#define MSG std::cout << GridLogMessage
 | 
			
		||||
#define SEP \
 | 
			
		||||
"============================================================================="
 | 
			
		||||
@@ -104,4 +104,5 @@ void readBenchmark(const Coordinate &latt, const std::string filename,
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif //LIME
 | 
			
		||||
#endif // Benchmark_IO_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -8,6 +8,7 @@ using namespace Grid;
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
  std::vector<std::string> dir;
 | 
			
		||||
  unsigned int             Ls;
 | 
			
		||||
  bool                     rb;
 | 
			
		||||
@@ -73,6 +74,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
  return EXIT_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -30,7 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
std::vector<int> L_list;
 | 
			
		||||
std::vector<int> Ls_list;
 | 
			
		||||
std::vector<double> mflop_list;
 | 
			
		||||
@@ -76,7 +75,6 @@ struct controls {
 | 
			
		||||
  int Opt;
 | 
			
		||||
  int CommsOverlap;
 | 
			
		||||
  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
 | 
			
		||||
  //  int HugePages;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class Benchmark {
 | 
			
		||||
@@ -119,14 +117,15 @@ public:
 | 
			
		||||
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
    comms_header();
 | 
			
		||||
 | 
			
		||||
    for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
      for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
    for(int lat=16;lat<=maxlat;lat+=8){
 | 
			
		||||
      //      for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
      { int Ls=12;
 | 
			
		||||
 | 
			
		||||
	Coordinate latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
	      lat*mpi_layout[1],
 | 
			
		||||
	      lat*mpi_layout[2],
 | 
			
		||||
	      lat*mpi_layout[3]});
 | 
			
		||||
 | 
			
		||||
	std::cout << GridLogMessage<< latt_size <<std::endl;
 | 
			
		||||
	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
	RealD Nrank = Grid._Nprocessors;
 | 
			
		||||
	RealD Nnode = Grid.NodeCount();
 | 
			
		||||
@@ -184,9 +183,6 @@ public:
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	timestat.statistics(t_time);
 | 
			
		||||
	//	for(int i=0;i<t_time.size();i++){
 | 
			
		||||
	//	  std::cout << i<<" "<<t_time[i]<<std::endl;
 | 
			
		||||
	//	}
 | 
			
		||||
 | 
			
		||||
	dbytes=dbytes*ppn;
 | 
			
		||||
	double xbytes    = dbytes*0.5;
 | 
			
		||||
@@ -199,8 +195,6 @@ public:
 | 
			
		||||
		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
 | 
			
		||||
		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 | 
			
		||||
		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
	
 | 
			
		||||
	    }
 | 
			
		||||
    }    
 | 
			
		||||
@@ -227,14 +221,15 @@ public:
 | 
			
		||||
    uint64_t NN;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  uint64_t lmax=48;
 | 
			
		||||
  uint64_t lmax=32;
 | 
			
		||||
#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
 | 
			
		||||
 | 
			
		||||
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
    for(int lat=8;lat<=lmax;lat+=4){
 | 
			
		||||
    for(int lat=8;lat<=lmax;lat+=8){
 | 
			
		||||
 | 
			
		||||
      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      //      NP= Grid.RankCount();
 | 
			
		||||
@@ -242,9 +237,9 @@ public:
 | 
			
		||||
 | 
			
		||||
      Vec rn ; random(sRNG,rn);
 | 
			
		||||
 | 
			
		||||
      LatticeVec z(&Grid); z=rn;
 | 
			
		||||
      LatticeVec x(&Grid); x=rn;
 | 
			
		||||
      LatticeVec y(&Grid); y=rn;
 | 
			
		||||
      LatticeVec z(&Grid); z=Zero();
 | 
			
		||||
      LatticeVec x(&Grid); x=Zero();
 | 
			
		||||
      LatticeVec y(&Grid); y=Zero();
 | 
			
		||||
      double a=2.0;
 | 
			
		||||
 | 
			
		||||
      uint64_t Nloop=NLOOP;
 | 
			
		||||
@@ -252,9 +247,9 @@ public:
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
	z=a*x-y;
 | 
			
		||||
	auto x_v = x.View();
 | 
			
		||||
	auto y_v = y.View();
 | 
			
		||||
	auto z_v = z.View();
 | 
			
		||||
	autoView( x_v , x, CpuWrite);
 | 
			
		||||
	autoView( y_v , y, CpuWrite);
 | 
			
		||||
	autoView( z_v , z, CpuRead);
 | 
			
		||||
        x_v[0]=z_v[0]; // force serial dependency to prevent optimise away
 | 
			
		||||
        y_v[4]=z_v[4];
 | 
			
		||||
      }
 | 
			
		||||
@@ -270,191 +265,8 @@ public:
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
  static double DWF5(int Ls,int L)
 | 
			
		||||
  {
 | 
			
		||||
    //    RealD mass=0.1;
 | 
			
		||||
    RealD M5  =1.8;
 | 
			
		||||
 | 
			
		||||
    double mflops;
 | 
			
		||||
    double mflops_best = 0;
 | 
			
		||||
    double mflops_worst= 0;
 | 
			
		||||
    std::vector<double> mflops_all;
 | 
			
		||||
 | 
			
		||||
    ///////////////////////////////////////////////////////
 | 
			
		||||
    // Set/Get the layout & grid size
 | 
			
		||||
    ///////////////////////////////////////////////////////
 | 
			
		||||
    int threads = GridThread::GetThreads();
 | 
			
		||||
    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
 | 
			
		||||
    Coordinate local({L,L,L,L});
 | 
			
		||||
 | 
			
		||||
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}), 
 | 
			
		||||
								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    uint64_t NP = TmpGrid->RankCount();
 | 
			
		||||
    uint64_t NN = TmpGrid->NodeCount();
 | 
			
		||||
    NN_global=NN;
 | 
			
		||||
    uint64_t SHM=NP/NN;
 | 
			
		||||
 | 
			
		||||
    Coordinate internal;
 | 
			
		||||
    if      ( SHM == 1 )   internal = Coordinate({1,1,1,1});
 | 
			
		||||
    else if ( SHM == 2 )   internal = Coordinate({2,1,1,1});
 | 
			
		||||
    else if ( SHM == 4 )   internal = Coordinate({2,2,1,1});
 | 
			
		||||
    else if ( SHM == 8 )   internal = Coordinate({2,2,2,1});
 | 
			
		||||
    else assert(0);
 | 
			
		||||
 | 
			
		||||
    Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
 | 
			
		||||
    Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
 | 
			
		||||
 | 
			
		||||
    ///////// Welcome message ////////////
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
    ///////// Lattice Init ////////////
 | 
			
		||||
    GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
    GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
 | 
			
		||||
    GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
 | 
			
		||||
    GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
 | 
			
		||||
    GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
    ///////// RNG Init ////////////
 | 
			
		||||
    std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
    std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
    GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 | 
			
		||||
 | 
			
		||||
    ///////// Source preparation ////////////
 | 
			
		||||
    LatticeFermion src   (sFGrid); 
 | 
			
		||||
    LatticeFermion tmp   (sFGrid);
 | 
			
		||||
    std::cout << GridLogMessage << "allocated src and tmp" << std::endl;
 | 
			
		||||
    random(RNG5,src);
 | 
			
		||||
    std::cout << GridLogMessage << "intialised random source" << std::endl;
 | 
			
		||||
 | 
			
		||||
    RealD N2 = 1.0/::sqrt(norm2(src));
 | 
			
		||||
    src = src*N2;
 | 
			
		||||
    
 | 
			
		||||
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
 | 
			
		||||
 | 
			
		||||
    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
 | 
			
		||||
    LatticeFermion src_e (sFrbGrid);
 | 
			
		||||
    LatticeFermion src_o (sFrbGrid);
 | 
			
		||||
    LatticeFermion r_e   (sFrbGrid);
 | 
			
		||||
    LatticeFermion r_o   (sFrbGrid);
 | 
			
		||||
    LatticeFermion r_eo  (sFGrid);
 | 
			
		||||
    LatticeFermion err   (sFGrid);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
      pickCheckerboard(Even,src_e,src);
 | 
			
		||||
      pickCheckerboard(Odd,src_o,src);
 | 
			
		||||
 | 
			
		||||
#if defined(AVX512) 
 | 
			
		||||
      const int num_cases = 6;
 | 
			
		||||
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 | 
			
		||||
#else
 | 
			
		||||
      const int num_cases = 4;
 | 
			
		||||
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 | 
			
		||||
#endif
 | 
			
		||||
      controls Cases [] = {
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
#endif
 | 
			
		||||
	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
 | 
			
		||||
      }; 
 | 
			
		||||
 | 
			
		||||
      for(int c=0;c<num_cases;c++) {
 | 
			
		||||
 | 
			
		||||
	 WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 | 
			
		||||
	 WilsonKernelsStatic::Opt   = Cases[c].Opt;
 | 
			
		||||
	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 | 
			
		||||
	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
	int nwarm = 100;
 | 
			
		||||
	uint64_t ncall = 1000;
 | 
			
		||||
 | 
			
		||||
	double t0=usecond();
 | 
			
		||||
	sFGrid->Barrier();
 | 
			
		||||
	for(int i=0;i<nwarm;i++){
 | 
			
		||||
	  sDw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	}
 | 
			
		||||
	sFGrid->Barrier();
 | 
			
		||||
	double t1=usecond();
 | 
			
		||||
 | 
			
		||||
	sDw.ZeroCounters();
 | 
			
		||||
	time_statistics timestat;
 | 
			
		||||
	std::vector<double> t_time(ncall);
 | 
			
		||||
	for(uint64_t i=0;i<ncall;i++){
 | 
			
		||||
	  t0=usecond();
 | 
			
		||||
	  sDw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	  t1=usecond();
 | 
			
		||||
	  t_time[i] = t1-t0;
 | 
			
		||||
	}
 | 
			
		||||
	sFGrid->Barrier();
 | 
			
		||||
	
 | 
			
		||||
	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
	double flops=(1344.0*volume)/2;
 | 
			
		||||
	double mf_hi, mf_lo, mf_err;
 | 
			
		||||
 | 
			
		||||
	timestat.statistics(t_time);
 | 
			
		||||
	mf_hi = flops/timestat.min;
 | 
			
		||||
	mf_lo = flops/timestat.max;
 | 
			
		||||
	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 | 
			
		||||
 | 
			
		||||
	mflops = flops/timestat.mean;
 | 
			
		||||
	mflops_all.push_back(mflops);
 | 
			
		||||
	if ( mflops_best == 0   ) mflops_best = mflops;
 | 
			
		||||
	if ( mflops_worst== 0   ) mflops_worst= mflops;
 | 
			
		||||
	if ( mflops>mflops_best ) mflops_best = mflops;
 | 
			
		||||
	if ( mflops<mflops_worst) mflops_worst= mflops;
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl;
 | 
			
		||||
 | 
			
		||||
	sDw.Report();
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      double robust = mflops_worst/mflops_best;;
 | 
			
		||||
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
 | 
			
		||||
 | 
			
		||||
      std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage <<fmt << std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage;
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<mflops_all.size();i++){
 | 
			
		||||
	std::cout<<mflops_all[i]/NN<<" ; " ;
 | 
			
		||||
      }
 | 
			
		||||
      std::cout<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    return mflops_best;
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  static double DWF(int Ls,int L, double & robust)
 | 
			
		||||
  static double DWF(int Ls,int L)
 | 
			
		||||
  {
 | 
			
		||||
    RealD mass=0.1;
 | 
			
		||||
    RealD M5  =1.8;
 | 
			
		||||
@@ -471,37 +283,30 @@ public:
 | 
			
		||||
    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
 | 
			
		||||
    Coordinate local({L,L,L,L});
 | 
			
		||||
 | 
			
		||||
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}), 
 | 
			
		||||
								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), 
 | 
			
		||||
								       GridDefaultSimd(Nd,vComplex::Nsimd()),
 | 
			
		||||
								       GridDefaultMpi());
 | 
			
		||||
    uint64_t NP = TmpGrid->RankCount();
 | 
			
		||||
    uint64_t NN = TmpGrid->NodeCount();
 | 
			
		||||
    NN_global=NN;
 | 
			
		||||
    uint64_t SHM=NP/NN;
 | 
			
		||||
 | 
			
		||||
    Coordinate internal;
 | 
			
		||||
    if      ( SHM == 1 )   internal = Coordinate({1,1,1,1});
 | 
			
		||||
    else if ( SHM == 2 )   internal = Coordinate({2,1,1,1});
 | 
			
		||||
    else if ( SHM == 4 )   internal = Coordinate({2,2,1,1});
 | 
			
		||||
    else if ( SHM == 8 )   internal = Coordinate({2,2,2,1});
 | 
			
		||||
    else assert(0);
 | 
			
		||||
 | 
			
		||||
    Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
 | 
			
		||||
    Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
 | 
			
		||||
    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
 | 
			
		||||
 | 
			
		||||
    ///////// Welcome message ////////////
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ///////// Lattice Init ////////////
 | 
			
		||||
    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
@@ -514,74 +319,31 @@ public:
 | 
			
		||||
    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 | 
			
		||||
 | 
			
		||||
    typedef DomainWallFermionF Action;
 | 
			
		||||
    typedef typename Action::FermionField Fermion;
 | 
			
		||||
    typedef LatticeGaugeFieldF Gauge;
 | 
			
		||||
    
 | 
			
		||||
    ///////// Source preparation ////////////
 | 
			
		||||
    LatticeFermion src   (FGrid); random(RNG5,src);
 | 
			
		||||
    LatticeFermion ref   (FGrid);
 | 
			
		||||
    LatticeFermion tmp   (FGrid);
 | 
			
		||||
    Gauge Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
 | 
			
		||||
    Fermion src   (FGrid); random(RNG5,src);
 | 
			
		||||
    Fermion src_e (FrbGrid);
 | 
			
		||||
    Fermion src_o (FrbGrid);
 | 
			
		||||
    Fermion r_e   (FrbGrid);
 | 
			
		||||
    Fermion r_o   (FrbGrid);
 | 
			
		||||
    Fermion r_eo  (FGrid);
 | 
			
		||||
    Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
 | 
			
		||||
    RealD N2 = 1.0/::sqrt(norm2(src));
 | 
			
		||||
    std::cout<<GridLogMessage << "Normalising src  "<< N2 <<std::endl;
 | 
			
		||||
    src = src*N2;
 | 
			
		||||
    
 | 
			
		||||
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////
 | 
			
		||||
    // Naive wilson implementation
 | 
			
		||||
    ////////////////////////////////////
 | 
			
		||||
    {
 | 
			
		||||
      LatticeGaugeField Umu5d(FGrid); 
 | 
			
		||||
      std::vector<LatticeColourMatrix> U(4,FGrid);
 | 
			
		||||
      auto Umu_v = Umu.View();
 | 
			
		||||
      auto Umu5d_v = Umu5d.View();
 | 
			
		||||
      for(int ss=0;ss<Umu.Grid()->oSites();ss++){
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  Umu5d_v[Ls*ss+s] = Umu_v[ss];
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      ref = Zero();
 | 
			
		||||
      for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
 | 
			
		||||
      }
 | 
			
		||||
      for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
	
 | 
			
		||||
	tmp = U[mu]*Cshift(src,mu+1,1);
 | 
			
		||||
	ref=ref + tmp - Gamma(Gmu[mu])*tmp;
 | 
			
		||||
	
 | 
			
		||||
	tmp =adj(U[mu])*src;
 | 
			
		||||
	tmp =Cshift(tmp,mu+1,-1);
 | 
			
		||||
	ref=ref + tmp + Gamma(Gmu[mu])*tmp;
 | 
			
		||||
      }
 | 
			
		||||
      ref = -0.5*ref;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LatticeFermion src_e (FrbGrid);
 | 
			
		||||
    LatticeFermion src_o (FrbGrid);
 | 
			
		||||
    LatticeFermion r_e   (FrbGrid);
 | 
			
		||||
    LatticeFermion r_o   (FrbGrid);
 | 
			
		||||
    LatticeFermion r_eo  (FGrid);
 | 
			
		||||
    LatticeFermion err   (FGrid);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
      pickCheckerboard(Even,src_e,src);
 | 
			
		||||
      pickCheckerboard(Odd,src_o,src);
 | 
			
		||||
 | 
			
		||||
#if defined(AVX512) 
 | 
			
		||||
      const int num_cases = 6;
 | 
			
		||||
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 | 
			
		||||
#else
 | 
			
		||||
      const int num_cases = 4;
 | 
			
		||||
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 | 
			
		||||
#endif
 | 
			
		||||
      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
 | 
			
		||||
 | 
			
		||||
      controls Cases [] = {
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
#endif
 | 
			
		||||
	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 | 
			
		||||
	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 | 
			
		||||
	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
 | 
			
		||||
      }; 
 | 
			
		||||
@@ -594,15 +356,12 @@ public:
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 | 
			
		||||
	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
	int nwarm = 200;
 | 
			
		||||
	int nwarm = 10;
 | 
			
		||||
	double t0=usecond();
 | 
			
		||||
	FGrid->Barrier();
 | 
			
		||||
	for(int i=0;i<nwarm;i++){
 | 
			
		||||
@@ -610,9 +369,7 @@ public:
 | 
			
		||||
	}
 | 
			
		||||
	FGrid->Barrier();
 | 
			
		||||
	double t1=usecond();
 | 
			
		||||
	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
 | 
			
		||||
	//	if (ncall < 500) ncall = 500;
 | 
			
		||||
	uint64_t ncall = 1000;
 | 
			
		||||
	uint64_t ncall = 50;
 | 
			
		||||
 | 
			
		||||
	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 | 
			
		||||
 | 
			
		||||
@@ -649,24 +406,11 @@ public:
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 | 
			
		||||
 | 
			
		||||
	Dw.Report();
 | 
			
		||||
 | 
			
		||||
	Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	Dw.DhopOE(src_e,r_o,DaggerNo);
 | 
			
		||||
	setCheckerboard(r_eo,r_o);
 | 
			
		||||
	setCheckerboard(r_eo,r_e);
 | 
			
		||||
	err = r_eo-ref; 
 | 
			
		||||
	RealD absref = norm2(ref);
 | 
			
		||||
	RealD abserr = norm2(err);
 | 
			
		||||
	std::cout<<GridLogMessage << "norm diff   "<< abserr << " / " << absref<<std::endl;
 | 
			
		||||
	assert(abserr<1.0e-4);
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      robust = mflops_worst/mflops_best;
 | 
			
		||||
 | 
			
		||||
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust  <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage <<fmt << std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage ;
 | 
			
		||||
 | 
			
		||||
@@ -680,8 +424,166 @@ public:
 | 
			
		||||
    return mflops_best;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  static double Staggered(int L)
 | 
			
		||||
  {
 | 
			
		||||
    double mflops;
 | 
			
		||||
    double mflops_best = 0;
 | 
			
		||||
    double mflops_worst= 0;
 | 
			
		||||
    std::vector<double> mflops_all;
 | 
			
		||||
 | 
			
		||||
    ///////////////////////////////////////////////////////
 | 
			
		||||
    // Set/Get the layout & grid size
 | 
			
		||||
    ///////////////////////////////////////////////////////
 | 
			
		||||
    int threads = GridThread::GetThreads();
 | 
			
		||||
    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
 | 
			
		||||
    Coordinate local({L,L,L,L});
 | 
			
		||||
    
 | 
			
		||||
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), 
 | 
			
		||||
								       GridDefaultSimd(Nd,vComplex::Nsimd()),
 | 
			
		||||
								       GridDefaultMpi());
 | 
			
		||||
    uint64_t NP = TmpGrid->RankCount();
 | 
			
		||||
    uint64_t NN = TmpGrid->NodeCount();
 | 
			
		||||
    NN_global=NN;
 | 
			
		||||
    uint64_t SHM=NP/NN;
 | 
			
		||||
 | 
			
		||||
    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
 | 
			
		||||
 | 
			
		||||
    ///////// Welcome message ////////////
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
    ///////// Lattice Init ////////////
 | 
			
		||||
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
 | 
			
		||||
    
 | 
			
		||||
    ///////// RNG Init ////////////
 | 
			
		||||
    std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 | 
			
		||||
 | 
			
		||||
    RealD mass=0.1;
 | 
			
		||||
    RealD c1=9.0/8.0;
 | 
			
		||||
    RealD c2=-1.0/24.0;
 | 
			
		||||
    RealD u0=1.0;
 | 
			
		||||
 | 
			
		||||
    typedef ImprovedStaggeredFermionF Action;
 | 
			
		||||
    typedef typename Action::FermionField Fermion; 
 | 
			
		||||
    typedef LatticeGaugeFieldF Gauge;
 | 
			
		||||
    
 | 
			
		||||
    Gauge Umu(FGrid);  SU3::HotConfiguration(RNG4,Umu); 
 | 
			
		||||
 | 
			
		||||
    typename Action::ImplParams params;
 | 
			
		||||
    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
 | 
			
		||||
 | 
			
		||||
    ///////// Source preparation ////////////
 | 
			
		||||
    Fermion src   (FGrid); random(RNG4,src);
 | 
			
		||||
    Fermion src_e (FrbGrid);
 | 
			
		||||
    Fermion src_o (FrbGrid);
 | 
			
		||||
    Fermion r_e   (FrbGrid);
 | 
			
		||||
    Fermion r_o   (FrbGrid);
 | 
			
		||||
    Fermion r_eo  (FGrid);
 | 
			
		||||
  
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
      pickCheckerboard(Even,src_e,src);
 | 
			
		||||
      pickCheckerboard(Odd,src_o,src);
 | 
			
		||||
    
 | 
			
		||||
      const int num_cases = 4;
 | 
			
		||||
      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
 | 
			
		||||
      
 | 
			
		||||
      controls Cases [] = {
 | 
			
		||||
	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 | 
			
		||||
	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 | 
			
		||||
	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
 | 
			
		||||
      }; 
 | 
			
		||||
 | 
			
		||||
      for(int c=0;c<num_cases;c++) {
 | 
			
		||||
	
 | 
			
		||||
	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
 | 
			
		||||
	StaggeredKernelsStatic::Opt   = Cases[c].Opt;
 | 
			
		||||
	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 | 
			
		||||
      
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
 | 
			
		||||
	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 | 
			
		||||
	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
	
 | 
			
		||||
	int nwarm = 10;
 | 
			
		||||
	double t0=usecond();
 | 
			
		||||
	FGrid->Barrier();
 | 
			
		||||
	for(int i=0;i<nwarm;i++){
 | 
			
		||||
	  Ds.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	}
 | 
			
		||||
	FGrid->Barrier();
 | 
			
		||||
	double t1=usecond();
 | 
			
		||||
	uint64_t ncall = 500;
 | 
			
		||||
 | 
			
		||||
	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 | 
			
		||||
 | 
			
		||||
	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 | 
			
		||||
	Ds.ZeroCounters();
 | 
			
		||||
 | 
			
		||||
	time_statistics timestat;
 | 
			
		||||
	std::vector<double> t_time(ncall);
 | 
			
		||||
	for(uint64_t i=0;i<ncall;i++){
 | 
			
		||||
	  t0=usecond();
 | 
			
		||||
	  Ds.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	  t1=usecond();
 | 
			
		||||
	  t_time[i] = t1-t0;
 | 
			
		||||
	}
 | 
			
		||||
	FGrid->Barrier();
 | 
			
		||||
	
 | 
			
		||||
	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
	double flops=(1146.0*volume)/2;
 | 
			
		||||
	double mf_hi, mf_lo, mf_err;
 | 
			
		||||
	
 | 
			
		||||
	timestat.statistics(t_time);
 | 
			
		||||
	mf_hi = flops/timestat.min;
 | 
			
		||||
	mf_lo = flops/timestat.max;
 | 
			
		||||
	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 | 
			
		||||
 | 
			
		||||
	mflops = flops/timestat.mean;
 | 
			
		||||
	mflops_all.push_back(mflops);
 | 
			
		||||
	if ( mflops_best == 0   ) mflops_best = mflops;
 | 
			
		||||
	if ( mflops_worst== 0   ) mflops_worst= mflops;
 | 
			
		||||
	if ( mflops>mflops_best ) mflops_best = mflops;
 | 
			
		||||
	if ( mflops<mflops_worst) mflops_worst= mflops;
 | 
			
		||||
	
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 | 
			
		||||
      
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage <<fmt << std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage ;
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<mflops_all.size();i++){
 | 
			
		||||
	std::cout<<mflops_all[i]/NN<<" ; " ;
 | 
			
		||||
      }
 | 
			
		||||
      std::cout<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    return mflops_best;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
@@ -696,62 +598,50 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  int do_memory=1;
 | 
			
		||||
  int do_comms =1;
 | 
			
		||||
  int do_su3   =0;
 | 
			
		||||
  int do_wilson=1;
 | 
			
		||||
  int do_dwf   =1;
 | 
			
		||||
 | 
			
		||||
  if ( do_su3 ) {
 | 
			
		||||
    // empty for now
 | 
			
		||||
  }
 | 
			
		||||
#if 1
 | 
			
		||||
  int sel=2;
 | 
			
		||||
  Coordinate L_list({8,12,16,24});
 | 
			
		||||
#else
 | 
			
		||||
  int sel=1;
 | 
			
		||||
  Coordinate L_list({8,12});
 | 
			
		||||
#endif
 | 
			
		||||
  std::vector<int> L_list({16,24,32});
 | 
			
		||||
  int selm1=sel-1;
 | 
			
		||||
  std::vector<double> robust_list;
 | 
			
		||||
 | 
			
		||||
  std::vector<double> wilson;
 | 
			
		||||
  std::vector<double> dwf4;
 | 
			
		||||
  std::vector<double> dwf5;
 | 
			
		||||
  std::vector<double> staggered;
 | 
			
		||||
 | 
			
		||||
  if ( do_wilson ) {
 | 
			
		||||
    int Ls=1;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    for(int l=0;l<L_list.size();l++){
 | 
			
		||||
      double robust;
 | 
			
		||||
      wilson.push_back(Benchmark::DWF(Ls,L_list[l],robust));
 | 
			
		||||
    }
 | 
			
		||||
  int Ls=1;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  for(int l=0;l<L_list.size();l++){
 | 
			
		||||
    wilson.push_back(Benchmark::DWF(Ls,L_list[l]));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int Ls=16;
 | 
			
		||||
  if ( do_dwf ) {
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    for(int l=0;l<L_list.size();l++){
 | 
			
		||||
      double robust;
 | 
			
		||||
      double result = Benchmark::DWF(Ls,L_list[l],robust) ;
 | 
			
		||||
      dwf4.push_back(result);
 | 
			
		||||
      robust_list.push_back(robust);
 | 
			
		||||
    }
 | 
			
		||||
  Ls=12;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  for(int l=0;l<L_list.size();l++){
 | 
			
		||||
    double result = Benchmark::DWF(Ls,L_list[l]) ;
 | 
			
		||||
    dwf4.push_back(result);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_dwf ) {
 | 
			
		||||
  /*
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  for(int l=0;l<L_list.size();l++){
 | 
			
		||||
    double result = Benchmark::Staggered(L_list[l]) ;
 | 
			
		||||
    staggered.push_back(result);
 | 
			
		||||
  }
 | 
			
		||||
  */
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 " <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\tt Staggered" <<std::endl;
 | 
			
		||||
  for(int l=0;l<L_list.size();l++){
 | 
			
		||||
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l] <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] <<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int NN=NN_global;
 | 
			
		||||
  if ( do_memory ) {
 | 
			
		||||
@@ -768,24 +658,20 @@ int main (int argc, char ** argv)
 | 
			
		||||
    Benchmark::Comms();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_dwf ) {
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  " <<std::endl;
 | 
			
		||||
  for(int l=0;l<L_list.size();l++){
 | 
			
		||||
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  " <<std::endl;
 | 
			
		||||
    for(int l=0;l<L_list.size();l++){
 | 
			
		||||
      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
 | 
			
		||||
  std::cout<<std::setprecision(3);
 | 
			
		||||
  std::cout<<GridLogMessage << " Comparison point robustness: "  << robust_list[sel] <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
 | 
			
		||||
    std::cout<<std::setprecision(3);
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -21,7 +21,7 @@
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
#define CUDA_PROFILE
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
@@ -129,8 +129,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
  LatticeGaugeField Umu5d(FGrid);
 | 
			
		||||
  std::vector<LatticeColourMatrix> U(4,FGrid);
 | 
			
		||||
  {
 | 
			
		||||
    auto Umu5d_v = Umu5d.View();
 | 
			
		||||
    auto Umu_v = Umu.View();
 | 
			
		||||
    autoView( Umu5d_v, Umu5d, CpuWrite);
 | 
			
		||||
    autoView( Umu_v  , Umu  , CpuRead);
 | 
			
		||||
    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
 | 
			
		||||
      for(int s=0;s<Ls;s++){
 | 
			
		||||
	Umu5d_v[Ls*ss+s] = Umu_v[ss];
 | 
			
		||||
@@ -272,8 +272,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
 | 
			
		||||
      tmp = U[mu]*Cshift(src,mu+1,1);
 | 
			
		||||
      {
 | 
			
		||||
	auto ref_v = ref.View();
 | 
			
		||||
	auto tmp_v = tmp.View();
 | 
			
		||||
	autoView( ref_v, ref, CpuWrite);
 | 
			
		||||
	autoView( tmp_v, tmp, CpuRead);
 | 
			
		||||
	for(int i=0;i<ref_v.size();i++){
 | 
			
		||||
	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
 | 
			
		||||
	}
 | 
			
		||||
@@ -282,8 +282,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      tmp =adj(U[mu])*src;
 | 
			
		||||
      tmp =Cshift(tmp,mu+1,-1);
 | 
			
		||||
      {
 | 
			
		||||
	auto ref_v = ref.View();
 | 
			
		||||
	auto tmp_v = tmp.View();
 | 
			
		||||
	autoView( ref_v, ref, CpuWrite);
 | 
			
		||||
	autoView( tmp_v, tmp, CpuRead);
 | 
			
		||||
	for(int i=0;i<ref_v.size();i++){
 | 
			
		||||
	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
 | 
			
		||||
	}
 | 
			
		||||
 
 | 
			
		||||
@@ -130,11 +130,13 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
 | 
			
		||||
  LatticeGaugeField Umu5d(FGrid); 
 | 
			
		||||
 | 
			
		||||
  // replicate across fifth dimension
 | 
			
		||||
  auto Umu5d_v = Umu5d.View();
 | 
			
		||||
  auto Umu_v   = Umu.View();
 | 
			
		||||
  for(int ss=0;ss<Umu.Grid()->oSites();ss++){
 | 
			
		||||
    for(int s=0;s<Ls;s++){
 | 
			
		||||
      Umu5d_v[Ls*ss+s] = Umu_v[ss];
 | 
			
		||||
  {
 | 
			
		||||
    autoView(Umu5d_v , Umu5d, CpuWrite);
 | 
			
		||||
    autoView(  Umu_v , Umu,   CpuRead);
 | 
			
		||||
    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
 | 
			
		||||
      for(int s=0;s<Ls;s++){
 | 
			
		||||
	Umu5d_v[Ls*ss+s] = Umu_v[ss];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -79,7 +79,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
    double start=usecond();
 | 
			
		||||
    thread_for(t,threads,{
 | 
			
		||||
      auto x_t = x[t].View();
 | 
			
		||||
      autoView( x_t , x[t],CpuRead);
 | 
			
		||||
      sum[t] = x_t[0];
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
	for(auto ss=x_t.begin();ss<x_t.end();ss++){
 | 
			
		||||
 
 | 
			
		||||
@@ -177,9 +177,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
      Real nn;      
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
	auto x_v = x.View();
 | 
			
		||||
	nn=norm2(x);
 | 
			
		||||
	vsplat(x_v[0]._internal[0],nn);
 | 
			
		||||
      }
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
 
 | 
			
		||||
@@ -85,11 +85,11 @@ void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int ss= so+n*stride+b;
 | 
			
		||||
	for(int i=0;i<Lblock;i++){
 | 
			
		||||
	  auto lhs_v = lhs[i].View();
 | 
			
		||||
	  autoView(lhs_v, lhs[i], CpuRead);
 | 
			
		||||
	  auto left = conjugate(lhs_v[ss]);
 | 
			
		||||
	  for(int j=0;j<Rblock;j++){
 | 
			
		||||
	    int idx = i+Lblock*j+Lblock*Rblock*r;
 | 
			
		||||
	    auto rhs_v = rhs[j].View();
 | 
			
		||||
	    autoView(rhs_v, rhs[j], CpuRead);
 | 
			
		||||
	    auto right = rhs_v[ss];
 | 
			
		||||
	    vector_type vv = left()(0)(0) * right()(0)(0)
 | 
			
		||||
	      +              left()(0)(1) * right()(0)(1)
 | 
			
		||||
@@ -221,12 +221,12 @@ void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int ss= so+n*stride+b;
 | 
			
		||||
	for(int i=0;i<Lblock;i++){
 | 
			
		||||
 	  auto lhs_v=lhs[i].View();
 | 
			
		||||
 	  autoView(lhs_v,lhs[i],CpuRead);
 | 
			
		||||
	  auto left = conjugate(lhs_v[ss]);
 | 
			
		||||
	  for(int j=0;j<Rblock;j++){
 | 
			
		||||
	  for(int mu=0;mu<Ngamma;mu++){
 | 
			
		||||
	    
 | 
			
		||||
	    auto rhs_v = rhs[j].View();
 | 
			
		||||
	    autoView(rhs_v,rhs[j],CpuRead);
 | 
			
		||||
	    auto right = Gamma(gammas[mu])*rhs_v[ss];
 | 
			
		||||
 | 
			
		||||
	      vector_type vv = left()(0)(0) * right()(0)(0)
 | 
			
		||||
@@ -370,12 +370,12 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
 | 
			
		||||
	int ss= so+n*stride+b;
 | 
			
		||||
	for(int i=0;i<Lblock;i++){
 | 
			
		||||
	  
 | 
			
		||||
	  auto lhs_v=lhs[i].View();
 | 
			
		||||
	  autoView(lhs_v,lhs[i],CpuRead);
 | 
			
		||||
	  auto left = conjugate(lhs_v[ss]);
 | 
			
		||||
	  for(int j=0;j<Rblock;j++){
 | 
			
		||||
 | 
			
		||||
	    SpinMatrix_v vv;
 | 
			
		||||
	    auto rhs_v = rhs[j].View();
 | 
			
		||||
	    autoView(rhs_v,rhs[j],CpuRead);
 | 
			
		||||
	    auto right = rhs_v[ss];
 | 
			
		||||
	    for(int s1=0;s1<Ns;s1++){
 | 
			
		||||
	    for(int s2=0;s2<Ns;s2++){
 | 
			
		||||
@@ -518,12 +518,12 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
 | 
			
		||||
 | 
			
		||||
	for(int i=0;i<Lblock;i++){
 | 
			
		||||
 | 
			
		||||
	  auto lhs_v = lhs[i].View();
 | 
			
		||||
	  autoView(lhs_v,lhs[i],CpuRead);
 | 
			
		||||
	  auto left = conjugate(lhs_v[ss]);
 | 
			
		||||
	  for(int j=0;j<Rblock;j++){
 | 
			
		||||
 | 
			
		||||
	    SpinMatrix_v vv;
 | 
			
		||||
	    auto rhs_v = rhs[j].View();
 | 
			
		||||
	    autoView(rhs_v,rhs[j],CpuRead);
 | 
			
		||||
	    auto right = rhs_v[ss];
 | 
			
		||||
	    for(int s1=0;s1<Ns;s1++){
 | 
			
		||||
	    for(int s2=0;s2<Ns;s2++){
 | 
			
		||||
@@ -537,7 +537,7 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
 | 
			
		||||
	    // Trigger unroll
 | 
			
		||||
	    for ( int m=0;m<Nmom;m++){
 | 
			
		||||
	      int idx = m+base;
 | 
			
		||||
	      auto mom_v = mom[m].View();
 | 
			
		||||
	      autoView(mom_v,mom[m],CpuRead);
 | 
			
		||||
	      auto phase = mom_v[ss];
 | 
			
		||||
	      mac(&lvSum[idx],&vv,&phase);
 | 
			
		||||
	    }
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										176
									
								
								benchmarks/Benchmark_schur.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										176
									
								
								benchmarks/Benchmark_schur.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,176 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./benchmarks/Benchmark_dwf.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
  Gamma::Algebra Gmu [] = {
 | 
			
		||||
    Gamma::Algebra::GammaX,
 | 
			
		||||
    Gamma::Algebra::GammaY,
 | 
			
		||||
    Gamma::Algebra::GammaZ,
 | 
			
		||||
    Gamma::Algebra::GammaT
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
void benchDw(std::vector<int> & L, int Ls);
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  const int Ls=12;
 | 
			
		||||
  std::vector< std::vector<int> > latts;
 | 
			
		||||
#if 1
 | 
			
		||||
  latts.push_back(std::vector<int> ({24,24,24,24}) );
 | 
			
		||||
  latts.push_back(std::vector<int> ({48,24,24,24}) );
 | 
			
		||||
  latts.push_back(std::vector<int> ({96,24,24,24}) );
 | 
			
		||||
  latts.push_back(std::vector<int> ({96,48,24,24}) );
 | 
			
		||||
  //  latts.push_back(std::vector<int> ({96,48,48,24}) );
 | 
			
		||||
  //  latts.push_back(std::vector<int> ({96,48,48,48}) );
 | 
			
		||||
#else
 | 
			
		||||
  //  latts.push_back(std::vector<int> ({96,48,48,48}) );
 | 
			
		||||
  latts.push_back(std::vector<int> ({96,96,96,192}) );
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t SchurDiagOne "<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  for (int l=0;l<latts.size();l++){
 | 
			
		||||
    std::vector<int> latt4 = latts[l];
 | 
			
		||||
    std::cout << GridLogMessage <<"\t";
 | 
			
		||||
    for(int d=0;d<Nd;d++){
 | 
			
		||||
      std::cout<<latt4[d]<<"x";
 | 
			
		||||
    }
 | 
			
		||||
    std::cout <<Ls<<"\t" ;
 | 
			
		||||
    benchDw (latt4,Ls);
 | 
			
		||||
  }
 | 
			
		||||
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
void benchDw(std::vector<int> & latt4, int Ls)
 | 
			
		||||
{
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // for Nc=3
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Dw :  Ls*24*(7+48)= Ls*1320 
 | 
			
		||||
  //
 | 
			
		||||
  // M5D:  Ls*(4*2*Nc mul + 4*2*Nc madd ) = 3*4*2*Nc*Ls = Ls*72
 | 
			
		||||
  // Meo:  Ls*24*(7+48) + Ls*72 = Ls*1392 
 | 
			
		||||
  //
 | 
			
		||||
  // Mee:  3*Ns*2*Nc*Ls  // Chroma 6*N5*Nc*Ns 
 | 
			
		||||
  //
 | 
			
		||||
  // LeemInv : 2*2*Nc*madd*Ls
 | 
			
		||||
  // LeeInv  : 2*2*Nc*madd*Ls
 | 
			
		||||
  // DeeInv  : 4*2*Nc*mul *Ls
 | 
			
		||||
  // UeeInv  : 2*2*Nc*madd*Ls
 | 
			
		||||
  // UeemInv : 2*2*Nc*madd*Ls = Nc*Ls*(8+8+8+8+8) = 40*Nc*Ls// Chroma (10*N5 - 8)*Nc*Ns ~ (40 N5 - 32)Nc flops
 | 
			
		||||
  // QUDA counts as dense LsxLs real matrix x Ls x NcNsNreim => Nc*4*2 x Ls^2 FMA = 16Nc Ls^2 flops
 | 
			
		||||
  // Mpc => 1452*cbvol*2*Ls flops // 
 | 
			
		||||
  //     => (1344+Ls*48)*Ls*cbvol*2 flops QUDA = 1920 @Ls=12 and 2112 @Ls=16
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
  //  long unsigned int single_site_flops     = 8*Nc*(7+16*Nc)*Ls;
 | 
			
		||||
  long unsigned int single_site_mpc_flops = 8*Nc*(7+16*Nc)*2*Ls + 40*Nc*2*Ls + 4*Nc*2*Ls;
 | 
			
		||||
  long unsigned int single_site_quda_flops = 8*Nc*(7+16*Nc)*2*Ls + 16*Nc*Ls*Ls + 4*Nc*2*Ls;
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  ColourMatrixF cm = ComplexF(1.0,0.0);
 | 
			
		||||
 | 
			
		||||
  int ncall=300;
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  RealD M5  =1.8;
 | 
			
		||||
  RealD NP = UGrid->_Nprocessors;
 | 
			
		||||
  double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
 | 
			
		||||
  LatticeGaugeFieldF Umu(UGrid); Umu=Zero();
 | 
			
		||||
  MobiusFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.5,0.5);
 | 
			
		||||
  
 | 
			
		||||
  LatticeFermionF src_o (FrbGrid); src_o=1.0;
 | 
			
		||||
  LatticeFermionF r_o   (FrbGrid); r_o=Zero();
 | 
			
		||||
 | 
			
		||||
  int order =151;
 | 
			
		||||
  SchurDiagOneOperator<MobiusFermionF,LatticeFermionF>  Mpc(Dw);
 | 
			
		||||
  Chebyshev<LatticeFermionF>      Cheby(0.0,60.0,order);
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    Mpc.Mpc(src_o,r_o);
 | 
			
		||||
    Mpc.Mpc(src_o,r_o);
 | 
			
		||||
    Mpc.Mpc(src_o,r_o);
 | 
			
		||||
 | 
			
		||||
    double t0=usecond();
 | 
			
		||||
    for(int i=0;i<ncall;i++){
 | 
			
		||||
      Mpc.Mpc(src_o,r_o);
 | 
			
		||||
    }
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
 | 
			
		||||
    double flops=(single_site_mpc_flops*volume*ncall); // Mpc has 1 - Moo^-1 Moe Mee^-1 Meo  so CB cancels.
 | 
			
		||||
    std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0);
 | 
			
		||||
    flops=(single_site_quda_flops*volume*ncall);
 | 
			
		||||
    std::cout <<"\t"<<flops/(t1-t0)<<"\t"<<(t1-t0)/1000./1000.<<" s\t";
 | 
			
		||||
 | 
			
		||||
    // Cheby uses MpcDagMpc so 2x flops
 | 
			
		||||
    for(int i=0;i<1;i++){
 | 
			
		||||
    Cheby(Mpc,src_o,r_o);
 | 
			
		||||
    t0=usecond();
 | 
			
		||||
    Cheby(Mpc,src_o,r_o);
 | 
			
		||||
    t1=usecond();
 | 
			
		||||
    flops=(single_site_mpc_flops*volume*2*order);
 | 
			
		||||
    std::cout <<"\t"<<flops/(t1-t0);
 | 
			
		||||
    flops=(single_site_quda_flops*volume*2*order);
 | 
			
		||||
    std::cout <<"\t"<<flops/(t1-t0) << "\t" << (t1-t0)/1000./1000. <<" s";
 | 
			
		||||
    std::cout <<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  //  Dw.Report();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -88,25 +88,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
  }
 | 
			
		||||
  ref = Zero();
 | 
			
		||||
  /*  
 | 
			
		||||
  { // Naive wilson implementation
 | 
			
		||||
    ref = Zero();
 | 
			
		||||
    for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
      //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
 | 
			
		||||
      tmp = U[mu]*Cshift(src,mu,1);
 | 
			
		||||
      for(int i=0;i<ref._odata.size();i++){
 | 
			
		||||
	ref[i]+= tmp[i] - Gamma(Gmu[mu])*tmp[i]; ;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      tmp =adj(U[mu])*src;
 | 
			
		||||
      tmp =Cshift(tmp,mu,-1);
 | 
			
		||||
      for(int i=0;i<ref._odata.size();i++){
 | 
			
		||||
	ref[i]+= tmp[i] + Gamma(Gmu[mu])*tmp[i]; ;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  ref = -0.5*ref;
 | 
			
		||||
  */
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  RealD c1=9.0/8.0;
 | 
			
		||||
@@ -125,10 +106,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  
 | 
			
		||||
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
  err = ref-result; 
 | 
			
		||||
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										114
									
								
								benchmarks/Benchmark_staggeredF.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										114
									
								
								benchmarks/Benchmark_staggeredF.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,114 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./benchmarks/Benchmark_staggered.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 ;
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  Coordinate latt_size   = GridDefaultLatt();
 | 
			
		||||
  Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
 | 
			
		||||
  Coordinate mpi_layout  = GridDefaultMpi();
 | 
			
		||||
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian     RBGrid(&Grid);
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
  GridParallelRNG          pRNG(&Grid);
 | 
			
		||||
  pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
  //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
 | 
			
		||||
  typedef typename ImprovedStaggeredFermionF::FermionField FermionField; 
 | 
			
		||||
  typename ImprovedStaggeredFermionF::ImplParams params; 
 | 
			
		||||
 | 
			
		||||
  FermionField src   (&Grid); random(pRNG,src);
 | 
			
		||||
  FermionField result(&Grid); result=Zero();
 | 
			
		||||
  FermionField    ref(&Grid);    ref=Zero();
 | 
			
		||||
  FermionField    tmp(&Grid);    tmp=Zero();
 | 
			
		||||
  FermionField    err(&Grid);    tmp=Zero();
 | 
			
		||||
  LatticeGaugeFieldF Umu(&Grid); random(pRNG,Umu);
 | 
			
		||||
  std::vector<LatticeColourMatrixF> U(4,&Grid);
 | 
			
		||||
 | 
			
		||||
  double volume=1;
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    volume=volume*latt_size[mu];
 | 
			
		||||
  }  
 | 
			
		||||
 | 
			
		||||
  // Only one non-zero (y)
 | 
			
		||||
#if 0
 | 
			
		||||
  Umu=Zero();
 | 
			
		||||
  Complex cone(1.0,0.0);
 | 
			
		||||
  for(int nn=0;nn<Nd;nn++){
 | 
			
		||||
    random(pRNG,U[nn]);
 | 
			
		||||
    if(1) {
 | 
			
		||||
      if (nn!=2) { U[nn]=Zero(); std::cout<<GridLogMessage << "zeroing gauge field in dir "<<nn<<std::endl; }
 | 
			
		||||
      //      else       { U[nn]= cone;std::cout<<GridLogMessage << "unit gauge field in dir "<<nn<<std::endl; }
 | 
			
		||||
      else       { std::cout<<GridLogMessage << "random gauge field in dir "<<nn<<std::endl; }
 | 
			
		||||
    }
 | 
			
		||||
    PokeIndex<LorentzIndex>(Umu,U[nn],nn);
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  RealD c1=9.0/8.0;
 | 
			
		||||
  RealD c2=-1.0/24.0;
 | 
			
		||||
  RealD u0=1.0;
 | 
			
		||||
  ImprovedStaggeredFermionF Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
 | 
			
		||||
  
 | 
			
		||||
  std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
 | 
			
		||||
  int ncall=1000;
 | 
			
		||||
  for(int i=0;i<ncall;i++){
 | 
			
		||||
    Ds.Dhop(src,result,0);
 | 
			
		||||
  }
 | 
			
		||||
  double t0=usecond();
 | 
			
		||||
  for(int i=0;i<ncall;i++){
 | 
			
		||||
    Ds.Dhop(src,result,0);
 | 
			
		||||
  }
 | 
			
		||||
  double t1=usecond();
 | 
			
		||||
  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
 | 
			
		||||
  
 | 
			
		||||
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
@@ -41,7 +41,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
#define LADD (8)
 | 
			
		||||
 | 
			
		||||
  int64_t Nwarm=20;
 | 
			
		||||
  int64_t Nloop=500;
 | 
			
		||||
  int64_t Nloop=50;
 | 
			
		||||
 | 
			
		||||
  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
 | 
			
		||||
  Coordinate mpi_layout  = GridDefaultMpi();
 | 
			
		||||
@@ -66,9 +66,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
      LatticeColourMatrix x(&Grid);// random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid);// random(pRNG,y);
 | 
			
		||||
 | 
			
		||||
      auto x_v = x.View();
 | 
			
		||||
      auto y_v = y.View();
 | 
			
		||||
      auto z_v = z.View();
 | 
			
		||||
      autoView( x_v , x, AcceleratorRead);
 | 
			
		||||
      autoView( y_v , y, AcceleratorRead);
 | 
			
		||||
      autoView( z_v , z, AcceleratorWrite);
 | 
			
		||||
      const uint64_t Nsite = x_v.size();
 | 
			
		||||
      const uint64_t nsimd = vComplex::Nsimd();
 | 
			
		||||
      for(int64_t i=0;i<Nwarm;i++){
 | 
			
		||||
@@ -116,9 +116,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
      LatticeColourMatrix x(&Grid);// random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid);// random(pRNG,y);
 | 
			
		||||
 | 
			
		||||
      auto x_v = x.View();
 | 
			
		||||
      auto y_v = y.View();
 | 
			
		||||
      auto z_v = z.View();
 | 
			
		||||
      autoView( x_v , x, AcceleratorWrite);
 | 
			
		||||
      autoView( y_v , y, AcceleratorRead);
 | 
			
		||||
      autoView( z_v , z, AcceleratorRead);
 | 
			
		||||
      const uint64_t Nsite = x_v.size();
 | 
			
		||||
      const uint64_t nsimd = vComplex::Nsimd();
 | 
			
		||||
      for(int64_t i=0;i<Nwarm;i++){
 | 
			
		||||
@@ -167,9 +167,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
      LatticeColourMatrix x(&Grid);// random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid);// random(pRNG,y);
 | 
			
		||||
 | 
			
		||||
      auto x_v = x.View();
 | 
			
		||||
      auto y_v = y.View();
 | 
			
		||||
      auto z_v = z.View();
 | 
			
		||||
      autoView( x_v , x, AcceleratorRead);
 | 
			
		||||
      autoView( y_v , y, AcceleratorRead);
 | 
			
		||||
      autoView( z_v , z, AcceleratorWrite);
 | 
			
		||||
      const uint64_t Nsite = x_v.size();
 | 
			
		||||
      const uint64_t nsimd = vComplex::Nsimd();
 | 
			
		||||
      for(int64_t i=0;i<Nwarm;i++){
 | 
			
		||||
@@ -220,10 +220,10 @@ int main (int argc, char ** argv)
 | 
			
		||||
      LatticeColourMatrix y(&Grid);// random(pRNG,y);
 | 
			
		||||
      LatticeColourMatrix w(&Grid);// random(pRNG,y);
 | 
			
		||||
 | 
			
		||||
      auto x_v = x.View();
 | 
			
		||||
      auto y_v = y.View();
 | 
			
		||||
      auto z_v = z.View();
 | 
			
		||||
      auto w_v = z.View();
 | 
			
		||||
      autoView( x_v , x, AcceleratorRead);
 | 
			
		||||
      autoView( y_v , y, AcceleratorRead);
 | 
			
		||||
      autoView( z_v , z, AcceleratorRead);
 | 
			
		||||
      autoView( w_v , w, AcceleratorWrite);
 | 
			
		||||
      const uint64_t Nsite = x_v.size();
 | 
			
		||||
      const uint64_t nsimd = vComplex::Nsimd();
 | 
			
		||||
      for(int64_t i=0;i<Nwarm;i++){
 | 
			
		||||
 
 | 
			
		||||
@@ -125,8 +125,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      //    ref =  src + Gamma(Gamma::Algebra::GammaX)* src ; // 1-gamma_x
 | 
			
		||||
      tmp = U[mu]*Cshift(src,mu,1);
 | 
			
		||||
      {
 | 
			
		||||
	auto ref_v = ref.View();
 | 
			
		||||
	auto tmp_v = tmp.View();
 | 
			
		||||
	autoView( ref_v, ref, CpuWrite);
 | 
			
		||||
	autoView( tmp_v, tmp, CpuWrite);
 | 
			
		||||
	for(int i=0;i<ref_v.size();i++){
 | 
			
		||||
	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
 | 
			
		||||
	}
 | 
			
		||||
@@ -135,8 +135,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      tmp =adj(U[mu])*src;
 | 
			
		||||
      tmp =Cshift(tmp,mu,-1);
 | 
			
		||||
      {
 | 
			
		||||
	auto ref_v = ref.View();
 | 
			
		||||
	auto tmp_v = tmp.View();
 | 
			
		||||
	autoView( ref_v, ref, CpuWrite);
 | 
			
		||||
	autoView( tmp_v, tmp, CpuWrite);
 | 
			
		||||
	for(int i=0;i<ref_v.size();i++){
 | 
			
		||||
	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
 | 
			
		||||
	}
 | 
			
		||||
@@ -209,8 +209,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
  for(int ss=0;ss<0;ss++ ){
 | 
			
		||||
    for(int i=0;i<Ns;i++){
 | 
			
		||||
      for(int j=0;j<Nc;j++){
 | 
			
		||||
	auto ref_v = ref.View();
 | 
			
		||||
	auto result_v = result.View();
 | 
			
		||||
	autoView( ref_v, ref, CpuWrite);
 | 
			
		||||
	autoView( result_v, result, CpuWrite);
 | 
			
		||||
	ComplexF * ref_p = (ComplexF *)&ref_v[ss]()(i)(j);
 | 
			
		||||
	ComplexF * res_p = (ComplexF *)&result_v[ss]()(i)(j);
 | 
			
		||||
	std::cout<<GridLogMessage << ss<< " "<<i<<" "<<j<<" "<< (*ref_p)<<" " <<(*res_p)<<std::endl;
 | 
			
		||||
@@ -226,8 +226,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
 | 
			
		||||
      tmp = U[mu]*Cshift(src,mu,1);
 | 
			
		||||
      {
 | 
			
		||||
	auto ref_v = ref.View();
 | 
			
		||||
	auto tmp_v = tmp.View();
 | 
			
		||||
	autoView( ref_v, ref, CpuWrite);
 | 
			
		||||
	autoView( tmp_v, tmp, CpuWrite);
 | 
			
		||||
	for(int i=0;i<ref_v.size();i++){
 | 
			
		||||
	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
 | 
			
		||||
	}
 | 
			
		||||
@@ -236,8 +236,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      tmp =adj(U[mu])*src;
 | 
			
		||||
      tmp =Cshift(tmp,mu,-1);
 | 
			
		||||
      {
 | 
			
		||||
	auto ref_v = ref.View();
 | 
			
		||||
	auto tmp_v = tmp.View();
 | 
			
		||||
	autoView( ref_v, ref, CpuWrite);
 | 
			
		||||
	autoView( tmp_v, tmp, CpuWrite);
 | 
			
		||||
	for(int i=0;i<ref_v.size();i++){
 | 
			
		||||
	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
 | 
			
		||||
	}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user