mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Weak scale the benchmarks automatically.
This commit is contained in:
		@@ -27,8 +27,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=32;lat+=4){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
			
		||||
@@ -47,8 +47,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
      
 | 
			
		||||
      double flops=lat*lat*lat*lat*Nvec*2;// mul,add
 | 
			
		||||
      double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real);
 | 
			
		||||
      double flops=vol*Nvec*2;// mul,add
 | 
			
		||||
      double bytes=3*vol*Nvec*sizeof(Real);
 | 
			
		||||
      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
@@ -61,8 +61,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
  
 | 
			
		||||
  for(int lat=4;lat<=32;lat+=4){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
			
		||||
@@ -79,8 +79,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
     
 | 
			
		||||
      double flops=lat*lat*lat*lat*Nvec*2;// mul,add
 | 
			
		||||
      double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real);
 | 
			
		||||
      double flops=vol*Nvec*2;// mul,add
 | 
			
		||||
      double bytes=3*vol*Nvec*sizeof(Real);
 | 
			
		||||
      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
@@ -92,7 +92,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=32;lat+=4){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
@@ -111,8 +112,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
      
 | 
			
		||||
      double bytes=2*lat*lat*lat*lat*Nvec*sizeof(Real);
 | 
			
		||||
      double flops=lat*lat*lat*lat*Nvec*1;// mul
 | 
			
		||||
      double bytes=2*vol*Nvec*sizeof(Real);
 | 
			
		||||
      double flops=vol*Nvec*1;// mul
 | 
			
		||||
      std::cout <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
@@ -125,8 +126,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=32;lat+=4){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
			
		||||
@@ -144,8 +145,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
      
 | 
			
		||||
      double bytes=lat*lat*lat*lat*Nvec*sizeof(Real);
 | 
			
		||||
      double flops=lat*lat*lat*lat*Nvec*2;// mul,add
 | 
			
		||||
      double bytes=vol*Nvec*sizeof(Real);
 | 
			
		||||
      double flops=vol*Nvec*2;// mul,add
 | 
			
		||||
      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
 | 
			
		||||
 | 
			
		||||
  }    
 | 
			
		||||
 
 | 
			
		||||
@@ -24,8 +24,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  for(int lat=2;lat<=24;lat+=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
			
		||||
 | 
			
		||||
@@ -40,9 +40,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000.0;
 | 
			
		||||
      
 | 
			
		||||
      double bytes=3.0*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
 | 
			
		||||
      double footprint=2.0*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
 | 
			
		||||
      double flops=Nc*Nc*(6.0+8.0+8.0)*lat*lat*lat*lat;
 | 
			
		||||
      double bytes=3.0*vol*Nc*Nc*sizeof(Complex);
 | 
			
		||||
      double footprint=2.0*vol*Nc*Nc*sizeof(Complex);
 | 
			
		||||
      double flops=Nc*Nc*(6.0+8.0+8.0)*vol;
 | 
			
		||||
      std::cout<<std::setprecision(3) << lat<<"\t\t"<<footprint<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
@@ -56,7 +56,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  for(int lat=2;lat<=24;lat+=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
			
		||||
@@ -72,8 +73,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000.0;
 | 
			
		||||
      
 | 
			
		||||
      double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
 | 
			
		||||
      double flops=Nc*Nc*(6+8+8)*lat*lat*lat*lat;
 | 
			
		||||
      double bytes=3*vol*Nc*Nc*sizeof(Complex);
 | 
			
		||||
      double flops=Nc*Nc*(6+8+8)*vol;
 | 
			
		||||
      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
@@ -86,7 +87,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  for(int lat=2;lat<=24;lat+=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
			
		||||
@@ -102,8 +104,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000.0;
 | 
			
		||||
      
 | 
			
		||||
      double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
 | 
			
		||||
      double flops=Nc*Nc*(6+8+8)*lat*lat*lat*lat;
 | 
			
		||||
      double bytes=3*vol*Nc*Nc*sizeof(Complex);
 | 
			
		||||
      double flops=Nc*Nc*(6+8+8)*vol;
 | 
			
		||||
      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
@@ -116,7 +118,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  for(int lat=2;lat<=24;lat+=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
			
		||||
@@ -132,8 +135,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000.0;
 | 
			
		||||
      
 | 
			
		||||
      double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
 | 
			
		||||
      double flops=Nc*Nc*(8+8+8)*lat*lat*lat*lat;
 | 
			
		||||
      double bytes=3*vol*Nc*Nc*sizeof(Complex);
 | 
			
		||||
      double flops=Nc*Nc*(8+8+8)*vol;
 | 
			
		||||
      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
@@ -54,7 +54,6 @@ namespace Grid {
 | 
			
		||||
  const std::vector<int> &GridDefaultLatt(void)     {return Grid_default_latt;};
 | 
			
		||||
  const std::vector<int> &GridDefaultMpi(void)      {return Grid_default_mpi;};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
  // Command line parsing assist for stock controls
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -27,9 +27,11 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
 | 
			
		||||
 | 
			
		||||
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
  
 | 
			
		||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
			
		||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
  for(int n=0;n<e1;n++){
 | 
			
		||||
    for(int b=0;b<e2;b++){
 | 
			
		||||
      int o  = n*rhs._grid->_slice_stride[dimension];
 | 
			
		||||
      int bo = n*rhs._grid->_slice_block[dimension];
 | 
			
		||||
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
			
		||||
@@ -55,9 +57,11 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 | 
			
		||||
 | 
			
		||||
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
 | 
			
		||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
			
		||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
  for(int n=0;n<e1;n++){
 | 
			
		||||
    for(int b=0;b<e2;b++){
 | 
			
		||||
 | 
			
		||||
      int o=n*rhs._grid->_slice_stride[dimension];
 | 
			
		||||
      int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
			
		||||
@@ -103,9 +107,11 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
 | 
			
		||||
 | 
			
		||||
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    
 | 
			
		||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
			
		||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
  for(int n=0;n<e1;n++){
 | 
			
		||||
    for(int b=0;b<e2;b++){
 | 
			
		||||
      int o   =n*rhs._grid->_slice_stride[dimension];
 | 
			
		||||
      int bo  =n*rhs._grid->_slice_block[dimension];
 | 
			
		||||
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
			
		||||
@@ -129,10 +135,11 @@ PARALLEL_NESTED_LOOP2
 | 
			
		||||
 | 
			
		||||
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    
 | 
			
		||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
			
		||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
 | 
			
		||||
  for(int n=0;n<e1;n++){
 | 
			
		||||
    for(int b=0;b<e2;b++){
 | 
			
		||||
      int o      = n*rhs._grid->_slice_stride[dimension];
 | 
			
		||||
      int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
			
		||||
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 | 
			
		||||
@@ -157,9 +164,11 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
 | 
			
		||||
  int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
  int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
 | 
			
		||||
  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
 | 
			
		||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
  for(int n=0;n<e1;n++){
 | 
			
		||||
    for(int b=0;b<e2;b++){
 | 
			
		||||
 
 | 
			
		||||
      int o =n*rhs._grid->_slice_stride[dimension]+b;
 | 
			
		||||
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
 | 
			
		||||
@@ -186,9 +195,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &r
 | 
			
		||||
  int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
  int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
 | 
			
		||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
			
		||||
  int e2=rhs._grid->_slice_block [dimension];
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
  for(int b=0;b<rhs._grid->_slice_block [dimension];b++){
 | 
			
		||||
  for(int n=0;n<e1;n++){
 | 
			
		||||
  for(int b=0;b<e2;b++){
 | 
			
		||||
 | 
			
		||||
      int o  =n*rhs._grid->_slice_stride[dimension];
 | 
			
		||||
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user