From 62dccb32476527cd77aae6ebeba98d28c20f6f9a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 28 May 2015 13:47:01 +0100
Subject: [PATCH] Weak scale the benchmarks automatically.

---
 benchmarks/Grid_memory_bandwidth.cc | 31 +++++++++++----------
 benchmarks/Grid_su3.cc              | 31 +++++++++++----------
 lib/Grid_init.cc                    |  1 -
 lib/cshift/Grid_cshift_common.h     | 43 ++++++++++++++++++-----------
 4 files changed, 60 insertions(+), 46 deletions(-)
diff --git a/benchmarks/Grid_memory_bandwidth.cc b/benchmarks/Grid_memory_bandwidth.cc
index fc959c38..5abdc6a3 100644
--- a/benchmarks/Grid_memory_bandwidth.cc
+++ b/benchmarks/Grid_memory_bandwidth.cc
@@ -27,8 +27,8 @@ int main (int argc, char ** argv)
 
   for(int lat=4;lat<=32;lat+=4){
 
-      std::vector<int> latt_size  ({lat,lat,lat,lat});
-
+      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
       //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
@@ -47,8 +47,8 @@ int main (int argc, char ** argv)
       double stop=usecond();
       double time = (stop-start)/Nloop*1000;
       
-      double flops=lat*lat*lat*lat*Nvec*2;// mul,add
-      double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real);
+      double flops=vol*Nvec*2;// mul,add
+      double bytes=3*vol*Nvec*sizeof(Real);
       std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
 
     }
@@ -61,8 +61,8 @@ int main (int argc, char ** argv)
   
   for(int lat=4;lat<=32;lat+=4){
 
-      std::vector<int> latt_size  ({lat,lat,lat,lat});
-
+      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
       //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
@@ -79,8 +79,8 @@ int main (int argc, char ** argv)
       double stop=usecond();
       double time = (stop-start)/Nloop*1000;
      
-      double flops=lat*lat*lat*lat*Nvec*2;// mul,add
-      double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real);
+      double flops=vol*Nvec*2;// mul,add
+      double bytes=3*vol*Nvec*sizeof(Real);
       std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
 
     }
@@ -92,7 +92,8 @@ int main (int argc, char ** argv)
 
   for(int lat=4;lat<=32;lat+=4){
 
-      std::vector<int> latt_size  ({lat,lat,lat,lat});
+      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
@@ -111,8 +112,8 @@ int main (int argc, char ** argv)
       double stop=usecond();
       double time = (stop-start)/Nloop*1000;
       
-      double bytes=2*lat*lat*lat*lat*Nvec*sizeof(Real);
-      double flops=lat*lat*lat*lat*Nvec*1;// mul
+      double bytes=2*vol*Nvec*sizeof(Real);
+      double flops=vol*Nvec*1;// mul
       std::cout <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
 
   }
@@ -125,8 +126,8 @@ int main (int argc, char ** argv)
 
   for(int lat=4;lat<=32;lat+=4){
 
-      std::vector<int> latt_size  ({lat,lat,lat,lat});
-
+      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
       //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
@@ -144,8 +145,8 @@ int main (int argc, char ** argv)
       double stop=usecond();
       double time = (stop-start)/Nloop*1000;
       
-      double bytes=lat*lat*lat*lat*Nvec*sizeof(Real);
-      double flops=lat*lat*lat*lat*Nvec*2;// mul,add
+      double bytes=vol*Nvec*sizeof(Real);
+      double flops=vol*Nvec*2;// mul,add
       std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
 
   }    
diff --git a/benchmarks/Grid_su3.cc b/benchmarks/Grid_su3.cc
index d64e8706..903ea348 100644
--- a/benchmarks/Grid_su3.cc
+++ b/benchmarks/Grid_su3.cc
@@ -24,8 +24,8 @@ int main (int argc, char ** argv)
 
   for(int lat=2;lat<=24;lat+=2){
 
-      std::vector<int> latt_size  ({lat,lat,lat,lat});
-
+      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 
@@ -40,9 +40,9 @@ int main (int argc, char ** argv)
       double stop=usecond();
       double time = (stop-start)/Nloop*1000.0;
       
-      double bytes=3.0*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
-      double footprint=2.0*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
-      double flops=Nc*Nc*(6.0+8.0+8.0)*lat*lat*lat*lat;
+      double bytes=3.0*vol*Nc*Nc*sizeof(Complex);
+      double footprint=2.0*vol*Nc*Nc*sizeof(Complex);
+      double flops=Nc*Nc*(6.0+8.0+8.0)*vol;
       std::cout<<std::setprecision(3) << lat<<"\t\t"<<footprint<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
 
     }
@@ -56,7 +56,8 @@ int main (int argc, char ** argv)
 
   for(int lat=2;lat<=24;lat+=2){
 
-      std::vector<int> latt_size  ({lat,lat,lat,lat});
+      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
@@ -72,8 +73,8 @@ int main (int argc, char ** argv)
       double stop=usecond();
       double time = (stop-start)/Nloop*1000.0;
       
-      double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
-      double flops=Nc*Nc*(6+8+8)*lat*lat*lat*lat;
+      double bytes=3*vol*Nc*Nc*sizeof(Complex);
+      double flops=Nc*Nc*(6+8+8)*vol;
       std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
 
     }
@@ -86,7 +87,8 @@ int main (int argc, char ** argv)
 
   for(int lat=2;lat<=24;lat+=2){
 
-      std::vector<int> latt_size  ({lat,lat,lat,lat});
+      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
@@ -102,8 +104,8 @@ int main (int argc, char ** argv)
       double stop=usecond();
       double time = (stop-start)/Nloop*1000.0;
       
-      double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
-      double flops=Nc*Nc*(6+8+8)*lat*lat*lat*lat;
+      double bytes=3*vol*Nc*Nc*sizeof(Complex);
+      double flops=Nc*Nc*(6+8+8)*vol;
       std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
 
     }
@@ -116,7 +118,8 @@ int main (int argc, char ** argv)
 
   for(int lat=2;lat<=24;lat+=2){
 
-      std::vector<int> latt_size  ({lat,lat,lat,lat});
+      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
@@ -132,8 +135,8 @@ int main (int argc, char ** argv)
       double stop=usecond();
       double time = (stop-start)/Nloop*1000.0;
       
-      double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
-      double flops=Nc*Nc*(8+8+8)*lat*lat*lat*lat;
+      double bytes=3*vol*Nc*Nc*sizeof(Complex);
+      double flops=Nc*Nc*(8+8+8)*vol;
       std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
 
     }
diff --git a/lib/Grid_init.cc b/lib/Grid_init.cc
index 7fce4793..f72393cb 100644
--- a/lib/Grid_init.cc
+++ b/lib/Grid_init.cc
@@ -54,7 +54,6 @@ namespace Grid {
   const std::vector<int> &GridDefaultLatt(void)     {return Grid_default_latt;};
   const std::vector<int> &GridDefaultMpi(void)      {return Grid_default_mpi;};
 
-
   ////////////////////////////////////////////////////////////
   // Command line parsing assist for stock controls
   ////////////////////////////////////////////////////////////
diff --git a/lib/cshift/Grid_cshift_common.h b/lib/cshift/Grid_cshift_common.h
index 97e66b2c..c369fe1c 100644
--- a/lib/cshift/Grid_cshift_common.h
+++ b/lib/cshift/Grid_cshift_common.h
@@ -27,9 +27,11 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
 
   int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
   
+  int e1=rhs._grid->_slice_nblock[dimension];
+  int e2=rhs._grid->_slice_block[dimension];
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+  for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
       int o  = n*rhs._grid->_slice_stride[dimension];
       int bo = n*rhs._grid->_slice_block[dimension];
       int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
@@ -54,10 +56,12 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
   }
 
   int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
-    
+
+  int e1=rhs._grid->_slice_nblock[dimension];
+  int e2=rhs._grid->_slice_block[dimension];
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+  for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
 
       int o=n*rhs._grid->_slice_stride[dimension];
       int offset = b+n*rhs._grid->_slice_block[dimension];
@@ -103,9 +107,11 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
 
   int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
     
+  int e1=rhs._grid->_slice_nblock[dimension];
+  int e2=rhs._grid->_slice_block[dimension];
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+  for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
       int o   =n*rhs._grid->_slice_stride[dimension];
       int bo  =n*rhs._grid->_slice_block[dimension];
       int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
@@ -129,10 +135,11 @@ PARALLEL_NESTED_LOOP2
 
   int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
     
+  int e1=rhs._grid->_slice_nblock[dimension];
+  int e2=rhs._grid->_slice_block[dimension];
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-
+  for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
       int o      = n*rhs._grid->_slice_stride[dimension];
       int offset = b+n*rhs._grid->_slice_block[dimension];
       int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
@@ -156,10 +163,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
 
   int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
   int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
-  
+
+  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
+  int e2=rhs._grid->_slice_block[dimension];
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+  for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
  
       int o =n*rhs._grid->_slice_stride[dimension]+b;
       int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
@@ -185,10 +194,12 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &r
 
   int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
   int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
-  
+
+  int e1=rhs._grid->_slice_nblock[dimension];
+  int e2=rhs._grid->_slice_block [dimension];
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-  for(int b=0;b<rhs._grid->_slice_block [dimension];b++){
+  for(int n=0;n<e1;n++){
+  for(int b=0;b<e2;b++){
 
       int o  =n*rhs._grid->_slice_stride[dimension];
       int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);