From 62dccb32476527cd77aae6ebeba98d28c20f6f9a Mon Sep 17 00:00:00 2001 From: Peter Boyle <paboyle@ph.ed.ac.uk> Date: Thu, 28 May 2015 13:47:01 +0100 Subject: [PATCH] Weak scale the benchmarks automatically. --- benchmarks/Grid_memory_bandwidth.cc | 31 +++++++++++---------- benchmarks/Grid_su3.cc | 31 +++++++++++---------- lib/Grid_init.cc | 1 - lib/cshift/Grid_cshift_common.h | 43 ++++++++++++++++++----------- 4 files changed, 60 insertions(+), 46 deletions(-) diff --git a/benchmarks/Grid_memory_bandwidth.cc b/benchmarks/Grid_memory_bandwidth.cc index fc959c38..5abdc6a3 100644 --- a/benchmarks/Grid_memory_bandwidth.cc +++ b/benchmarks/Grid_memory_bandwidth.cc @@ -27,8 +27,8 @@ int main (int argc, char ** argv) for(int lat=4;lat<=32;lat+=4){ - std::vector<int> latt_size ({lat,lat,lat,lat}); - + std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); //GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); @@ -47,8 +47,8 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000; - double flops=lat*lat*lat*lat*Nvec*2;// mul,add - double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real); + double flops=vol*Nvec*2;// mul,add + double bytes=3*vol*Nvec*sizeof(Real); std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl; } @@ -61,8 +61,8 @@ int main (int argc, char ** argv) for(int lat=4;lat<=32;lat+=4){ - std::vector<int> latt_size ({lat,lat,lat,lat}); - + std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); //GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); @@ -79,8 +79,8 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000; - double flops=lat*lat*lat*lat*Nvec*2;// mul,add - double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real); + double flops=vol*Nvec*2;// mul,add + double bytes=3*vol*Nvec*sizeof(Real); std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl; } @@ -92,7 +92,8 @@ int main (int argc, char ** argv) for(int lat=4;lat<=32;lat+=4){ - std::vector<int> latt_size ({lat,lat,lat,lat}); + std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); @@ -111,8 +112,8 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000; - double bytes=2*lat*lat*lat*lat*Nvec*sizeof(Real); - double flops=lat*lat*lat*lat*Nvec*1;// mul + double bytes=2*vol*Nvec*sizeof(Real); + double flops=vol*Nvec*1;// mul std::cout <<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl; } @@ -125,8 +126,8 @@ int main (int argc, char ** argv) for(int lat=4;lat<=32;lat+=4){ - std::vector<int> latt_size ({lat,lat,lat,lat}); - + std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); //GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); @@ -144,8 +145,8 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000; - double bytes=lat*lat*lat*lat*Nvec*sizeof(Real); - double flops=lat*lat*lat*lat*Nvec*2;// mul,add + double bytes=vol*Nvec*sizeof(Real); + double flops=vol*Nvec*2;// mul,add std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl; } diff --git a/benchmarks/Grid_su3.cc b/benchmarks/Grid_su3.cc index d64e8706..903ea348 100644 --- a/benchmarks/Grid_su3.cc +++ b/benchmarks/Grid_su3.cc @@ -24,8 +24,8 @@ int main (int argc, char ** argv) for(int lat=2;lat<=24;lat+=2){ - std::vector<int> latt_size ({lat,lat,lat,lat}); - + std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); // GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); @@ -40,9 +40,9 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000.0; - double bytes=3.0*lat*lat*lat*lat*Nc*Nc*sizeof(Complex); - double footprint=2.0*lat*lat*lat*lat*Nc*Nc*sizeof(Complex); - double flops=Nc*Nc*(6.0+8.0+8.0)*lat*lat*lat*lat; + double bytes=3.0*vol*Nc*Nc*sizeof(Complex); + double footprint=2.0*vol*Nc*Nc*sizeof(Complex); + double flops=Nc*Nc*(6.0+8.0+8.0)*vol; std::cout<<std::setprecision(3) << lat<<"\t\t"<<footprint<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl; } @@ -56,7 +56,8 @@ int main (int argc, char ** argv) for(int lat=2;lat<=24;lat+=2){ - std::vector<int> latt_size ({lat,lat,lat,lat}); + std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); // GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); @@ -72,8 +73,8 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000.0; - double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex); - double flops=Nc*Nc*(6+8+8)*lat*lat*lat*lat; + double bytes=3*vol*Nc*Nc*sizeof(Complex); + double flops=Nc*Nc*(6+8+8)*vol; std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl; } @@ -86,7 +87,8 @@ int main (int argc, char ** argv) for(int lat=2;lat<=24;lat+=2){ - std::vector<int> latt_size ({lat,lat,lat,lat}); + std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); // GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); @@ -102,8 +104,8 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000.0; - double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex); - double flops=Nc*Nc*(6+8+8)*lat*lat*lat*lat; + double bytes=3*vol*Nc*Nc*sizeof(Complex); + double flops=Nc*Nc*(6+8+8)*vol; std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl; } @@ -116,7 +118,8 @@ int main (int argc, char ** argv) for(int lat=2;lat<=24;lat+=2){ - std::vector<int> latt_size ({lat,lat,lat,lat}); + std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); // GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); @@ -132,8 +135,8 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000.0; - double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex); - double flops=Nc*Nc*(8+8+8)*lat*lat*lat*lat; + double bytes=3*vol*Nc*Nc*sizeof(Complex); + double flops=Nc*Nc*(8+8+8)*vol; std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl; } diff --git a/lib/Grid_init.cc b/lib/Grid_init.cc index 7fce4793..f72393cb 100644 --- a/lib/Grid_init.cc +++ b/lib/Grid_init.cc @@ -54,7 +54,6 @@ namespace Grid { const std::vector<int> &GridDefaultLatt(void) {return Grid_default_latt;}; const std::vector<int> &GridDefaultMpi(void) {return Grid_default_mpi;}; - //////////////////////////////////////////////////////////// // Command line parsing assist for stock controls //////////////////////////////////////////////////////////// diff --git a/lib/cshift/Grid_cshift_common.h b/lib/cshift/Grid_cshift_common.h index 97e66b2c..c369fe1c 100644 --- a/lib/cshift/Grid_cshift_common.h +++ b/lib/cshift/Grid_cshift_common.h @@ -27,9 +27,11 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator< int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane + int e1=rhs._grid->_slice_nblock[dimension]; + int e2=rhs._grid->_slice_block[dimension]; PARALLEL_NESTED_LOOP2 - for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){ - for(int b=0;b<rhs._grid->_slice_block[dimension];b++){ + for(int n=0;n<e1;n++){ + for(int b=0;b<e2;b++){ int o = n*rhs._grid->_slice_stride[dimension]; int bo = n*rhs._grid->_slice_block[dimension]; int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup @@ -54,10 +56,12 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_ } int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - + + int e1=rhs._grid->_slice_nblock[dimension]; + int e2=rhs._grid->_slice_block[dimension]; PARALLEL_NESTED_LOOP2 - for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){ - for(int b=0;b<rhs._grid->_slice_block[dimension];b++){ + for(int n=0;n<e1;n++){ + for(int b=0;b<e2;b++){ int o=n*rhs._grid->_slice_stride[dimension]; int offset = b+n*rhs._grid->_slice_block[dimension]; @@ -103,9 +107,11 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane + int e1=rhs._grid->_slice_nblock[dimension]; + int e2=rhs._grid->_slice_block[dimension]; PARALLEL_NESTED_LOOP2 - for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){ - for(int b=0;b<rhs._grid->_slice_block[dimension];b++){ + for(int n=0;n<e1;n++){ + for(int b=0;b<e2;b++){ int o =n*rhs._grid->_slice_stride[dimension]; int bo =n*rhs._grid->_slice_block[dimension]; int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup @@ -129,10 +135,11 @@ PARALLEL_NESTED_LOOP2 int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane + int e1=rhs._grid->_slice_nblock[dimension]; + int e2=rhs._grid->_slice_block[dimension]; PARALLEL_NESTED_LOOP2 - for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){ - for(int b=0;b<rhs._grid->_slice_block[dimension];b++){ - + for(int n=0;n<e1;n++){ + for(int b=0;b<e2;b++){ int o = n*rhs._grid->_slice_stride[dimension]; int offset = b+n*rhs._grid->_slice_block[dimension]; int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); @@ -156,10 +163,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane - + + int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc + int e2=rhs._grid->_slice_block[dimension]; PARALLEL_NESTED_LOOP2 - for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){ - for(int b=0;b<rhs._grid->_slice_block[dimension];b++){ + for(int n=0;n<e1;n++){ + for(int b=0;b<e2;b++){ int o =n*rhs._grid->_slice_stride[dimension]+b; int ocb=1<<lhs._grid->CheckerBoardFromOindex(o); @@ -185,10 +194,12 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &r int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane - + + int e1=rhs._grid->_slice_nblock[dimension]; + int e2=rhs._grid->_slice_block [dimension]; PARALLEL_NESTED_LOOP2 - for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){ - for(int b=0;b<rhs._grid->_slice_block [dimension];b++){ + for(int n=0;n<e1;n++){ + for(int b=0;b<e2;b++){ int o =n*rhs._grid->_slice_stride[dimension]; int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);