diff --git a/benchmarks/Grid_memory_bandwidth.cc b/benchmarks/Grid_memory_bandwidth.cc index fc959c38..5abdc6a3 100644 --- a/benchmarks/Grid_memory_bandwidth.cc +++ b/benchmarks/Grid_memory_bandwidth.cc @@ -27,8 +27,8 @@ int main (int argc, char ** argv) for(int lat=4;lat<=32;lat+=4){ - std::vector latt_size ({lat,lat,lat,lat}); - + std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); //GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); @@ -47,8 +47,8 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000; - double flops=lat*lat*lat*lat*Nvec*2;// mul,add - double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real); + double flops=vol*Nvec*2;// mul,add + double bytes=3*vol*Nvec*sizeof(Real); std::cout< latt_size ({lat,lat,lat,lat}); - + std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); //GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); @@ -79,8 +79,8 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000; - double flops=lat*lat*lat*lat*Nvec*2;// mul,add - double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real); + double flops=vol*Nvec*2;// mul,add + double bytes=3*vol*Nvec*sizeof(Real); std::cout< latt_size ({lat,lat,lat,lat}); + std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); @@ -111,8 +112,8 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000; - double bytes=2*lat*lat*lat*lat*Nvec*sizeof(Real); - double flops=lat*lat*lat*lat*Nvec*1;// mul + double bytes=2*vol*Nvec*sizeof(Real); + double flops=vol*Nvec*1;// mul std::cout < latt_size ({lat,lat,lat,lat}); - + std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); //GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); @@ -144,8 +145,8 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000; - double bytes=lat*lat*lat*lat*Nvec*sizeof(Real); - double flops=lat*lat*lat*lat*Nvec*2;// mul,add + double bytes=vol*Nvec*sizeof(Real); + double flops=vol*Nvec*2;// mul,add std::cout< latt_size ({lat,lat,lat,lat}); - + std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); // GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); @@ -40,9 +40,9 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000.0; - double bytes=3.0*lat*lat*lat*lat*Nc*Nc*sizeof(Complex); - double footprint=2.0*lat*lat*lat*lat*Nc*Nc*sizeof(Complex); - double flops=Nc*Nc*(6.0+8.0+8.0)*lat*lat*lat*lat; + double bytes=3.0*vol*Nc*Nc*sizeof(Complex); + double footprint=2.0*vol*Nc*Nc*sizeof(Complex); + double flops=Nc*Nc*(6.0+8.0+8.0)*vol; std::cout< latt_size ({lat,lat,lat,lat}); + std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); // GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); @@ -72,8 +73,8 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000.0; - double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex); - double flops=Nc*Nc*(6+8+8)*lat*lat*lat*lat; + double bytes=3*vol*Nc*Nc*sizeof(Complex); + double flops=Nc*Nc*(6+8+8)*vol; std::cout< latt_size ({lat,lat,lat,lat}); + std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); // GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); @@ -102,8 +104,8 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000.0; - double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex); - double flops=Nc*Nc*(6+8+8)*lat*lat*lat*lat; + double bytes=3*vol*Nc*Nc*sizeof(Complex); + double flops=Nc*Nc*(6+8+8)*vol; std::cout< latt_size ({lat,lat,lat,lat}); + std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); // GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); @@ -132,8 +135,8 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000.0; - double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex); - double flops=Nc*Nc*(8+8+8)*lat*lat*lat*lat; + double bytes=3*vol*Nc*Nc*sizeof(Complex); + double flops=Nc*Nc*(8+8+8)*vol; std::cout< &GridDefaultLatt(void) {return Grid_default_latt;}; const std::vector &GridDefaultMpi(void) {return Grid_default_mpi;}; - //////////////////////////////////////////////////////////// // Command line parsing assist for stock controls //////////////////////////////////////////////////////////// diff --git a/lib/cshift/Grid_cshift_common.h b/lib/cshift/Grid_cshift_common.h index 97e66b2c..c369fe1c 100644 --- a/lib/cshift/Grid_cshift_common.h +++ b/lib/cshift/Grid_cshift_common.h @@ -27,9 +27,11 @@ Gather_plane_simple (const Lattice &rhs,std::vector_ostride[dimension]; // base offset for start of plane + int e1=rhs._grid->_slice_nblock[dimension]; + int e2=rhs._grid->_slice_block[dimension]; PARALLEL_NESTED_LOOP2 - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ + for(int n=0;n_slice_stride[dimension]; int bo = n*rhs._grid->_slice_block[dimension]; int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup @@ -54,10 +56,12 @@ Gather_plane_extract(const Lattice &rhs,std::vector_ostride[dimension]; // base offset for start of plane - + + int e1=rhs._grid->_slice_nblock[dimension]; + int e2=rhs._grid->_slice_block[dimension]; PARALLEL_NESTED_LOOP2 - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ + for(int n=0;n_slice_stride[dimension]; int offset = b+n*rhs._grid->_slice_block[dimension]; @@ -103,9 +107,11 @@ template void Scatter_plane_simple (Lattice &rhs,std::vector_ostride[dimension]; // base offset for start of plane + int e1=rhs._grid->_slice_nblock[dimension]; + int e2=rhs._grid->_slice_block[dimension]; PARALLEL_NESTED_LOOP2 - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ + for(int n=0;n_slice_stride[dimension]; int bo =n*rhs._grid->_slice_block[dimension]; int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup @@ -129,10 +135,11 @@ PARALLEL_NESTED_LOOP2 int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane + int e1=rhs._grid->_slice_nblock[dimension]; + int e2=rhs._grid->_slice_block[dimension]; PARALLEL_NESTED_LOOP2 - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - + for(int n=0;n_slice_stride[dimension]; int offset = b+n*rhs._grid->_slice_block[dimension]; int ocb=1<CheckerBoardFromOindex(o+b); @@ -156,10 +163,12 @@ template void Copy_plane(Lattice& lhs,Lattice &rhs, int int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane - + + int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc + int e2=rhs._grid->_slice_block[dimension]; PARALLEL_NESTED_LOOP2 - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ + for(int n=0;n_slice_stride[dimension]+b; int ocb=1<CheckerBoardFromOindex(o); @@ -185,10 +194,12 @@ template void Copy_plane_permute(Lattice& lhs,Lattice &r int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane - + + int e1=rhs._grid->_slice_nblock[dimension]; + int e2=rhs._grid->_slice_block [dimension]; PARALLEL_NESTED_LOOP2 - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block [dimension];b++){ + for(int n=0;n_slice_stride[dimension]; int ocb=1<CheckerBoardFromOindex(o+b);