/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./benchmarks/Benchmark_memory_bandwidth.cc Copyright (C) 2015 Author: Peter Boyle Author: paboyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #include using namespace std; using namespace Grid; using namespace Grid::QCD; typedef WilsonFermion5D WilsonFermion5DR; typedef WilsonFermion5D WilsonFermion5DF; typedef WilsonFermion5D WilsonFermion5DD; std::vector L_list; std::vector Ls_list; std::vector mflop_list; double mflop_ref; double mflop_ref_err; int NN_global; struct time_statistics{ double mean; double err; double min; double max; void statistics(std::vector v){ double sum = std::accumulate(v.begin(), v.end(), 0.0); mean = sum / v.size(); std::vector diff(v.size()); std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); auto result = std::minmax_element(v.begin(), v.end()); min = *result.first; max = *result.second; } }; void comms_header(){ std::cout < simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); for(int mu=0;mu1) nmu++; std::vector t_time(Nloop); time_statistics timestat; std::cout< latt_size ({lat*mpi_layout[0], lat*mpi_layout[1], lat*mpi_layout[2], lat*mpi_layout[3]}); GridCartesian Grid(latt_size,simd_layout,mpi_layout); RealD Nrank = Grid._Nprocessors; RealD Nnode = Grid.NodeCount(); RealD ppn = Nrank/Nnode; std::vector xbuf(8); std::vector rbuf(8); Grid.ShmBufferFreeAll(); for(int d=0;d<8;d++){ xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); } int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); int ncomm; double dbytes; std::vector times(Nloop); for(int i=0;i1 ) { int xmit_to_rank; int recv_from_rank; if ( dir == mu ) { int comm_proc=1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); } else { int comm_proc = mpi_layout[mu]-1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); } tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0], recv_from_rank, bytes,dir); #ifdef GRID_OMP #pragma omp atomic ncomm++; #ifdef GRID_OMP #pragma omp atomic #endif dbytes+=tbytes; } } Grid.Barrier(); double stop=usecond(); t_time[i] = stop-start; // microseconds } timestat.statistics(t_time); // for(int i=0;i > LatticeVec; typedef iVector Vec; std::vector simd_layout = GridDefaultSimd(Nd,vReal::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); std::cout<({45,12,81,9})); for(int lat=8;lat<=lmax;lat+=4){ std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); Vec rn ; random(sRNG,rn); LatticeVec z(&Grid); z=rn; LatticeVec x(&Grid); x=rn; LatticeVec y(&Grid); y=rn; double a=2.0; uint64_t Nloop=NLOOP; double start=usecond(); for(int i=0;i mflops_all; /////////////////////////////////////////////////////// // Set/Get the layout & grid size /////////////////////////////////////////////////////// int threads = GridThread::GetThreads(); std::vector mpi = GridDefaultMpi(); assert(mpi.size()==4); std::vector local({L,L,L,L}); GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector({64,64,64,64}), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); uint64_t NP = TmpGrid->RankCount(); uint64_t NN = TmpGrid->NodeCount(); NN_global=NN; uint64_t SHM=NP/NN; std::vector internal; if ( SHM == 1 ) internal = std::vector({1,1,1,1}); else if ( SHM == 2 ) internal = std::vector({2,1,1,1}); else if ( SHM == 4 ) internal = std::vector({2,2,1,1}); else if ( SHM == 8 ) internal = std::vector({2,2,2,1}); else assert(0); std::vector nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); std::vector latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); ///////// Welcome message //////////// std::cout< seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5); std::cout << GridLogMessage << "Initialised RNGs" << std::endl; ///////// Source preparation //////////// LatticeFermion src (sFGrid); random(RNG5,src); LatticeFermion tmp (sFGrid); RealD N2 = 1.0/::sqrt(norm2(src)); src = src*N2; LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5); LatticeFermion src_e (sFrbGrid); LatticeFermion src_o (sFrbGrid); LatticeFermion r_e (sFrbGrid); LatticeFermion r_o (sFrbGrid); LatticeFermion r_eo (sFGrid); LatticeFermion err (sFGrid); { pickCheckerboard(Even,src_e,src); pickCheckerboard(Odd,src_o,src); #if defined(AVX512) const int num_cases = 6; std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); #else const int num_cases = 4; std::string fmt("U/S ; U/O ; G/S ; G/O "); #endif controls Cases [] = { #ifdef AVX512 { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, #endif { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential } }; for(int c=0;cBarrier(); for(int i=0;iBarrier(); double t1=usecond(); // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); // if (ncall < 500) ncall = 500; uint64_t ncall = 500; sFGrid->Broadcast(0,&ncall,sizeof(ncall)); // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); for(uint64_t i=0;iBarrier(); double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops; if ( mflops mflops_all; /////////////////////////////////////////////////////// // Set/Get the layout & grid size /////////////////////////////////////////////////////// int threads = GridThread::GetThreads(); std::vector mpi = GridDefaultMpi(); assert(mpi.size()==4); std::vector local({L,L,L,L}); GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector({64,64,64,64}), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); uint64_t NP = TmpGrid->RankCount(); uint64_t NN = TmpGrid->NodeCount(); NN_global=NN; uint64_t SHM=NP/NN; std::vector internal; if ( SHM == 1 ) internal = std::vector({1,1,1,1}); else if ( SHM == 2 ) internal = std::vector({2,1,1,1}); else if ( SHM == 4 ) internal = std::vector({2,2,1,1}); else if ( SHM == 8 ) internal = std::vector({2,2,2,1}); else assert(0); std::vector nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); std::vector latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); ///////// Welcome message //////////// std::cout< seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); std::cout << GridLogMessage << "Initialised RNGs" << std::endl; ///////// Source preparation //////////// LatticeFermion src (FGrid); random(RNG5,src); LatticeFermion ref (FGrid); LatticeFermion tmp (FGrid); RealD N2 = 1.0/::sqrt(norm2(src)); src = src*N2; LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); //////////////////////////////////// // Naive wilson implementation //////////////////////////////////// { LatticeGaugeField Umu5d(FGrid); std::vector U(4,FGrid); for(int ss=0;ssoSites();ss++){ for(int s=0;s(Umu5d,mu); } for(int mu=0;muBarrier(); for(int i=0;iBarrier(); double t1=usecond(); // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); // if (ncall < 500) ncall = 500; uint64_t ncall = 1000; FGrid->Broadcast(0,&ncall,sizeof(ncall)); // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); for(uint64_t i=0;iBarrier(); double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops; if ( mflops({8,2,2,2}); #else LebesgueOrder::Block = std::vector({2,2,2,2}); #endif Benchmark::Decomposition(); int do_memory=1; int do_comms =1; int do_su3 =0; int do_wilson=1; int do_dwf =1; if ( do_memory ) { std::cout< L_list({8,12,16,24}); std::vector wilson; std::vector dwf4; std::vector dwf5; if ( do_wilson ) { int Ls=1; std::cout<