/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./benchmarks/Benchmark_usqcd.cc Copyright (C) 2015 Author: Peter Boyle Author: paboyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #include #include using namespace Grid; std::vector L_list; std::vector Ls_list; std::vector mflop_list; double mflop_ref; double mflop_ref_err; int NN_global; FILE * FP; struct time_statistics{ double mean; double err; double min; double max; void statistics(std::vector v){ double sum = std::accumulate(v.begin(), v.end(), 0.0); mean = sum / v.size(); std::vector diff(v.size()); std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); auto result = std::minmax_element(v.begin(), v.end()); min = *result.first; max = *result.second; } }; void comms_header(){ std::cout <1) nmu++; std::vector t_time(Nloop); time_statistics timestat; std::cout< xbuf(8); std::vector rbuf(8); //Grid.ShmBufferFreeAll(); uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); for(int d=0;d<8;d++){ xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); // bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); // bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); } // int ncomm; double dbytes; for(int dir=0;dir<8;dir++) { int mu =dir % 4; if (mpi_layout[mu]>1 ) { std::vector times(Nloop); for(int i=0;i > LatticeVec; typedef iVector Vec; Coordinate simd_layout = GridDefaultSimd(Nd,vReal::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); fprintf(FP,"Memory Bandwidth\n\n"); fprintf(FP,"Bytes, GB/s per node\n"); std::cout<({45,12,81,9})); for(int lat=8;lat<=lmax;lat+=8){ Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); // NP= Grid.RankCount(); NN =Grid.NodeCount(); Vec rn ; random(sRNG,rn); LatticeVec z(&Grid); z=Zero(); LatticeVec x(&Grid); x=Zero(); LatticeVec y(&Grid); y=Zero(); double a=2.0; uint64_t Nloop=NLOOP; double start=usecond(); for(int i=0;i static void BLAS(void) { //int nbasis, int nrhs, int coarseVol int basis[] = { 16,32,64 }; int rhs[] = { 8,12,16 }; int vol = 8*8*8*8; int blk = 4*4*4*4; GridBLAS blas; int fpbits = sizeof(CComplex)*4; std::cout<(M,N,K,BATCH); fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); std::cout<(M,N,K,BATCH); fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); std::cout<(M,N,K,BATCH); fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); std::cout< > LatticeSU4; Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); std::cout<({45,12,81,9})); for(int lat=8;lat<=lmax;lat+=8){ Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); NN =Grid.NodeCount(); LatticeSU4 z(&Grid); z=Zero(); LatticeSU4 x(&Grid); x=Zero(); LatticeSU4 y(&Grid); y=Zero(); // double a=2.0; uint64_t Nloop=NLOOP; double start=usecond(); for(int i=0;i mflops_all; /////////////////////////////////////////////////////// // Set/Get the layout & grid size /////////////////////////////////////////////////////// int threads = GridThread::GetThreads(); Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); Coordinate local({L,L,L,L}); Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi()); uint64_t NP = TmpGrid->RankCount(); uint64_t NN = TmpGrid->NodeCount(); NN_global=NN; uint64_t SHM=NP/NN; ///////// Welcome message //////////// std::cout< seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); std::cout << GridLogMessage << "Initialised RNGs" << std::endl; typedef DomainWallFermionF Action; typedef typename Action::FermionField Fermion; typedef LatticeGaugeFieldF Gauge; ///////// Source preparation //////////// Gauge Umu(UGrid); SU::HotConfiguration(RNG4,Umu); Fermion src (FGrid); random(RNG5,src); Fermion src_e (FrbGrid); Fermion src_o (FrbGrid); Fermion r_e (FrbGrid); Fermion r_o (FrbGrid); Fermion r_eo (FGrid); Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); { pickCheckerboard(Even,src_e,src); pickCheckerboard(Odd,src_o,src); #ifdef AVX512 const int num_cases = 3; #else const int num_cases = 2; #endif std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); controls Cases [] = { { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, { WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, { WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent } }; for(int c=0;cBarrier(); for(int i=0;iBarrier(); double t1=usecond(); uint64_t ncall = 500; FGrid->Broadcast(0,&ncall,sizeof(ncall)); // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); for(uint64_t i=0;iBarrier(); double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops; if ( mflops mflops_all; /////////////////////////////////////////////////////// // Set/Get the layout & grid size /////////////////////////////////////////////////////// int threads = GridThread::GetThreads(); Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); Coordinate local({L,L,L,L}); Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi()); uint64_t NP = TmpGrid->RankCount(); uint64_t NN = TmpGrid->NodeCount(); NN_global=NN; uint64_t SHM=NP/NN; ///////// Welcome message //////////// std::cout< seeds4({1,2,3,4}); GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4); std::cout << GridLogMessage << "Initialised RNGs" << std::endl; RealD mass=0.1; RealD c1=9.0/8.0; RealD c2=-1.0/24.0; RealD u0=1.0; typedef ImprovedStaggeredFermionF Action; typedef typename Action::FermionField Fermion; typedef LatticeGaugeFieldF Gauge; Gauge Umu(FGrid); SU::HotConfiguration(RNG4,Umu); typename Action::ImplParams params; Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params); ///////// Source preparation //////////// Fermion src (FGrid); random(RNG4,src); Fermion src_e (FrbGrid); Fermion src_o (FrbGrid); Fermion r_e (FrbGrid); Fermion r_o (FrbGrid); Fermion r_eo (FGrid); { pickCheckerboard(Even,src_e,src); pickCheckerboard(Odd,src_o,src); const int num_cases = 2; std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); controls Cases [] = { { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, { StaggeredKernelsStatic::OptHandUnroll, StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, { StaggeredKernelsStatic::OptInlineAsm , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent } }; for(int c=0;cBarrier(); for(int i=0;iBarrier(); double t1=usecond(); uint64_t ncall = 500; FGrid->Broadcast(0,&ncall,sizeof(ncall)); // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); for(uint64_t i=0;iBarrier(); double volume=1; for(int mu=0;mumflops_best ) mflops_best = mflops; if ( mflops mflops_all; /////////////////////////////////////////////////////// // Set/Get the layout & grid size /////////////////////////////////////////////////////// int threads = GridThread::GetThreads(); Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); Coordinate local({L,L,L,L}); Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi()); uint64_t NP = TmpGrid->RankCount(); uint64_t NN = TmpGrid->NodeCount(); NN_global=NN; uint64_t SHM=NP/NN; ///////// Welcome message //////////// std::cout< seeds4({1,2,3,4}); GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4); std::cout << GridLogMessage << "Initialised RNGs" << std::endl; RealD mass=0.1; RealD csw=1.0; typedef WilsonCloverFermionF Action; typedef typename Action::FermionField Fermion; typedef LatticeGaugeFieldF Gauge; Gauge Umu(FGrid); SU::HotConfiguration(RNG4,Umu); Action Dc(Umu,*FGrid,*FrbGrid,mass,csw,csw); ///////// Source preparation //////////// Fermion src (FGrid); random(RNG4,src); Fermion r (FGrid); { const int num_cases = 1; std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); controls Cases [] = { { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, }; for(int c=0;cBarrier(); for(int i=0;iBarrier(); double t1=usecond(); uint64_t ncall = 500; FGrid->Broadcast(0,&ncall,sizeof(ncall)); // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); for(uint64_t i=0;iBarrier(); double volume=1; for(int mu=0;mumflops_best ) mflops_best = mflops; if ( mflops({2,2,2,2}); Benchmark::Decomposition(); int do_su4=0; int do_memory=1; int do_comms =1; int do_blas =1; int do_dslash=1; int sel=4; std::vector L_list({8,12,16,24,32}); int selm1=sel-1; std::vector clover; std::vector dwf4; std::vector staggered; int Ls=1; if (do_dslash){ std::cout<(); Benchmark::BLAS(); } if ( do_su4 ) { std::cout<