diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index 82da2d5d..87bcbd73 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -252,13 +252,16 @@ public: #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) // Need a default/reference implementation + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { ComplexD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + for (int kk = 0; kk < k; ++kk) + c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; + Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; } } } @@ -348,14 +351,19 @@ public: #warning "oneMKL implementation not built " #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; + ComplexF alphaf(real(alpha),imag(alpha)); + ComplexF betaf(real(beta),imag(beta)); // Need a default/reference implementation for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { - ComplexD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + ComplexF c_mn(0.0); + for (int kk = 0; kk < k; ++kk) + c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; + Cmn[p][mm + nn*ldc] = (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ]; } } } @@ -444,14 +452,17 @@ public: #warning "oneMKL implementation not built " #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; // Need a default/reference implementation for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { RealD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + for (int kk = 0; kk < k; ++kk) + c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; + Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; } } } @@ -558,14 +569,17 @@ public: #warning "oneMKL implementation not built " #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; // Need a default/reference implementation for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { RealD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + for (int kk = 0; kk < k; ++kk) + c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; + Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; } } } @@ -638,43 +652,41 @@ public: for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { ComplexD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) + for (int kk = 0; kk < k; ++kk) c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + Cmn[mm + nn*ldc + p*sdc] = (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc]; } } } #endif } - void benchmark(int nbasis, int nrhs, int coarseVol, int nstencil) + double benchmark(int M, int N, int K, int BATCH) { - int32_t N_A = nbasis*nbasis*coarseVol*nstencil; - int32_t N_B = nbasis*nrhs*coarseVol*nstencil; // One leg of stencil at a time - int32_t N_C = nbasis*nrhs*coarseVol*nstencil; + int32_t N_A = M*K*BATCH; + int32_t N_B = K*N*BATCH; + int32_t N_C = M*N*BATCH; deviceVector A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); deviceVector B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); deviceVector C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); ComplexD alpha(1.0); ComplexD beta (1.0); + RealD flops = 8.0*M*N*K*BATCH; for(int i=0;i<10;i++){ RealD t0 = usecond(); - for(int s=0;s using cshiftAllocator = std::allocator; template using Vector = std::vector >; template using stencilVector = std::vector >; template using commVector = std::vector >; +template using deviceVector = std::vector >; template using cshiftVector = std::vector >; NAMESPACE_END(Grid); diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc new file mode 100644 index 00000000..526e5659 --- /dev/null +++ b/benchmarks/Benchmark_usqcd.cc @@ -0,0 +1,959 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_usqcd.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace Grid; + +std::vector L_list; +std::vector Ls_list; +std::vector mflop_list; + +double mflop_ref; +double mflop_ref_err; + +int NN_global; + +FILE * FP; + +struct time_statistics{ + double mean; + double err; + double min; + double max; + + void statistics(std::vector v){ + double sum = std::accumulate(v.begin(), v.end(), 0.0); + mean = sum / v.size(); + + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); + + auto result = std::minmax_element(v.begin(), v.end()); + min = *result.first; + max = *result.second; +} +}; + +void comms_header(){ + std::cout <1) nmu++; + + std::vector t_time(Nloop); + time_statistics timestat; + + std::cout< xbuf(8); + std::vector rbuf(8); + //Grid.ShmBufferFreeAll(); + uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + // bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + // bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + } + + // int ncomm; + double dbytes; + + for(int dir=0;dir<8;dir++) { + int mu =dir % 4; + if (mpi_layout[mu]>1 ) { + + std::vector times(Nloop); + for(int i=0;i > LatticeVec; + typedef iVector Vec; + + Coordinate simd_layout = GridDefaultSimd(Nd,vReal::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + fprintf(FP,"Memory Bandwidth\n\n"); + fprintf(FP,"Bytes, GB/s per node\n"); + std::cout<({45,12,81,9})); + for(int lat=8;lat<=lmax;lat+=8){ + + Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + // NP= Grid.RankCount(); + NN =Grid.NodeCount(); + + Vec rn ; random(sRNG,rn); + + LatticeVec z(&Grid); z=Zero(); + LatticeVec x(&Grid); x=Zero(); + LatticeVec y(&Grid); y=Zero(); + double a=2.0; + + uint64_t Nloop=NLOOP; + + double start=usecond(); + for(int i=0;i > LatticeSU4; + + Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + std::cout<({45,12,81,9})); + for(int lat=8;lat<=lmax;lat+=8){ + + Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + NN =Grid.NodeCount(); + + + LatticeSU4 z(&Grid); z=Zero(); + LatticeSU4 x(&Grid); x=Zero(); + LatticeSU4 y(&Grid); y=Zero(); + // double a=2.0; + + uint64_t Nloop=NLOOP; + + double start=usecond(); + for(int i=0;i mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); + Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; + uint64_t SHM=NP/NN; + + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + typedef DomainWallFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + ///////// Source preparation //////////// + Gauge Umu(UGrid); SU::HotConfiguration(RNG4,Umu); + Fermion src (FGrid); random(RNG5,src); + Fermion src_e (FrbGrid); + Fermion src_o (FrbGrid); + Fermion r_e (FrbGrid); + Fermion r_o (FrbGrid); + Fermion r_eo (FGrid); + Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + { + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd,src_o,src); + + const int num_cases = 1; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases [] = { + { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent } + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); + Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; + uint64_t SHM=NP/NN; + + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + RealD mass=0.1; + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + + typedef ImprovedStaggeredFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + Gauge Umu(FGrid); SU::HotConfiguration(RNG4,Umu); + + typename Action::ImplParams params; + Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params); + + ///////// Source preparation //////////// + Fermion src (FGrid); random(RNG4,src); + Fermion src_e (FrbGrid); + Fermion src_o (FrbGrid); + Fermion r_e (FrbGrid); + Fermion r_o (FrbGrid); + Fermion r_eo (FGrid); + + { + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd,src_o,src); + + const int num_cases = 1; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases [] = { + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=1; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); + Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; + uint64_t SHM=NP/NN; + + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + RealD mass=0.1; + RealD csw=1.0; + + typedef WilsonCloverFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + Gauge Umu(FGrid); SU::HotConfiguration(RNG4,Umu); + + Action Dc(Umu,*FGrid,*FrbGrid,mass,csw,csw); + + ///////// Source preparation //////////// + Fermion src (FGrid); random(RNG4,src); + Fermion r (FGrid); + + { + + const int num_cases = 1; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases [] = { + { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=1; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops({8,2,2,2}); +#else + LebesgueOrder::Block = std::vector({2,2,2,2}); +#endif + Benchmark::Decomposition(); + + int do_su4=0; + int do_memory=1; + int do_comms =1; + int do_blas =1; + + int sel=4; + std::vector L_list({8,12,16,24,32}); + int selm1=sel-1; + + std::vector clover; + std::vector dwf4; + std::vector staggered; + + int Ls=1; + std::cout<