From 5adf2657ddeb238be1e3453509c801c5987edf43 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 10 Aug 2025 00:00:13 +0100 Subject: [PATCH] Updated to compile and run fast on CUDA --- Grid/algorithms/blas/BatchedBlas.h | 1 + Grid/algorithms/blas/MomentumProject.h | 18 + Grid/lattice/Lattice_view.h | 2 +- Grid/qcd/action/fermion/WilsonCloverHelpers.h | 4 +- Grid/qcd/utils/A2Autils.h | 418 ++---------------- Grid/qcd/utils/SUn.impl.h | 2 +- Grid/util/Lexicographic.h | 2 +- configure.ac | 2 +- 8 files changed, 60 insertions(+), 389 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index d79da095..bd01ab74 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -111,6 +111,7 @@ public: default: GRID_ASSERT(0); } + return CUBLAS_COMPUTE_32F_FAST_16F; } #endif // Force construct once diff --git a/Grid/algorithms/blas/MomentumProject.h b/Grid/algorithms/blas/MomentumProject.h index 888257af..7af7f022 100644 --- a/Grid/algorithms/blas/MomentumProject.h +++ b/Grid/algorithms/blas/MomentumProject.h @@ -228,6 +228,11 @@ public: // void Project(Field &data,std::vector< typename Field::scalar_object > & projected_gdata) { + double t_import=0; + double t_export=0; + double t_gemm =0; + double t_allreduce=0; + t_import-=usecond(); this->ImportVector(data); std::vector< typename Field::scalar_object > projected_planes; @@ -243,12 +248,14 @@ public: acceleratorPut(Vd[0],Vh); acceleratorPut(Md[0],Mh); acceleratorPut(Pd[0],Ph); + t_import+=usecond(); GridBLAS BLAS; ///////////////////////////////////////// // P_im = VMmx . Vxi ///////////////////////////////////////// + t_gemm-=usecond(); BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, words*nt,nmom,nxyz, scalar(1.0), @@ -257,8 +264,11 @@ public: scalar(0.0), // wipe out result Pd); BLAS.synchronise(); + t_gemm+=usecond(); + t_export-=usecond(); ExportMomentumProjection(projected_planes); // resizes + t_export+=usecond(); ///////////////////////////////// // Reduce across MPI ranks @@ -275,7 +285,15 @@ public: int st = grid->LocalStarts()[nd-1]; projected_gdata[t+st + gt*m] = projected_planes[t+lt*m]; }} + t_allreduce-=usecond(); grid->GlobalSumVector((scalar *)&projected_gdata[0],gt*nmom*words); + t_allreduce+=usecond(); + + std::cout << GridLogPerformance<<" MomentumProject t_import "< j @@ -232,7 +232,7 @@ public: #else template static accelerator_inline vobj triangle_elem(const iImplCloverTriangle& triangle, int block, int i, int j) { - GRID_ASSERT(i != j); + assert(i != j); if(i < j) { return triangle()(block)(triangle_index(i, j)); } else { // i > j diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index ed6023ce..8998126c 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -62,7 +62,7 @@ public: const FermionField *rhs_vj, std::vector gammas, const std::vector &mom, - int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr); + int orthogdim); template // output: rank 5 tensor, e.g. Eigen::Tensor static void AslashField(TensorType &mat, @@ -70,7 +70,7 @@ public: const FermionField *rhs_vj, const std::vector &emB0, const std::vector &emB1, - int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr); + int orthogdim); template typename std::enable_if<(std::is_same, TensorType>::value || @@ -136,7 +136,7 @@ typedef iVecComplex vVecComplex; typedef Lattice LatticeVecComplex; #define A2A_GPU_KERNELS -#ifdef A2A_GPU_KERNELS + template template void A2Autils::MesonField(TensorType &mat, @@ -144,7 +144,7 @@ void A2Autils::MesonField(TensorType &mat, const FermionField *rhs_vj, std::vector gammas, const std::vector &mom, - int orthogdim, double *t_kernel, double *t_gsum) + int orthogdim) { const int block=A2Ablocking; typedef typename FImpl::SiteSpinor vobj; @@ -173,24 +173,34 @@ void A2Autils::MesonField(TensorType &mat, std::cout < MP; - std::cout < sliced; for(int i=0;ioSites(),(size_t)Nsimd,{ auto left = conjugate(lhs_v(ss)); auto right = rhs_v(ss); @@ -203,48 +213,38 @@ void A2Autils::MesonField(TensorType &mat, }} coalescedWrite(SpinMat_v[ss],vv); }); + t_kernel += usecond(); }// j within block // After getting the sitewise product do the mom phase loop -#if 1 - std::cout <(sliced[idx],jj); for(int mu=0;mu(sliced[t],jj); - auto trSG = trace(tmp*Gamma(gammas[mu])); - mat(m,mu,t,i,j) = trSG()(); - } - } - } - } -#endif + }); + t_gamma += usecond(); }//jo } + std::cout << GridLogMessage<<" A2A::MesonField t_view "<::AslashField(TensorType &mat, const FermionField *rhs_vj, const std::vector &emB0, const std::vector &emB1, - int orthogdim, double *t_kernel, double *t_gsum) + int orthogdim) { const int block=A2Ablocking; typedef typename FImpl::SiteSpinor vobj; @@ -355,354 +355,6 @@ void A2Autils::AslashField(TensorType &mat, } } -#else -template -template -void A2Autils::MesonField(TensorType &mat, - const FermionField *lhs_wi, - const FermionField *rhs_vj, - std::vector gammas, - const std::vector &mom, - int orthogdim, double *t_kernel, double *t_gsum) -{ - typedef typename FImpl::SiteSpinor vobj; - - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - typedef iSpinMatrix SpinMatrix_v; - typedef iSpinMatrix SpinMatrix_s; - - int Lblock = mat.dimension(3); - int Rblock = mat.dimension(4); - - GridBase *grid = lhs_wi[0].Grid(); - - const int Nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); - - int Nt = grid->GlobalDimensions()[orthogdim]; - int Ngamma = gammas.size(); - int Nmom = mom.size(); - - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; - - // will locally sum vectors first - // sum across these down to scalars - // splitting the SIMD - int MFrvol = rd*Lblock*Rblock*Nmom; - int MFlvol = ld*Lblock*Rblock*Nmom; - - std::vector lvSum(MFrvol); - for(int r=0;r lsSum(MFlvol); - for(int r=0;r_slice_nblock[orthogdim]; - int e2= grid->_slice_block [orthogdim]; - int stride=grid->_slice_stride[orthogdim]; - - // potentially wasting cores here if local time extent too small - if (t_kernel) *t_kernel = -usecond(); - for(int r=0;r_ostride[orthogdim]; // base offset for start of plane - - for(int n=0;n extracted(Nsimd); - - for(int i=0;iiCoorFromIindex(icoor,idx); - - int ldx = rt+icoor[orthogdim]*rd; - - int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx; - - lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]; - - } - }}} - } - if (t_kernel) *t_kernel += usecond(); - GRID_ASSERT(mat.dimension(0) == Nmom); - GRID_ASSERT(mat.dimension(1) == Ngamma); - GRID_ASSERT(mat.dimension(2) == Nt); - - // ld loop and local only?? - int pd = grid->_processors[orthogdim]; - int pc = grid->_processor_coor[orthogdim]; - thread_for_collapse(2,lt,ld,{ - for(int pt=0;ptGlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock); - if (t_gsum) *t_gsum += usecond(); -} - -template -template -void A2Autils::AslashField(TensorType &mat, - const FermionField *lhs_wi, - const FermionField *rhs_vj, - const std::vector &emB0, - const std::vector &emB1, - int orthogdim, double *t_kernel, double *t_gsum) -{ - typedef typename FermionField::vector_object vobj; - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - typedef iSpinMatrix SpinMatrix_v; - typedef iSpinMatrix SpinMatrix_s; - typedef iSinglet Singlet_v; - typedef iSinglet Singlet_s; - - int Lblock = mat.dimension(3); - int Rblock = mat.dimension(4); - - GridBase *grid = lhs_wi[0].Grid(); - - const int Nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); - - int Nt = grid->GlobalDimensions()[orthogdim]; - int Nem = emB0.size(); - GRID_ASSERT(emB1.size() == Nem); - - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; - - // will locally sum vectors first - // sum across these down to scalars - // splitting the SIMD - int MFrvol = rd*Lblock*Rblock*Nem; - int MFlvol = ld*Lblock*Rblock*Nem; - - std::vector lvSum(MFrvol); - thread_for(r,MFrvol, - { - lvSum[r] = Zero(); - }); - - std::vector lsSum(MFlvol); - thread_for(r,MFlvol, - { - lsSum[r] = scalar_type(0.0); - }); - - int e1= grid->_slice_nblock[orthogdim]; - int e2= grid->_slice_block [orthogdim]; - int stride=grid->_slice_stride[orthogdim]; - - // Nested parallelism would be ok - // Wasting cores here. Test case r - if (t_kernel) *t_kernel = -usecond(); - for(int r=0;r_ostride[orthogdim]; // base offset for start of plane - - for(int n=0;n extracted(Nsimd); - - for(int i=0;i(lvSum[ij_rdx],extracted); - for(int idx=0;idxiCoorFromIindex(icoor,idx); - - int ldx = rt+icoor[orthogdim]*rd; - int ij_ldx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*ldx; - - lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]; - } - } - }); - if (t_kernel) *t_kernel += usecond(); - - // ld loop and local only?? - int pd = grid->_processors[orthogdim]; - int pc = grid->_processor_coor[orthogdim]; - thread_for_collapse(2,lt,ld, - { - for(int pt=0;ptGlobalSumVector(&mat(0,0,0,0,0),Nem*Nt*Lblock*Rblock); - if (t_gsum) *t_gsum += usecond(); -} -#endif //////////////////////////////////////////// // Schematic thoughts about more generalised four quark insertion // diff --git a/Grid/qcd/utils/SUn.impl.h b/Grid/qcd/utils/SUn.impl.h index 3ca3a9a6..bcccc7c4 100644 --- a/Grid/qcd/utils/SUn.impl.h +++ b/Grid/qcd/utils/SUn.impl.h @@ -119,7 +119,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix &ta) { // Map a su2 subgroup number to the pair of rows that are non zero //////////////////////////////////////////////////////////////////////// static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { - GRID_ASSERT((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2)); + assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2)); int spare = su2_index; for (i1 = 0; spare >= (ncolour - 1 - i1); i1++) { diff --git a/Grid/util/Lexicographic.h b/Grid/util/Lexicographic.h index bfe9afcf..d494760f 100644 --- a/Grid/util/Lexicographic.h +++ b/Grid/util/Lexicographic.h @@ -31,7 +31,7 @@ namespace Grid{ static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){ int64_t index64; IndexFromCoor(coor,index64,dims); - GRID_ASSERT(index64<2*1024*1024*1024LL); + assert(index64<2*1024*1024*1024LL); index = (int) index64; } diff --git a/configure.ac b/configure.ac index 84e8f46e..8944ffdc 100644 --- a/configure.ac +++ b/configure.ac @@ -292,7 +292,7 @@ AC_ARG_ENABLE([accelerator], case ${ac_ACCELERATOR} in cuda) echo CUDA acceleration - LIBS="${LIBS} -lcuda" + LIBS="${LIBS} -lcuda -lcublas" AC_DEFINE([GRID_CUDA],[1],[Use CUDA offload]);; sycl) echo SYCL acceleration