From 5cc4f3241d38802e0fc90a1d40591cb2033ad28b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 18 Oct 2024 15:42:30 +0000 Subject: [PATCH] Meson field test --- Grid/lattice/Lattice_reduction.h | 6 + Grid/qcd/utils/A2Autils.h | 201 ++++++++++++++++++++++++------- tests/Test_meson_field.cc | 60 +++++---- 3 files changed, 203 insertions(+), 64 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index c7da2945..837e3bea 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -522,11 +522,14 @@ template inline void sliceSum(const Lattice &Data, int ostride=grid->_ostride[orthogdim]; //Reduce Data down to lvSum + RealD t_sum =-usecond(); sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd); + t_sum +=usecond(); // Sum across simd lanes in the plane, breaking out orthog dir. Coordinate icoor(Nd); + RealD t_rest =-usecond(); for(int rt=0;rt inline void sliceSum(const Lattice &Data, scalar_type * ptr = (scalar_type *) &result[0]; int words = fd*sizeof(sobj)/sizeof(scalar_type); grid->GlobalSumVector(ptr, words); + t_rest +=usecond(); + std::cout << GridLogMessage << " sliceSum local"< inline std::vector diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index a81ebe6c..70eaf0ab 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -6,6 +6,34 @@ NAMESPACE_BEGIN(Grid); #undef DELTA_F_EQ_2 +/////////////////////////////////////////////////////////////////// +//Meson +// Interested in +// +// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ] +// +// Conventional meson field: +// +// = sum_x,y Trace[ sum_j G |v_j(y,ty)> +// = sum_ij PI_ji(tx) PI_ij(ty) +// +// G5-Hermiticity +// +// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ] +// = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ] +// = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> +// = sum_ij PionVV(ty) PionWW(tx) +// +// (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker +// expectation value. Otherwise biased. +//////////////////////////////////////////////////////////////////// + template class A2Autils { @@ -26,7 +54,9 @@ public: typedef iSpinColourMatrix SpinColourMatrix_v; - template // output: rank 5 tensor, e.g. Eigen::Tensor + + // output: rank 5 tensor, e.g. Eigen::Tensor + template static void MesonField(TensorType &mat, const FermionField *lhs_wi, const FermionField *rhs_vj, @@ -34,6 +64,14 @@ public: const std::vector &mom, int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr); + template + static void MesonFieldGPU(TensorType &mat, + const FermionField *lhs_wi, + const FermionField *rhs_vj, + std::vector gammas, + const std::vector &mom, + int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr); + /* static void PionFieldWVmom(Eigen::Tensor &mat, const FermionField *wi, const FermionField *vj, @@ -58,7 +96,8 @@ public: const FermionField *vi, const FermionField *vj, int orthogdim); - + */ + template // output: rank 5 tensor, e.g. Eigen::Tensor static void AslashField(TensorType &mat, const FermionField *lhs_wi, @@ -159,14 +198,14 @@ void A2Autils::MesonField(TensorType &mat, int MFlvol = ld*Lblock*Rblock*Nmom; std::vector lvSum(MFrvol); - thread_for( r, MFrvol,{ + for(int r=0;r lsSum(MFlvol); - thread_for(r,MFlvol,{ + for(int r=0;r_slice_nblock[orthogdim]; int e2= grid->_slice_block [orthogdim]; @@ -174,7 +213,7 @@ void A2Autils::MesonField(TensorType &mat, // potentially wasting cores here if local time extent too small if (t_kernel) *t_kernel = -usecond(); - thread_for(r,rd,{ + for(int r=0;r_ostride[orthogdim]; // base offset for start of plane @@ -213,10 +252,10 @@ void A2Autils::MesonField(TensorType &mat, } } } - }); + }; // Sum across simd lanes in the plane, breaking out orthog dir. - thread_for(rt,rd,{ + for(int rt=0;rt extracted(Nsimd); @@ -241,7 +280,7 @@ void A2Autils::MesonField(TensorType &mat, } }}} - }); + } if (t_kernel) *t_kernel += usecond(); assert(mat.dimension(0) == Nmom); assert(mat.dimension(1) == Ngamma); @@ -290,35 +329,115 @@ void A2Autils::MesonField(TensorType &mat, if (t_gsum) *t_gsum += usecond(); } +const int A2Ablocking=8; +template using iVecSpinMatrix = iVector, Ns>, A2Ablocking>; +typedef iVecSpinMatrix VecSpinMatrix; +typedef iVecSpinMatrix vVecSpinMatrix; +typedef Lattice LatticeVecSpinMatrix; -/////////////////////////////////////////////////////////////////// -//Meson -// Interested in -// -// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ] -// -// Conventional meson field: -// -// = sum_x,y Trace[ sum_j G |v_j(y,ty)> -// = sum_ij PI_ji(tx) PI_ij(ty) -// -// G5-Hermiticity -// -// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ] -// = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ] -// = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> -// = sum_ij PionVV(ty) PionWW(tx) -// -// (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker -// expectation value. Otherwise biased. -//////////////////////////////////////////////////////////////////// +template +template +void A2Autils::MesonFieldGPU(TensorType &mat, + const FermionField *lhs_wi, + const FermionField *rhs_vj, + std::vector gammas, + const std::vector &mom, + int orthogdim, double *t_kernel, double *t_gsum) +{ + const int block=A2Ablocking; + typedef typename FImpl::SiteSpinor vobj; + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Lblock = mat.dimension(3); + int Rblock = mat.dimension(4); + + assert(Lblock % block==0); + // assert(Rblock % block==0); + + GridBase *grid = lhs_wi[0].Grid(); + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + int Nt = grid->GlobalDimensions()[orthogdim]; + int Ngamma = gammas.size(); + int Nmom = mom.size(); + + + LatticeVecSpinMatrix SpinMat(grid); + LatticeVecSpinMatrix MomSpinMat(grid); + + RealD t_afor = 0.0; + RealD t_sum = 0.0; + RealD t_pha = 0.0; + RealD t_trace= 0.0; + uint64_t ncall=0; + + std::vector sliced; + for(int i=0;i void A2Autils::PionFieldXX(Eigen::Tensor &mat, const FermionField *wi, @@ -645,6 +764,7 @@ void A2Autils::PionFieldVV(Eigen::Tensor &mat, const int nog5=0; PionFieldXX(mat,vi,vj,orthogdim,nog5); } +*/ // "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x) // @@ -992,9 +1112,9 @@ typename std::enable_if<(std::is_same, TensorType>::va std::is_same>, TensorType>::value), void>::type A2Autils::ContractWWVV(std::vector &WWVV, - const TensorType &WW_sd, - const FermionField *vs, - const FermionField *vd) + const TensorType &WW_sd, + const FermionField *vs, + const FermionField *vd) { GridBase *grid = vs[0].Grid(); @@ -1062,7 +1182,6 @@ A2Autils::ContractWWVV(std::vector &WWVV, } for (int t = 0; t < N_t; t++){ - std::cout << GridLogMessage << "Contraction t = " << t << std::endl; buf = WW_sd[t]; thread_for(ss,grid->oSites(),{ for(int d_o=0;d_o phi(VDIM,&grid); - std::vector rho(VDIM,&grid); - FermionField rho_tmp(&grid); std::cout << GridLogMessage << "Initialising random meson fields" << std::endl; for (unsigned int i = 0; i < VDIM; ++i){ random(pRNG,phi[i]); - random(pRNG,rho_tmp); //ideally only nonzero on t=0 - rho[i] = where((t==TSRC), rho_tmp, 0.*rho_tmp); //ideally only nonzero on t=0 } std::cout << GridLogMessage << "Meson fields initialised, rho non-zero only for t = " << TSRC << std::endl; @@ -82,7 +78,7 @@ int main(int argc, char *argv[]) {1.,1.,1.}, {2.,0.,0.} }; - + // 5 momenta x VDIMxVDIM = 125 calls (x 16 spins) 1.4s => 1400/125 ~10ms per call std::cout << GridLogMessage << "Meson fields will be created for " << Gmu.size() << " Gamma matrices and " << momenta.size() << " momenta." << std::endl; std::cout << GridLogMessage << "Computing complex phases" << std::endl; @@ -102,28 +98,47 @@ int main(int argc, char *argv[]) std::cout << GridLogMessage << "Computing complex phases done." << std::endl; Eigen::Tensor Mpp(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); - Eigen::Tensor Mpr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); - Eigen::Tensor Mrr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); + Eigen::Tensor Mpp_gpu(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); // timer double start,stop; //execute meson field routine + std::cout << GridLogMessage << "Meson Field Warmup Begin" << std::endl; + A2Autils::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp); + std::cout << GridLogMessage << "Meson Field Timing Begin" << std::endl; start = usecond(); A2Autils::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp); stop = usecond(); std::cout << GridLogMessage << "M(phi,phi) created, execution time " << stop-start << " us" << std::endl; - start = usecond(); - /* Ideally, for this meson field we could pass TSRC (even better a list of timeslices) - * to the routine so that all the compnents which are predictably equal to zero are not computed. */ - A2Autils::MesonField(Mpr,&phi[0],&rho[0],Gmu,phases,Tp); - stop = usecond(); - std::cout << GridLogMessage << "M(phi,rho) created, execution time " << stop-start << " us" << std::endl; - start = usecond(); - A2Autils::MesonField(Mrr,&rho[0],&rho[0],Gmu,phases,Tp); - stop = usecond(); - std::cout << GridLogMessage << "M(rho,rho) created, execution time " << stop-start << " us" << std::endl; + std::cout << GridLogMessage << "Meson Field GPU Warmup Begin" << std::endl; + A2Autils::MesonFieldGPU(Mpp_gpu,&phi[0],&phi[0],Gmu,phases,Tp); + std::cout << GridLogMessage << "Meson Field GPU Timing Begin" << std::endl; + start = usecond(); + A2Autils::MesonFieldGPU(Mpp_gpu,&phi[0],&phi[0],Gmu,phases,Tp); + stop = usecond(); + std::cout << GridLogMessage << "M_gpu(phi,phi) created, execution time " << stop-start << " us" << std::endl; + + for(int mom=0;mom