diff --git a/extras/Hadrons/Modules/MContraction/A2AMesonField.hpp b/extras/Hadrons/Modules/MContraction/A2AMesonField.hpp index 9939e7cc..ba849b0c 100644 --- a/extras/Hadrons/Modules/MContraction/A2AMesonField.hpp +++ b/extras/Hadrons/Modules/MContraction/A2AMesonField.hpp @@ -51,20 +51,20 @@ class A2AMesonFieldPar : Serializable public: GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMesonFieldPar, int, cacheBlock, - int, schurBlock, - int, Nmom, + int, block, std::string, v, std::string, w, - std::string, output); + std::string, output, + std::vector, mom); }; template class TA2AMesonField : public Module { - public: +public: FERM_TYPE_ALIASES(FImpl, ); SOLVER_TYPE_ALIASES(FImpl, ); - public: +public: // constructor TA2AMesonField(const std::string name); // destructor @@ -76,18 +76,21 @@ class TA2AMesonField : public Module virtual void setup(void); // execution virtual void execute(void); - +private: // Arithmetic help. Move to Grid?? - virtual void MesonField(Eigen::Tensor &mat, - const LatticeFermion *lhs, - const LatticeFermion *rhs, - std::vector gammas, - const std::vector &mom, - int orthogdim, - double &t0, - double &t1, - double &t2, - double &t3); + virtual void makeBlock(Eigen::Tensor &mat, + const LatticeFermion *lhs, + const LatticeFermion *rhs, + std::vector gammas, + const std::vector &mom, + int orthogdim, + double &t0, + double &t1, + double &t2, + double &t3); +private: + bool hasPhase_{false}; + std::string momphName_; }; MODULE_REGISTER(A2AMesonField, ARG(TA2AMesonField), MContraction); @@ -99,7 +102,8 @@ MODULE_REGISTER(ZA2AMesonField, ARG(TA2AMesonField), MContraction); // constructor ///////////////////////////////////////////////////////////////// template TA2AMesonField::TA2AMesonField(const std::string name) - : Module(name) +: Module(name) +, momphName_(name + "_momph") { } @@ -120,18 +124,166 @@ std::vector TA2AMesonField::getOutput(void) return out; } - // setup /////////////////////////////////////////////////////////////////////// template void TA2AMesonField::setup(void) -{} +{ + envCache(std::vector, momphName_, 1, + par().mom.size(), env().getGrid()); + envTmpLat(LatticeComplex, "coor"); +} + +// execution /////////////////////////////////////////////////////////////////// +template +void TA2AMesonField::execute(void) +{ + LOG(Message) << "Computing all-to-all meson fields" << std::endl; + + auto &v = envGet(std::vector, par().v); + auto &w = envGet(std::vector, par().w); + + // 2+6+4+4 = 16 gammas + // Ordering defined here + std::vector gammas ( { + Gamma::Algebra::Gamma5, + Gamma::Algebra::Identity, + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::SigmaZT + }); + + int nt = env().getDim().back(); + int N_i = w.size(); + int N_j = v.size(); + int ngamma = gammas.size(); + int nmom = par().mom.size(); + int block = par().block; + int cacheBlock = par().cacheBlock; + + /////////////////////////////////////////////// + // Momentum setup + /////////////////////////////////////////////// + auto &ph = envGet(std::vector, momphName_); + + if (!hasPhase_) + { + MODULE_TIMER("Momentum phases"); + for (unsigned int j = 0; j < nmom; ++j) + { + Complex i(0.0,1.0); + std::vector p; + + envGetTmp(LatticeComplex, coor); + p = strToVec(par().mom[j]); + ph[j] = zero; + for(unsigned int mu = 0; mu < p.size(); mu++) + { + LatticeCoordinate(coor, mu); + ph[j] = ph[j] + (p[mu]/env().getDim(mu))*coor; + } + ph[j] = exp((Real)(2*M_PI)*i*ph[j]); + } + hasPhase_ = true; + } + LOG(Message) << "MesonField size " << N_i << "x" << N_j << "x" << nt << std::endl; + + ////////////////////////////////////////////////////////////////////////// + // i,j is first loop over SchurBlock factors reusing 5D matrices + // ii,jj is second loop over cacheBlock factors for high perf contractoin + // iii,jjj are loops within cacheBlock + // Total index is sum of these i+ii+iii etc... + ////////////////////////////////////////////////////////////////////////// + + double flops = 0.0; + double bytes = 0.0; + double vol = env().getVolume(); + double t_schur=0; + double t_contr=0; + double t_int_0=0; + double t_int_1=0; + double t_int_2=0; + double t_int_3=0; + + double t0 = usecond(); + int NBlock_i = N_i/block + (((N_i % block) != 0) ? 1 : 0); + int NBlock_j = N_j/block + (((N_j % block) != 0) ? 1 : 0); + + for(int i=0;i mfBlock(nmom,ngamma,nt,N_ii,N_jj); + + /////////////////////////////////////////////////////////////// + // Series of cache blocked chunks of the contractions within this block + /////////////////////////////////////////////////////////////// + for(int ii=0;ii mfCache(nmom,ngamma,nt,N_iii,N_jjj); + + t_contr-=usecond(); + makeBlock(mfCache, &w[i+ii], &v[j+jj], gammas, ph, + env().getNd() - 1, t_int_0, t_int_1, t_int_2, t_int_3); + t_contr+=usecond(); + + // flops for general N_c & N_s + flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma; + bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj + + vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma; + + MODULE_TIMER("Cache copy"); + for(int iii=0;iii< N_iii;iii++) + for(int jjj=0;jjj< N_jjj;jjj++) + for(int m =0;m< nmom;m++) + for(int g =0;g< ngamma;g++) + for(int t =0;t< nt;t++) + { + mfBlock(m,g,t,ii+iii,jj+jjj) = mfCache(m,g,t,iii,jjj); + } + } + } + + double nodes = env().getGrid()->NodeCount(); + double t_kernel = t_int_0 + t_int_1; + + LOG(Message) << "Perf " << flops/(t_kernel)/1.0e3/nodes << " Gflop/s/node " << std::endl; + LOG(Message) << "Perf " << bytes/(t_kernel)/1.0e3/nodes << " GB/s/node " << std::endl; +} ////////////////////////////////////////////////////////////////////////////////// // Cache blocked arithmetic routine // Could move to Grid ??? ////////////////////////////////////////////////////////////////////////////////// template -void TA2AMesonField::MesonField(Eigen::Tensor &mat, +void TA2AMesonField::makeBlock(Eigen::Tensor &mat, const LatticeFermion *lhs_wi, const LatticeFermion *rhs_vj, std::vector gammas, @@ -142,315 +294,178 @@ void TA2AMesonField::MesonField(Eigen::Tensor &mat, double &t2, double &t3) { - typedef typename FImpl::SiteSpinor vobj; + typedef typename FImpl::SiteSpinor vobj; - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; - typedef iSpinMatrix SpinMatrix_v; - typedef iSpinMatrix SpinMatrix_s; - - int Lblock = mat.dimension(3); - int Rblock = mat.dimension(4); + typedef iSpinMatrix SpinMatrix_v; + typedef iSpinMatrix SpinMatrix_s; + + int Lblock = mat.dimension(3); + int Rblock = mat.dimension(4); - GridBase *grid = lhs_wi[0]._grid; - - const int Nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); + GridBase *grid = lhs_wi[0]._grid; + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); - int Nt = grid->GlobalDimensions()[orthogdim]; - int Ngamma = gammas.size(); - int Nmom = mom.size(); + int Nt = grid->GlobalDimensions()[orthogdim]; + int Ngamma = gammas.size(); + int Nmom = mom.size(); - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; - // will locally sum vectors first - // sum across these down to scalars - // splitting the SIMD - int MFrvol = rd*Lblock*Rblock*Nmom; - int MFlvol = ld*Lblock*Rblock*Nmom; + // will locally sum vectors first + // sum across these down to scalars + // splitting the SIMD + int MFrvol = rd*Lblock*Rblock*Nmom; + int MFlvol = ld*Lblock*Rblock*Nmom; - Vector lvSum(MFrvol); - parallel_for (int r = 0; r < MFrvol; r++) - { - lvSum[r] = zero; - } - - Vector lsSum(MFlvol); - parallel_for (int r = 0; r < MFlvol; r++){ - lsSum[r]=scalar_type(0.0); - } - - int e1= grid->_slice_nblock[orthogdim]; - int e2= grid->_slice_block [orthogdim]; - int stride=grid->_slice_stride[orthogdim]; - - t0-=usecond(); - MODULE_TIMER("Colour trace * mom."); - // Nested parallelism would be ok - // Wasting cores here. Test case r - parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane - - for(int n=0;n lvSum(MFrvol); + parallel_for (int r = 0; r < MFrvol; r++) { - int ss= so+n*stride+b; - - for(int i=0;i icoor(Nd); - std::vector extracted(Nsimd); - - for(int i=0;iiCoorFromIindex(icoor,idx); - - int ldx = rt+icoor[orthogdim]*rd; - int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx; - - lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]; - } + Vector lsSum(MFlvol); + parallel_for (int r = 0; r < MFlvol; r++){ + lsSum[r]=scalar_type(0.0); } - } - t1+=usecond(); - assert(mat.dimension(0) == Nmom); - assert(mat.dimension(1) == Ngamma); - assert(mat.dimension(2) == Nt); - t2-=usecond(); - // ld loop and local only?? - MODULE_TIMER("Spin trace"); - int pd = grid->_processors[orthogdim]; - int pc = grid->_processor_coor[orthogdim]; - parallel_for_nest2(int lt=0;lt_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + t0-=usecond(); + MODULE_TIMER("Colour trace * mom."); + // Nested parallelism would be ok + // Wasting cores here. Test case r + parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane - for(int mu=0;mu icoor(Nd); + std::vector extracted(Nsimd); for(int i=0;iiCoorFromIindex(icoor,idx); + + int ldx = rt+icoor[orthogdim]*rd; + int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx; + + lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]; + } + } } - } - t2+=usecond(); - //////////////////////////////////////////////////////////////////// - // This global sum is taking as much as 50% of time on 16 nodes - // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume - // Healthy size that should suffice - //////////////////////////////////////////////////////////////////// - t3-=usecond(); - MODULE_TIMER("Global sum"); - grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock); - t3+=usecond(); -} + t1+=usecond(); + assert(mat.dimension(0) == Nmom); + assert(mat.dimension(1) == Ngamma); + assert(mat.dimension(2) == Nt); + t2-=usecond(); -// execution /////////////////////////////////////////////////////////////////// -template -void TA2AMesonField::execute(void) -{ - LOG(Message) << "Computing A2A meson field" << std::endl; - - auto &v = envGet(std::vector, par().v); - auto &w = envGet(std::vector, par().w); - - // 2+6+4+4 = 16 gammas - // Ordering defined here - std::vector gammas ( { - Gamma::Algebra::Gamma5, - Gamma::Algebra::Identity, - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaT, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::SigmaZT - }); - - /////////////////////////////////////////////// - // Square assumption for now Nl = Nr = N - /////////////////////////////////////////////// - int nt = env().getDim(Tp); - int nx = env().getDim(Xp); - int ny = env().getDim(Yp); - int nz = env().getDim(Zp); - int N_i = w.size(); - int N_j = v.size(); - int ngamma = gammas.size(); - int schurBlock = par().schurBlock; - int cacheBlock = par().cacheBlock; - int nmom = par().Nmom; - std::vector corr(nt,ComplexD(0.0)); - - /////////////////////////////////////////////// - // Momentum setup - /////////////////////////////////////////////// - GridBase *grid = env().getGrid(); - std::vector phases(nmom,grid); - - for(int m=0;m mesonFieldBlocked(nmom,ngamma,nt,N_ii,N_jj); - - /////////////////////////////////////////////////////////////// - // Series of cache blocked chunks of the contractions within this SchurBlock - /////////////////////////////////////////////////////////////// - for(int ii=0;ii_processors[orthogdim]; + int pc = grid->_processor_coor[orthogdim]; + parallel_for_nest2(int lt=0;lt mesonFieldCache(nmom,ngamma,nt,N_iii,N_jjj); + for(int pt=0;ptNodeCount(); - double t_kernel = t_int_0 + t_int_1; - - LOG(Message) << "Perf " << flops/(t_kernel)/1.0e3/nodes << " Gflop/s/node " << std::endl; - LOG(Message) << "Perf " << bytes/(t_kernel)/1.0e3/nodes << " GB/s/node " << std::endl; + t2+=usecond(); + //////////////////////////////////////////////////////////////////// + // This global sum is taking as much as 50% of time on 16 nodes + // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume + // Healthy size that should suffice + //////////////////////////////////////////////////////////////////// + t3-=usecond(); + MODULE_TIMER("Global sum"); + grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock); + t3+=usecond(); } END_MODULE_NAMESPACE