diff --git a/extras/Hadrons/Modules/MContraction/A2AMesonField.hpp b/extras/Hadrons/Modules/MContraction/A2AMesonField.hpp index 9a09c0b6..cee6858e 100644 --- a/extras/Hadrons/Modules/MContraction/A2AMesonField.hpp +++ b/extras/Hadrons/Modules/MContraction/A2AMesonField.hpp @@ -21,9 +21,9 @@ class A2AMesonFieldPar : Serializable { public: GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMesonFieldPar, - int, cacheBlock, - int, schurBlock, - int, Nmom, + int, cacheBlock, + int, schurBlock, + int, Nmom, std::string, A2A, std::string, output); }; @@ -52,15 +52,15 @@ class TA2AMesonField : public Module // Arithmetic help. Move to Grid?? virtual void MesonField(Eigen::Tensor &mat, - const LatticeFermion *lhs, - const LatticeFermion *rhs, - std::vector gammas, - const std::vector &mom, - int orthogdim, - double &t0, - double &t1, - double &t2, - double &t3); + const LatticeFermion *lhs, + const LatticeFermion *rhs, + std::vector gammas, + const std::vector &mom, + int orthogdim, + double &t0, + double &t1, + double &t2, + double &t3); }; MODULE_REGISTER(A2AMesonField, ARG(TA2AMesonField), MContraction); @@ -160,7 +160,8 @@ void TA2AMesonField::MesonField(Eigen::Tensor &mat, int MFlvol = ld*Lblock*Rblock*Nmom; Vector lvSum(MFrvol); - parallel_for (int r = 0; r < MFrvol; r++){ + parallel_for (int r = 0; r < MFrvol; r++) + { lvSum[r] = zero; } @@ -176,110 +177,113 @@ void TA2AMesonField::MesonField(Eigen::Tensor &mat, t0-=usecond(); // Nested parallelism would be ok // Wasting cores here. Test case r - parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane - for(int n=0;n icoor(Nd); std::vector extracted(Nsimd); - for(int i=0;iiCoorFromIindex(icoor,idx); - for(int idx=0;idxiCoorFromIindex(icoor,idx); - - int ldx = rt+icoor[orthogdim]*rd; - - int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx; - - lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]; + int ldx = rt+icoor[orthogdim]*rd; + int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx; + lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]; } - }}} + } } t1+=usecond(); - assert(mat.dimension(0) == Nmom); assert(mat.dimension(1) == Ngamma); assert(mat.dimension(2) == Nt); t2-=usecond(); + // ld loop and local only?? int pd = grid->_processors[orthogdim]; int pc = grid->_processor_coor[orthogdim]; parallel_for_nest2(int lt=0;lt::MesonField(Eigen::Tensor &mat, template void TA2AMesonField::execute(void) { - LOG(Message) << "Computing A2A meson field" << std::endl; + LOG(Message) << "Computing A2A meson field" << std::endl; - auto &a2a = envGet(A2ABase, par().A2A); - - // 2+6+4+4 = 16 gammas - // Ordering defined here - std::vector gammas ( { - Gamma::Algebra::Gamma5, - Gamma::Algebra::Identity, - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaT, - Gamma::Algebra::GammaXGamma5, - Gamma::Algebra::GammaYGamma5, - Gamma::Algebra::GammaZGamma5, - Gamma::Algebra::GammaTGamma5, - Gamma::Algebra::SigmaXY, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::SigmaZT - }); + auto &a2a = envGet(A2ABase, par().A2A); + + // 2+6+4+4 = 16 gammas + // Ordering defined here + std::vector gammas ( { + Gamma::Algebra::Gamma5, + Gamma::Algebra::Identity, + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT, + Gamma::Algebra::GammaXGamma5, + Gamma::Algebra::GammaYGamma5, + Gamma::Algebra::GammaZGamma5, + Gamma::Algebra::GammaTGamma5, + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::SigmaZT + }); - /////////////////////////////////////////////// - // Square assumption for now Nl = Nr = N - /////////////////////////////////////////////// - int nt = env().getDim(Tp); - int nx = env().getDim(Xp); - int ny = env().getDim(Yp); - int nz = env().getDim(Zp); - int Nl = a2a.get_Nl(); - int N = Nl + a2a.get_Nh(); - - int ngamma = gammas.size(); + /////////////////////////////////////////////// + // Square assumption for now Nl = Nr = N + /////////////////////////////////////////////// + int nt = env().getDim(Tp); + int nx = env().getDim(Xp); + int ny = env().getDim(Yp); + int nz = env().getDim(Zp); + int Nl = a2a.get_Nl(); + int N = Nl + a2a.get_Nh(); + + int ngamma = gammas.size(); - int schurBlock = par().schurBlock; - int cacheBlock = par().cacheBlock; - int nmom = par().Nmom; + int schurBlock = par().schurBlock; + int cacheBlock = par().cacheBlock; + int nmom = par().Nmom; - /////////////////////////////////////////////// - // Momentum setup - /////////////////////////////////////////////// - GridBase *grid = env().getGrid(1); - std::vector phases(nmom,grid); - for(int m=0;m phases(nmom,grid); + for(int m=0;m mesonField (nmom,ngamma,nt,N,N); - LOG(Message) << "N = Nh+Nl for A2A MesonField is " << N << std::endl; + Eigen::Tensor mesonField (nmom,ngamma,nt,N,N); + LOG(Message) << "N = Nh+Nl for A2A MesonField is " << N << std::endl; - envGetTmp(std::vector, w); - envGetTmp(std::vector, v); - envGetTmp(FermionField, tmp_5d); + envGetTmp(std::vector, w); + envGetTmp(std::vector, v); + envGetTmp(FermionField, tmp_5d); - LOG(Message) << "Finding v and w vectors for N = " << N << std::endl; + LOG(Message) << "Finding v and w vectors for N = " << N << std::endl; - ////////////////////////////////////////////////////////////////////////// - // i,j is first loop over SchurBlock factors reusing 5D matrices - // ii,jj is second loop over cacheBlock factors for high perf contractoin - // iii,jjj are loops within cacheBlock - // Total index is sum of these i+ii+iii etc... - ////////////////////////////////////////////////////////////////////////// - - double flops = 0.0; - double bytes = 0.0; - double vol = nx*ny*nz*nt; - double t_schur=0; - double t_contr=0; - double t_int_0=0; - double t_int_1=0; - double t_int_2=0; - double t_int_3=0; + ////////////////////////////////////////////////////////////////////////// + // i,j is first loop over SchurBlock factors reusing 5D matrices + // ii,jj is second loop over cacheBlock factors for high perf contractoin + // iii,jjj are loops within cacheBlock + // Total index is sum of these i+ii+iii etc... + ////////////////////////////////////////////////////////////////////////// + + double flops = 0.0; + double bytes = 0.0; + double vol = nx*ny*nz*nt; + double t_schur=0; + double t_contr=0; + double t_int_0=0; + double t_int_1=0; + double t_int_2=0; + double t_int_3=0; - double t0 = usecond(); - int N_i = N; - int N_j = N; - for(int i=0;i mesonFieldBlocked(nmom,ngamma,nt,N_iii,N_jjj); + + t_contr-=usecond(); + MesonField(mesonFieldBlocked, &w[ii], &v[jj], gammas, phases,Tp, + t_int_0,t_int_1,t_int_2,t_int_3); + t_contr+=usecond(); + flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma; + + bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj + + vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma; /////////////////////////////////////////////////////////////// - // Get the W and V vectors for this schurBlock^2 set of terms - /////////////////////////////////////////////////////////////// - int N_ii = MIN(N_i-i,schurBlock); - int N_jj = MIN(N_j-j,schurBlock); - - t_schur-=usecond(); - for(int ii =0;ii < N_ii;ii++) a2a.return_w(i+ii, tmp_5d, w[ii]); - for(int jj =0;jj < N_jj;jj++) a2a.return_v(j+jj, tmp_5d, v[jj]); - t_schur+=usecond(); - - LOG(Message) << "Found w vectors " << i <<" .. " << i+N_ii-1 << std::endl; - LOG(Message) << "Found v vectors " << j <<" .. " << j+N_jj-1 << std::endl; - - /////////////////////////////////////////////////////////////// - // Series of cache blocked chunks of the contractions within this SchurBlock + // Copy back to full meson field tensor /////////////////////////////////////////////////////////////// - for(int ii=0;ii mesonFieldBlocked(nmom,ngamma,nt,N_iii,N_jjj); + double nodes=grid->NodeCount(); + double t1 = usecond(); + LOG(Message) << " Contraction of MesonFields took "<<(t1-t0)/1.0e6<< " seconds " << std::endl; + LOG(Message) << " Schur "<<(t_schur)/1.0e6<< " seconds " << std::endl; + LOG(Message) << " Contr "<<(t_contr)/1.0e6<< " seconds " << std::endl; + LOG(Message) << " Intern0 "<<(t_int_0)/1.0e6<< " seconds " << std::endl; + LOG(Message) << " Intern1 "<<(t_int_1)/1.0e6<< " seconds " << std::endl; + LOG(Message) << " Intern2 "<<(t_int_2)/1.0e6<< " seconds " << std::endl; + LOG(Message) << " Intern3 "<<(t_int_3)/1.0e6<< " seconds " << std::endl; - t_contr-=usecond(); - MesonField(mesonFieldBlocked, &w[ii], &v[jj], gammas, phases,Tp, - t_int_0,t_int_1,t_int_2,t_int_3); - t_contr+=usecond(); - flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma; + double t_kernel = t_int_0 + t_int_1; + LOG(Message) << " Arith "< + ///////////////////////////////////////////////////////////////////////// + std::vector corr(nt,ComplexD(0.0)); - /////////////////////////////////////////////////////////////// - // Copy back to full meson field tensor - /////////////////////////////////////////////////////////////// - parallel_for_nest2(int iii=0;iii< N_iii;iii++) { - for(int jjj=0;jjj< N_jjj;jjj++) { - for(int m =0;m< nmom;m++) { - for(int g =0;g< ngamma;g++) { - for(int t =0;t< nt;t++) { - mesonField(m,g,t,i+ii+iii,j+jj+jjj) = mesonFieldBlocked(m,g,t,iii,jjj); - }}} + for(int i=0;iNodeCount(); - double t1 = usecond(); - LOG(Message) << " Contraction of MesonFields took "<<(t1-t0)/1.0e6<< " seconds " << std::endl; - LOG(Message) << " Schur "<<(t_schur)/1.0e6<< " seconds " << std::endl; - LOG(Message) << " Contr "<<(t_contr)/1.0e6<< " seconds " << std::endl; - LOG(Message) << " Intern0 "<<(t_int_0)/1.0e6<< " seconds " << std::endl; - LOG(Message) << " Intern1 "<<(t_int_1)/1.0e6<< " seconds " << std::endl; - LOG(Message) << " Intern2 "<<(t_int_2)/1.0e6<< " seconds " << std::endl; - LOG(Message) << " Intern3 "<<(t_int_3)/1.0e6<< " seconds " << std::endl; - - double t_kernel = t_int_0 + t_int_1; - LOG(Message) << " Arith "< - ///////////////////////////////////////////////////////////////////////// - std::vector corr(nt,ComplexD(0.0)); - - for(int i=0;i