/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: extras/Hadrons/Modules/MContraction/A2AMesonField.hpp Copyright (C) 2015-2018 Author: Antonin Portelli Author: Peter Boyle Author: paboyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #ifndef Hadrons_MContraction_A2AMesonField_hpp_ #define Hadrons_MContraction_A2AMesonField_hpp_ #include #include #include #include #include BEGIN_HADRONS_NAMESPACE /****************************************************************************** * All-to-all meson field creation * ******************************************************************************/ BEGIN_MODULE_NAMESPACE(MContraction) typedef std::pair GammaPair; class A2AMesonFieldPar : Serializable { public: GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMesonFieldPar, int, cacheBlock, int, schurBlock, int, Nmom, std::string, v, std::string, w, std::string, output); }; template class TA2AMesonField : public Module { public: FERM_TYPE_ALIASES(FImpl, ); SOLVER_TYPE_ALIASES(FImpl, ); public: // constructor TA2AMesonField(const std::string name); // destructor virtual ~TA2AMesonField(void){}; // dependency relation virtual std::vector getInput(void); virtual std::vector getOutput(void); // setup virtual void setup(void); // execution virtual void execute(void); // Arithmetic help. Move to Grid?? virtual void MesonField(Eigen::Tensor &mat, const LatticeFermion *lhs, const LatticeFermion *rhs, std::vector gammas, const std::vector &mom, int orthogdim, double &t0, double &t1, double &t2, double &t3); }; MODULE_REGISTER(A2AMesonField, ARG(TA2AMesonField), MContraction); MODULE_REGISTER(ZA2AMesonField, ARG(TA2AMesonField), MContraction); /****************************************************************************** * TA2AMesonField implementation * ******************************************************************************/ // constructor ///////////////////////////////////////////////////////////////// template TA2AMesonField::TA2AMesonField(const std::string name) : Module(name) { } // dependencies/products /////////////////////////////////////////////////////// template std::vector TA2AMesonField::getInput(void) { std::vector in = {par().v, par().w}; return in; } template std::vector TA2AMesonField::getOutput(void) { std::vector out = {}; return out; } // setup /////////////////////////////////////////////////////////////////////// template void TA2AMesonField::setup(void) {} ////////////////////////////////////////////////////////////////////////////////// // Cache blocked arithmetic routine // Could move to Grid ??? ////////////////////////////////////////////////////////////////////////////////// template void TA2AMesonField::MesonField(Eigen::Tensor &mat, const LatticeFermion *lhs_wi, const LatticeFermion *rhs_vj, std::vector gammas, const std::vector &mom, int orthogdim, double &t0, double &t1, double &t2, double &t3) { typedef typename FImpl::SiteSpinor vobj; typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; typedef iSpinMatrix SpinMatrix_v; typedef iSpinMatrix SpinMatrix_s; int Lblock = mat.dimension(3); int Rblock = mat.dimension(4); GridBase *grid = lhs_wi[0]._grid; const int Nd = grid->_ndimension; const int Nsimd = grid->Nsimd(); int Nt = grid->GlobalDimensions()[orthogdim]; int Ngamma = gammas.size(); int Nmom = mom.size(); int fd=grid->_fdimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; // will locally sum vectors first // sum across these down to scalars // splitting the SIMD int MFrvol = rd*Lblock*Rblock*Nmom; int MFlvol = ld*Lblock*Rblock*Nmom; Vector lvSum(MFrvol); parallel_for (int r = 0; r < MFrvol; r++) { lvSum[r] = zero; } Vector lsSum(MFlvol); parallel_for (int r = 0; r < MFlvol; r++){ lsSum[r]=scalar_type(0.0); } int e1= grid->_slice_nblock[orthogdim]; int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; t0-=usecond(); // Nested parallelism would be ok // Wasting cores here. Test case r parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane for(int n=0;n icoor(Nd); std::vector extracted(Nsimd); for(int i=0;iiCoorFromIindex(icoor,idx); int ldx = rt+icoor[orthogdim]*rd; int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx; lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]; } } } t1+=usecond(); assert(mat.dimension(0) == Nmom); assert(mat.dimension(1) == Ngamma); assert(mat.dimension(2) == Nt); t2-=usecond(); // ld loop and local only?? int pd = grid->_processors[orthogdim]; int pc = grid->_processor_coor[orthogdim]; parallel_for_nest2(int lt=0;ltGlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock); t3+=usecond(); } // execution /////////////////////////////////////////////////////////////////// template void TA2AMesonField::execute(void) { LOG(Message) << "Computing A2A meson field" << std::endl; auto &v = envGet(std::vector, par().v); auto &w = envGet(std::vector, par().w); // 2+6+4+4 = 16 gammas // Ordering defined here std::vector gammas ( { Gamma::Algebra::Gamma5, Gamma::Algebra::Identity, Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT, Gamma::Algebra::GammaXGamma5, Gamma::Algebra::GammaYGamma5, Gamma::Algebra::GammaZGamma5, Gamma::Algebra::GammaTGamma5, Gamma::Algebra::SigmaXY, Gamma::Algebra::SigmaXZ, Gamma::Algebra::SigmaXT, Gamma::Algebra::SigmaYZ, Gamma::Algebra::SigmaYT, Gamma::Algebra::SigmaZT }); /////////////////////////////////////////////// // Square assumption for now Nl = Nr = N /////////////////////////////////////////////// int nt = env().getDim(Tp); int nx = env().getDim(Xp); int ny = env().getDim(Yp); int nz = env().getDim(Zp); int N_i = w.size(); int N_j = v.size(); int ngamma = gammas.size(); int schurBlock = par().schurBlock; int cacheBlock = par().cacheBlock; int nmom = par().Nmom; /////////////////////////////////////////////// // Momentum setup /////////////////////////////////////////////// GridBase *grid = env().getGrid(1); std::vector phases(nmom,grid); for(int m=0;m mesonField(nmom,ngamma,nt,N_i,N_j); LOG(Message) << "MesonField size " << N_i << "x" << N_j << "x" << nt << std::endl; ////////////////////////////////////////////////////////////////////////// // i,j is first loop over SchurBlock factors reusing 5D matrices // ii,jj is second loop over cacheBlock factors for high perf contractoin // iii,jjj are loops within cacheBlock // Total index is sum of these i+ii+iii etc... ////////////////////////////////////////////////////////////////////////// double flops = 0.0; double bytes = 0.0; double vol = nx*ny*nz*nt; double t_schur=0; double t_contr=0; double t_int_0=0; double t_int_1=0; double t_int_2=0; double t_int_3=0; double t0 = usecond(); int NBlock_i = N_i/schurBlock + (((N_i % schurBlock) != 0) ? 1 : 0); int NBlock_j = N_j/schurBlock + (((N_j % schurBlock) != 0) ? 1 : 0); for(int i=0;i mesonFieldBlocked(nmom,ngamma,nt,N_iii,N_jjj); t_contr-=usecond(); MesonField(mesonFieldBlocked, &w[i+ii], &v[j+jj], gammas, phases,Tp, t_int_0,t_int_1,t_int_2,t_int_3); t_contr+=usecond(); // flops for general N_c & N_s flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma; bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj + vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma; /////////////////////////////////////////////////////////////// // Copy back to full meson field tensor /////////////////////////////////////////////////////////////// parallel_for_nest2(int iii=0;iii< N_iii;iii++) for(int jjj=0;jjj< N_jjj;jjj++) for(int m =0;m< nmom;m++) for(int g =0;g< ngamma;g++) for(int t =0;t< nt;t++) { mesonField(m,g,t,i+ii+iii,j+jj+jjj) = mesonFieldBlocked(m,g,t,iii,jjj); } } } double nodes=grid->NodeCount(); double t1 = usecond(); LOG(Message) << "Contraction of MesonFields took "<<(t1-t0)/1.0e6<< " s" << std::endl; LOG(Message) << " Schur " << (t_schur)/1.0e6 << " s" << std::endl; LOG(Message) << " Contr " << (t_contr)/1.0e6 << " s" << std::endl; LOG(Message) << " Intern0 " << (t_int_0)/1.0e6 << " s" << std::endl; LOG(Message) << " Intern1 " << (t_int_1)/1.0e6 << " s" << std::endl; LOG(Message) << " Intern2 " << (t_int_2)/1.0e6 << " s" << std::endl; LOG(Message) << " Intern3 " << (t_int_3)/1.0e6 << " s" << std::endl; double t_kernel = t_int_0 + t_int_1; LOG(Message) << " Arith " << flops/(t_kernel)/1.0e3/nodes << " Gflop/s/ node " << std::endl; LOG(Message) << " Arith " << bytes/(t_kernel)/1.0e3/nodes << " GB/s/node " << std::endl; ///////////////////////////////////////////////////////////////////////// // Test: Build the pion correlator (two end) // < PI_ij(t0) PI_ji (t0+t) > ///////////////////////////////////////////////////////////////////////// std::vector corr(nt,ComplexD(0.0)); for(int i=0;i