Hadrons: meson fields code cleaning and momentum phases

2026-05-29 21:44:17 +01:00 · 2018-08-11 15:13:43 +01:00
parent ac69f042b1
commit 5be6a51044
1 changed files with 319 additions and 304 deletions
@@ -51,11 +51,11 @@ class A2AMesonFieldPar : Serializable
  public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMesonFieldPar,
                                    int, cacheBlock,
-                                    int, schurBlock,
+                                    int, block,
                                    int, Nmom,
                                    std::string, v,
                                    std::string, w,
-                                    std::string, output);
+                                    std::string, output,
                                    std::vector<std::string>, mom);
 };
 template <typename FImpl>
@@ -76,9 +76,9 @@ class TA2AMesonField : public Module<A2AMesonFieldPar>
    virtual void setup(void);
    // execution
    virtual void execute(void);
-
+private:
    // Arithmetic help. Move to Grid??
-    virtual void MesonField(Eigen::Tensor<ComplexD,5> &mat, 
+    virtual void makeBlock(Eigen::Tensor<ComplexD,5> &mat, 
                           const LatticeFermion *lhs,
                           const LatticeFermion *rhs,
                           std::vector<Gamma::Algebra> gammas,
@@ -88,6 +88,9 @@ class TA2AMesonField : public Module<A2AMesonFieldPar>
                           double &t1,
                           double &t2,
                           double &t3);
 private:
    bool        hasPhase_{false};
    std::string momphName_;
 };
 MODULE_REGISTER(A2AMesonField, ARG(TA2AMesonField<FIMPL>), MContraction);
@@ -100,6 +103,7 @@ MODULE_REGISTER(ZA2AMesonField, ARG(TA2AMesonField<ZFIMPL>), MContraction);
 template <typename FImpl>
 TA2AMesonField<FImpl>::TA2AMesonField(const std::string name)
 : Module<A2AMesonFieldPar>(name)
 , momphName_(name + "_momph")
 {
 }
@@ -120,18 +124,166 @@ std::vector<std::string> TA2AMesonField<FImpl>::getOutput(void)
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TA2AMesonField<FImpl>::setup(void)
-{}
+{
    envCache(std::vector<LatticeComplex>, momphName_, 1, 
             par().mom.size(), env().getGrid());
    envTmpLat(LatticeComplex, "coor");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TA2AMesonField<FImpl>::execute(void)
 {
    LOG(Message) << "Computing all-to-all meson fields" << std::endl;
    auto &v = envGet(std::vector<FermionField>, par().v);
    auto &w = envGet(std::vector<FermionField>, par().w);
    // 2+6+4+4 = 16 gammas
    // Ordering defined here
    std::vector<Gamma::Algebra> gammas ( {
        Gamma::Algebra::Gamma5,
        Gamma::Algebra::Identity,    
        Gamma::Algebra::GammaX,
        Gamma::Algebra::GammaY,
        Gamma::Algebra::GammaZ,
        Gamma::Algebra::GammaT,
        Gamma::Algebra::GammaXGamma5,
        Gamma::Algebra::GammaYGamma5,
        Gamma::Algebra::GammaZGamma5,
        Gamma::Algebra::GammaTGamma5,
        Gamma::Algebra::SigmaXY,
        Gamma::Algebra::SigmaXZ,
        Gamma::Algebra::SigmaXT,
        Gamma::Algebra::SigmaYZ,
        Gamma::Algebra::SigmaYT,
        Gamma::Algebra::SigmaZT
    });
    int nt         = env().getDim().back();
    int N_i        = w.size();
    int N_j        = v.size();
    int ngamma     = gammas.size();
    int nmom       = par().mom.size();
    int block      = par().block;
    int cacheBlock = par().cacheBlock;
    ///////////////////////////////////////////////
    // Momentum setup
    ///////////////////////////////////////////////
    auto &ph = envGet(std::vector<LatticeComplex>, momphName_);
    if (!hasPhase_)
    {
        MODULE_TIMER("Momentum phases");
        for (unsigned int j = 0; j < nmom; ++j)
        {
            Complex           i(0.0,1.0);
            std::vector<Real> p;
            envGetTmp(LatticeComplex, coor);
            p     = strToVec<Real>(par().mom[j]);
            ph[j] = zero;
            for(unsigned int mu = 0; mu < p.size(); mu++)
            {
                LatticeCoordinate(coor, mu);
                ph[j] = ph[j] + (p[mu]/env().getDim(mu))*coor;
            }
            ph[j] = exp((Real)(2*M_PI)*i*ph[j]);
        }
        hasPhase_ = true;
    }
    LOG(Message) << "MesonField size " << N_i << "x" << N_j << "x" << nt << std::endl;
    //////////////////////////////////////////////////////////////////////////
    // i,j   is first  loop over SchurBlock factors reusing 5D matrices
    // ii,jj is second loop over cacheBlock factors for high perf contractoin
    // iii,jjj are loops within cacheBlock
    // Total index is sum of these  i+ii+iii etc...
    //////////////////////////////////////////////////////////////////////////
    double flops = 0.0;
    double bytes = 0.0;
    double vol   = env().getVolume();
    double t_schur=0;
    double t_contr=0;
    double t_int_0=0;
    double t_int_1=0;
    double t_int_2=0;
    double t_int_3=0;
    double t0    = usecond();
    int NBlock_i = N_i/block + (((N_i % block) != 0) ? 1 : 0);
    int NBlock_j = N_j/block + (((N_j % block) != 0) ? 1 : 0);
    for(int i=0;i<N_i;i+=block)
    for(int j=0;j<N_j;j+=block)
    {
        ///////////////////////////////////////////////////////////////
        // Get the W and V vectors for this block^2 set of terms
        ///////////////////////////////////////////////////////////////
        int N_ii = MIN(N_i-i,block);
        int N_jj = MIN(N_j-j,block);
        t_schur-=usecond();
        t_schur+=usecond();
        LOG(Message) << "Meson field block " 
                    << j/block + NBlock_j*i/block + 1 
                    << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
                    << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
                    << std::endl;
        Eigen::Tensor<ComplexD,5> mfBlock(nmom,ngamma,nt,N_ii,N_jj);
        ///////////////////////////////////////////////////////////////
        // Series of cache blocked chunks of the contractions within this block
        /////////////////////////////////////////////////////////////// 
        for(int ii=0;ii<N_ii;ii+=cacheBlock)
        for(int jj=0;jj<N_jj;jj+=cacheBlock)
        {
            int N_iii = MIN(N_ii-ii,cacheBlock);
            int N_jjj = MIN(N_jj-jj,cacheBlock);
            Eigen::Tensor<ComplexD,5> mfCache(nmom,ngamma,nt,N_iii,N_jjj);    
            t_contr-=usecond();
            makeBlock(mfCache, &w[i+ii], &v[j+jj], gammas, ph, 
                      env().getNd() - 1, t_int_0, t_int_1, t_int_2, t_int_3);
            t_contr+=usecond();
            // flops for general N_c & N_s
            flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma;
            bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj
                +  vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma;
            MODULE_TIMER("Cache copy");
            for(int iii=0;iii< N_iii;iii++)
            for(int jjj=0;jjj< N_jjj;jjj++)
            for(int m =0;m< nmom;m++)
            for(int g =0;g< ngamma;g++)
            for(int t =0;t< nt;t++)
            {
                mfBlock(m,g,t,ii+iii,jj+jjj) = mfCache(m,g,t,iii,jjj);
            }
        }
    }
    double nodes    = env().getGrid()->NodeCount();
    double t_kernel = t_int_0 + t_int_1;
    LOG(Message) << "Perf " << flops/(t_kernel)/1.0e3/nodes << " Gflop/s/node "  << std::endl;
    LOG(Message) << "Perf " << bytes/(t_kernel)/1.0e3/nodes << " GB/s/node "  << std::endl;
 }
 //////////////////////////////////////////////////////////////////////////////////
 // Cache blocked arithmetic routine
 // Could move to Grid ???
 //////////////////////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TA2AMesonField<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat, 
+void TA2AMesonField<FImpl>::makeBlock(Eigen::Tensor<ComplexD,5> &mat, 
 					 const LatticeFermion *lhs_wi,
 					 const LatticeFermion *rhs_vj,
 					 std::vector<Gamma::Algebra> gammas,
@@ -316,143 +468,6 @@ void TA2AMesonField<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
    t3+=usecond();
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TA2AMesonField<FImpl>::execute(void)
 {
  LOG(Message) << "Computing A2A meson field" << std::endl;
  auto &v = envGet(std::vector<FermionField>, par().v);
  auto &w = envGet(std::vector<FermionField>, par().w);
  // 2+6+4+4 = 16 gammas
  // Ordering defined here
  std::vector<Gamma::Algebra> gammas ( {
    Gamma::Algebra::Gamma5,
    Gamma::Algebra::Identity,    
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
    Gamma::Algebra::GammaXGamma5,
    Gamma::Algebra::GammaYGamma5,
    Gamma::Algebra::GammaZGamma5,
    Gamma::Algebra::GammaTGamma5,
    Gamma::Algebra::SigmaXY,
    Gamma::Algebra::SigmaXZ,
    Gamma::Algebra::SigmaXT,
    Gamma::Algebra::SigmaYZ,
    Gamma::Algebra::SigmaYT,
    Gamma::Algebra::SigmaZT
  });
  ///////////////////////////////////////////////
  // Square assumption for now Nl = Nr = N
  ///////////////////////////////////////////////
  int nt  = env().getDim(Tp);
  int nx  = env().getDim(Xp);
  int ny  = env().getDim(Yp);
  int nz  = env().getDim(Zp);
  int N_i = w.size();
  int N_j = v.size();
  int ngamma = gammas.size();
  int schurBlock = par().schurBlock;
  int cacheBlock = par().cacheBlock;
  int nmom       = par().Nmom;
  std::vector<ComplexD> corr(nt,ComplexD(0.0));
  ///////////////////////////////////////////////
  // Momentum setup
  ///////////////////////////////////////////////
  GridBase *grid = env().getGrid();
  std::vector<LatticeComplex> phases(nmom,grid);
  for(int m=0;m<nmom;m++)
  {
    phases[m] = Complex(1.0);    // All zero momentum for now
  }
  LOG(Message) << "MesonField size " << N_i << "x" << N_j << "x" << nt << std::endl;
  //////////////////////////////////////////////////////////////////////////
  // i,j   is first  loop over SchurBlock factors reusing 5D matrices
  // ii,jj is second loop over cacheBlock factors for high perf contractoin
  // iii,jjj are loops within cacheBlock
  // Total index is sum of these  i+ii+iii etc...
  //////////////////////////////////////////////////////////////////////////
  double flops = 0.0;
  double bytes = 0.0;
  double vol   = nx*ny*nz*nt;
  double t_schur=0;
  double t_contr=0;
  double t_int_0=0;
  double t_int_1=0;
  double t_int_2=0;
  double t_int_3=0;
  double t0    = usecond();
  int NBlock_i = N_i/schurBlock + (((N_i % schurBlock) != 0) ? 1 : 0);
  int NBlock_j = N_j/schurBlock + (((N_j % schurBlock) != 0) ? 1 : 0);
  for(int i=0;i<N_i;i+=schurBlock)
  for(int j=0;j<N_j;j+=schurBlock)
  {
    ///////////////////////////////////////////////////////////////
    // Get the W and V vectors for this schurBlock^2 set of terms
    ///////////////////////////////////////////////////////////////
    int N_ii = MIN(N_i-i,schurBlock);
    int N_jj = MIN(N_j-j,schurBlock);
    t_schur-=usecond();
    t_schur+=usecond();
    LOG(Message) << "Meson field block " 
                 << j/schurBlock + NBlock_j*i/schurBlock + 1 
                 << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
                 << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
                 << std::endl;
    Eigen::Tensor<ComplexD,5> mesonFieldBlocked(nmom,ngamma,nt,N_ii,N_jj);
    ///////////////////////////////////////////////////////////////
    // Series of cache blocked chunks of the contractions within this SchurBlock
    /////////////////////////////////////////////////////////////// 
    for(int ii=0;ii<N_ii;ii+=cacheBlock)
    for(int jj=0;jj<N_jj;jj+=cacheBlock)
    {
      int N_iii = MIN(N_ii-ii,cacheBlock);
      int N_jjj = MIN(N_jj-jj,cacheBlock);
      Eigen::Tensor<ComplexD,5> mesonFieldCache(nmom,ngamma,nt,N_iii,N_jjj);    
      t_contr-=usecond();
      MesonField(mesonFieldCache, &w[i+ii], &v[j+jj], gammas, phases,Tp,
                 t_int_0,t_int_1,t_int_2,t_int_3);
      t_contr+=usecond();
      // flops for general N_c & N_s
      flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma;
      bytes  += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj
                  +  vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma;
      MODULE_TIMER("Cache copy");
      for(int iii=0;iii< N_iii;iii++)
      for(int jjj=0;jjj< N_jjj;jjj++)
      for(int m =0;m< nmom;m++)
      for(int g =0;g< ngamma;g++)
      for(int t =0;t< nt;t++)
      {
        mesonFieldBlocked(m,g,t,ii+iii,jj+jjj) = mesonFieldCache(m,g,t,iii,jjj);
      }
    }
  }
  double nodes=grid->NodeCount();
  double t_kernel = t_int_0 + t_int_1;
  LOG(Message) << "Perf " << flops/(t_kernel)/1.0e3/nodes << " Gflop/s/node "  << std::endl;
  LOG(Message) << "Perf " << bytes/(t_kernel)/1.0e3/nodes << " GB/s/node "  << std::endl;
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE