Meson field test

2025-07-31 11:47:06 +01:00 · 2024-10-18 15:42:30 +00:00
parent a78a61d76f
commit 5cc4f3241d
3 changed files with 203 additions and 64 deletions
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -522,11 +522,14 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
  int ostride=grid->_ostride[orthogdim];
  //Reduce Data down to lvSum
  RealD t_sum =-usecond();
  sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
  t_sum +=usecond();
  // Sum across simd lanes in the plane, breaking out orthog dir.
  Coordinate icoor(Nd);
  RealD t_rest =-usecond();
  for(int rt=0;rt<rd;rt++){
    extract(lvSum[rt],extracted);
@@ -556,6 +559,9 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
  scalar_type * ptr = (scalar_type *) &result[0];
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
  t_rest +=usecond();
  std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
 }
 template<class vobj> inline
 std::vector<typename vobj::scalar_object> 
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@@ -6,6 +6,34 @@ NAMESPACE_BEGIN(Grid);
 #undef DELTA_F_EQ_2
 ///////////////////////////////////////////////////////////////////
 //Meson 
 // Interested in
 //
 //      sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
 //
 // Conventional meson field:
 //                 
 //    = sum_x,y Trace[ sum_j G |v_j(y,ty)> <w_j(x,tx)|  G sum_i |v_i(x,tx) ><w_i(y,ty)| ]
 //    = sum_ij sum_x,y < w_j(x,tx)| G |v_i(x,tx) > <w_i(y,ty) (x)|G| v_j(y,ty) >
 //    = sum_ij PI_ji(tx) PI_ij(ty)
 //
 // G5-Hermiticity
 //
 //      sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
 //    = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ]
 //    = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> <w_j(x,tx)|  G g5 sum_i   (|v_j(y,ty)> <w_i(x,tx)|)^dag ]      --  (*)
 //
 // NB:  Dag applies to internal indices spin,colour,complex
 //
 //    = sum_ij sum_x,y Trace[ g5 G |v_j(y,ty)> <w_j(x,tx)|  G g5  |w_i(x,tx)> <v_i(y,ty)| ]
 //    = sum_ij sum_x,y <v_i(y,ty)|g5 G |v_j(y,ty)> <w_j(x,tx)|  G g5 |w_i(x,tx)> 
 //    = sum_ij  PionVV(ty) PionWW(tx)
 //
 // (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker
 //     expectation value. Otherwise biased.
 ////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 class A2Autils 
 {
@@ -26,7 +54,9 @@ public:
  typedef iSpinColourMatrix<vector_type> SpinColourMatrix_v;
-  template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
+  
  // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
  template <typename TensorType> 
  static void MesonField(TensorType &mat, 
 			 const FermionField *lhs_wi,
 			 const FermionField *rhs_vj,
@@ -34,6 +64,14 @@ public:
 			 const std::vector<ComplexField > &mom,
 			 int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
  template <typename TensorType> 
  static void MesonFieldGPU(TensorType &mat, 
 			 const FermionField *lhs_wi,
 			 const FermionField *rhs_vj,
 			 std::vector<Gamma::Algebra> gammas,
 			 const std::vector<ComplexField > &mom,
 			 int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
  /*
  static void PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat, 
 			     const FermionField *wi,
 			     const FermionField *vj,
@@ -58,6 +96,7 @@ public:
 			  const FermionField *vi,
 			  const FermionField *vj,
 			  int orthogdim);
  */
  template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
  static void AslashField(TensorType &mat, 
@@ -159,14 +198,14 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
  int MFlvol = ld*Lblock*Rblock*Nmom;
  std::vector<SpinMatrix_v > lvSum(MFrvol);
-  thread_for( r, MFrvol,{
+  for(int r=0;r<MFrvol;r++){
    lvSum[r] = Zero();
-  });
+  }
  std::vector<SpinMatrix_s > lsSum(MFlvol);             
-  thread_for(r,MFlvol,{
+  for(int r=0;r<MFlvol;r++){
    lsSum[r]=scalar_type(0.0);
-  });
+  }
  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
@@ -174,7 +213,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
  // potentially wasting cores here if local time extent too small
  if (t_kernel) *t_kernel = -usecond();
-  thread_for(r,rd,{
+  for(int r=0;r<rd;r++) {
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
@@ -213,10 +252,10 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 	}
      }
    }
-  });
+  };
  // Sum across simd lanes in the plane, breaking out orthog dir.
-  thread_for(rt,rd,{
+  for(int rt=0;rt<rd;rt++){
    Coordinate icoor(Nd);
    ExtractBuffer<SpinMatrix_s> extracted(Nsimd);               
@@ -241,7 +280,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
      }
    }}}
-  });
+  }
  if (t_kernel) *t_kernel += usecond();
  assert(mat.dimension(0) == Nmom);
  assert(mat.dimension(1) == Ngamma);
@@ -290,35 +329,115 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
  if (t_gsum) *t_gsum += usecond();
 }
 const int A2Ablocking=8;
 template<typename vtype> using iVecSpinMatrix = iVector<iMatrix<iScalar<vtype>, Ns>, A2Ablocking>;
 typedef iVecSpinMatrix<Complex  >             VecSpinMatrix;
 typedef iVecSpinMatrix<vComplex >             vVecSpinMatrix;
 typedef Lattice<vVecSpinMatrix>               LatticeVecSpinMatrix;
-///////////////////////////////////////////////////////////////////
+template <class FImpl>
-//Meson 
+template <typename TensorType>
-// Interested in
+void A2Autils<FImpl>::MesonFieldGPU(TensorType &mat, 
-//
+				    const FermionField *lhs_wi,
-//      sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
+				    const FermionField *rhs_vj,
-//
+				    std::vector<Gamma::Algebra> gammas,
-// Conventional meson field:
+				    const std::vector<ComplexField > &mom,
-//                 
+				    int orthogdim, double *t_kernel, double *t_gsum) 
-//    = sum_x,y Trace[ sum_j G |v_j(y,ty)> <w_j(x,tx)|  G sum_i |v_i(x,tx) ><w_i(y,ty)| ]
+{
-//    = sum_ij sum_x,y < w_j(x,tx)| G |v_i(x,tx) > <w_i(y,ty) (x)|G| v_j(y,ty) >
+  const int block=A2Ablocking;
-//    = sum_ij PI_ji(tx) PI_ij(ty)
+  typedef typename FImpl::SiteSpinor vobj;
 //
 // G5-Hermiticity
 //
 //      sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
 //    = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ]
 //    = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> <w_j(x,tx)|  G g5 sum_i   (|v_j(y,ty)> <w_i(x,tx)|)^dag ]      --  (*)
 //
 // NB:  Dag applies to internal indices spin,colour,complex
 //
 //    = sum_ij sum_x,y Trace[ g5 G |v_j(y,ty)> <w_j(x,tx)|  G g5  |w_i(x,tx)> <v_i(y,ty)| ]
 //    = sum_ij sum_x,y <v_i(y,ty)|g5 G |v_j(y,ty)> <w_j(x,tx)|  G g5 |w_i(x,tx)> 
 //    = sum_ij  PionVV(ty) PionWW(tx)
 //
 // (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker
 //     expectation value. Otherwise biased.
 ////////////////////////////////////////////////////////////////////
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Lblock = mat.dimension(3); 
  int Rblock = mat.dimension(4);
  assert(Lblock % block==0);
  //  assert(Rblock % block==0);
  GridBase *grid = lhs_wi[0].Grid();
  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
  int Nt     = grid->GlobalDimensions()[orthogdim];
  int Ngamma = gammas.size();
  int Nmom   = mom.size();
  LatticeVecSpinMatrix SpinMat(grid);
  LatticeVecSpinMatrix MomSpinMat(grid);
  RealD t_afor = 0.0;
  RealD t_sum  = 0.0;
  RealD t_pha  = 0.0;
  RealD t_trace= 0.0;
  uint64_t ncall=0;
  std::vector<VecSpinMatrix> sliced;
  for(int i=0;i<Lblock;i++){
    autoView(SpinMat_v,SpinMat,AcceleratorWrite);
    autoView(lhs_v,lhs_wi[i],AcceleratorRead);
    for(int jo=0;jo<Rblock;jo+=block){
      for(int j=jo;j<MIN(Rblock,jo+block);j++){
 	int jj=j%block;
 	autoView(rhs_v,rhs_vj[j],AcceleratorRead); // Create a vector of views
 	//////////////////////////////////////////
 	// Should write a SpinOuterColorTrace
 	//////////////////////////////////////////
 	std::cout <<GridLogMessage << "accelerator_for for i="<<i<<" j="<<j<<std::endl;
 	t_afor-=usecond();
 	accelerator_for(ss,grid->oSites(),(size_t)Nsimd,{
 	    auto left = conjugate(lhs_v(ss));
 	    auto right = rhs_v(ss);
 	    auto vv =  SpinMat_v(ss);
 	    for(int s1=0;s1<Ns;s1++){
 	      for(int s2=0;s2<Ns;s2++){
 		vv(jj)(s1,s2)() = left()(s2)(0) * right()(s1)(0)
 		  +               left()(s2)(1) * right()(s1)(1)
 		  +               left()(s2)(2) * right()(s1)(2);
 	      }}
 	    coalescedWrite(SpinMat_v[ss],vv);
 	  });
      }// j within block
      t_afor+=usecond();
      // After getting the sitewise product do the mom phase loop
      for(int m=0;m<Nmom;m++){
 	std::cout <<GridLogMessage << "mom "<<m<<" i="<<i<<" jo="<<jo<<std::endl;
 	t_pha-=usecond();
 	MomSpinMat   = SpinMat * mom[m];
 	t_pha+=usecond();
 	t_sum-=usecond();
 	ncall++;
 	sliceSum(MomSpinMat,sliced,orthogdim);
 	t_sum+=usecond();
 	t_trace-=usecond();
 	for(int mu=0;mu<Ngamma;mu++){
 	  std::cout <<GridLogMessage << "trace Gamma "<<mu<<std::endl;
 	  for(int t=0;t<sliced.size();t++){
 	    for(int j=jo;j<MIN(Rblock,jo+block);j++){
 	      int jj=j%block;
 	      auto tmp = sliced[t](jj);
 	      auto trSG = trace(tmp*Gamma(gammas[mu]));
 	      mat(m,mu,t,i,j) = trSG()();
 	    }
 	  }
 	}
 	t_trace+=usecond();
      }
    }//jo
  }
  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_afor  "<<t_afor<<" us"<<std::endl;
  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_pha   "<<t_pha<<" us"<<std::endl;
  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_sum   "<<t_sum<<" us"<<std::endl;
  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU N_sum   "<<ncall<<" calls"<<std::endl;
  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_trace "<<t_trace<<" us"<<std::endl;
 }
 /*
 template<class FImpl>
 void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat, 
 				  const FermionField *wi,
@@ -645,6 +764,7 @@ void A2Autils<FImpl>::PionFieldVV(Eigen::Tensor<ComplexD,3> &mat,
  const int nog5=0;
  PionFieldXX(mat,vi,vj,orthogdim,nog5);
 }
 */
 // "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x)
 //
@@ -1062,7 +1182,6 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
  }
  for (int t = 0; t < N_t; t++){
    std::cout << GridLogMessage << "Contraction t = " << t << std::endl;
    buf = WW_sd[t];
    thread_for(ss,grid->oSites(),{
      for(int d_o=0;d_o<N_d;d_o+=d_unroll){
--- a/tests/Test_meson_field.cc
+++ b/tests/Test_meson_field.cc
@@ -56,13 +56,9 @@ int main(int argc, char *argv[])
  // MesonField lhs and rhs vectors
  std::vector<FermionField> phi(VDIM,&grid);
  std::vector<FermionField> rho(VDIM,&grid);
  FermionField rho_tmp(&grid);
  std::cout << GridLogMessage << "Initialising random meson fields" << std::endl;
  for (unsigned int i = 0; i < VDIM; ++i){
    random(pRNG,phi[i]);
    random(pRNG,rho_tmp); //ideally only nonzero on t=0
    rho[i] = where((t==TSRC), rho_tmp, 0.*rho_tmp); //ideally only nonzero on t=0
  }
  std::cout << GridLogMessage << "Meson fields initialised, rho non-zero only for t = " << TSRC << std::endl;
@@ -82,7 +78,7 @@ int main(int argc, char *argv[])
 	  {1.,1.,1.},
 	  {2.,0.,0.}
  };
-
+  // 5 momenta x VDIMxVDIM = 125 calls (x 16 spins) 1.4s => 1400/125 ~10ms per call
  std::cout << GridLogMessage << "Meson fields will be created for " << Gmu.size() << " Gamma matrices and " << momenta.size() << " momenta." << std::endl;
  std::cout << GridLogMessage << "Computing complex phases" << std::endl;
@@ -102,27 +98,46 @@ int main(int argc, char *argv[])
  std::cout << GridLogMessage << "Computing complex phases done." << std::endl;
  Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpp(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
-  Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
+  Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpp_gpu(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
  Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mrr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
  // timer
  double start,stop;
  //execute meson field routine
  std::cout << GridLogMessage << "Meson Field Warmup Begin" << std::endl;
  A2Autils<WilsonImplR>::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp);
  std::cout << GridLogMessage << "Meson Field Timing Begin" << std::endl;
  start = usecond();
  A2Autils<WilsonImplR>::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp);
  stop = usecond();
  std::cout << GridLogMessage << "M(phi,phi) created, execution time " << stop-start << " us" << std::endl;
  std::cout << GridLogMessage << "Meson Field GPU Warmup Begin" << std::endl;
  A2Autils<WilsonImplR>::MesonFieldGPU(Mpp_gpu,&phi[0],&phi[0],Gmu,phases,Tp);
  std::cout << GridLogMessage << "Meson Field GPU Timing Begin" << std::endl;
  start = usecond();
-  /* Ideally, for this meson field we could pass TSRC (even better a list of timeslices)
+  A2Autils<WilsonImplR>::MesonFieldGPU(Mpp_gpu,&phi[0],&phi[0],Gmu,phases,Tp);
   * to the routine so that all the compnents which are predictably equal to zero are not computed. */
  A2Autils<WilsonImplR>::MesonField(Mpr,&phi[0],&rho[0],Gmu,phases,Tp);
  stop = usecond();
-  std::cout << GridLogMessage << "M(phi,rho) created, execution time " << stop-start << " us" << std::endl;
+  std::cout << GridLogMessage << "M_gpu(phi,phi) created, execution time " << stop-start << " us" << std::endl;
-  start = usecond();
+
-  A2Autils<WilsonImplR>::MesonField(Mrr,&rho[0],&rho[0],Gmu,phases,Tp);
+  for(int mom=0;mom<momenta.size();mom++){
-  stop = usecond();
+    for(int mu=0;mu<Gmu.size();mu++){
-  std::cout << GridLogMessage << "M(rho,rho) created, execution time " << stop-start << " us" << std::endl;
+      for(int t=0;t<Nt;t++){
 	for(int v=0;v<VDIM;v++){
 	  for(int w=0;w<VDIM;w++){
 	    std::cout << GridLogMessage
 		      << " " << mom
 		      << " " << mu
 		      << " " << t
 		      << " " << v
 		      << " " << w
 		      << " " << Mpp_gpu(mom,mu,t,v,w)
 		      << " " << Mpp(mom,mu,t,v,w) << std::endl;
 	  }
 	}
      }
    }
  }
  std::string FileName = "Meson_Fields";
 #ifdef HAVE_HDF5
@@ -134,12 +149,11 @@ int main(int argc, char *argv[])
  using Default_Writer = Grid::BinaryWriter;
  FileName.append(".bin");
 #endif
-
+  {
    Default_Writer w(FileName);
    write(w,"phi_phi",Mpp);
-  write(w,"phi_rho",Mpr);
+    write(w,"phi_phi_gpu",Mpp_gpu);
-  write(w,"rho_rho",Mrr);
+  }
  // epilogue
  std::cout << GridLogMessage << "Grid is finalizing now" << std::endl;
  Grid_finalize();