Merge pull request #243 from fionnoh/feature/A2A_current_insertion

Feature/a2 a current insertion
2025-10-22 00:44:45 +01:00 · 2019-10-22 13:55:53 +01:00
parent 202f025fc7 a55d0ba8fe
commit c97f780784
11 changed files with 835 additions and 483 deletions
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -317,116 +317,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  }
 }

-template<class vobj>
-static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
-{
-  // std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
-
-  typedef typename vobj::scalar_type scalar_type;
-  std::vector<scalar_type> lsSum;
-  localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
-  globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
-  // std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
-}
-
-template <class vobj>
-static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
-{
-  // std::cout << GridLogMessage << "Start prep" << std::endl;
-  typedef typename vobj::vector_type   vector_type;
-  typedef typename vobj::scalar_type   scalar_type;
-  GridBase  *grid = lhs.Grid();
-  assert(grid!=NULL);
-  conformable(grid,rhs.Grid());
-
-  const int    Nd = grid->_ndimension;
-  const int Nsimd = grid->Nsimd();
-
-  assert(orthogdim >= 0);
-  assert(orthogdim < Nd);
-
-  int fd=grid->_fdimensions[orthogdim];
-  int ld=grid->_ldimensions[orthogdim];
-  int rd=grid->_rdimensions[orthogdim];
-  // std::cout << GridLogMessage << "Start alloc" << std::endl;
-
-  Vector<vector_type> lvSum(rd); // will locally sum vectors first
-  lsSum.resize(ld,scalar_type(0.0));                    // sum across these down to scalars
-  ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD  
-  // std::cout << GridLogMessage << "End alloc" << std::endl;
-
-  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
-  for(int r=0;r<rd;r++){
-    lvSum[r]=Zero();
-  }
-
-  int e1=    grid->_slice_nblock[orthogdim];
-  int e2=    grid->_slice_block [orthogdim];
-  int stride=grid->_slice_stride[orthogdim];
-  // std::cout << GridLogMessage << "End prep" << std::endl;
-  // std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
-  vector_type vv;
-  auto l_v=lhs.View();
-  auto r_v=rhs.View();
-  thread_for( r,rd,{
-
-    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-    for(int n=0;n<e1;n++){
-      for(int b=0;b<e2;b++){
-        int ss = so + n * stride + b;
-        vv = TensorRemove(innerProduct(l_v[ss], r_v[ss]));
-        lvSum[r] = lvSum[r] + vv;
-      }
-    }
-  });
-  // std::cout << GridLogMessage << "End parallel inner product" << std::endl;
-
-  // Sum across simd lanes in the plane, breaking out orthog dir.
-  Coordinate icoor(Nd);
-  for(int rt=0;rt<rd;rt++){
-
-    iScalar<vector_type> temp; 
-    temp._internal = lvSum[rt];
-    extract(temp,extracted);
-
-    for(int idx=0;idx<Nsimd;idx++){
-
-      grid->iCoorFromIindex(icoor,idx);
-
-      int ldx =rt+icoor[orthogdim]*rd;
-
-      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
-
-    }
-  }
-  // std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
-}
-template <class vobj>
-static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
-{
-  typedef typename vobj::scalar_type scalar_type;
-  GridBase *grid = lhs.Grid();
-  int fd = result.size();
-  int ld = lsSum.size();
-  // sum over nodes.
-  std::vector<scalar_type> gsum;
-  gsum.resize(fd, scalar_type(0.0));
-  // std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
-  for(int t=0;t<fd;t++){
-    int pt = t/ld; // processor plane
-    int lt = t%ld;
-    if ( pt == grid->_processor_coor[orthogdim] ) {
-      gsum[t]=lsSum[lt];
-    }
-  }
-  // std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
-  // std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
-  grid->GlobalSumVector(&gsum[0], fd);
-  // std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
-
-  result = gsum;
-}
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
 {
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@@ -67,8 +67,21 @@ public:
        const std::vector<ComplexField> &emB1,
        int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);

-  static void ContractWWVV(std::vector<PropagatorField> &WWVV,
-			   const Eigen::Tensor<ComplexD,3> &WW_sd,
+  template <typename TensorType>
+  typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
+                           std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
+                           void>::type
+  static ContractWWVV(std::vector<PropagatorField> &WWVV,
+			   const TensorType &WW_sd,
+			   const FermionField *vs,
+			   const FermionField *vd);
+
+  template <typename TensorType>
+  typename std::enable_if<!(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
+                            std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
+                            void>::type
+  static ContractWWVV(std::vector<PropagatorField> &WWVV,
+			   const TensorType &WW_sd,
 			   const FermionField *vs,
 			   const FermionField *vd);

@@ -98,6 +111,11 @@ public:
 			const FermionField *vd,
 			int orthogdim);
 #endif
+private:
+  inline static void OuterProductWWVV(PropagatorField &WWVV,
+                               const vobj &lhs,
+                               const vobj &rhs,
+                               const int Ns, const int ss);
 };

 template <class FImpl>
@@ -968,9 +986,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
 // Take WW_sd v^dag_d (x) v_s
 // 

-template<class FImpl>
-void A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
-				   const Eigen::Tensor<ComplexD,3> &WW_sd,
+template <class FImpl>
+template <typename TensorType>
+typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
+                         std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
+                         void>::type
+A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
+				   const TensorType &WW_sd,
 				   const FermionField *vs,
 				   const FermionField *vd)
 {
@@ -992,39 +1014,100 @@ void A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
    for(int d_o=0;d_o<N_d;d_o+=d_unroll){
      for(int t=0;t<N_t;t++){
      for(int s=0;s<N_s;s++){
-	auto vs_v = vs[s].View();
-	auto tmp1 = vs_v[ss];
-	vobj tmp2 = Zero();
-	vobj tmp3 = Zero();
-	for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
-	  auto vd_v = vd[d].View();
-	  Scalar_v coeff = WW_sd(t,s,d);
-	  tmp3 = conjugate(vd_v[ss]);
-	  mac(&tmp2, &coeff, &tmp3);
-	}
+  auto vs_v = vs[s].View();
+  auto tmp1 = vs_v[ss];
+  vobj tmp2 = Zero();
+  vobj tmp3 = Zero();
+  for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
+    auto vd_v = vd[d].View();
+    Scalar_v coeff = WW_sd(t,s,d);
+    tmp3 = conjugate(vd_v[ss]);
+    mac(&tmp2, &coeff, &tmp3);
+  }

-	//////////////////////////
-	// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
-	//////////////////////////
-	auto WWVV_v = WWVV[t].View();
-	for(int s1=0;s1<Ns;s1++){
-	for(int s2=0;s2<Ns;s2++){
-	  WWVV_v[ss]()(s1,s2)(0,0) += tmp1()(s1)(0)*tmp2()(s2)(0);
-	  WWVV_v[ss]()(s1,s2)(0,1) += tmp1()(s1)(0)*tmp2()(s2)(1);
-	  WWVV_v[ss]()(s1,s2)(0,2) += tmp1()(s1)(0)*tmp2()(s2)(2);
-	  WWVV_v[ss]()(s1,s2)(1,0) += tmp1()(s1)(1)*tmp2()(s2)(0);
-	  WWVV_v[ss]()(s1,s2)(1,1) += tmp1()(s1)(1)*tmp2()(s2)(1);
-	  WWVV_v[ss]()(s1,s2)(1,2) += tmp1()(s1)(1)*tmp2()(s2)(2);
-	  WWVV_v[ss]()(s1,s2)(2,0) += tmp1()(s1)(2)*tmp2()(s2)(0);
-	  WWVV_v[ss]()(s1,s2)(2,1) += tmp1()(s1)(2)*tmp2()(s2)(1);
-	  WWVV_v[ss]()(s1,s2)(2,2) += tmp1()(s1)(2)*tmp2()(s2)(2);
-	}}
+  //////////////////////////
+  // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
+  //////////////////////////
+  OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);

      }}
    }
  });
 }

+template <class FImpl>
+template <typename TensorType>
+typename std::enable_if<!(std::is_same<Eigen::Tensor<ComplexD, 3>, TensorType>::value ||
+                          std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
+                          void>::type
+A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
+                              const TensorType &WW_sd,
+                              const FermionField *vs,
+                              const FermionField *vd)
+{
+  GridBase *grid = vs[0].Grid();
+
+  int nd    = grid->_ndimension;
+  int Nsimd = grid->Nsimd();
+  int N_t = WW_sd.dimensions()[0];
+  int N_s = WW_sd.dimensions()[1];
+  int N_d = WW_sd.dimensions()[2];
+
+  int d_unroll = 32;// Empirical optimisation
+
+  Eigen::Matrix<Complex, -1, -1, Eigen::RowMajor> buf;
+
+  for(int t=0;t<N_t;t++){
+    WWVV[t] = Zero();
+  }
+
+  for (int t = 0; t < N_t; t++){
+    std::cout << GridLogMessage << "Contraction t = " << t << std::endl;
+    buf = WW_sd[t];
+    thread_for(ss,grid->oSites(),{
+      for(int d_o=0;d_o<N_d;d_o+=d_unroll){
+        for(int s=0;s<N_s;s++){
+    auto vs_v = vs[s].View();
+    auto tmp1 = vs_v[ss];
+    vobj tmp2 = Zero();
+    vobj tmp3 = Zero();
+    for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
+      auto vd_v = vd[d].View();
+      Scalar_v coeff = buf(s,d);
+      tmp3 = conjugate(vd_v[ss]);
+      mac(&tmp2, &coeff, &tmp3);
+    }
+
+    //////////////////////////
+    // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
+    //////////////////////////
+    OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
+      }}
+    });
+  }
+}
+
+template <class FImpl>
+inline void A2Autils<FImpl>::OuterProductWWVV(PropagatorField &WWVV,
+                                             const vobj &lhs,
+                                             const vobj &rhs,
+                                             const int Ns, const int ss)
+{
+  auto WWVV_v = WWVV.View();
+  for (int s1 = 0; s1 < Ns; s1++){
+    for (int s2 = 0; s2 < Ns; s2++){
+      WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
+      WWVV_v[ss]()(s1,s2)(0, 1) += lhs()(s1)(0) * rhs()(s2)(1);
+      WWVV_v[ss]()(s1,s2)(0, 2) += lhs()(s1)(0) * rhs()(s2)(2);
+      WWVV_v[ss]()(s1,s2)(1, 0) += lhs()(s1)(1) * rhs()(s2)(0);
+      WWVV_v[ss]()(s1,s2)(1, 1) += lhs()(s1)(1) * rhs()(s2)(1);
+      WWVV_v[ss]()(s1,s2)(1, 2) += lhs()(s1)(1) * rhs()(s2)(2);
+      WWVV_v[ss]()(s1,s2)(2, 0) += lhs()(s1)(2) * rhs()(s2)(0);
+      WWVV_v[ss]()(s1,s2)(2, 1) += lhs()(s1)(2) * rhs()(s2)(1);
+      WWVV_v[ss]()(s1,s2)(2, 2) += lhs()(s1)(2) * rhs()(s2)(2);
+    }
+  }
+}

 template<class FImpl>
 void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWVV0,