Staggered coalseced read

2025-11-24 08:29:32 +00:00 · 2021-03-29 20:01:15 +02:00
parent 8bdadbadac
commit bb89a82a07
3 changed files with 82 additions and 45 deletions
--- a/Grid/qcd/action/fermion/StaggeredImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredImpl.h
@@ -72,19 +72,23 @@ public:
  StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){};
-  static accelerator_inline void multLink(SiteSpinor &phi,
+  template<class _Spinor>
  static accelerator_inline void multLink(_Spinor &phi,
 		       const SiteDoubledGaugeField &U,
-		       const SiteSpinor &chi,
+		       const _Spinor &chi,
 		       int mu)
  {
-    mult(&phi(), &U(mu), &chi());
+    auto UU = coalescedRead(U(mu));
    mult(&phi(), &UU, &chi());
  }
-  static accelerator_inline void multLinkAdd(SiteSpinor &phi,
+  template<class _Spinor>
  static accelerator_inline void multLinkAdd(_Spinor &phi,
 			  const SiteDoubledGaugeField &U,
-			  const SiteSpinor &chi,
+			  const _Spinor &chi,
 			  int mu)
  {
-    mac(&phi(), &U(mu), &chi());
+    auto UU = coalescedRead(U(mu));
    mac(&phi(), &UU, &chi());
  }
  template <class ref>
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -184,18 +184,22 @@ public:
      mat = TraceIndex<SpinIndex>(P); 
    }
-    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
    {
      for (int mu = 0; mu < Nd; mu++)
      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
    }
-
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu)
-  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+  {
-      
+#undef USE_OLD_INSERT_FORCE    
    int Ls=Btilde.Grid()->_fdimensions[0];
    autoView( mat_v , mat, AcceleratorWrite);
 #ifdef USE_OLD_INSERT_FORCE    
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
      const int Nsimd = SiteSpinor::Nsimd();
      autoView( tmp_v , tmp, AcceleratorWrite);
      autoView( Btilde_v , Btilde, AcceleratorRead);
      autoView( Atilde_v , Atilde, AcceleratorRead);
@@ -208,6 +212,29 @@ public:
 	});
    }
    PokeIndex<LorentzIndex>(mat,tmp,mu);
 #else
    {
      const int Nsimd = SiteSpinor::Nsimd();
      autoView( Btilde_v , Btilde, AcceleratorRead);
      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
 	  int sU=sss;
  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
  	  ColorMatrixType sum;
 	  zeroit(sum);  
 	  for(int s=0;s<Ls;s++){
 	    int sF = s+Ls*sU;
  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
  	      auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
  	      auto aa = coalescedRead(Atilde_v[sF]()(spn) );
 	      auto op = outerProduct(bb,aa);
  	      sum = sum + op;
 	    }
 	  }
  	  coalescedWrite(mat_v[sU](mu)(), sum);
      });
    }
 #endif    
  }
 };
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -35,39 +35,32 @@ NAMESPACE_BEGIN(Grid);
 #define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
-    if (SE->_permute) {						\
+    int perm= SE->_permute;						\
-      chi_p = &chi;						\
+    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
      permute(chi,  in[SE->_offset], ptype);			\
    } else {							\
      chi_p = &in[SE->_offset];					\
    }								\
  } else {							\
-    chi_p = &buf[SE->_offset];					\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  multLink(Uchi, U[sU], *chi_p, Dir);			
+  acceleratorSynchronise();					\
  multLink(Uchi, U[sU], chi, Dir);			
 #define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
-    if (SE->_permute) {						\
+    int perm= SE->_permute;						\
-      chi_p = &chi;						\
+    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
      permute(chi,  in[SE->_offset], ptype);			\
    } else {							\
      chi_p = &in[SE->_offset];					\
    }								\
  } else if ( st.same_node[Dir] ) {				\
-    chi_p = &buf[SE->_offset];					\
+    chi = coalescedRead(buf[SE->_offset],lane);                 \
  }								\
  if (SE->_is_local || st.same_node[Dir] ) {			\
-    multLink(Uchi, U[sU], *chi_p, Dir);				\
+    multLink(Uchi, U[sU], chi, Dir);				\
  }
 #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
-    chi_p = &buf[SE->_offset];					\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
-    multLink(Uchi, U[sU], *chi_p, Dir);				\
+    multLink(Uchi, U[sU], chi, Dir);				\
  }
 template <class Impl>
@@ -84,12 +77,14 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
 					     SiteSpinor *buf, int sF, int sU, 
 					     const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
-  const SiteSpinor *chi_p;
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
-  SiteSpinor chi;
+  calcSpinor chi;
-  SiteSpinor Uchi;
+  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int skew;
  const int Nsimd = SiteHalfSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  //  for(int s=0;s<LLs;s++){
  //
@@ -118,7 +113,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
    if ( dag ) { 
      Uchi = - Uchi;
    } 
-    vstream(out[sF], Uchi);
+    coalescedWrite(out[sF], Uchi,lane);
  }
 };
@@ -130,13 +125,16 @@ template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU, 
-						const FermionFieldView &in, FermionFieldView &out,int dag) {
+						const FermionFieldView &in, FermionFieldView &out,int dag)
-  const SiteSpinor *chi_p;
+{
-  SiteSpinor chi;
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
-  SiteSpinor Uchi;
+  calcSpinor chi;
  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int skew ;
  const int Nsimd = SiteHalfSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  //  for(int s=0;s<LLs;s++){
  //    int sF=LLs*sU+s;
@@ -165,7 +163,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
    if ( dag ) {
      Uchi = - Uchi;
    }
-    vstream(out[sF], Uchi);
+    coalescedWrite(out[sF], Uchi,lane);
  }
 };
@@ -178,14 +176,17 @@ template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU,
-						const FermionFieldView &in, FermionFieldView &out,int dag) {
+						const FermionFieldView &in, FermionFieldView &out,int dag)
-  const SiteSpinor *chi_p;
+{
-  //  SiteSpinor chi;
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
-  SiteSpinor Uchi;
+  calcSpinor chi;
  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int nmu=0;
  int skew ;
  const int Nsimd = SiteHalfSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  //  for(int s=0;s<LLs;s++){
  //    int sF=LLs*sU+s;
@@ -211,11 +212,12 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
    }
-    if ( nmu ) { 
+    if ( nmu ) {
-      if ( dag ) { 
+      auto _out = coalescedRead(out[sF],lane);
-	out[sF] = out[sF] - Uchi;
+      if ( dag ) {
 	coalescedWrite(out[sF], _out-Uchi,lane);
      } else { 
-	out[sF] = out[sF] + Uchi;
+	coalescedWrite(out[sF], _out+Uchi,lane);
      }
    }
  }
@@ -261,6 +263,8 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
  GridBase *FGrid=in.Grid();  
  GridBase *UGrid=U.Grid();  
  typedef StaggeredKernels<Impl> ThisKernel;
  const int Nsimd = SiteHalfSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  autoView( UUU_v , UUU, AcceleratorRead);
  autoView( U_v   ,   U, AcceleratorRead);
  autoView( in_v  ,  in, AcceleratorRead);
@@ -301,6 +305,8 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
  GridBase *FGrid=in.Grid();  
  GridBase *UGrid=U.Grid();  
  typedef StaggeredKernels<Impl> ThisKernel;
  const int Nsimd = SiteHalfSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  autoView( UUU_v ,   U, AcceleratorRead);
  autoView( U_v   ,   U, AcceleratorRead);
  autoView( in_v  ,  in, AcceleratorRead);