Finally starting to get decent performance on Volta

2025-11-17 03:59:30 +00:00 · 2018-07-13 12:06:18 -04:00
parent 2cc07450f4
commit b2b5137d28
5 changed files with 203 additions and 98 deletions
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -215,6 +215,14 @@ public:
    mult(&phi(), &U(mu), &chi());
  }
      
+#ifdef GPU_VEC
+  static accelerator_inline void copyLinkGpu(int lane,
+					     SiteDoubledGaugeField & UU,
+					     const SiteDoubledGaugeField &U)
+  {
+    auto U_l   = extractLane(lane,U);
+    insertLane(lane,UU,U_l);
+  }
  static accelerator_inline void multLinkGpu(int lane,
 					     typename SiteHalfSpinor::scalar_object &phi,
 					     const SiteDoubledGaugeField &U,
@@ -224,6 +232,17 @@ public:
    auto U_l   = extractLane(lane,U(mu));
    phi() =  U_l * chi();
  }
+#else
+  static accelerator_inline void multLinkGpu(int lane,
+					     SiteHalfSpinor &phi,
+					     const SiteDoubledGaugeField &U,
+					     const SiteHalfSpinor &chi,
+					     int mu) 
+  {
+    auto U_l   = U(mu);
+    phi() =  U_l * chi();
+  }
+#endif
    
  static accelerator_inline void multLinkProp(SitePropagator &phi,
 					      const SiteDoubledGaugeField &U,
@@ -364,7 +383,13 @@ public:
    }
    mult(&phi(), &UU(), &chi());
  }
-
+#ifdef GPU_VEC
+  static accelerator_inline void copyLinkGpu(int lane,
+					     SiteDoubledGaugeField & UU,
+					     const SiteDoubledGaugeField &U)
+  {
+    UU = U;
+  }
  static accelerator_inline void multLinkGpu(int lane,
 					     typename SiteHalfSpinor::scalar_object &phi,
 					     const SiteDoubledGaugeField &U,
@@ -374,6 +399,17 @@ public:
    auto U_l   = U(mu);
    phi() =  U_l * chi();
  }
+#else
+  static accelerator_inline void multLinkGpu(int lane,
+					     SiteHalfSpinor &phi,
+					     const SiteDoubledGaugeField &U,
+					     const SiteHalfSpinor &chi,
+					     int mu) 
+  {
+    auto U_l   = U(mu);
+    phi() =  U_l * chi();
+  }
+#endif

  static accelerator_inline void multLinkProp(SitePropagator &phi,
 					      const SiteDoubledGaugeField &U,
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -104,6 +104,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, Double
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out)
 {
+#ifndef GRID_NVCC
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
@@ -121,6 +122,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, Double
  GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
  GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
  vstream(out[sF], result);
+#endif
 };

 template <class Impl>
@@ -128,6 +130,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGa
 						      SiteHalfSpinor *buf, int sF,
 						      int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
+#ifndef GRID_NVCC
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
@@ -145,6 +148,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGa
  GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
  GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
  vstream(out[sF], result);
+#endif
 };
  ////////////////////////////////////////////////////////////////////
  // Interior kernels
@@ -154,6 +158,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  Do
 							    SiteHalfSpinor *buf, int sF,
 							    int sU, const FermionFieldView &in, FermionFieldView &out)
 {
+#ifndef GRID_NVCC
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
@@ -172,6 +177,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  Do
  GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
  GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
  vstream(out[sF], result);
+#endif
 };

 template <class Impl>
@@ -179,6 +185,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  Doubl
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
+#ifndef GRID_NVCC
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
@@ -196,6 +203,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  Doubl
  GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
  GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
  vstream(out[sF], result);
+#endif
 };
 ////////////////////////////////////////////////////////////////////
 // Exterior kernels
@@ -205,6 +213,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  Do
 							    SiteHalfSpinor *buf, int sF,
 							    int sU, const FermionFieldView &in, FermionFieldView &out)
 {
+#ifndef GRID_NVCC
  //  SiteHalfSpinor tmp;
  //  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
@@ -225,6 +234,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  Do
  if ( nmu ) { 
    out[sF] = out[sF] + result; 
  }
+#endif
 };

 template <class Impl>
@@ -232,6 +242,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  Doubl
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
+#ifndef GRID_NVCC
  //  SiteHalfSpinor tmp;
  //  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
@@ -252,12 +263,14 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  Doubl
  if ( nmu ) { 
    out[sF] = out[sF] + result; 
  }
+#endif
 };

 template <class Impl>
 accelerator void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) 
 {
+#ifndef GRID_NVCC
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteSpinor result;
@@ -275,6 +288,7 @@ accelerator void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFie
  GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
  GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
  vstream(out[sF], result);
+#endif
 }

 /*******************************************************************************
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -54,59 +54,11 @@ public:

  static void Dhop(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 		   int Ls, int Nsite, const FermionField &in, FermionField &out,
-		   int interior=1,int exterior=1) 
-  {
-    auto U_v   = U.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    auto st_v  = st.View();
-    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
-      const uint64_t nsimd = Simd::Nsimd();
-      const uint64_t    NN = Nsite*Ls*nsimd;
-      accelerator_loopN( sss, NN, {
-	  uint64_t cur  = sss;
-	  //	  uint64_t lane = cur % nsimd;
-	  cur = cur / nsimd;
-	  uint64_t   sF = cur;         cur = cur / Ls;
-	  uint64_t   sU = cur;
-	  WilsonKernels<Impl>::GpuDhopSite(st_v,U_v,buf,sF,sU,in_v,out_v);
-      });
-    } else { 
-      accelerator_loop( ss, U_v, {
-	int sU = ss;
-        int sF = Ls * sU;
-        DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
-      });
-    }
-  }
+		   int interior=1,int exterior=1) ;
  static void DhopDag(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 		      int Ls, int Nsite, const FermionField &in, FermionField &out,
-		      int interior=1,int exterior=1) 
-  {
-    auto U_v   = U.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    auto st_v  = st.View();
+		      int interior=1,int exterior=1) ;

-    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
-      const uint64_t nsimd = Simd::Nsimd();
-      const uint64_t    NN = Nsite*Ls*nsimd;
-      accelerator_loopN( sss, NN, {
-	  uint64_t cur  = sss;
-	  // uint64_t lane = cur % nsimd;
-	  cur = cur / nsimd;
-	  uint64_t   sF = cur;         cur = cur / Ls;
-	  uint64_t   sU = cur;
-	  WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,sF,sU,in_v,out_v);
-      });
-    } else { 
-      accelerator_loop( ss, U_v, {
-	int sU = ss;
-        int sF = Ls * sU;
-        DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
-      });
-    }
-  }
   
  template <bool EnableBool = true> static accelerator
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
@@ -275,11 +227,11 @@ public:

 private:
  // Specialised variants
-  static accelerator void GpuDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-				      int sF,  int sU, const FermionFieldView &in, FermionFieldView &out);
+  static accelerator_inline void GpuDhopSite(StencilView &st,  SiteDoubledGaugeField &U, SiteHalfSpinor * buf,
+					     int Ls, int sF,  int sU, const FermionFieldView &in, FermionFieldView &out);
  
-  static accelerator void GpuDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-					 int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  static accelerator_inline void GpuDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+						int Ls,int sF, int sU, const FermionFieldView &in, FermionFieldView &out);

  static accelerator void GenericDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
 					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
--- a/lib/qcd/action/fermion/WilsonKernelsGpu.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsGpu.cc
@@ -57,30 +57,51 @@ accelerator_inline int get_my_lane_offset(int Nsimd)
 #endif
 }

-
+#ifdef GPU_VEC
 #define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
  synchronise();							\
  if (SE->_is_local) {							\
    int mask = Nsimd >> (ptype + 1);					\
    int plane= SE->_permute ? (lane ^ mask) : lane;			\
-    auto in_l = extractLane(plane,in[SE->_offset]);			\
+    auto in_l = extractLane(plane,in[SE->_offset+s]);			\
    spProj(chi,in_l);							\
  } else {								\
-    chi  = extractLane(lane,buf[SE->_offset]);				\
+    chi  = extractLane(lane,buf[SE->_offset+s]);			\
  }									\
  synchronise();
+#else 
+#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
+  if (SE->_is_local) {							\
+    auto in_t = in[SE->_offset+s];					\
+    if (SE->_permute) {							\
+      spProj(tmp, in_t);						\
+      permute(chi, tmp, ptype);						\
+    } else {								\
+      spProj(chi, in_t);						\
+    }									\
+  } else {								\
+    chi  = buf[SE->_offset+s];						\
+  }									\
+  synchronise();
+#endif

 template <class Impl>
-accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
-						     SiteHalfSpinor *buf, int sF,
+accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
+						     SiteHalfSpinor *buf, int Ls, int s,
 						     int sU, const FermionFieldView &in, FermionFieldView &out)
 {
+#ifdef GPU_VEC
  typename SiteHalfSpinor::scalar_object chi;
  typename SiteHalfSpinor::scalar_object Uchi;
  typename SiteSpinor::scalar_object   result;
+#else 
+  SiteHalfSpinor chi;
+  SiteHalfSpinor Uchi;
+  SiteHalfSpinor tmp;
+  SiteSpinor   result;
+#endif
  typedef typename SiteSpinor::scalar_type scalar_type;
  typedef typename SiteSpinor::vector_type vector_type;
-
  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);

  uint64_t lane_offset= get_my_lane_offset(Nsimd);
@@ -88,69 +109,80 @@ accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGau

  StencilEntry *SE;
  int ptype;
-
+  uint64_t ssF = Ls * sU;
+  uint64_t sF  = ssF + s;
 #ifndef __CUDA_ARCH__
  for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
 #else
  int lane = lane_offset; {
 #endif
-    SE = st.GetEntry(ptype, Xp, sF);
+    SE = st.GetEntry(ptype, Xp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp); 
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
    spReconXp(result, Uchi);

-    SE = st.GetEntry(ptype, Yp, sF);
+    SE = st.GetEntry(ptype, Yp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
    accumReconYp(result, Uchi);
      
-    SE = st.GetEntry(ptype, Zp, sF);
+    SE = st.GetEntry(ptype, Zp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
    accumReconZp(result, Uchi);

-    SE = st.GetEntry(ptype, Tp, sF);
+    SE = st.GetEntry(ptype, Tp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
    accumReconTp(result, Uchi);

-    SE = st.GetEntry(ptype, Xm, sF);
+    SE = st.GetEntry(ptype, Xm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
    accumReconXm(result, Uchi);

-    SE = st.GetEntry(ptype, Ym, sF);
+    SE = st.GetEntry(ptype, Ym, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
    accumReconYm(result, Uchi);


-    SE = st.GetEntry(ptype, Zm, sF);
+    SE = st.GetEntry(ptype, Zm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
    accumReconZm(result, Uchi);

-    SE = st.GetEntry(ptype, Tm, sF);
+    SE = st.GetEntry(ptype, Tm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); 
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
    accumReconTm(result, Uchi);

    synchronise();
+#ifdef GPU_VEC
    insertLane (lane,out[sF],result);
+#else
+  vstream(out[sF], result);
+#endif
  }
 }

 template <class Impl>
-accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U,
-						  SiteHalfSpinor *buf, int sF,
+accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U,
+						  SiteHalfSpinor *buf,  int Ls, int s,
 						  int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
+#ifdef GPU_VEC
  typename SiteHalfSpinor::scalar_object chi;
  typename SiteHalfSpinor::scalar_object Uchi;
  typename SiteSpinor::scalar_object   result;
+#else 
+  SiteHalfSpinor chi;
+  SiteHalfSpinor Uchi;
+  SiteHalfSpinor tmp;
+  SiteSpinor   result;
+#endif
  typedef typename SiteSpinor::scalar_type scalar_type;
  typedef typename SiteSpinor::vector_type vector_type;
-
  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);

  uint64_t lane_offset= get_my_lane_offset(Nsimd);
@@ -158,54 +190,62 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF

  StencilEntry *SE;
  int ptype;
+  // Forces some degree of coalesce on the table look ups
+  // Could also use wide load instructions on the data structure
+  uint64_t ssF = Ls * sU;
+  uint64_t sF  = ssF + s;

 #ifndef __CUDA_ARCH__
  for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
 #else
  int lane = lane_offset; {
 #endif
-    SE = st.GetEntry(ptype, Xp, sF);
+    SE = st.GetEntry(ptype, Xp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm); 
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
    spReconXm(result, Uchi);

-    SE = st.GetEntry(ptype, Yp, sF);
+    SE = st.GetEntry(ptype, Yp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
    accumReconYm(result, Uchi);
      
-    SE = st.GetEntry(ptype, Zp, sF);
+    SE = st.GetEntry(ptype, Zp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
    accumReconZm(result, Uchi);

-    SE = st.GetEntry(ptype, Tp, sF);
+    SE = st.GetEntry(ptype, Tp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
    accumReconTm(result, Uchi);

-    SE = st.GetEntry(ptype, Xm, sF);
+    SE = st.GetEntry(ptype, Xm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
    accumReconXp(result, Uchi);

-    SE = st.GetEntry(ptype, Ym, sF);
+    SE = st.GetEntry(ptype, Ym, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
    accumReconYp(result, Uchi);

-    SE = st.GetEntry(ptype, Zm, sF);
+    SE = st.GetEntry(ptype, Zm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
    accumReconZp(result, Uchi);

-    SE = st.GetEntry(ptype, Tm, sF);
+    SE = st.GetEntry(ptype, Tm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp); 
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
    accumReconTp(result, Uchi);

    synchronise();
+#ifdef GPU_VEC
    insertLane (lane,out[sF],result);
+#else
+  vstream(out[sF], result);
+#endif
  }

 };
@@ -213,20 +253,20 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF
 // Template specialise Gparity to empty for now
 #define GPU_EMPTY(A)							\
  template <>								\
-accelerator void							\
+accelerator_inline void							\
 WilsonKernels<A>::GpuDhopSite(StencilView &st,				\
-			      DoubledGaugeFieldView &U,			\
-			      SiteHalfSpinor *buf, int sF,		\
+			      SiteDoubledGaugeField &U,			\
+			      SiteHalfSpinor *buf, int Ls, int sF,	\
 			      int sU,					\
 			      const FermionFieldView &in,		\
 			      FermionFieldView &out) { assert(0);};	\
  template <>								\
-  accelerator void							\
+  accelerator_inline void							\
  WilsonKernels<A>::GpuDhopSiteDag(StencilView &st,			\
-				DoubledGaugeFieldView &U,		\
-				   SiteHalfSpinor *buf, int sF,		\
-				int sU,					\
-				const FermionFieldView &in,		\
+				   DoubledGaugeFieldView &U,		\
+				   SiteHalfSpinor *buf, int Ls,int sF,	\
+				   int sU,				\
+				   const FermionFieldView &in,		\
 				   FermionFieldView &out) { assert(0);};

 GPU_EMPTY(GparityWilsonImplF);
@@ -234,6 +274,67 @@ GPU_EMPTY(GparityWilsonImplFH);
 GPU_EMPTY(GparityWilsonImplD);
 GPU_EMPTY(GparityWilsonImplDF);

+template <class Impl>
+void WilsonKernels<Impl>::Dhop(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+			       int Ls, int Nsite, const FermionField &in, FermionField &out,
+			       int interior,int exterior) 
+{
+    auto U_v   = U.View();
+    auto in_v  = in.View();
+    auto out_v = out.View();
+    auto st_v  = st.View();
+    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
+      const uint64_t nsimd = Simd::Nsimd();
+      const uint64_t    NN = Nsite*Ls*nsimd;
+      accelerator_loopN( sss, NN, {
+	  uint64_t cur  = sss;
+	  //	  uint64_t lane = cur % nsimd;
+	  cur = cur / nsimd;
+	  uint64_t   s  = cur%Ls;
+	  uint64_t   sF = cur;         cur = cur / Ls;
+	  uint64_t   sU = cur;
+	  WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
+      });
+    } else { 
+      accelerator_loop( ss, U_v, {
+	int sU = ss;
+        int sF = Ls * sU;
+        DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
+      });
+    }
+  }
+  template <class Impl>
+  void WilsonKernels<Impl>::DhopDag(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+				    int Ls, int Nsite, const FermionField &in, FermionField &out,
+				    int interior,int exterior) 
+  {
+    auto U_v   = U.View();
+    auto in_v  = in.View();
+    auto out_v = out.View();
+    auto st_v  = st.View();
+
+    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
+      const uint64_t nsimd = Simd::Nsimd();
+      const uint64_t    NN = Nsite*Ls*nsimd;
+      accelerator_loopN( sss, NN, {
+	  uint64_t cur  = sss;
+	  // uint64_t lane = cur % nsimd;
+	  cur = cur / nsimd;
+	  uint64_t   s  = cur%Ls;
+	  uint64_t   sF = cur;         cur = cur / Ls;
+	  uint64_t   sU = cur;
+	  WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,Ls,s,sU,in_v,out_v);
+      });
+    } else { 
+      accelerator_loop( ss, U_v, {
+	int sU = ss;
+        int sF = Ls * sU;
+        DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
+      });
+    }
+  }
+
+
 /*
 GPU_EMPTY(DomainWallVec5dImplF);
 GPU_EMPTY(DomainWallVec5dImplFH);
--- a/lib/simd/Grid_gpu_vec.h
+++ b/lib/simd/Grid_gpu_vec.h
@@ -36,7 +36,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 namespace Grid {

-#define COALESCE_GRANULARITY (16) // bytes for coalesce granularity of target: Pascal, Volta
+  //#define COALESCE_GRANULARITY (64) // bytes for coalesce granularity of target: Pascal, Volta
+  //#define COALESCE_GRANULARITY (32) // bytes for coalesce granularity of target: Pascal, Volta
+  #define COALESCE_GRANULARITY (16) // bytes for coalesce granularity of target: Pascal, Volta

 template<class pair>
 class GpuComplex {