Finally starting to get decent performance on Volta

2026-01-05 09:29:35 +00:00 · 2018-07-13 12:06:18 -04:00
parent 2cc07450f4
commit b2b5137d28
5 changed files with 203 additions and 98 deletions
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -215,6 +215,14 @@ public:
    mult(&phi(), &U(mu), &chi());
  }
 #ifdef GPU_VEC
  static accelerator_inline void copyLinkGpu(int lane,
 					     SiteDoubledGaugeField & UU,
 					     const SiteDoubledGaugeField &U)
  {
    auto U_l   = extractLane(lane,U);
    insertLane(lane,UU,U_l);
  }
  static accelerator_inline void multLinkGpu(int lane,
 					     typename SiteHalfSpinor::scalar_object &phi,
 					     const SiteDoubledGaugeField &U,
@@ -224,6 +232,17 @@ public:
    auto U_l   = extractLane(lane,U(mu));
    phi() =  U_l * chi();
  }
 #else
  static accelerator_inline void multLinkGpu(int lane,
 					     SiteHalfSpinor &phi,
 					     const SiteDoubledGaugeField &U,
 					     const SiteHalfSpinor &chi,
 					     int mu) 
  {
    auto U_l   = U(mu);
    phi() =  U_l * chi();
  }
 #endif
  static accelerator_inline void multLinkProp(SitePropagator &phi,
 					      const SiteDoubledGaugeField &U,
@@ -364,7 +383,13 @@ public:
    }
    mult(&phi(), &UU(), &chi());
  }
-
+#ifdef GPU_VEC
  static accelerator_inline void copyLinkGpu(int lane,
 					     SiteDoubledGaugeField & UU,
 					     const SiteDoubledGaugeField &U)
  {
    UU = U;
  }
  static accelerator_inline void multLinkGpu(int lane,
 					     typename SiteHalfSpinor::scalar_object &phi,
 					     const SiteDoubledGaugeField &U,
@@ -374,6 +399,17 @@ public:
    auto U_l   = U(mu);
    phi() =  U_l * chi();
  }
 #else
  static accelerator_inline void multLinkGpu(int lane,
 					     SiteHalfSpinor &phi,
 					     const SiteDoubledGaugeField &U,
 					     const SiteHalfSpinor &chi,
 					     int mu) 
  {
    auto U_l   = U(mu);
    phi() =  U_l * chi();
  }
 #endif
  static accelerator_inline void multLinkProp(SitePropagator &phi,
 					      const SiteDoubledGaugeField &U,
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -104,6 +104,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, Double
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out)
 {
 #ifndef GRID_NVCC
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
@@ -121,6 +122,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, Double
  GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
  GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
  vstream(out[sF], result);
 #endif
 };
 template <class Impl>
@@ -128,6 +130,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGa
 						      SiteHalfSpinor *buf, int sF,
 						      int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
 #ifndef GRID_NVCC
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
@@ -145,6 +148,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGa
  GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
  GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
  vstream(out[sF], result);
 #endif
 };
  ////////////////////////////////////////////////////////////////////
  // Interior kernels
@@ -154,6 +158,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  Do
 							    SiteHalfSpinor *buf, int sF,
 							    int sU, const FermionFieldView &in, FermionFieldView &out)
 {
 #ifndef GRID_NVCC
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
@@ -172,6 +177,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  Do
  GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
  GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
  vstream(out[sF], result);
 #endif
 };
 template <class Impl>
@@ -179,6 +185,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  Doubl
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
 #ifndef GRID_NVCC
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
@@ -196,6 +203,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  Doubl
  GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
  GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
  vstream(out[sF], result);
 #endif
 };
 ////////////////////////////////////////////////////////////////////
 // Exterior kernels
@@ -205,6 +213,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  Do
 							    SiteHalfSpinor *buf, int sF,
 							    int sU, const FermionFieldView &in, FermionFieldView &out)
 {
 #ifndef GRID_NVCC
  //  SiteHalfSpinor tmp;
  //  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
@@ -225,6 +234,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  Do
  if ( nmu ) { 
    out[sF] = out[sF] + result; 
  }
 #endif
 };
 template <class Impl>
@@ -232,6 +242,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  Doubl
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
 #ifndef GRID_NVCC
  //  SiteHalfSpinor tmp;
  //  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
@@ -252,12 +263,14 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  Doubl
  if ( nmu ) { 
    out[sF] = out[sF] + result; 
  }
 #endif
 };
 template <class Impl>
 accelerator void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) 
 {
 #ifndef GRID_NVCC
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteSpinor result;
@@ -275,6 +288,7 @@ accelerator void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFie
  GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
  GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
  vstream(out[sF], result);
 #endif
 }
 /*******************************************************************************
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -54,59 +54,11 @@ public:
  static void Dhop(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 		   int Ls, int Nsite, const FermionField &in, FermionField &out,
-		   int interior=1,int exterior=1) 
+		   int interior=1,int exterior=1) ;
  {
    auto U_v   = U.View();
    auto in_v  = in.View();
    auto out_v = out.View();
    auto st_v  = st.View();
    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
      const uint64_t nsimd = Simd::Nsimd();
      const uint64_t    NN = Nsite*Ls*nsimd;
      accelerator_loopN( sss, NN, {
 	  uint64_t cur  = sss;
 	  //	  uint64_t lane = cur % nsimd;
 	  cur = cur / nsimd;
 	  uint64_t   sF = cur;         cur = cur / Ls;
 	  uint64_t   sU = cur;
 	  WilsonKernels<Impl>::GpuDhopSite(st_v,U_v,buf,sF,sU,in_v,out_v);
      });
    } else { 
      accelerator_loop( ss, U_v, {
 	int sU = ss;
        int sF = Ls * sU;
        DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
      });
    }
  }
  static void DhopDag(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 		      int Ls, int Nsite, const FermionField &in, FermionField &out,
-		      int interior=1,int exterior=1) 
+		      int interior=1,int exterior=1) ;
  {
    auto U_v   = U.View();
    auto in_v  = in.View();
    auto out_v = out.View();
    auto st_v  = st.View();
    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
      const uint64_t nsimd = Simd::Nsimd();
      const uint64_t    NN = Nsite*Ls*nsimd;
      accelerator_loopN( sss, NN, {
 	  uint64_t cur  = sss;
 	  // uint64_t lane = cur % nsimd;
 	  cur = cur / nsimd;
 	  uint64_t   sF = cur;         cur = cur / Ls;
 	  uint64_t   sU = cur;
 	  WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,sF,sU,in_v,out_v);
      });
    } else { 
      accelerator_loop( ss, U_v, {
 	int sU = ss;
        int sF = Ls * sU;
        DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
      });
    }
  }
  template <bool EnableBool = true> static accelerator
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
@@ -275,11 +227,11 @@ public:
 private:
  // Specialised variants
-  static accelerator void GpuDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+  static accelerator_inline void GpuDhopSite(StencilView &st,  SiteDoubledGaugeField &U, SiteHalfSpinor * buf,
-				      int sF,  int sU, const FermionFieldView &in, FermionFieldView &out);
+					     int Ls, int sF,  int sU, const FermionFieldView &in, FermionFieldView &out);
-  static accelerator void GpuDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+  static accelerator_inline void GpuDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-					 int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+						int Ls,int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
  static accelerator void GenericDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
 					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
--- a/lib/qcd/action/fermion/WilsonKernelsGpu.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsGpu.cc
@@ -57,30 +57,51 @@ accelerator_inline int get_my_lane_offset(int Nsimd)
 #endif
 }
-
+#ifdef GPU_VEC
 #define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
  synchronise();							\
  if (SE->_is_local) {							\
    int mask = Nsimd >> (ptype + 1);					\
    int plane= SE->_permute ? (lane ^ mask) : lane;			\
-    auto in_l = extractLane(plane,in[SE->_offset]);			\
+    auto in_l = extractLane(plane,in[SE->_offset+s]);			\
    spProj(chi,in_l);							\
  } else {								\
-    chi  = extractLane(lane,buf[SE->_offset]);				\
+    chi  = extractLane(lane,buf[SE->_offset+s]);			\
  }									\
  synchronise();
 #else 
 #define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
  if (SE->_is_local) {							\
    auto in_t = in[SE->_offset+s];					\
    if (SE->_permute) {							\
      spProj(tmp, in_t);						\
      permute(chi, tmp, ptype);						\
    } else {								\
      spProj(chi, in_t);						\
    }									\
  } else {								\
    chi  = buf[SE->_offset+s];						\
  }									\
  synchronise();
 #endif
 template <class Impl>
-accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
+accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
-						     SiteHalfSpinor *buf, int sF,
+						     SiteHalfSpinor *buf, int Ls, int s,
 						     int sU, const FermionFieldView &in, FermionFieldView &out)
 {
 #ifdef GPU_VEC
  typename SiteHalfSpinor::scalar_object chi;
  typename SiteHalfSpinor::scalar_object Uchi;
  typename SiteSpinor::scalar_object   result;
 #else 
  SiteHalfSpinor chi;
  SiteHalfSpinor Uchi;
  SiteHalfSpinor tmp;
  SiteSpinor   result;
 #endif
  typedef typename SiteSpinor::scalar_type scalar_type;
  typedef typename SiteSpinor::vector_type vector_type;
  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
  uint64_t lane_offset= get_my_lane_offset(Nsimd);
@@ -88,69 +109,80 @@ accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGau
  StencilEntry *SE;
  int ptype;
-
+  uint64_t ssF = Ls * sU;
  uint64_t sF  = ssF + s;
 #ifndef __CUDA_ARCH__
  for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
 #else
  int lane = lane_offset; {
 #endif
-    SE = st.GetEntry(ptype, Xp, sF);
+    SE = st.GetEntry(ptype, Xp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp); 
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
    spReconXp(result, Uchi);
-    SE = st.GetEntry(ptype, Yp, sF);
+    SE = st.GetEntry(ptype, Yp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
    accumReconYp(result, Uchi);
-    SE = st.GetEntry(ptype, Zp, sF);
+    SE = st.GetEntry(ptype, Zp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
    accumReconZp(result, Uchi);
-    SE = st.GetEntry(ptype, Tp, sF);
+    SE = st.GetEntry(ptype, Tp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
    accumReconTp(result, Uchi);
-    SE = st.GetEntry(ptype, Xm, sF);
+    SE = st.GetEntry(ptype, Xm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
    accumReconXm(result, Uchi);
-    SE = st.GetEntry(ptype, Ym, sF);
+    SE = st.GetEntry(ptype, Ym, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
    accumReconYm(result, Uchi);
-    SE = st.GetEntry(ptype, Zm, sF);
+    SE = st.GetEntry(ptype, Zm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
    accumReconZm(result, Uchi);
-    SE = st.GetEntry(ptype, Tm, sF);
+    SE = st.GetEntry(ptype, Tm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); 
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
    accumReconTm(result, Uchi);
    synchronise();
 #ifdef GPU_VEC
    insertLane (lane,out[sF],result);
 #else
  vstream(out[sF], result);
 #endif
  }
 }
 template <class Impl>
-accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U,
+accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U,
-						  SiteHalfSpinor *buf, int sF,
+						  SiteHalfSpinor *buf,  int Ls, int s,
 						  int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
 #ifdef GPU_VEC
  typename SiteHalfSpinor::scalar_object chi;
  typename SiteHalfSpinor::scalar_object Uchi;
  typename SiteSpinor::scalar_object   result;
 #else 
  SiteHalfSpinor chi;
  SiteHalfSpinor Uchi;
  SiteHalfSpinor tmp;
  SiteSpinor   result;
 #endif
  typedef typename SiteSpinor::scalar_type scalar_type;
  typedef typename SiteSpinor::vector_type vector_type;
  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
  uint64_t lane_offset= get_my_lane_offset(Nsimd);
@@ -158,54 +190,62 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF
  StencilEntry *SE;
  int ptype;
  // Forces some degree of coalesce on the table look ups
  // Could also use wide load instructions on the data structure
  uint64_t ssF = Ls * sU;
  uint64_t sF  = ssF + s;
 #ifndef __CUDA_ARCH__
  for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
 #else
  int lane = lane_offset; {
 #endif
-    SE = st.GetEntry(ptype, Xp, sF);
+    SE = st.GetEntry(ptype, Xp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm); 
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
    spReconXm(result, Uchi);
-    SE = st.GetEntry(ptype, Yp, sF);
+    SE = st.GetEntry(ptype, Yp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
    accumReconYm(result, Uchi);
-    SE = st.GetEntry(ptype, Zp, sF);
+    SE = st.GetEntry(ptype, Zp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
    accumReconZm(result, Uchi);
-    SE = st.GetEntry(ptype, Tp, sF);
+    SE = st.GetEntry(ptype, Tp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
    accumReconTm(result, Uchi);
-    SE = st.GetEntry(ptype, Xm, sF);
+    SE = st.GetEntry(ptype, Xm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
    accumReconXp(result, Uchi);
-    SE = st.GetEntry(ptype, Ym, sF);
+    SE = st.GetEntry(ptype, Ym, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
    accumReconYp(result, Uchi);
-    SE = st.GetEntry(ptype, Zm, sF);
+    SE = st.GetEntry(ptype, Zm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
    accumReconZp(result, Uchi);
-    SE = st.GetEntry(ptype, Tm, sF);
+    SE = st.GetEntry(ptype, Tm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp); 
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
    accumReconTp(result, Uchi);
    synchronise();
 #ifdef GPU_VEC
    insertLane (lane,out[sF],result);
 #else
  vstream(out[sF], result);
 #endif
  }
 };
@@ -213,20 +253,20 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF
 // Template specialise Gparity to empty for now
 #define GPU_EMPTY(A)							\
  template <>								\
-accelerator void							\
+accelerator_inline void							\
 WilsonKernels<A>::GpuDhopSite(StencilView &st,				\
-			      DoubledGaugeFieldView &U,			\
+			      SiteDoubledGaugeField &U,			\
-			      SiteHalfSpinor *buf, int sF,		\
+			      SiteHalfSpinor *buf, int Ls, int sF,	\
 			      int sU,					\
 			      const FermionFieldView &in,		\
 			      FermionFieldView &out) { assert(0);};	\
  template <>								\
-  accelerator void							\
+  accelerator_inline void							\
  WilsonKernels<A>::GpuDhopSiteDag(StencilView &st,			\
-				DoubledGaugeFieldView &U,		\
+				   DoubledGaugeFieldView &U,		\
-				   SiteHalfSpinor *buf, int sF,		\
+				   SiteHalfSpinor *buf, int Ls,int sF,	\
-				int sU,					\
+				   int sU,				\
-				const FermionFieldView &in,		\
+				   const FermionFieldView &in,		\
 				   FermionFieldView &out) { assert(0);};
 GPU_EMPTY(GparityWilsonImplF);
@@ -234,6 +274,67 @@ GPU_EMPTY(GparityWilsonImplFH);
 GPU_EMPTY(GparityWilsonImplD);
 GPU_EMPTY(GparityWilsonImplDF);
 template <class Impl>
 void WilsonKernels<Impl>::Dhop(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			       int Ls, int Nsite, const FermionField &in, FermionField &out,
 			       int interior,int exterior) 
 {
    auto U_v   = U.View();
    auto in_v  = in.View();
    auto out_v = out.View();
    auto st_v  = st.View();
    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
      const uint64_t nsimd = Simd::Nsimd();
      const uint64_t    NN = Nsite*Ls*nsimd;
      accelerator_loopN( sss, NN, {
 	  uint64_t cur  = sss;
 	  //	  uint64_t lane = cur % nsimd;
 	  cur = cur / nsimd;
 	  uint64_t   s  = cur%Ls;
 	  uint64_t   sF = cur;         cur = cur / Ls;
 	  uint64_t   sU = cur;
 	  WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
      });
    } else { 
      accelerator_loop( ss, U_v, {
 	int sU = ss;
        int sF = Ls * sU;
        DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
      });
    }
  }
  template <class Impl>
  void WilsonKernels<Impl>::DhopDag(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 				    int Ls, int Nsite, const FermionField &in, FermionField &out,
 				    int interior,int exterior) 
  {
    auto U_v   = U.View();
    auto in_v  = in.View();
    auto out_v = out.View();
    auto st_v  = st.View();
    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
      const uint64_t nsimd = Simd::Nsimd();
      const uint64_t    NN = Nsite*Ls*nsimd;
      accelerator_loopN( sss, NN, {
 	  uint64_t cur  = sss;
 	  // uint64_t lane = cur % nsimd;
 	  cur = cur / nsimd;
 	  uint64_t   s  = cur%Ls;
 	  uint64_t   sF = cur;         cur = cur / Ls;
 	  uint64_t   sU = cur;
 	  WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,Ls,s,sU,in_v,out_v);
      });
    } else { 
      accelerator_loop( ss, U_v, {
 	int sU = ss;
        int sF = Ls * sU;
        DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
      });
    }
  }
 /*
 GPU_EMPTY(DomainWallVec5dImplF);
 GPU_EMPTY(DomainWallVec5dImplFH);
--- a/lib/simd/Grid_gpu_vec.h
+++ b/lib/simd/Grid_gpu_vec.h
@@ -36,7 +36,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
-#define COALESCE_GRANULARITY (16) // bytes for coalesce granularity of target: Pascal, Volta
+  //#define COALESCE_GRANULARITY (64) // bytes for coalesce granularity of target: Pascal, Volta
  //#define COALESCE_GRANULARITY (32) // bytes for coalesce granularity of target: Pascal, Volta
  #define COALESCE_GRANULARITY (16) // bytes for coalesce granularity of target: Pascal, Volta
 template<class pair>
 class GpuComplex {