diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h
index e7a20abd..4bb8ce33 100644
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -215,6 +215,14 @@ public:
     mult(&phi(), &U(mu), &chi());
   }
       
+#ifdef GPU_VEC
+  static accelerator_inline void copyLinkGpu(int lane,
+					     SiteDoubledGaugeField & UU,
+					     const SiteDoubledGaugeField &U)
+  {
+    auto U_l   = extractLane(lane,U);
+    insertLane(lane,UU,U_l);
+  }
   static accelerator_inline void multLinkGpu(int lane,
 					     typename SiteHalfSpinor::scalar_object &phi,
 					     const SiteDoubledGaugeField &U,
@@ -224,6 +232,17 @@ public:
     auto U_l   = extractLane(lane,U(mu));
     phi() =  U_l * chi();
   }
+#else
+  static accelerator_inline void multLinkGpu(int lane,
+					     SiteHalfSpinor &phi,
+					     const SiteDoubledGaugeField &U,
+					     const SiteHalfSpinor &chi,
+					     int mu) 
+  {
+    auto U_l   = U(mu);
+    phi() =  U_l * chi();
+  }
+#endif
     
   static accelerator_inline void multLinkProp(SitePropagator &phi,
 					      const SiteDoubledGaugeField &U,
@@ -364,7 +383,13 @@ public:
     }
     mult(&phi(), &UU(), &chi());
   }
-
+#ifdef GPU_VEC
+  static accelerator_inline void copyLinkGpu(int lane,
+					     SiteDoubledGaugeField & UU,
+					     const SiteDoubledGaugeField &U)
+  {
+    UU = U;
+  }
   static accelerator_inline void multLinkGpu(int lane,
 					     typename SiteHalfSpinor::scalar_object &phi,
 					     const SiteDoubledGaugeField &U,
@@ -374,6 +399,17 @@ public:
     auto U_l   = U(mu);
     phi() =  U_l * chi();
   }
+#else
+  static accelerator_inline void multLinkGpu(int lane,
+					     SiteHalfSpinor &phi,
+					     const SiteDoubledGaugeField &U,
+					     const SiteHalfSpinor &chi,
+					     int mu) 
+  {
+    auto U_l   = U(mu);
+    phi() =  U_l * chi();
+  }
+#endif
 
   static accelerator_inline void multLinkProp(SitePropagator &phi,
 					      const SiteDoubledGaugeField &U,
diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc
index 0d01263c..893aee3e 100644
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -104,6 +104,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, Double
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out)
 {
+#ifndef GRID_NVCC
   SiteHalfSpinor tmp;
   SiteHalfSpinor chi;
   SiteHalfSpinor *chi_p;
@@ -121,6 +122,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, Double
   GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
   GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
   vstream(out[sF], result);
+#endif
 };
 
 template <class Impl>
@@ -128,6 +130,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGa
 						      SiteHalfSpinor *buf, int sF,
 						      int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
+#ifndef GRID_NVCC
   SiteHalfSpinor tmp;
   SiteHalfSpinor chi;
   SiteHalfSpinor *chi_p;
@@ -145,6 +148,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGa
   GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
   GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
   vstream(out[sF], result);
+#endif
 };
   ////////////////////////////////////////////////////////////////////
   // Interior kernels
@@ -154,6 +158,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  Do
 							    SiteHalfSpinor *buf, int sF,
 							    int sU, const FermionFieldView &in, FermionFieldView &out)
 {
+#ifndef GRID_NVCC
   SiteHalfSpinor tmp;
   SiteHalfSpinor chi;
   SiteHalfSpinor *chi_p;
@@ -172,6 +177,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  Do
   GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
   GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
   vstream(out[sF], result);
+#endif
 };
 
 template <class Impl>
@@ -179,6 +185,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  Doubl
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
+#ifndef GRID_NVCC
   SiteHalfSpinor tmp;
   SiteHalfSpinor chi;
   SiteHalfSpinor *chi_p;
@@ -196,6 +203,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  Doubl
   GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
   GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
   vstream(out[sF], result);
+#endif
 };
 ////////////////////////////////////////////////////////////////////
 // Exterior kernels
@@ -205,6 +213,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  Do
 							    SiteHalfSpinor *buf, int sF,
 							    int sU, const FermionFieldView &in, FermionFieldView &out)
 {
+#ifndef GRID_NVCC
   //  SiteHalfSpinor tmp;
   //  SiteHalfSpinor chi;
   SiteHalfSpinor *chi_p;
@@ -225,6 +234,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  Do
   if ( nmu ) { 
     out[sF] = out[sF] + result; 
   }
+#endif
 };
 
 template <class Impl>
@@ -232,6 +242,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  Doubl
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
+#ifndef GRID_NVCC
   //  SiteHalfSpinor tmp;
   //  SiteHalfSpinor chi;
   SiteHalfSpinor *chi_p;
@@ -252,12 +263,14 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  Doubl
   if ( nmu ) { 
     out[sF] = out[sF] + result; 
   }
+#endif
 };
 
 template <class Impl>
 accelerator void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) 
 {
+#ifndef GRID_NVCC
   SiteHalfSpinor tmp;
   SiteHalfSpinor chi;
   SiteSpinor result;
@@ -275,6 +288,7 @@ accelerator void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFie
   GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
   GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
   vstream(out[sF], result);
+#endif
 }
 
 /*******************************************************************************
diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h
index 09c086d8..a0922934 100644
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -54,59 +54,11 @@ public:
 
   static void Dhop(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 		   int Ls, int Nsite, const FermionField &in, FermionField &out,
-		   int interior=1,int exterior=1) 
-  {
-    auto U_v   = U.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    auto st_v  = st.View();
-    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
-      const uint64_t nsimd = Simd::Nsimd();
-      const uint64_t    NN = Nsite*Ls*nsimd;
-      accelerator_loopN( sss, NN, {
-	  uint64_t cur  = sss;
-	  //	  uint64_t lane = cur % nsimd;
-	  cur = cur / nsimd;
-	  uint64_t   sF = cur;         cur = cur / Ls;
-	  uint64_t   sU = cur;
-	  WilsonKernels<Impl>::GpuDhopSite(st_v,U_v,buf,sF,sU,in_v,out_v);
-      });
-    } else { 
-      accelerator_loop( ss, U_v, {
-	int sU = ss;
-        int sF = Ls * sU;
-        DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
-      });
-    }
-  }
+		   int interior=1,int exterior=1) ;
   static void DhopDag(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 		      int Ls, int Nsite, const FermionField &in, FermionField &out,
-		      int interior=1,int exterior=1) 
-  {
-    auto U_v   = U.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    auto st_v  = st.View();
+		      int interior=1,int exterior=1) ;
 
-    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
-      const uint64_t nsimd = Simd::Nsimd();
-      const uint64_t    NN = Nsite*Ls*nsimd;
-      accelerator_loopN( sss, NN, {
-	  uint64_t cur  = sss;
-	  // uint64_t lane = cur % nsimd;
-	  cur = cur / nsimd;
-	  uint64_t   sF = cur;         cur = cur / Ls;
-	  uint64_t   sU = cur;
-	  WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,sF,sU,in_v,out_v);
-      });
-    } else { 
-      accelerator_loop( ss, U_v, {
-	int sU = ss;
-        int sF = Ls * sU;
-        DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
-      });
-    }
-  }
    
   template <bool EnableBool = true> static accelerator
   typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
@@ -275,11 +227,11 @@ public:
 
 private:
   // Specialised variants
-  static accelerator void GpuDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-				      int sF,  int sU, const FermionFieldView &in, FermionFieldView &out);
+  static accelerator_inline void GpuDhopSite(StencilView &st,  SiteDoubledGaugeField &U, SiteHalfSpinor * buf,
+					     int Ls, int sF,  int sU, const FermionFieldView &in, FermionFieldView &out);
   
-  static accelerator void GpuDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-					 int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  static accelerator_inline void GpuDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+						int Ls,int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
 
   static accelerator void GenericDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
 					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
diff --git a/lib/qcd/action/fermion/WilsonKernelsGpu.cc b/lib/qcd/action/fermion/WilsonKernelsGpu.cc
index ac0d3ffa..8ac5e55b 100644
--- a/lib/qcd/action/fermion/WilsonKernelsGpu.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsGpu.cc
@@ -57,30 +57,51 @@ accelerator_inline int get_my_lane_offset(int Nsimd)
 #endif
 }
 
-
+#ifdef GPU_VEC
 #define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
   synchronise();							\
   if (SE->_is_local) {							\
     int mask = Nsimd >> (ptype + 1);					\
     int plane= SE->_permute ? (lane ^ mask) : lane;			\
-    auto in_l = extractLane(plane,in[SE->_offset]);			\
+    auto in_l = extractLane(plane,in[SE->_offset+s]);			\
     spProj(chi,in_l);							\
   } else {								\
-    chi  = extractLane(lane,buf[SE->_offset]);				\
+    chi  = extractLane(lane,buf[SE->_offset+s]);			\
   }									\
   synchronise();
+#else 
+#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
+  if (SE->_is_local) {							\
+    auto in_t = in[SE->_offset+s];					\
+    if (SE->_permute) {							\
+      spProj(tmp, in_t);						\
+      permute(chi, tmp, ptype);						\
+    } else {								\
+      spProj(chi, in_t);						\
+    }									\
+  } else {								\
+    chi  = buf[SE->_offset+s];						\
+  }									\
+  synchronise();
+#endif
 
 template <class Impl>
-accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
-						     SiteHalfSpinor *buf, int sF,
+accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
+						     SiteHalfSpinor *buf, int Ls, int s,
 						     int sU, const FermionFieldView &in, FermionFieldView &out)
 {
+#ifdef GPU_VEC
   typename SiteHalfSpinor::scalar_object chi;
   typename SiteHalfSpinor::scalar_object Uchi;
   typename SiteSpinor::scalar_object   result;
+#else 
+  SiteHalfSpinor chi;
+  SiteHalfSpinor Uchi;
+  SiteHalfSpinor tmp;
+  SiteSpinor   result;
+#endif
   typedef typename SiteSpinor::scalar_type scalar_type;
   typedef typename SiteSpinor::vector_type vector_type;
-
   constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
 
   uint64_t lane_offset= get_my_lane_offset(Nsimd);
@@ -88,69 +109,80 @@ accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGau
 
   StencilEntry *SE;
   int ptype;
-
+  uint64_t ssF = Ls * sU;
+  uint64_t sF  = ssF + s;
 #ifndef __CUDA_ARCH__
   for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
 #else
   int lane = lane_offset; {
 #endif
-    SE = st.GetEntry(ptype, Xp, sF);
+    SE = st.GetEntry(ptype, Xp, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp); 
     Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
     spReconXp(result, Uchi);
 
-    SE = st.GetEntry(ptype, Yp, sF);
+    SE = st.GetEntry(ptype, Yp, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
     Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
     accumReconYp(result, Uchi);
       
-    SE = st.GetEntry(ptype, Zp, sF);
+    SE = st.GetEntry(ptype, Zp, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
     Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
     accumReconZp(result, Uchi);
 
-    SE = st.GetEntry(ptype, Tp, sF);
+    SE = st.GetEntry(ptype, Tp, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
     Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
     accumReconTp(result, Uchi);
 
-    SE = st.GetEntry(ptype, Xm, sF);
+    SE = st.GetEntry(ptype, Xm, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
     Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
     accumReconXm(result, Uchi);
 
-    SE = st.GetEntry(ptype, Ym, sF);
+    SE = st.GetEntry(ptype, Ym, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
     Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
     accumReconYm(result, Uchi);
 
 
-    SE = st.GetEntry(ptype, Zm, sF);
+    SE = st.GetEntry(ptype, Zm, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
     Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
     accumReconZm(result, Uchi);
 
-    SE = st.GetEntry(ptype, Tm, sF);
+    SE = st.GetEntry(ptype, Tm, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); 
     Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
     accumReconTm(result, Uchi);
 
     synchronise();
+#ifdef GPU_VEC
     insertLane (lane,out[sF],result);
+#else
+  vstream(out[sF], result);
+#endif
   }
 }
 
 template <class Impl>
-accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U,
-						  SiteHalfSpinor *buf, int sF,
+accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U,
+						  SiteHalfSpinor *buf,  int Ls, int s,
 						  int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
+#ifdef GPU_VEC
   typename SiteHalfSpinor::scalar_object chi;
   typename SiteHalfSpinor::scalar_object Uchi;
   typename SiteSpinor::scalar_object   result;
+#else 
+  SiteHalfSpinor chi;
+  SiteHalfSpinor Uchi;
+  SiteHalfSpinor tmp;
+  SiteSpinor   result;
+#endif
   typedef typename SiteSpinor::scalar_type scalar_type;
   typedef typename SiteSpinor::vector_type vector_type;
-
   constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
 
   uint64_t lane_offset= get_my_lane_offset(Nsimd);
@@ -158,54 +190,62 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF
 
   StencilEntry *SE;
   int ptype;
+  // Forces some degree of coalesce on the table look ups
+  // Could also use wide load instructions on the data structure
+  uint64_t ssF = Ls * sU;
+  uint64_t sF  = ssF + s;
 
 #ifndef __CUDA_ARCH__
   for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
 #else
   int lane = lane_offset; {
 #endif
-    SE = st.GetEntry(ptype, Xp, sF);
+    SE = st.GetEntry(ptype, Xp, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm); 
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
     spReconXm(result, Uchi);
 
-    SE = st.GetEntry(ptype, Yp, sF);
+    SE = st.GetEntry(ptype, Yp, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
     accumReconYm(result, Uchi);
       
-    SE = st.GetEntry(ptype, Zp, sF);
+    SE = st.GetEntry(ptype, Zp, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
     accumReconZm(result, Uchi);
 
-    SE = st.GetEntry(ptype, Tp, sF);
+    SE = st.GetEntry(ptype, Tp, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
     accumReconTm(result, Uchi);
 
-    SE = st.GetEntry(ptype, Xm, sF);
+    SE = st.GetEntry(ptype, Xm, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
     accumReconXp(result, Uchi);
 
-    SE = st.GetEntry(ptype, Ym, sF);
+    SE = st.GetEntry(ptype, Ym, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
     accumReconYp(result, Uchi);
 
-    SE = st.GetEntry(ptype, Zm, sF);
+    SE = st.GetEntry(ptype, Zm, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
     accumReconZp(result, Uchi);
 
-    SE = st.GetEntry(ptype, Tm, sF);
+    SE = st.GetEntry(ptype, Tm, ssF);
     GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp); 
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
     accumReconTp(result, Uchi);
 
     synchronise();
+#ifdef GPU_VEC
     insertLane (lane,out[sF],result);
+#else
+  vstream(out[sF], result);
+#endif
   }
 
 };
@@ -213,20 +253,20 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF
 // Template specialise Gparity to empty for now
 #define GPU_EMPTY(A)							\
   template <>								\
-accelerator void							\
+accelerator_inline void							\
 WilsonKernels<A>::GpuDhopSite(StencilView &st,				\
-			      DoubledGaugeFieldView &U,			\
-			      SiteHalfSpinor *buf, int sF,		\
+			      SiteDoubledGaugeField &U,			\
+			      SiteHalfSpinor *buf, int Ls, int sF,	\
 			      int sU,					\
 			      const FermionFieldView &in,		\
 			      FermionFieldView &out) { assert(0);};	\
   template <>								\
-  accelerator void							\
+  accelerator_inline void							\
   WilsonKernels<A>::GpuDhopSiteDag(StencilView &st,			\
-				DoubledGaugeFieldView &U,		\
-				   SiteHalfSpinor *buf, int sF,		\
-				int sU,					\
-				const FermionFieldView &in,		\
+				   DoubledGaugeFieldView &U,		\
+				   SiteHalfSpinor *buf, int Ls,int sF,	\
+				   int sU,				\
+				   const FermionFieldView &in,		\
 				   FermionFieldView &out) { assert(0);};
 
 GPU_EMPTY(GparityWilsonImplF);
@@ -234,6 +274,67 @@ GPU_EMPTY(GparityWilsonImplFH);
 GPU_EMPTY(GparityWilsonImplD);
 GPU_EMPTY(GparityWilsonImplDF);
 
+template <class Impl>
+void WilsonKernels<Impl>::Dhop(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+			       int Ls, int Nsite, const FermionField &in, FermionField &out,
+			       int interior,int exterior) 
+{
+    auto U_v   = U.View();
+    auto in_v  = in.View();
+    auto out_v = out.View();
+    auto st_v  = st.View();
+    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
+      const uint64_t nsimd = Simd::Nsimd();
+      const uint64_t    NN = Nsite*Ls*nsimd;
+      accelerator_loopN( sss, NN, {
+	  uint64_t cur  = sss;
+	  //	  uint64_t lane = cur % nsimd;
+	  cur = cur / nsimd;
+	  uint64_t   s  = cur%Ls;
+	  uint64_t   sF = cur;         cur = cur / Ls;
+	  uint64_t   sU = cur;
+	  WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
+      });
+    } else { 
+      accelerator_loop( ss, U_v, {
+	int sU = ss;
+        int sF = Ls * sU;
+        DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
+      });
+    }
+  }
+  template <class Impl>
+  void WilsonKernels<Impl>::DhopDag(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+				    int Ls, int Nsite, const FermionField &in, FermionField &out,
+				    int interior,int exterior) 
+  {
+    auto U_v   = U.View();
+    auto in_v  = in.View();
+    auto out_v = out.View();
+    auto st_v  = st.View();
+
+    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
+      const uint64_t nsimd = Simd::Nsimd();
+      const uint64_t    NN = Nsite*Ls*nsimd;
+      accelerator_loopN( sss, NN, {
+	  uint64_t cur  = sss;
+	  // uint64_t lane = cur % nsimd;
+	  cur = cur / nsimd;
+	  uint64_t   s  = cur%Ls;
+	  uint64_t   sF = cur;         cur = cur / Ls;
+	  uint64_t   sU = cur;
+	  WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,Ls,s,sU,in_v,out_v);
+      });
+    } else { 
+      accelerator_loop( ss, U_v, {
+	int sU = ss;
+        int sF = Ls * sU;
+        DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
+      });
+    }
+  }
+
+
 /*
 GPU_EMPTY(DomainWallVec5dImplF);
 GPU_EMPTY(DomainWallVec5dImplFH);
diff --git a/lib/simd/Grid_gpu_vec.h b/lib/simd/Grid_gpu_vec.h
index 3850e403..0d30c68d 100644
--- a/lib/simd/Grid_gpu_vec.h
+++ b/lib/simd/Grid_gpu_vec.h
@@ -36,7 +36,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 
 namespace Grid {
 
-#define COALESCE_GRANULARITY (16) // bytes for coalesce granularity of target: Pascal, Volta
+  //#define COALESCE_GRANULARITY (64) // bytes for coalesce granularity of target: Pascal, Volta
+  //#define COALESCE_GRANULARITY (32) // bytes for coalesce granularity of target: Pascal, Volta
+  #define COALESCE_GRANULARITY (16) // bytes for coalesce granularity of target: Pascal, Volta
 
 template<class pair>
 class GpuComplex {