Merge branch 'develop' into feature/baryon

2025-06-19 08:17:05 +01:00 · 2020-06-25 16:24:07 +01:00
parent a65f66d2db 936c5ecf69
commit 39cea8b5a7
139 changed files with 6115 additions and 2632 deletions
--- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
+++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
@ -114,19 +114,22 @@ public:
      U = adj(Cshift(U, mu, -1));
      PokeIndex<LorentzIndex>(Uadj, U, mu);
    }
-    
-    for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
+
+    autoView(Umu_v,Umu,CpuRead);
+    autoView(Uadj_v,Uadj,CpuRead);
+    autoView(Uds_v,Uds,CpuWrite);
+    thread_for( lidx, GaugeGrid->lSites(), {
      Coordinate lcoor;
      GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
      
-      peekLocalSite(ScalarUmu, Umu, lcoor);
+      peekLocalSite(ScalarUmu, Umu_v, lcoor);
      for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
      
-      peekLocalSite(ScalarUmu, Uadj, lcoor);
+      peekLocalSite(ScalarUmu, Uadj_v, lcoor);
      for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
      
-      pokeLocalSite(ScalarUds, Uds, lcoor);
-    }
+      pokeLocalSite(ScalarUds, Uds_v, lcoor);
+    });
  }
      
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) 
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);

+#include <Grid/qcd/action/fermion/NaiveStaggeredFermion.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
 NAMESPACE_CHECK(Staggered);
@ -282,11 +283,15 @@ typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;

+typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
+typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
+typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
+
 typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;

-#ifndef GRID_NVCC
+#ifndef GRID_CUDA
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@ -96,11 +96,11 @@ public:
    int sl        = St._simd_layout[direction];
    Coordinate icoor;

-#ifdef __CUDA_ARCH__
+#ifdef GRID_SIMT
    _Spinor tmp;

    const int Nsimd =SiteDoubledGaugeField::Nsimd();
-    int s = SIMTlane(Nsimd);
+    int s = acceleratorSIMTlane(Nsimd);
    St.iCoorFromIindex(icoor,s);

    int mmu = mu % Nd;
@ -232,15 +232,17 @@ public:
      if ( Params.twists[mu] ) { 
 	Uconj = where(coor==neglink,-Uconj,Uconj);
      }
-	  
-      auto U_v = U.View();
-      auto Uds_v = Uds.View();
-      auto Uconj_v = Uconj.View();
-      auto Utmp_v= Utmp.View();
-      thread_foreach(ss,U_v,{
-	Uds_v[ss](0)(mu) = U_v[ss]();
-	Uds_v[ss](1)(mu) = Uconj_v[ss]();
-      });
+
+      {
+	autoView( U_v , U, CpuRead);
+	autoView( Uconj_v , Uconj, CpuRead);
+	autoView( Uds_v , Uds, CpuWrite);
+	autoView( Utmp_v, Utmp, CpuWrite);
+	thread_foreach(ss,U_v,{
+	    Uds_v[ss](0)(mu) = U_v[ss]();
+	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
+	  });
+      }
          
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
      Uconj = adj(Cshift(Uconj,mu,-1));
@ -250,19 +252,25 @@ public:
 	Utmp = where(coor==0,Uconj,Utmp);
      }

-      thread_foreach(ss,Utmp_v,{
-	Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
-      });
-          
+      {
+	autoView( Uds_v , Uds, CpuWrite);
+	autoView( Utmp_v, Utmp, CpuWrite);
+	thread_foreach(ss,Utmp_v,{
+	    Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
+	  });
+      }
      Utmp = Uconj;
      if ( Params.twists[mu] ) { 
 	Utmp = where(coor==0,U,Utmp);
      }
-	  
-      thread_foreach(ss,Utmp_v,{
-        Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
-      });
-          
+
+      {	  
+	autoView( Uds_v , Uds, CpuWrite);
+	autoView( Utmp_v, Utmp, CpuWrite);
+	thread_foreach(ss,Utmp_v,{
+	    Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
+        });
+      }
    }
  }
      
@ -272,11 +280,14 @@ public:
    GaugeLinkField link(mat.Grid());
    // use lorentz for flavour as hack.
    auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
-    auto link_v = link.View();
-    auto tmp_v = tmp.View();
-    thread_foreach(ss,tmp_v,{
-      link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
-    });
+
+    {
+      autoView( link_v , link, CpuWrite);
+      autoView( tmp_v , tmp, CpuRead);
+      thread_foreach(ss,tmp_v,{
+	  link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
+	});
+    }
    PokeIndex<LorentzIndex>(mat, link, mu);
    return;
  }
@ -306,16 +317,18 @@ public:
        
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
-    auto tmp_v = tmp.View();
-    auto Atilde_v = Atilde.View();
-    auto Btilde_v = Btilde.View();
-    thread_for(ss,tmp.Grid()->oSites(),{
-      for (int s = 0; s < Ls; s++) {
-	int sF = s + Ls * ss;
-	auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
-	tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
-      }
-    });
+    {
+      autoView( tmp_v , tmp, CpuWrite);
+      autoView( Atilde_v , Atilde, CpuRead);
+      autoView( Btilde_v , Btilde, CpuRead);
+      thread_for(ss,tmp.Grid()->oSites(),{
+	  for (int s = 0; s < Ls; s++) {
+	    int sF = s + Ls * ss;
+	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
+	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
+	  }
+	});
+    }
    PokeIndex<LorentzIndex>(mat, tmp, mu);
    return;
  }
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@ -61,8 +61,8 @@ public:
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
-      double DhopComputeTime2;
-      double DhopFaceTime;
+  double DhopComputeTime2;
+  double DhopFaceTime;

  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@ -0,0 +1,194 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#ifndef GRID_QCD_NAIVE_STAG_FERMION_H
+#define GRID_QCD_NAIVE_STAG_FERMION_H
+
+NAMESPACE_BEGIN(Grid);
+
+class NaiveStaggeredFermionStatic {
+public:
+  static const std::vector<int> directions;
+  static const std::vector<int> displacements;
+  static const int npoint = 8;
+};
+
+template <class Impl>
+class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStaggeredFermionStatic {
+public:
+  INHERIT_IMPL_TYPES(Impl);
+  typedef StaggeredKernels<Impl> Kernels;
+
+  FermionField _tmp;
+  FermionField &tmp(void) { return _tmp; }
+
+  ////////////////////////////////////////
+  // Performance monitoring
+  ////////////////////////////////////////
+  void Report(void);
+  void ZeroCounters(void);
+  double DhopTotalTime;
+  double DhopCalls;
+  double DhopCommTime;
+  double DhopComputeTime;
+  double DhopComputeTime2;
+  double DhopFaceTime;
+
+  ///////////////////////////////////////////////////////////////
+  // Implement the abstract base
+  ///////////////////////////////////////////////////////////////
+  GridBase *GaugeGrid(void) { return _grid; }
+  GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
+  GridBase *FermionGrid(void) { return _grid; }
+  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
+
+  //////////////////////////////////////////////////////////////////
+  // override multiply; cut number routines if pass dagger argument
+  // and also make interface more uniformly consistent
+  //////////////////////////////////////////////////////////////////
+  void M(const FermionField &in, FermionField &out);
+  void Mdag(const FermionField &in, FermionField &out);
+
+  /////////////////////////////////////////////////////////
+  // half checkerboard operations
+  /////////////////////////////////////////////////////////
+  void Meooe(const FermionField &in, FermionField &out);
+  void MeooeDag(const FermionField &in, FermionField &out);
+  void Mooee(const FermionField &in, FermionField &out);
+  void MooeeDag(const FermionField &in, FermionField &out);
+  void MooeeInv(const FermionField &in, FermionField &out);
+  void MooeeInvDag(const FermionField &in, FermionField &out);
+
+  ////////////////////////
+  // Derivative interface
+  ////////////////////////
+  // Interface calls an internal routine
+  void DhopDeriv  (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+  void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+  void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+
+  ///////////////////////////////////////////////////////////////
+  // non-hermitian hopping term; half cb or both
+  ///////////////////////////////////////////////////////////////
+  void Dhop  (const FermionField &in, FermionField &out, int dag);
+  void DhopOE(const FermionField &in, FermionField &out, int dag);
+  void DhopEO(const FermionField &in, FermionField &out, int dag);
+
+  ///////////////////////////////////////////////////////////////
+  // Multigrid assistance; force term uses too
+  ///////////////////////////////////////////////////////////////
+  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
+  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
+
+  ///////////////////////////////////////////////////////////////
+  // Extra methods added by derived
+  ///////////////////////////////////////////////////////////////
+  void DerivInternal(StencilImpl &st, 
+		     DoubledGaugeField &U,
+		     GaugeField &mat, 
+		     const FermionField &A, const FermionField &B, int dag);
+
+  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+                    const FermionField &in, FermionField &out, int dag);
+  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+			       const FermionField &in, FermionField &out, int dag);
+  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+				   const FermionField &in, FermionField &out, int dag);
+
+  //////////////////////////////////////////////////////////////////////////
+  // Grid own interface Constructor
+  //////////////////////////////////////////////////////////////////////////
+  NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
+			GridRedBlackCartesian &Hgrid, RealD _mass,
+			RealD _c1, RealD _u0,
+			const ImplParams &p = ImplParams());
+  NaiveStaggeredFermion(GridCartesian &Fgrid,
+			GridRedBlackCartesian &Hgrid, RealD _mass,
+			RealD _c1, RealD _u0,
+			const ImplParams &p = ImplParams());
+
+  // DoubleStore impl dependent
+  void ImportGauge      (const GaugeField &_U );
+  DoubledGaugeField &GetU(void)   { return Umu ; } ;
+  void CopyGaugeCheckerboards(void);
+
+  ///////////////////////////////////////////////////////////////
+  // Data members require to support the functionality
+  ///////////////////////////////////////////////////////////////
+
+  //    protected:
+public:
+  // any other parameters of action ???
+  virtual int   isTrivialEE(void) { return 1; };
+  virtual RealD Mass(void) { return mass; }
+  RealD mass;
+  RealD u0;
+  RealD c1;
+
+  GridBase *_grid;
+  GridBase *_cbgrid;
+
+  // Defines the stencils for even and odd
+  StencilImpl Stencil;
+  StencilImpl StencilEven;
+  StencilImpl StencilOdd;
+
+  // Copy of the gauge field , with even and odd subsets
+  DoubledGaugeField Umu;
+  DoubledGaugeField UmuEven;
+  DoubledGaugeField UmuOdd;
+
+  LebesgueOrder Lebesgue;
+  LebesgueOrder LebesgueEvenOdd;
+  
+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+                                PropagatorField &q_in_2,
+                                PropagatorField &q_out,
+                                PropagatorField &src,
+                                Current curr_type,
+                                unsigned int mu);
+  void SeqConservedCurrent(PropagatorField &q_in,
+                           PropagatorField &q_out,
+                           PropagatorField &srct,
+                           Current curr_type,
+                           unsigned int mu, 
+                           unsigned int tmin,
+                           unsigned int tmax,
+			   ComplexField &lattice_cmplx);
+};
+
+typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
+typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/qcd/action/fermion/StaggeredKernels.h
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@ -47,23 +47,34 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
  INHERIT_IMPL_TYPES(Impl);
  typedef FermionOperator<Impl> Base;
   
-public:
-    
-   void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
-		      int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
+ public:
+
+  void DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
+		    DoubledGaugeField &U, DoubledGaugeField &UUU, 
+		    const FermionField &in, FermionField &out, int dag, int interior,int exterior);
+  void DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
+		 DoubledGaugeField &U,
+		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
+  
+  void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
+		     int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
+ protected:    

   ///////////////////////////////////////////////////////////////////////////////////////
   // Generic Nc kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik> accelerator_inline
+   void DhopSiteGeneric(StencilView &st, 
 			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik> accelerator_inline
+   void DhopSiteGenericInt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik> accelerator_inline
+   void DhopSiteGenericExt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionFieldView &in, FermionFieldView &out,int dag);
@ -71,15 +82,18 @@ public:
   ///////////////////////////////////////////////////////////////////////////////////////
   // Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik> accelerator_inline
+   void DhopSiteHand(StencilView &st, 
 		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		     SiteSpinor * buf, int LLs, int sU, 
 		     const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik> accelerator_inline
+   void DhopSiteHandInt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik> accelerator_inline
+   void DhopSiteHandExt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
@ -87,27 +101,10 @@ public:
   ///////////////////////////////////////////////////////////////////////////////////////
   // Asm Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+   void DhopSiteAsm(StencilView &st, 
 		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU, 
 		    const FermionFieldView &in, FermionFieldView &out,int dag);
-   ///////////////////////////////////////////////////////////////////////////////////////////////////
-   // Generic interface; fan out to right routine
-   ///////////////////////////////////////////////////////////////////////////////////////////////////
-   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
-		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
-		 SiteSpinor * buf, int LLs, int sU,
-		 const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
-
-   void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, 
-		    DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
-		    SiteSpinor * buf, int LLs, int sU,
-		    const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
-
-   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
-		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
-		 SiteSpinor * buf, int LLs, int sU,
-		 const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
  
 public:

--- a/Grid/qcd/action/fermion/StaggeredVec5dImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredVec5dImpl.h
@ -113,20 +113,7 @@ public:
      
  inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
  {
-    GridBase *GaugeGrid = U_ds.Grid();
-    thread_for(lidx, GaugeGrid->lSites(),{
-
-	SiteScalarGaugeLink   ScalarU;
-	SiteDoubledGaugeField ScalarUds;
-	
-	Coordinate lcoor;
-	GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
-	peekLocalSite(ScalarUds, U_ds, lcoor);
-	
-	peekLocalSite(ScalarU, U, lcoor);
-	ScalarUds(mu) = ScalarU();
-	
-    });
+    assert(0);
  }
  inline void DoubleStore(GridBase *GaugeGrid,
 			  DoubledGaugeField &UUUds, // for Naik term
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@ -257,15 +257,16 @@ private:
  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO

+ public:
  // eventually these can be compressed into 6x6 blocks instead of the 12x12
  // using the DeGrand-Rossi basis for the gamma matrices
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
-    auto T_v = T.View();
-    auto F_v = F.View();
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    autoView(T_v,T,AcceleratorWrite);
+    autoView(F_v,F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
@ -281,9 +282,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();
    
-    auto T_v = T.View();
-    auto F_v = F.View();
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    autoView(T_v, T,AcceleratorWrite);
+    autoView(F_v, F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -F_v[i]()();
      T_v[i]()(1, 0) = F_v[i]()();
@ -299,9 +300,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();

-    auto T_v = T.View();
-    auto F_v = F.View();
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    autoView(T_v,T,AcceleratorWrite);
+    autoView(F_v,F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 1) = timesI(F_v[i]()());
@ -317,9 +318,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();

-    auto T_v = T.View();
-    auto F_v = F.View();
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    autoView( T_v , T, AcceleratorWrite);
+    autoView( F_v , F, AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesI(F_v[i]()());
      T_v[i]()(1, 0) = timesI(F_v[i]()());
@ -335,9 +336,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();
    
-    auto T_v = T.View();
-    auto F_v = F.View();
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    autoView( T_v ,T,AcceleratorWrite);
+    autoView( F_v ,F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -(F_v[i]()());
      T_v[i]()(1, 0) = (F_v[i]()());
@ -354,9 +355,9 @@ private:

    T = Zero();

-    auto T_v = T.View();
-    auto F_v = F.View();
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    autoView( T_v , T,AcceleratorWrite);
+    autoView( F_v , F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesI(F_v[i]()());
      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@ -106,10 +106,10 @@ public:
 			    const _SpinorField & phi,
 			    int mu)
  {
-    auto out_v= out.View();
-    auto phi_v= phi.View();
-    auto Umu_v= Umu.View();
-    thread_for(sss,out.Grid()->oSites(),{
+    autoView( out_v, out, AcceleratorWrite);
+    autoView( phi_v, phi, AcceleratorRead);
+    autoView( Umu_v, Umu, AcceleratorRead);
+    accelerator_for(sss,out.Grid()->oSites(),1,{
 	multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
    });
  }
@ -191,18 +191,19 @@ public:
    int Ls=Btilde.Grid()->_fdimensions[0];
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
-    auto tmp_v = tmp.View();
-    auto Btilde_v = Btilde.View();
-    auto Atilde_v = Atilde.View();
-    thread_for(sss,tmp.Grid()->oSites(),{
-      int sU=sss;
-      for(int s=0;s<Ls;s++){
-	int sF = s+Ls*sU;
-	tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
-      }
-    });
+    {
+      autoView( tmp_v , tmp, AcceleratorWrite);
+      autoView( Btilde_v , Btilde, AcceleratorRead);
+      autoView( Atilde_v , Atilde, AcceleratorRead);
+      accelerator_for(sss,tmp.Grid()->oSites(),1,{
+	  int sU=sss;
+	  for(int s=0;s<Ls;s++){
+	    int sF = s+Ls*sU;
+	    tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
+	  }
+	});
+    }
    PokeIndex<LorentzIndex>(mat,tmp,mu);
-      
  }
 };

--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@ -180,7 +180,7 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
-#ifdef GRID_NVCC
+#ifdef GRID_CUDA
    RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
-#ifndef GRID_NVCC
+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
@ -826,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
 #endif

-#ifndef GRID_NVCC
+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@ -50,9 +50,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  
  chi_i.Checkerboard()=psi_i.Checkerboard();
  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
+  autoView(psi , psi_i,AcceleratorRead);
+  autoView(phi , phi_i,AcceleratorRead);
+  autoView(chi , chi_i,AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());

  auto pdiag = &diag[0];
@ -93,9 +93,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
 {
  chi_i.Checkerboard()=psi_i.Checkerboard();
  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
+  autoView(psi , psi_i,AcceleratorRead);
+  autoView(phi , phi_i,AcceleratorRead);
+  autoView(chi , chi_i,AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());

  auto pdiag = &diag[0];
@ -131,8 +131,8 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
  chi_i.Checkerboard()=psi_i.Checkerboard();
  GridBase *grid=psi_i.Grid();

-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
+  autoView(psi , psi_i,AcceleratorRead);
+  autoView(chi , chi_i,AcceleratorWrite);

  int Ls=this->Ls;

@ -193,8 +193,8 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
  GridBase *grid=psi_i.Grid();
  int Ls=this->Ls;

-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
+  autoView(psi , psi_i,AcceleratorRead);
+  autoView(chi , chi_i,AcceleratorWrite);

  auto plee  = & lee [0];
  auto pdee  = & dee [0];
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
@ -65,9 +65,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
  chi_i.Checkerboard()=psi_i.Checkerboard();
  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
+  autoView(psi, psi_i,CpuRead);
+  autoView(phi, phi_i,CpuRead);
+  autoView(chi, chi_i,CpuWrite);
  int Ls   = this->Ls;
  int LLs  = grid->_rdimensions[0];
  const int nsimd= Simd::Nsimd();
@ -213,9 +213,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
  chi_i.Checkerboard()=psi_i.Checkerboard();
  GridBase *grid=psi_i.Grid();
-  auto psi=psi_i.View();
-  auto phi=phi_i.View();
-  auto chi=chi_i.View();
+  autoView(psi,psi_i,CpuRead);
+  autoView(phi,phi_i,CpuRead);
+  autoView(chi,chi_i,CpuWrite);
  int Ls   = this->Ls;
  int LLs  = grid->_rdimensions[0];
  int nsimd= Simd::Nsimd();
@ -357,8 +357,8 @@ CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField
 					Vector<iSinglet<Simd> > &Matm)
 {
  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
+  autoView(psi , psi_i,CpuRead);
+  autoView(chi , chi_i,CpuWrite);
 #ifndef AVX512
  {
    SiteHalfSpinor BcastP;
@ -535,8 +535,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
  EnableIf<Impl::LsVectorised,int> sfinae=0;
 #ifndef AVX512
  {
-    auto psi = psi_i.View();
-    auto chi = chi_i.View();
+    autoView(psi , psi_i,CpuRead);
+    autoView(chi , chi_i,CpuWrite);

    SiteHalfSpinor BcastP;
    SiteHalfSpinor BcastM;
@ -586,8 +586,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
  }
 #else
  {
-    auto psi = psi_i.View();
-    auto chi = chi_i.View();
+    autoView(psi , psi_i,CpuRead);
+    autoView(chi , chi_i,CpuWrite);
    // pointers
    //  MASK_REGS;
 #define Chi_00 %zmm0
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
@ -46,9 +46,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
  chi_i.Checkerboard() = psi_i.Checkerboard();
  int Ls = this->Ls;
  GridBase* grid = psi_i.Grid();
-  auto phi = phi_i.View();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
+  autoView( phi , phi_i, AcceleratorRead);
+  autoView( psi , psi_i, AcceleratorRead);
+  autoView( chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
  auto pdiag = &diag[0];
  auto pupper = &upper[0];
@ -82,9 +82,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
  GridBase* grid = psi_i.Grid();
  int Ls = this->Ls;

-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
+  autoView( psi , psi_i, AcceleratorRead);
+  autoView( phi , phi_i, AcceleratorRead);
+  autoView( chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
  auto pdiag = &diag[0];
  auto pupper = &upper[0];
@ -116,8 +116,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase* grid = psi_i.Grid();
-  auto psi=psi_i.View();
-  auto chi=chi_i.View();
+  autoView( psi, psi_i, AcceleratorRead);
+  autoView( chi, chi_i, AcceleratorWrite);
  int Ls = this->Ls;

  auto plee  = & this->lee[0];
@ -172,8 +172,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
+  autoView( psi, psi_i, AcceleratorRead);
+  autoView( chi, chi_i, AcceleratorWrite);
  int Ls = this->Ls;

  auto plee  = & this->lee[0];
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi

  Compressor compressor;
  Stencil.HaloExchange(in,compressor);
-  auto Umu_v   = Umu.View();
-  auto UUUmu_v = UUUmu.View();
-  auto in_v    = in.View();
-  auto out_v   = out.View();
+  autoView( Umu_v   ,   Umu, CpuRead);
+  autoView( UUUmu_v , UUUmu, CpuRead);
+  autoView( in_v    ,  in, CpuRead);
+  autoView( out_v   , out, CpuWrite);
  thread_for( ss,Umu.Grid()->oSites(),{
    for(int s=0;s<Ls;s++){
      int sU=ss;
@ -281,11 +281,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 						    DoubledGaugeField & U,DoubledGaugeField & UUU,
 						    const FermionField &in, FermionField &out,int dag)
 {
-#ifdef GRID_OMP
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
  else
-#endif
    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
 }

@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
 								   DoubledGaugeField & U,DoubledGaugeField & UUU,
 								   const FermionField &in, FermionField &out,int dag)
 {
-#ifdef GRID_OMP
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-
  Compressor compressor; 

  int LLs = in.Grid()->_rdimensions[0];
@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  DhopFaceTime-=usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
+  DhopFaceTime+=usecond();
+
+  DhopCommTime -=usecond();
+  std::vector<std::vector<CommsRequest_t> > requests;
+  st.CommunicateBegin(requests);
+
  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
+  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();

-  double ctime=0;
-  double ptime=0;
-
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
+  // Remove explicit thread mapping introduced for OPA reasons.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
+  DhopComputeTime-=usecond();
  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      double start = usecond();
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = U.Grid()->oSites(); // 4d vol
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
-      }
-
-      // do the compute
-      auto   U_v  =   U.View();
-      auto UUU_v  = UUU.View();
-      auto  in_v  =  in.View();
-      auto out_v  = out.View();
-
-      if (dag == DaggerYes) {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-          int sU = ss;
-	  // Interior = 1; Exterior = 0; must implement for staggered
-          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
-        }
-      } else {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  // Interior = 1; Exterior = 0;
-          int sU = ss;
-          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
-        }
-      }
-        ptime = usecond() - start;
-    } else {
-      double start = usecond();
-      st.CommunicateThreaded();
-      ctime = usecond() - start;
-    }
+    int interior=1;
+    int exterior=0;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
-  DhopCommTime += ctime;
-  DhopComputeTime+=ptime;
-
-  // First to enter, last to leave timing
-  st.CollateThreads();
+  DhopComputeTime+=usecond();

  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();

-  DhopComputeTime2-=usecond();
+  st.CommunicateComplete(requests);
+  DhopCommTime +=usecond();

-  auto   U_v  =   U.View();
-  auto UUU_v  = UUU.View();
-  auto  in_v  =  in.View();
-  auto out_v  = out.View();
-  if (dag == DaggerYes) {
-    int sz=st.surface_list.size();
-    thread_for( ss,sz,{
-      int sU = st.surface_list[ss];
-      Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
-    });
-  } else {
-    int sz=st.surface_list.size();
-    thread_for( ss,sz,{
-      int sU = st.surface_list[ss];
-      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
-    });
+  DhopComputeTime2-=usecond();
+  {
+    int interior=0;
+    int exterior=1;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime2+=usecond();
-#else
-  assert(0);
-#endif
-
 }

 template<class Impl>
@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  Compressor compressor;
  int LLs = in.Grid()->_rdimensions[0];

-
-
 //double t1=usecond();
  DhopTotalTime -= usecond();
  DhopCommTime -= usecond();
@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  
  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  auto   U_v  =   U.View();
-  auto UUU_v  = UUU.View();
-  auto  in_v  =  in.View();
-  auto out_v  = out.View();
-  if (dag == DaggerYes) {
-    thread_for( ss,U.Grid()->oSites(),{
-      int sU=ss;
-      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
-    });
-  } else {
-    thread_for( ss,U.Grid()->oSites(),{
-      int sU=ss;
-      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
-    });
+  {
+    int interior=1;
+    int exterior=1;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
- //double t2=usecond();
- //std::cout << __FILE__ << " " << __func__  << " Total Time " << DhopTotalTime << std::endl;
- //std::cout << __FILE__ << " " << __func__  << " Total Time Org " << t2-t1 << std::endl;
- //std::cout << __FILE__ << " " << __func__  << " Comml Time " << DhopCommTime << std::endl;
- //std::cout << __FILE__ << " " << __func__  << " Compute Time " << DhopComputeTime << std::endl;

 }
 /*CHANGE END*/
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@ -258,10 +258,10 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
    ////////////////////////
    // Call the single hop
    ////////////////////////
-    auto U_v   = U.View();
-    auto UUU_v = UUU.View();
-    auto B_v   = B.View();
-    auto Btilde_v   = Btilde.View();
+    autoView( U_v   , U, CpuRead);
+    autoView( UUU_v , UUU, CpuRead);
+    autoView( B_v      , B, CpuWrite);
+    autoView( Btilde_v , Btilde, CpuWrite);
    thread_for(sss,B.Grid()->oSites(),{
      Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
    });
@ -386,10 +386,10 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel

  Compressor compressor;
  Stencil.HaloExchange(in, compressor);
-  auto Umu_v   =   Umu.View();
-  auto UUUmu_v = UUUmu.View();
-  auto in_v    =  in.View();
-  auto out_v   = out.View();
+  autoView( Umu_v   ,   Umu, CpuRead);
+  autoView( UUUmu_v , UUUmu, CpuRead);
+  autoView( in_v    ,  in, CpuRead);
+  autoView( out_v   , out, CpuWrite);
  thread_for( sss, in.Grid()->oSites(),{
    Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
  });
@ -403,11 +403,9 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
 						  const FermionField &in,
 						  FermionField &out, int dag) 
 {
-#ifdef GRID_OMP
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
  else
-#endif
    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
 }
 template <class Impl>
@ -417,7 +415,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
 								 const FermionField &in,
 								 FermionField &out, int dag) 
 {
-#ifdef GRID_OMP
  Compressor compressor; 
  int len =  U.Grid()->oSites();

@ -426,60 +423,30 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
-  st.CommsMergeSHM(compressor);
  DhopFaceTime    += usecond();

+  DhopCommTime -=usecond();
+  std::vector<std::vector<CommsRequest_t> > requests;
+  st.CommunicateBegin(requests);
+
+  DhopFaceTime-=usecond();
+  st.CommsMergeSHM(compressor);
+  DhopFaceTime+= usecond();
+
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
+  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime    -= usecond();
-#pragma omp parallel 
  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-
-    if (tid >= ncomms) {
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = len;
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
-      }
-
-      // do the compute
-      auto U_v   = U.View();
-      auto UUU_v = UUU.View();
-      auto in_v  = in.View();
-      auto out_v = out.View();
-      if (dag == DaggerYes) {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-          int sU = ss;
-	  // Interior = 1; Exterior = 0; must implement for staggered
-          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); 
-        }
-      } else {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  // Interior = 1; Exterior = 0;
-          int sU = ss;
-          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
-        }
-      }
-    } else {
-      st.CommunicateThreaded();
-    }
+    int interior=1;
+    int exterior=0;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime    += usecond();

+  st.CommunicateComplete(requests);
+  DhopCommTime +=usecond();
+
  // First to enter, last to leave timing
  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
@ -487,28 +454,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st

  DhopComputeTime2    -= usecond();
  {
-    auto U_v   = U.View();
-    auto UUU_v = UUU.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    if (dag == DaggerYes) {
-      int sz=st.surface_list.size();
-      thread_for(ss,sz,{
-	int sU = st.surface_list[ss];
-	Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
-      });
-    } else {
-      int sz=st.surface_list.size();
-      thread_for(ss,sz,{
-	int sU = st.surface_list[ss];
-	Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
-      });
-    }
+    int interior=0;
+    int exterior=1;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime2    += usecond();
-#else
-  assert(0);
-#endif
 }


@ -528,19 +478,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
  st.HaloExchange(in, compressor);
  DhopCommTime    += usecond();

-  auto U_v   =   U.View();
-  auto UUU_v = UUU.View();
-  auto in_v  =  in.View();
-  auto out_v = out.View();
  DhopComputeTime -= usecond();
-  if (dag == DaggerYes) {
-    thread_for(sss, in.Grid()->oSites(),{
-      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
-    });
-  } else {
-    thread_for(sss, in.Grid()->oSites(),{
-      Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
-    });
+  {
+    int interior=1;
+    int exterior=1;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
@ -44,9 +44,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
+  autoView(phi , phi_i, AcceleratorRead);
+  autoView(chi , chi_i, AcceleratorWrite);

  assert(phi.Checkerboard() == psi.Checkerboard());

@ -84,9 +84,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
+  autoView(phi , phi_i, AcceleratorRead);
+  autoView(chi , chi_i, AcceleratorWrite);

  auto pm  = this->pm;
  int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
@ -132,9 +132,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
+  autoView(phi , phi_i, AcceleratorRead);
+  autoView(chi , chi_i, AcceleratorWrite);

  assert(phi.Checkerboard() == psi.Checkerboard());

@ -174,9 +174,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
  GridBase *grid = psi_i.Grid();
  int Ls = this->Ls;
  int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
+  autoView(phi , phi_i, AcceleratorRead);
+  autoView(chi , chi_i, AcceleratorWrite);

  assert(phi.Checkerboard() == psi.Checkerboard());

@ -226,8 +226,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
+  autoView(chi , chi_i, AcceleratorWrite);

  auto plee = & this->lee [0];
  auto pdee = & this->dee [0];
@ -286,8 +286,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
+  autoView(chi , chi_i, AcceleratorWrite);

  auto pm = this->pm;
  auto plee = & this->lee [0];
@ -354,8 +354,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
+  autoView(chi , chi_i, AcceleratorWrite);

  auto plee = & this->lee [0];
  auto pdee = & this->dee [0];
@ -410,8 +410,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
+  autoView(chi , chi_i, AcceleratorWrite);
  int Ls = this->Ls;

  auto pm = this->pm;
--- a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
@ -0,0 +1,499 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+#pragma once 
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////
+// Constructor and gauge import
+/////////////////////////////////
+
+template <class Impl>
+NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, 
+						   RealD _mass,
+						   RealD _c1, RealD _u0,
+						   const ImplParams &p)
+  : Kernels(p),
+    _grid(&Fgrid),
+    _cbgrid(&Hgrid),
+    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
+    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
+    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
+    mass(_mass),
+    Lebesgue(_grid),
+    LebesgueEvenOdd(_cbgrid),
+    Umu(&Fgrid),
+    UmuEven(&Hgrid),
+    UmuOdd(&Hgrid),
+    _tmp(&Hgrid)
+{
+  int vol4;
+  int LLs=1;
+  c1=_c1;
+  u0=_u0;
+  vol4= _grid->oSites();
+  Stencil.BuildSurfaceList(LLs,vol4);
+  vol4= _cbgrid->oSites();
+  StencilEven.BuildSurfaceList(LLs,vol4);
+  StencilOdd.BuildSurfaceList(LLs,vol4);
+}
+
+template <class Impl>
+NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
+						   GridRedBlackCartesian &Hgrid, RealD _mass,
+						   RealD _c1, RealD _u0,
+						   const ImplParams &p)
+  : NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p)
+{
+  ImportGauge(_U);
+}
+
+////////////////////////////////////////////////////////////
+// Momentum space propagator should be 
+// https://arxiv.org/pdf/hep-lat/9712010.pdf
+//
+// mom space action.
+//   gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
+//
+// must track through staggered flavour/spin reduction in literature to 
+// turn to free propagator for the one component chi field, a la page 4/5
+// of above link to implmement fourier based solver.
+////////////////////////////////////////////////////////////
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
+{
+  pickCheckerboard(Even, UmuEven,  Umu);
+  pickCheckerboard(Odd,  UmuOdd ,  Umu);
+}
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::ImportGauge(const GaugeField &_U) 
+{
+  GaugeLinkField U(GaugeGrid());
+  DoubledGaugeField _UUU(GaugeGrid());
+  ////////////////////////////////////////////////////////
+  // Double Store should take two fields for Naik and one hop separately.
+  // Discard teh Naik as Naive
+  ////////////////////////////////////////////////////////
+  Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U );
+
+  ////////////////////////////////////////////////////////
+  // Apply scale factors to get the right fermion Kinetic term
+  // Could pass coeffs into the double store to save work.
+  // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) 
+  ////////////////////////////////////////////////////////
+  for (int mu = 0; mu < Nd; mu++) {
+
+    U = PeekIndex<LorentzIndex>(Umu, mu);
+    PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
+    
+    U = PeekIndex<LorentzIndex>(Umu, mu+4);
+    PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
+
+  }
+
+  CopyGaugeCheckerboards();
+}
+
+/////////////////////////////
+// Implement the interface
+/////////////////////////////
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  Dhop(in, out, DaggerNo);
+  axpy(out, mass, in, out);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  Dhop(in, out, DaggerYes);
+  axpy(out, mass, in, out);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
+  }
+}
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
+  }
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  typename FermionField::scalar_type scal(mass);
+  out = scal * in;
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  Mooee(in, out);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  out = (1.0 / (mass)) * in;
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) 
+{
+  out.Checkerboard() = in.Checkerboard();
+  MooeeInv(in, out);
+}
+
+///////////////////////////////////
+// Internal
+///////////////////////////////////
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
+						GaugeField & mat,
+						const FermionField &A, const FermionField &B, int dag) 
+{
+  assert((dag == DaggerNo) || (dag == DaggerYes));
+
+  Compressor compressor;
+
+  FermionField Btilde(B.Grid());
+  FermionField Atilde(B.Grid());
+  Atilde = A;
+
+  st.HaloExchange(B, compressor);
+
+  for (int mu = 0; mu < Nd; mu++) {
+
+    ////////////////////////
+    // Call the single hop
+    ////////////////////////
+    autoView( U_v      , U, CpuRead);
+    autoView( B_v      , B, CpuWrite);
+    autoView( Btilde_v , Btilde, CpuWrite);
+    thread_for(sss,B.Grid()->oSites(),{
+      Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
+    });
+
+    assert(0);// need to figure out the force interface with a blasted three link term.
+    
+  }
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+
+  conformable(U.Grid(), _grid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());
+
+  mat.Checkerboard() = U.Checkerboard();
+
+  DerivInternal(Stencil, Umu, mat, U, V, dag);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+
+  conformable(U.Grid(), _cbgrid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());
+
+  assert(V.Checkerboard() == Even);
+  assert(U.Checkerboard() == Odd);
+  mat.Checkerboard() = Odd;
+
+  DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+
+  conformable(U.Grid(), _cbgrid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());
+
+  assert(V.Checkerboard() == Odd);
+  assert(U.Checkerboard() == Even);
+  mat.Checkerboard() = Even;
+
+  DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
+{
+  DhopCalls+=2;
+  conformable(in.Grid(), _grid);  // verifies full grid
+  conformable(in.Grid(), out.Grid());
+
+  out.Checkerboard() = in.Checkerboard();
+
+  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
+{
+  DhopCalls+=1;
+  conformable(in.Grid(), _cbgrid);    // verifies half grid
+  conformable(in.Grid(), out.Grid());  // drops the cb check
+
+  assert(in.Checkerboard() == Even);
+  out.Checkerboard() = Odd;
+
+  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
+{
+  DhopCalls+=1;
+  conformable(in.Grid(), _cbgrid);    // verifies half grid
+  conformable(in.Grid(), out.Grid());  // drops the cb check
+
+  assert(in.Checkerboard() == Odd);
+  out.Checkerboard() = Even;
+
+  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
+  DhopDir(in, out, dir, disp);
+}
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  assert(0); // Not implemented yet
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
+
+  Compressor compressor;
+  Stencil.HaloExchange(in, compressor);
+  autoView( Umu_v   ,  Umu, CpuRead);
+  autoView( in_v    ,  in, CpuRead);
+  autoView( out_v   , out, CpuWrite);
+  //  thread_for( sss, in.Grid()->oSites(),{
+  //    Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
+  //  });
+  assert(0);
+};
+
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+					       DoubledGaugeField &U,
+					       const FermionField &in,
+					       FermionField &out, int dag) 
+{
+  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
+    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
+  else
+    DhopInternalSerialComms(st,lo,U,in,out,dag);
+}
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+							      DoubledGaugeField &U,
+							      const FermionField &in,
+							      FermionField &out, int dag) 
+{
+  Compressor compressor; 
+  int len =  U.Grid()->oSites();
+
+  DhopTotalTime   -= usecond();
+
+  DhopFaceTime    -= usecond();
+  st.Prepare();
+  st.HaloGather(in,compressor);
+  DhopFaceTime    += usecond();
+
+  DhopCommTime -=usecond();
+  std::vector<std::vector<CommsRequest_t> > requests;
+  st.CommunicateBegin(requests);
+
+  DhopFaceTime-=usecond();
+  st.CommsMergeSHM(compressor);
+  DhopFaceTime+= usecond();
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Removed explicit thread comms
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  DhopComputeTime    -= usecond();
+  {
+    int interior=1;
+    int exterior=0;
+    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+  }
+  DhopComputeTime    += usecond();
+
+  st.CommunicateComplete(requests);
+  DhopCommTime +=usecond();
+
+  // First to enter, last to leave timing
+  DhopFaceTime    -= usecond();
+  st.CommsMerge(compressor);
+  DhopFaceTime    -= usecond();
+
+  DhopComputeTime2    -= usecond();
+  {
+    int interior=0;
+    int exterior=1;
+    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+  }
+  DhopComputeTime2    += usecond();
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
+							  DoubledGaugeField &U,
+							  const FermionField &in,
+							  FermionField &out, int dag) 
+{
+  assert((dag == DaggerNo) || (dag == DaggerYes));
+
+  DhopTotalTime   -= usecond();
+
+  DhopCommTime    -= usecond();
+  Compressor compressor;
+  st.HaloExchange(in, compressor);
+  DhopCommTime    += usecond();
+
+  DhopComputeTime -= usecond();
+  {
+    int interior=1;
+    int exterior=1;
+    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+  }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
+};
+
+  ////////////////////////////////////////////////////////////////
+  // Reporting
+  ////////////////////////////////////////////////////////////////
+template<class Impl>
+void NaiveStaggeredFermion<Impl>::Report(void) 
+{
+  Coordinate latt = _grid->GlobalDimensions();
+  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP = _grid->_Nprocessors;
+  RealD NN = _grid->NodeCount();
+
+  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
+
+  std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls   : " 
+	    << DhopCalls   << std::endl;
+  std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime   /Calls       : " 
+	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime    /Calls       : " 
+	    << DhopCommTime    / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls        : " 
+	    << DhopComputeTime / DhopCalls << " us" << std::endl;
+
+  // Average the compute time
+  _grid->GlobalSum(DhopComputeTime);
+  DhopComputeTime/=NP;
+
+  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
+  
+  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
+
+  std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
+  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
+  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
+}
+template<class Impl>
+void NaiveStaggeredFermion<Impl>::ZeroCounters(void) 
+{
+  DhopCalls       = 0;
+  DhopTotalTime   = 0;
+  DhopCommTime    = 0;
+  DhopComputeTime = 0;
+  DhopFaceTime    = 0;
+
+  Stencil.ZeroCounters();
+  StencilEven.ZeroCounters();
+  StencilOdd.ZeroCounters();
+}
+
+
+//////////////////////////////////////////////////////// 
+// Conserved current - not yet implemented.
+////////////////////////////////////////////////////////
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+							      PropagatorField &q_in_2,
+							      PropagatorField &q_out,
+							      PropagatorField &src,
+							      Current curr_type,
+							      unsigned int mu)
+{
+  assert(0);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+                                                         PropagatorField &q_out,
+                                                         PropagatorField &src,
+                                                         Current curr_type,
+                                                         unsigned int mu, 
+                                                         unsigned int tmin,
+                                              unsigned int tmax,
+					      ComplexField &lattice_cmplx)
+{
+  assert(0);
+
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
@ -618,10 +618,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);

 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
 					 DoubledGaugeFieldView &U,
 					 DoubledGaugeFieldView &UUU,
-					 SiteSpinor *buf, int LLs,
+					 SiteSpinor *buf, int sF,
 					 int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  assert(0);
@ -680,12 +680,13 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
  gauge2 =(uint64_t)&UU[sU]( Z );				\
  gauge3 =(uint64_t)&UU[sU]( T ); 
  
+
  // This is the single precision 5th direction vectorised kernel
 #include <Grid/simd/Intel512single.h>
-template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
 								    DoubledGaugeFieldView &U,
 								    DoubledGaugeFieldView &UUU,
-								    SiteSpinor *buf, int LLs,
+								    SiteSpinor *buf, int sF,
 								    int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
@ -702,9 +703,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
  StencilEntry *SE2;
  StencilEntry *SE3;

-   for(int s=0;s<LLs;s++){
+  //   for(int s=0;s<LLs;s++){

-    int sF=s+LLs*sU;
+  //    int sF=s+LLs*sU;
+  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHI(addr0,addr1,addr2,addr3);
@ -736,10 +738,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
 }

 #include <Grid/simd/Intel512double.h>
-template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st, 
 								    DoubledGaugeFieldView &U,
 								    DoubledGaugeFieldView &UUU,
-								    SiteSpinor *buf, int LLs,
+								    SiteSpinor *buf, int sF,
 								    int sU, const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
 #ifdef AVX512
@ -756,8 +758,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
  StencilEntry *SE2;
  StencilEntry *SE3;

-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=s+LLs*sU;
+  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHI(addr0,addr1,addr2,addr3);
@ -821,10 +824,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
  // This is the single precision 5th direction vectorised kernel

 #include <Grid/simd/Intel512single.h>
-template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st, 
 							       DoubledGaugeFieldView &U,
 							       DoubledGaugeFieldView &UUU,
-							       SiteSpinor *buf, int LLs,
+							       SiteSpinor *buf, int sF,
 							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
@ -841,9 +844,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
  StencilEntry *SE2;
  StencilEntry *SE3;

-  for(int s=0;s<LLs;s++){
-    
-    int sF=s+LLs*sU;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=s+LLs*sU;
+  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHIa(addr0,addr1);
@ -890,10 +893,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
 }

 #include <Grid/simd/Intel512double.h>
-template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st, 
 							       DoubledGaugeFieldView &U,
 							       DoubledGaugeFieldView &UUU,
-							       SiteSpinor *buf, int LLs,
+							       SiteSpinor *buf, int sF,
 							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
@ -910,9 +913,9 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
  StencilEntry *SE2;
  StencilEntry *SE3;

-  for(int s=0;s<LLs;s++){
-    
-    int sF=s+LLs*sU;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=s+LLs*sU;
+  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHIa(addr0,addr1);
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
@ -146,9 +146,10 @@ NAMESPACE_BEGIN(Grid);


 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
+void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
 					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
-					  SiteSpinor *buf, int LLs, int sU, 
+					  SiteSpinor *buf, int sF, int sU, 
 					  const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
@ -181,8 +182,9 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
  StencilEntry *SE;
  int skew;

-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=s+LLs*sU;
+  {

    skew = 0;
    HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);  
@ -193,6 +195,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG      (U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG      (U,Zm,1,skew,even);  
    HAND_STENCIL_LEG      (U,Tm,0,skew,odd);  
+    if (Naik) {
    skew = 8;
    HAND_STENCIL_LEG(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);   
@ -202,7 +205,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);  
-    
+    }    
    if ( dag ) {
      result()()(0) = - even_0 - odd_0;
      result()()(1) = - even_1 - odd_1;
@ -218,9 +221,10 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,


 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
+void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
+					     SiteSpinor *buf, int sF, int sU, 
 					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
@ -253,8 +257,9 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
  StencilEntry *SE;
  int skew;

-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=s+LLs*sU;
+  {

    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
@ -268,6 +273,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);  
    HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);  
+    if (Naik) {
    skew = 8;
    HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);   
@ -277,7 +283,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);  
-
+    }
    // Assume every site must be connected to at least one interior point. No 1^4 subvols.
    if ( dag ) {
      result()()(0) = - even_0 - odd_0;
@ -294,9 +300,10 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,


 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
+void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
+					     SiteSpinor *buf, int sF, int sU, 
 					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
@ -329,8 +336,9 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
  StencilEntry *SE;
  int skew;

-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=s+LLs*sU;
+  {

    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
@ -344,6 +352,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);  
    HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);  
+    if (Naik) {
    skew = 8;
    HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);   
@ -353,7 +362,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);  
-
+    }
    // Add sum of all exterior connected stencil legs
    if ( nmu ) { 
      if ( dag ) {
@ -370,6 +379,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
  }
 }

+/*
 #define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
@ -385,7 +395,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
-
+*/
 #undef LOAD_CHI

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
  if (SE->_is_local ) {						\
    if (SE->_permute) {						\
      chi_p = &chi;						\
-      permute(chi,  in[SE->_offset], ptype);		\
+      permute(chi,  in[SE->_offset], ptype);			\
    } else {							\
-      chi_p = &in[SE->_offset];				\
+      chi_p = &in[SE->_offset];					\
    }								\
  } else {							\
    chi_p = &buf[SE->_offset];					\
@ -51,15 +51,15 @@ NAMESPACE_BEGIN(Grid);
  if (SE->_is_local ) {						\
    if (SE->_permute) {						\
      chi_p = &chi;						\
-      permute(chi,  in[SE->_offset], ptype);		\
+      permute(chi,  in[SE->_offset], ptype);			\
    } else {							\
-      chi_p = &in[SE->_offset];				\
+      chi_p = &in[SE->_offset];					\
    }								\
  } else if ( st.same_node[Dir] ) {				\
    chi_p = &buf[SE->_offset];					\
  }								\
  if (SE->_is_local || st.same_node[Dir] ) {			\
-    multLink(Uchi, U[sU], *chi_p, Dir);			\
+    multLink(Uchi, U[sU], *chi_p, Dir);				\
  }

 #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
@ -67,7 +67,7 @@ NAMESPACE_BEGIN(Grid);
  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
    chi_p = &buf[SE->_offset];					\
-    multLink(Uchi, U[sU], *chi_p, Dir);			\
+    multLink(Uchi, U[sU], *chi_p, Dir);				\
  }

 template <class Impl>
@ -78,10 +78,12 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 // Int, Ext, Int+Ext cases for comms overlap
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
+void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionFieldView &in, FermionFieldView &out, int dag) {
+					     SiteSpinor *buf, int sF, int sU, 
+					     const FermionFieldView &in, FermionFieldView &out, int dag) 
+{
  const SiteSpinor *chi_p;
  SiteSpinor chi;
  SiteSpinor Uchi;
@ -89,8 +91,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
  int ptype;
  int skew;

-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
+  //  for(int s=0;s<LLs;s++){
+  //
+  //    int sF=LLs*sU+s;
+  {
    skew = 0;
    GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
    GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
@ -100,6 +104,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
    GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
+    if ( Naik ) {
    skew=8;
    GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
@ -109,6 +114,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
    GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
+    }
    if ( dag ) { 
      Uchi = - Uchi;
    } 
@ -120,9 +126,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
  // Only contributions from interior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
+void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-						SiteSpinor *buf, int LLs, int sU, 
+						SiteSpinor *buf, int sF, int sU, 
 						const FermionFieldView &in, FermionFieldView &out,int dag) {
  const SiteSpinor *chi_p;
  SiteSpinor chi;
@ -131,8 +138,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
  int ptype;
  int skew ;

-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=LLs*sU+s;
+  {
    skew = 0;
    Uchi=Zero();
    GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
@ -143,6 +151,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
+    if ( Naik ) {
    skew=8;
    GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
@ -152,6 +161,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
+    }
    if ( dag ) {
      Uchi = - Uchi;
    }
@ -164,9 +174,10 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
  // Only contributions from exterior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
+void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-						SiteSpinor *buf, int LLs, int sU,
+						SiteSpinor *buf, int sF, int sU,
 						const FermionFieldView &in, FermionFieldView &out,int dag) {
  const SiteSpinor *chi_p;
  //  SiteSpinor chi;
@ -176,8 +187,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
  int nmu=0;
  int skew ;

-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=LLs*sU+s;
+  {
    skew = 0;
    Uchi=Zero();
    GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
@ -188,6 +200,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
+    if ( Naik ) {
    skew=8;
    GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
@ -197,7 +210,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
-
+    }
    if ( nmu ) { 
      if ( dag ) { 
 	out[sF] = out[sF] - Uchi;
@ -211,72 +224,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
 ////////////////////////////////////////////////////////////////////////////////////
 // Driving / wrapping routine to select right kernel
 ////////////////////////////////////////////////////////////////////////////////////
-
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					 SiteSpinor *buf, int LLs, int sU,
-					 const FermionFieldView &in, FermionFieldView &out,
-					 int interior,int exterior)
-{
-  int dag=1;
-  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-				      SiteSpinor *buf, int LLs, int sU,
-				      const FermionFieldView &in, FermionFieldView &out,
-				      int interior,int exterior)
-{
-  int dag=0;
-  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-				      SiteSpinor *buf, int LLs,
-				      int sU, const FermionFieldView &in, FermionFieldView &out,
-				      int dag,int interior,int exterior) 
-{
-  switch(Opt) {
-#ifdef AVX512
-  case OptInlineAsm:
-    if ( interior && exterior ) {
-      DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else { 
-      std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
-      assert(0);
-    }
-    break;
-#endif
-  case OptHandUnroll:
-    if ( interior && exterior ) {
-      DhopSiteHand   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( interior ) {
-      DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( exterior ) {
-      DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    }
-    break;
-  case OptGeneric:
-    if ( interior && exterior ) {
-      DhopSiteGeneric   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( interior ) {
-      DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( exterior ) {
-      DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    }
-    break;
-  default:
-    std::cout<<"Oops Opt = "<<Opt<<std::endl;
-    assert(0);
-    break;
-  }
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U,  DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
-					    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp) 
+void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
+					   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
 {
  // Disp should be either +1,-1,+3,-3
  // What about "dag" ?
@ -285,6 +235,108 @@ void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi
  assert(0);
 }

+#define KERNEL_CALLNB(A,improved)					\
+  const uint64_t    NN = Nsite*Ls;					\
+  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
+    });
+
+#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier(); 
+
+#define ASM_CALL(A)							\
+  const uint64_t    NN = Nsite*Ls;					\
+  thread_for( ss, NN, {							\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag);		\
+  });
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
+					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
+					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
+{
+  GridBase *FGrid=in.Grid();  
+  GridBase *UGrid=U.Grid();  
+  typedef StaggeredKernels<Impl> ThisKernel;
+  autoView( UUU_v , UUU, AcceleratorRead);
+  autoView( U_v   ,   U, AcceleratorRead);
+  autoView( in_v  ,  in, AcceleratorRead);
+  autoView( out_v , out, AcceleratorWrite);
+  autoView( st_v  ,  st, AcceleratorRead);
+  SiteSpinor * buf = st.CommBuf();
+    
+  int Ls=1;
+  if(FGrid->Nd()==UGrid->Nd()+1){
+    Ls    = FGrid->_rdimensions[0];
+  }
+  int Nsite = UGrid->oSites();
+
+  if( interior && exterior ) { 
+    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
+#ifndef GRID_CUDA
+    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;}
+    if (Opt == OptInlineAsm  ) {  ASM_CALL(DhopSiteAsm);     return;}
+#endif
+  } else if( interior ) {
+    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
+#ifndef GRID_CUDA
+    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1);    return;}
+#endif
+  } else if( exterior ) { 
+    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
+#ifndef GRID_CUDA
+    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;}
+#endif
+  }
+  assert(0 && " Kernel optimisation case not covered ");
+}
+template <class Impl>
+void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
+				       DoubledGaugeField &U,
+				       const FermionField &in, FermionField &out, int dag, int interior,int exterior)
+{
+  GridBase *FGrid=in.Grid();  
+  GridBase *UGrid=U.Grid();  
+  typedef StaggeredKernels<Impl> ThisKernel;
+  autoView( UUU_v ,   U, AcceleratorRead);
+  autoView( U_v   ,   U, AcceleratorRead);
+  autoView( in_v  ,  in, AcceleratorRead);
+  autoView( out_v , out, AcceleratorWrite);
+  autoView( st_v  ,  st, AcceleratorRead);
+  SiteSpinor * buf = st.CommBuf();
+
+  int Ls=1;
+  if(FGrid->Nd()==UGrid->Nd()+1){
+    Ls    = FGrid->_rdimensions[0];
+  }
+  int Nsite = UGrid->oSites();
+  
+  if( interior && exterior ) { 
+    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
+#ifndef GRID_CUDA
+    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0);    return;}
+#endif
+  } else if( interior ) {
+    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
+#ifndef GRID_CUDA
+    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0);    return;}
+#endif
+  } else if( exterior ) { 
+    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
+#ifndef GRID_CUDA
+    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0);    return;}
+#endif
+  }
+}
+
+
+#undef KERNEL_CALLNB
+#undef KERNEL_CALL
+#undef ASM_CALL
+
 NAMESPACE_END(Grid);


--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@ -98,32 +98,35 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  Coordinate lcoor;
  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();

-  for (int site = 0; site < lvol; site++)
  {
-    grid->LocalIndexToLocalCoor(site, lcoor);
-    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-    peekLocalSite(Qx, CloverTerm, lcoor);
-    Qxinv = Zero();
-    //if (csw!=0){
-    for (int j = 0; j < Ns; j++)
-      for (int k = 0; k < Ns; k++)
-        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++){
-	    auto zz =  Qx()(j, k)(a, b);
-            EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
-	  }
-    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
-
-    EigenInvCloverOp = EigenCloverOp.inverse();
-    //std::cout << EigenInvCloverOp << std::endl;
-    for (int j = 0; j < Ns; j++)
-      for (int k = 0; k < Ns; k++)
-        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++)
-            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
-    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
-    //  }
-    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
+    autoView(CTv,CloverTerm,CpuRead);
+    autoView(CTIv,CloverTermInv,CpuWrite);
+    for (int site = 0; site < lvol; site++) {
+      grid->LocalIndexToLocalCoor(site, lcoor);
+      EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+      peekLocalSite(Qx, CTv, lcoor);
+      Qxinv = Zero();
+      //if (csw!=0){
+      for (int j = 0; j < Ns; j++)
+	for (int k = 0; k < Ns; k++)
+	  for (int a = 0; a < DimRep; a++)
+	    for (int b = 0; b < DimRep; b++){
+	      auto zz =  Qx()(j, k)(a, b);
+	      EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
+	    }
+      //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
+      
+      EigenInvCloverOp = EigenCloverOp.inverse();
+      //std::cout << EigenInvCloverOp << std::endl;
+      for (int j = 0; j < Ns; j++)
+	for (int k = 0; k < Ns; k++)
+	  for (int a = 0; a < DimRep; a++)
+	    for (int b = 0; b < DimRep; b++)
+	      Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
+      //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
+      //  }
+      pokeLocalSite(Qxinv, CTIv, lcoor);
+    }
  }

  // Separate the even and odd parts
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -580,16 +580,21 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  cosha = (one + W*W + sk) / (abs(W)*2.0);

  // FIXME Need a Lattice acosh
-  for(int idx=0;idx<_grid->lSites();idx++){
-    Coordinate lcoor(Nd);
-    Tcomplex cc;
-    //    RealD sgn;
-    _grid->LocalIndexToLocalCoor(idx,lcoor);
-    peekLocalSite(cc,cosha,lcoor);
-    assert((double)real(cc)>=1.0);
-    assert(fabs((double)imag(cc))<=1.0e-15);
-    cc = ScalComplex(::acosh(real(cc)),0.0);
-    pokeLocalSite(cc,a,lcoor);
+
+  {
+    autoView(cosha_v,cosha,CpuRead);
+    autoView(a_v,a,CpuWrite);
+    for(int idx=0;idx<_grid->lSites();idx++){
+      Coordinate lcoor(Nd);
+      Tcomplex cc;
+      //    RealD sgn;
+      _grid->LocalIndexToLocalCoor(idx,lcoor);
+      peekLocalSite(cc,cosha_v,lcoor);
+      assert((double)real(cc)>=1.0);
+      assert(fabs((double)imag(cc))<=1.0e-15);
+      cc = ScalComplex(::acosh(real(cc)),0.0);
+      pokeLocalSite(cc,a_v,lcoor);
+    }
  }

  Wea = ( exp( a) * abs(W)  );
@ -775,17 +780,20 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
  cosha =  (one + W*W + sk) / (abs(W)*2.0);

  // FIXME Need a Lattice acosh
+  {
+  autoView(cosha_v,cosha,CpuRead);
+  autoView(a_v,a,CpuWrite);
  for(int idx=0;idx<_grid->lSites();idx++){
    Coordinate lcoor(Nd);
    Tcomplex cc;
    //    RealD sgn;
    _grid->LocalIndexToLocalCoor(idx,lcoor);
-    peekLocalSite(cc,cosha,lcoor);
+    peekLocalSite(cc,cosha_v,lcoor);
    assert((double)real(cc)>=1.0);
    assert(fabs((double)imag(cc))<=1.0e-15);
    cc = ScalComplex(::acosh(real(cc)),0.0);
-    pokeLocalSite(cc,a,lcoor);
-  }
+    pokeLocalSite(cc,a_v,lcoor);
+  }}
  
  Wea = ( exp( a) * abs(W)  );
  Wema= ( exp(-a) * abs(W)  );
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@ -67,7 +67,12 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
    diag_mass = 4.0 + mass;
  }

-
+  int vol4;
+  vol4=Fgrid.oSites();
+  Stencil.BuildSurfaceList(1,vol4);
+  vol4=Hgrid.oSites();
+  StencilEven.BuildSurfaceList(1,vol4);
+  StencilOdd.BuildSurfaceList(1,vol4);
 }

 template <class Impl>
@ -483,32 +488,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
-#if 0
-  PropagatorField tmp1(_grid), tmp2(_grid);
-  q_out = Zero();
-
-  // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
-  // Inefficient comms method but not performance critical.
-  tmp1 = Cshift(q_in_1, mu, 1);
-  tmp2 = Cshift(q_in_2, mu, 1);
-  auto tmp1_v  =  tmp1.View();
-  auto tmp2_v  =  tmp2.View();
-  auto q_in_1_v=q_in_1.View();
-  auto q_in_2_v=q_in_2.View();
-  auto q_out_v = q_out.View();
-  auto Umu_v   =   Umu.View();
-  thread_for(sU, Umu.Grid()->oSites(),{
-      Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
-					       q_in_2_v[sU],
-					       q_out_v[sU],
-					       Umu_v, sU, mu);
-      Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
-					       tmp2_v[sU],
-					       q_out_v[sU],
-					       Umu_v, sU, mu);
-  });
-#else
-#endif
+  assert(0);
 }


@ -524,62 +504,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 {
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
-#if 0
-
-  //  Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
-  Complex i(0.0,1.0);
-  PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
-  unsigned int tshift = (mu == Tp) ? 1 : 0;
-  unsigned int LLt    = GridDefaultLatt()[Tp];
-
-  q_out = Zero();
-  LatticeInteger coords(_grid);
-  LatticeCoordinate(coords, Tp);
-
-  // Need q(x + mu) and q(x - mu).
-  tmp    = Cshift(q_in, mu, 1);
-  tmpFwd = tmp*lattice_cmplx;
-  tmp    = lattice_cmplx*q_in;
-  tmpBwd = Cshift(tmp, mu, -1);
-
-  auto coords_v = coords.View();
-  auto tmpFwd_v = tmpFwd.View();
-  auto tmpBwd_v = tmpBwd.View();
-  auto Umu_v    = Umu.View();
-  auto q_out_v  = q_out.View();
-
-  thread_for(sU, Umu.Grid()->oSites(), {
-
-    // Compute the sequential conserved current insertion only if our simd
-    // object contains a timeslice we need.
-    vPredicate t_mask;
-    t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
-    Integer timeSlices = Reduce(t_mask());
-
-    if (timeSlices > 0) {
-      Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], 
-					  q_out_v[sU], 
-					  Umu_v, sU, mu, t_mask);
-    }
-
-    // Repeat for backward direction.
-    t_mask()     = ((coords_v[sU] >= (tmin + tshift)) && 
-		    (coords_v[sU] <= (tmax + tshift)));
-    
-    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
-    unsigned int t0 = 0;
-    if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
-    
-    timeSlices = Reduce(t_mask());
-
-    if (timeSlices > 0) {
-      Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], 
-					  q_out_v[sU], 
-					  Umu_v, sU, mu, t_mask);
-    }
-  });
-#else
-#endif
+  assert(0);
 }

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -39,9 +39,10 @@ NAMESPACE_BEGIN(Grid);
 // Generic implementation; move to different file?
 ////////////////////////////////////////////

+/*
 accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
 {
-#ifdef __CUDA_ARCH__
+#ifdef GRID_SIMT
  static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); 
  uint4 * mem_pun  = (uint4 *)mem; // force 128 bit loads
  uint4 * chip_pun = (uint4 *)&chip;
@ -51,7 +52,8 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
 #endif
  return;
 }
-  
+*/
+
 #define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
  SE = st.GetEntry(ptype, Dir, sF);				\
  if (SE->_is_local) {						\
@ -61,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  } else {							\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  synchronise();						\
+  acceleratorSynchronise();						\
  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
  Recon(result, Uchi);
  
@ -74,12 +76,12 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  } else if ( st.same_node[Dir] ) {				\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  synchronise();						\
+  acceleratorSynchronise();						\
  if (SE->_is_local || st.same_node[Dir] ) {			\
    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
    Recon(result, Uchi);					\
  }								\
-  synchronise();						
+  acceleratorSynchronise();						

 #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
  SE = st.GetEntry(ptype, Dir, sF);				\
@ -89,7 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
    Recon(result, Uchi);					\
    nmu++;							\
  }								\
-  synchronise();						
+  acceleratorSynchronise();						

 #define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon)		\
    if (SE->_is_local ) {					\
@ -99,7 +101,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
    } else {							\
      chi = coalescedRead(buf[SE->_offset],lane);		\
    }								\
-    synchronise();						\
+    acceleratorSynchronise();					\
    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
    Recon(result, Uchi);					

@ -126,7 +128,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
  StencilEntry *SE;
  int ptype;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
  GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
  GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
@ -153,7 +155,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
  int ptype;

  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
  GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
  GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
@ -181,7 +183,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFi
  StencilEntry *SE;
  int ptype;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);

  result=Zero();
  GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
@ -203,7 +205,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeField
  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);

  calcHalfSpinor chi;
  //  calcHalfSpinor *chi_p;
@ -239,7 +241,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFi
  int ptype;
  int nmu=0;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  result=Zero();
  GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
  GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
@ -270,7 +272,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
  int ptype;
  int nmu=0;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  result=Zero();
  GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
  GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
@ -300,7 +302,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
  StencilEntry *SE;							\
  int ptype;								\
  const int Nsimd = SiteHalfSpinor::Nsimd();				\
-  const int lane=SIMTlane(Nsimd);					\
+  const int lane=acceleratorSIMTlane(Nsimd);					\
 									\
  SE = st.GetEntry(ptype, dir, sF);					\
  GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon);				\
@ -328,7 +330,7 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
  StencilEntry *SE;
  int ptype;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);

  SE = st.GetEntry(ptype, dir, sF);
  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
@ -346,30 +348,30 @@ template <class Impl>
 void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
 				      int Nsite, const FermionField &in, std::vector<FermionField> &out) 
 {
-   auto U_v   = U.View();
-   auto in_v  = in.View();
-   auto st_v  = st.View();
+   autoView(U_v  ,U,AcceleratorRead);
+   autoView(in_v ,in,AcceleratorRead);
+   autoView(st_v ,st,AcceleratorRead);

-   auto out_Xm = out[0].View();
-   auto out_Ym = out[1].View();
-   auto out_Zm = out[2].View();
-   auto out_Tm = out[3].View();
-   auto out_Xp = out[4].View();
-   auto out_Yp = out[5].View();
-   auto out_Zp = out[6].View();
-   auto out_Tp = out[7].View();
-
-   accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
+   autoView(out_Xm,out[0],AcceleratorWrite);
+   autoView(out_Ym,out[1],AcceleratorWrite);
+   autoView(out_Zm,out[2],AcceleratorWrite);
+   autoView(out_Tm,out[3],AcceleratorWrite);
+   autoView(out_Xp,out[4],AcceleratorWrite);
+   autoView(out_Yp,out[5],AcceleratorWrite);
+   autoView(out_Zp,out[6],AcceleratorWrite);
+   autoView(out_Tp,out[7],AcceleratorWrite);
+   auto CBp=st.CommBuf();
+   accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{
      int sU=sss/Ls;				
      int sF =sss;				
-      DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0);
-      DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1);
-      DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2);
-      DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3);
-      DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4);
-      DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5);
-      DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6);
-      DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7);
+      DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
+      DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
+      DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
+      DhopDirTm(st_v,U_v,CBp,sF,sU,in_v,out_Tm,3);
+      DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,4);
+      DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,5);
+      DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,6);
+      DhopDirTp(st_v,U_v,CBp,sF,sU,in_v,out_Tp,7);
   });
 }

@ -381,17 +383,18 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
  assert(dirdisp<=7);
  assert(dirdisp>=0);

-   auto U_v   = U.View();
-   auto in_v  = in.View();
-   auto out_v = out.View();
-   auto st_v  = st.View();
+   autoView(U_v  ,U  ,AcceleratorRead);
+   autoView(in_v ,in ,AcceleratorRead);
+   autoView(out_v,out,AcceleratorWrite);
+   autoView(st_v ,st ,AcceleratorRead);
+   auto CBp=st.CommBuf();			
 #define LoopBody(Dir)				\
-   case Dir :			\
-     accelerator_forNB(ss,Nsite,Simd::Nsimd(),{	\
+   case Dir :					\
+     accelerator_for(ss,Nsite,Simd::Nsimd(),{	\
       for(int s=0;s<Ls;s++){			\
 	 int sU=ss;				\
 	 int sF = s+Ls*sU;						\
-	 DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
+	 DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
       }							       \
       });							       \
     break;
@ -435,26 +438,26 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 				     int Ls, int Nsite, const FermionField &in, FermionField &out,
 				     int interior,int exterior) 
 {
-    auto U_v   =   U.View();
-    auto in_v  =  in.View();
-    auto out_v = out.View();
-    auto st_v  =  st.View();
+    autoView(U_v  ,  U,AcceleratorRead);
+    autoView(in_v , in,AcceleratorRead);
+    autoView(out_v,out,AcceleratorWrite);
+    autoView(st_v , st,AcceleratorRead);

   if( interior && exterior ) { 
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
-#ifndef GRID_NVCC
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;}
 #endif
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
-#ifndef GRID_NVCC
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt);    return;}
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) { 
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
-#ifndef GRID_NVCC
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;}
 #endif
@ -466,26 +469,26 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 					  int Ls, int Nsite, const FermionField &in, FermionField &out,
 					  int interior,int exterior) 
  {
-    auto U_v   = U.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    auto st_v  = st.View();
+    autoView(U_v  ,U,AcceleratorRead);
+    autoView(in_v ,in,AcceleratorRead);
+    autoView(out_v,out,AcceleratorWrite);
+    autoView(st_v ,st,AcceleratorRead);

   if( interior && exterior ) { 
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;}
-#ifndef GRID_NVCC
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;}
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 #endif
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
-#ifndef GRID_NVCC
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
   } else if( exterior ) { 
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
-#ifndef GRID_NVCC
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 #endif
@ -493,5 +496,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   assert(0 && " Kernel optimisation case not covered ");
  }

+#undef KERNEL_CALLNB
+#undef KERNEL_CALL
+#undef ASM_CALL
+
 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc
@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#include <Grid/Grid.h>
+
+NAMESPACE_BEGIN(Grid);
+
+const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
+const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master
@ -0,0 +1,37 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/NaiveStaggeredFermion.cc
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class NaiveStaggeredFermion<IMPLEMENTATION>; 
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc
@ -0,0 +1 @@
+../NaiveStaggeredFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc
@ -0,0 +1 @@
+../NaiveStaggeredFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@ -88,6 +88,7 @@ done
 CC_LIST=" \
  ImprovedStaggeredFermion5DInstantiation \
  ImprovedStaggeredFermionInstantiation \
+  NaiveStaggeredFermionInstantiation \
  StaggeredKernelsInstantiation "

 for impl in $STAG_IMPL_LIST
--- a/Grid/qcd/action/gauge/GaugeImplTypes.h
+++ b/Grid/qcd/action/gauge/GaugeImplTypes.h
@ -86,9 +86,9 @@ public:

  // Move this elsewhere? FIXME
  static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W
-    auto U_v = U.View();
-    auto W_v = W.View();
-    thread_for( ss, U.Grid()->oSites(), {
+    autoView(U_v,U,AcceleratorWrite);
+    autoView(W_v,W,AcceleratorRead);
+    accelerator_for( ss, U.Grid()->oSites(), 1, {
      U_v[ss](mu) = U_v[ss](mu) + W_v[ss]();
    });
  }
@ -131,15 +131,14 @@ public:
    //static std::chrono::duration<double> diff;

    //auto start = std::chrono::high_resolution_clock::now();
-    auto U_v = U.View();
-    auto P_v = P.View();
-    thread_for(ss, P.Grid()->oSites(),{
+    autoView(U_v,U,AcceleratorWrite);
+    autoView(P_v,P,AcceleratorRead);
+    accelerator_for(ss, P.Grid()->oSites(),1,{
      for (int mu = 0; mu < Nd; mu++) {
        U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu));
      }
    });
-    
-    //auto end = std::chrono::high_resolution_clock::now();
+   //auto end = std::chrono::high_resolution_clock::now();
   // diff += end - start;
   // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n";
  }
--- a/Grid/qcd/action/scalar/ScalarInteractionAction.h
+++ b/Grid/qcd/action/scalar/ScalarInteractionAction.h
@ -89,8 +89,8 @@ public:
    action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
    
    
-    auto p_v = p.View();
-    auto action_v = action.View();
+    autoView( p_v      , p, CpuRead);
+    autoView( action_v , action, CpuWrite);
    for (int mu = 0; mu < Ndim; mu++)
    {
      //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
@ -146,8 +146,8 @@ public:
    for (int point = 0; point < npoint; point++)
    {

-      auto p_v = p.View();
-      auto force_v = force.View();
+      autoView( p_v , p, CpuRead);
+      autoView( force_v , force, CpuWrite);
            
      int permute_type;
      StencilEntry *SE;
--- a/Grid/qcd/modules/Registration.h
+++ b/Grid/qcd/modules/Registration.h
@ -80,10 +80,11 @@ static Registrar<OneFlavourRatioEOFModule<FermionImplementationPolicy>,

 static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>,   
                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient"); 
-//static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,   
-//                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("BiCGSTAB"); 
-//static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,   
-//                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual"); 
+
+static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,   
+                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __BiCGWFmodXMLInit("BiCGSTAB"); 
+static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,   
+                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual"); 

 // add the staggered, scalar versions here

--- a/Grid/qcd/smearing/GaugeConfiguration.h
+++ b/Grid/qcd/smearing/GaugeConfiguration.h
@ -49,7 +49,7 @@ public:

 private:
  const unsigned int smearingLevels;
-  Smear_Stout<Gimpl> StoutSmearing;
+  Smear_Stout<Gimpl> *StoutSmearing;
  std::vector<GaugeField> SmearedSet;

  // Member functions
@ -72,7 +72,7 @@ private:
      previous_u = *ThinLinks;
      for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl)
      {
-        StoutSmearing.smear(SmearedSet[smearLvl], previous_u);
+        StoutSmearing->smear(SmearedSet[smearLvl], previous_u);
        previous_u = SmearedSet[smearLvl];

        // For debug purposes
@ -93,7 +93,7 @@ private:
    GaugeLinkField SigmaKPrime_mu(grid);
    GaugeLinkField GaugeKmu(grid), Cmu(grid);

-    StoutSmearing.BaseSmear(C, GaugeK);
+    StoutSmearing->BaseSmear(C, GaugeK);
    SigmaK = Zero();
    iLambda = Zero();

@ -107,7 +107,7 @@ private:
      pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
      pokeLorentz(iLambda, iLambda_mu, mu);
    }
-    StoutSmearing.derivative(SigmaK, iLambda,
+    StoutSmearing->derivative(SigmaK, iLambda,
                             GaugeK);  // derivative of SmearBase
    return SigmaK;
  }
@ -144,14 +144,14 @@ private:
    // Exponential
    iQ2 = iQ * iQ;
    iQ3 = iQ * iQ2;
-    StoutSmearing.set_uw(u, w, iQ2, iQ3);
-    StoutSmearing.set_fj(f0, f1, f2, u, w);
+    StoutSmearing->set_uw(u, w, iQ2, iQ3);
+    StoutSmearing->set_fj(f0, f1, f2, u, w);
    e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2;

    // Getting B1, B2, Gamma and Lambda
    // simplify this part, reduntant calculations in set_fj
-    xi0 = StoutSmearing.func_xi0(w);
-    xi1 = StoutSmearing.func_xi1(w);
+    xi0 = StoutSmearing->func_xi0(w);
+    xi1 = StoutSmearing->func_xi1(w);
    u2 = u * u;
    w2 = w * w;
    cosw = cos(w);
@ -219,7 +219,7 @@ public:
  /* Standard constructor */
  SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
                       Smear_Stout<Gimpl>& Stout)
-      : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL)
+      : smearingLevels(Nsmear), StoutSmearing(&Stout), ThinLinks(NULL)
  {
    for (unsigned int i = 0; i < smearingLevels; ++i)
      SmearedSet.push_back(*(new GaugeField(UGrid)));
@ -227,7 +227,7 @@ public:

  /*! For just thin links */
  SmearedConfiguration()
-    : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {}
+    : smearingLevels(0), StoutSmearing(nullptr), SmearedSet(), ThinLinks(NULL) {}

  // attach the smeared routines to the thin links U and fill the smeared set
  void set_Field(GaugeField &U)
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@ -185,13 +185,14 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,

 	for(int i=0;i<Lblock;i++){

-	  auto lhs_v = lhs_wi[i].View();
+	  // Recreate view potentially expensive outside fo UVM mode
+	  autoView(lhs_v,lhs_wi[i],CpuRead);
 	  auto left = conjugate(lhs_v[ss]);
-
 	  for(int j=0;j<Rblock;j++){

 	    SpinMatrix_v vv;
-	    auto rhs_v = rhs_vj[j].View();
+	    // Recreate view potentially expensive outside fo UVM mode
+	    autoView(rhs_v,rhs_vj[j],CpuRead);
 	    auto right = rhs_v[ss];
 	    for(int s1=0;s1<Ns;s1++){
 	    for(int s2=0;s2<Ns;s2++){
@ -204,11 +205,10 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
 	    for ( int m=0;m<Nmom;m++){
 	      int idx = m+base;
-	      auto mom_v = mom[m].View();
+	      autoView(mom_v,mom[m],CpuRead);
 	      auto phase = mom_v[ss];
 	      mac(&lvSum[idx],&vv,&phase);
 	    }
-	  
 	  }
 	}
      }
@ -371,7 +371,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,

 	for(int i=0;i<Lblock;i++){

-	  auto wi_v = wi[i].View();
+	  autoView(wi_v,wi[i],CpuRead);
 	  auto w = conjugate(wi_v[ss]);
 	  if (g5) {
 	    w()(2)(0) = - w()(2)(0);
@ -383,7 +383,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
 	  }
 	  for(int j=0;j<Rblock;j++){
 	    
-	    auto vj_v=vj[j].View();
+	    autoView(vj_v,vj[j],CpuRead);
 	    auto v  = vj_v[ss];
 	    auto vv = v()(0)(0);

@ -518,12 +518,12 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,

 	for(int i=0;i<Lblock;i++){

-	  auto wi_v = wi[i].View();
+	  autoView(wi_v,wi[i],CpuRead);
 	  auto w = conjugate(wi_v[ss]);

 	  for(int j=0;j<Rblock;j++){
-	    
-	    auto vj_v = vj[j].View();
+
+	    autoView(vj_v,vj[j],CpuRead);
 	    auto v = vj_v[ss];

 	    auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
@ -544,7 +544,7 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
 	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
 	    for ( int m=0;m<Nmom;m++){
 	      int idx = m+base;
-	      auto mom_v = mom[m].View();
+	      autoView(mom_v,mom[m],CpuRead);
 	      auto phase = mom_v[ss];
 	      mac(&lvSum[idx],&vv,&phase()()());
 	    }
@ -730,13 +730,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,

            for(int i=0;i<Lblock;i++)
            {
-  	        auto wi_v = lhs_wi[i].View();
+  	        autoView(wi_v,lhs_wi[i],CpuRead);
                auto left = conjugate(wi_v[ss]);

                for(int j=0;j<Rblock;j++)
                {
                    SpinMatrix_v vv;
-		    auto vj_v  = rhs_vj[j].View();
+		    autoView(vj_v,rhs_vj[j],CpuRead);
                    auto right = vj_v[ss];

                    for(int s1=0;s1<Ns;s1++)
@ -752,8 +752,8 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,

                    for ( int m=0;m<Nem;m++)
                    {
-  		        auto emB0_v = emB0[m].View();
-  		        auto emB1_v = emB1[m].View();
+  		        autoView(emB0_v,emB0[m],CpuRead);
+		        autoView(emB1_v,emB1[m],CpuRead);
                        int idx  = m+base;
                        auto b0  = emB0_v[ss];
                        auto b1  = emB1_v[ss];
@ -1014,21 +1014,21 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
    for(int d_o=0;d_o<N_d;d_o+=d_unroll){
      for(int t=0;t<N_t;t++){
      for(int s=0;s<N_s;s++){
-  auto vs_v = vs[s].View();
-  auto tmp1 = vs_v[ss];
-  vobj tmp2 = Zero();
-  vobj tmp3 = Zero();
-  for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
-    auto vd_v = vd[d].View();
-    Scalar_v coeff = WW_sd(t,s,d);
-    tmp3 = conjugate(vd_v[ss]);
-    mac(&tmp2, &coeff, &tmp3);
-  }
+	autoView(vs_v,vs[s],CpuRead);
+	auto tmp1 = vs_v[ss];
+	vobj tmp2 = Zero();
+	vobj tmp3 = Zero();
+	for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
+	  autoView(vd_v,vd[d],CpuRead);
+	  Scalar_v coeff = WW_sd(t,s,d);
+	  tmp3 = conjugate(vd_v[ss]);
+	  mac(&tmp2, &coeff, &tmp3);
+	}

-  //////////////////////////
-  // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
-  //////////////////////////
-  OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
+	//////////////////////////
+	// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
+	//////////////////////////
+	OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);

      }}
    }
@ -1067,21 +1067,20 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
    thread_for(ss,grid->oSites(),{
      for(int d_o=0;d_o<N_d;d_o+=d_unroll){
        for(int s=0;s<N_s;s++){
-    auto vs_v = vs[s].View();
-    auto tmp1 = vs_v[ss];
-    vobj tmp2 = Zero();
-    vobj tmp3 = Zero();
-    for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
-      auto vd_v = vd[d].View();
-      Scalar_v coeff = buf(s,d);
-      tmp3 = conjugate(vd_v[ss]);
-      mac(&tmp2, &coeff, &tmp3);
-    }
-
-    //////////////////////////
-    // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
-    //////////////////////////
-    OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
+	  autoView(vs_v,vs[s],CpuRead);
+	  auto tmp1 = vs_v[ss];
+	  vobj tmp2 = Zero();
+	  vobj tmp3 = Zero();
+	  for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
+	    autoView(vd_v,vd[d],CpuRead);
+	    Scalar_v coeff = buf(s,d);
+	    tmp3 = conjugate(vd_v[ss]);
+	    mac(&tmp2, &coeff, &tmp3);
+	  }
+	  //////////////////////////
+	  // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
+	  //////////////////////////
+	  OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
      }}
    });
  }
@ -1093,7 +1092,7 @@ inline void A2Autils<FImpl>::OuterProductWWVV(PropagatorField &WWVV,
                                             const vobj &rhs,
                                             const int Ns, const int ss)
 {
-  auto WWVV_v = WWVV.View();
+  autoView(WWVV_v,WWVV,CpuWrite);
  for (int s1 = 0; s1 < Ns; s1++){
    for (int s2 = 0; s2 < Ns; s2++){
      WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
@ -1122,10 +1121,10 @@ void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWV

  GridBase *grid = WWVV0.Grid();

-  auto WWVV0_v = WWVV0.View();
-  auto WWVV1_v = WWVV1.View();
-  auto O_trtr_v= O_trtr.View();
-  auto O_fig8_v= O_fig8.View();
+  autoView(WWVV0_v , WWVV0,CpuRead);
+  autoView(WWVV1_v , WWVV1,CpuRead);
+  autoView(O_trtr_v, O_trtr,CpuWrite);
+  autoView(O_fig8_v, O_fig8,CpuWrite);
  thread_for(ss,grid->oSites(),{

    typedef typename ComplexField::vector_object vobj;
@ -1166,10 +1165,10 @@ void A2Autils<FImpl>::ContractFourQuarkColourMix(const PropagatorField &WWVV0,

  GridBase *grid = WWVV0.Grid();

-  auto WWVV0_v = WWVV0.View();
-  auto WWVV1_v = WWVV1.View();
-  auto O_trtr_v= O_trtr.View();
-  auto O_fig8_v= O_fig8.View();
+  autoView( WWVV0_v , WWVV0,CpuRead);
+  autoView( WWVV1_v , WWVV1,CpuRead);
+  autoView( O_trtr_v, O_trtr,CpuWrite);
+  autoView( O_fig8_v, O_fig8,CpuWrite);

  thread_for(ss,grid->oSites(),{

--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@ -350,11 +350,11 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");

  GridBase *grid = q1_left.Grid();
-
-  auto vbaryon_corr= baryon_corr.View();
-  auto v1 = q1_left.View();
-  auto v2 = q2_left.View();
-  auto v3 = q3_left.View();
+  
+  autoView(vbaryon_corr, baryon_corr,CpuWrite);
+  autoView( v1 , q1_left, CpuRead);
+  autoView( v2 , q2_left, CpuRead);
+  autoView( v3 , q3_left, CpuRead);

  Real bytes =0.;
  bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real));
@ -989,10 +989,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,

  GridBase *grid = qs_ti.Grid();

-  auto vcorr= stn_corr.View();
-  auto vq_loop = qq_loop.View();
-  auto vd_tf = qd_tf.View();
-  auto vs_ti = qs_ti.View();
+  autoView( vcorr, stn_corr, CpuWrite);
+  autoView( vq_loop , qq_loop, CpuRead);
+  autoView( vd_tf , qd_tf, CpuRead);
+  autoView( vs_ti , qs_ti, CpuRead);

  accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
    auto Dq_loop = vq_loop[ss];
@ -1029,13 +1029,13 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,

  GridBase *grid = qs_ti.Grid();

-  auto vcorr= stn_corr.View();
-  auto vq_ti = qq_ti.View();
-  auto vq_tf = qq_tf.View();
-  auto vd_tf = qd_tf.View();
-  auto vs_ti = qs_ti.View();
-
-  accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+  autoView( vcorr , stn_corr, CpuWrite);
+  autoView( vq_ti , qq_ti, CpuRead);
+  autoView( vq_tf , qq_tf, CpuRead);
+  autoView( vd_tf , qd_tf, CpuRead);
+  autoView( vs_ti , qs_ti, CpuRead);
+ // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+  thread_for(ss,grid->oSites(),{
    auto Dq_ti = vq_ti[ss];
    auto Dq_tf = vq_tf[ss];
    auto Dd_tf = vd_tf[ss];
--- a/Grid/qcd/utils/LinalgUtils.h
+++ b/Grid/qcd/utils/LinalgUtils.h
@ -47,8 +47,8 @@ void axpibg5x(Lattice<vobj> &z,const Lattice<vobj> &x,Coeff a,Coeff b)
  GridBase *grid=x.Grid();

  Gamma G5(Gamma::Algebra::Gamma5);
-  auto x_v = x.View();
-  auto z_v = z.View();
+  autoView(x_v, x, AcceleratorRead);
+  autoView(z_v, z, AcceleratorWrite);
  accelerator_for( ss, x_v.size(),vobj::Nsimd(), {
    auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss)));
    coalescedWrite(z_v[ss],tmp);
@ -63,9 +63,9 @@ void axpby_ssp(Lattice<vobj> &z, Coeff a,const Lattice<vobj> &x,Coeff b,const La
  conformable(x,z);
  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];
-  auto x_v = x.View();
-  auto y_v = y.View();
-  auto z_v = z.View();
+  autoView( x_v, x, AcceleratorRead);
+  autoView( y_v, y, AcceleratorRead);
+  autoView( z_v, z, AcceleratorWrite);
  // FIXME -- need a new class of accelerator_loop to implement this
  //
  uint64_t nloop = grid->oSites()/Ls;
@ -85,9 +85,9 @@ void ag5xpby_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];
  Gamma G5(Gamma::Algebra::Gamma5);
-  auto x_v = x.View();
-  auto y_v = y.View();
-  auto z_v = z.View();
+  autoView( x_v, x, AcceleratorRead);
+  autoView( y_v, y, AcceleratorRead);
+  autoView( z_v, z, AcceleratorWrite);
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,vobj::Nsimd(),{
    uint64_t ss = sss*Ls;
@ -104,9 +104,9 @@ void axpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
  conformable(x,z);
  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];
-  auto x_v = x.View();
-  auto y_v = y.View();
-  auto z_v = z.View();
+  autoView( x_v, x, AcceleratorRead);
+  autoView( y_v, y, AcceleratorRead);
+  autoView( z_v, z, AcceleratorWrite);
  Gamma G5(Gamma::Algebra::Gamma5);
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,vobj::Nsimd(),{
@ -125,9 +125,9 @@ void ag5xpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const
  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];

-  auto x_v = x.View();
-  auto y_v = y.View();
-  auto z_v = z.View();
+  autoView( x_v, x, AcceleratorRead);
+  autoView( y_v, y, AcceleratorRead);
+  autoView( z_v, z, AcceleratorWrite);
  Gamma G5(Gamma::Algebra::Gamma5);
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,vobj::Nsimd(),{
@ -147,9 +147,9 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];

-  auto x_v = x.View();
-  auto y_v = y.View();
-  auto z_v = z.View();
+  autoView( x_v, x, AcceleratorRead);
+  autoView( y_v, y, AcceleratorRead);
+  autoView( z_v, z, AcceleratorWrite);
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,vobj::Nsimd(),{
    uint64_t ss = sss*Ls;
@ -168,9 +168,9 @@ void axpby_ssp_pplus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,con
  conformable(x,z);
  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];
-  auto x_v = x.View();
-  auto y_v = y.View();
-  auto z_v = z.View();
+  autoView( x_v, x, AcceleratorRead);
+  autoView( y_v, y, AcceleratorRead);
+  autoView( z_v, z, AcceleratorWrite);
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,vobj::Nsimd(),{
    uint64_t ss = sss*Ls;
@ -189,8 +189,8 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
  conformable(x,z);
  int Ls = grid->_rdimensions[0];
  Gamma G5(Gamma::Algebra::Gamma5);
-  auto x_v = x.View();
-  auto z_v = z.View();
+  autoView( x_v, x, AcceleratorRead);
+  autoView( z_v, z, AcceleratorWrite);
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,vobj::Nsimd(),{
    uint64_t ss = sss*Ls;
@ -222,8 +222,8 @@ void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex,
  static_assert(nbasis % 2 == 0, "");
  int nb = nbasis / 2;

-  auto z_v = z.View();
-  auto x_v = x.View();
+  autoView( z_v, z, AcceleratorWrite);
+  autoView( x_v, x, AcceleratorRead);
  accelerator_for(ss,grid->oSites(),CComplex::Nsimd(),
  {
    for(int n = 0; n < nb; ++n) {
--- a/Grid/qcd/utils/SUn.h
+++ b/Grid/qcd/utils/SUn.h
@ -222,11 +222,11 @@ public:
    conformable(subgroup, Determinant);
    int i0, i1;
    su2SubGroupIndex(i0, i1, su2_index);
-    auto subgroup_v = subgroup.View();
-    auto source_v   = source.View();
-    auto Determinant_v = Determinant.View();

-    thread_for(ss, grid->oSites(), {
+    autoView( subgroup_v , subgroup,AcceleratorWrite);
+    autoView( source_v   , source,AcceleratorRead);
+    autoView( Determinant_v , Determinant,AcceleratorWrite);
+    accelerator_for(ss, grid->oSites(), 1, {

      subgroup_v[ss]()()(0, 0) = source_v[ss]()()(i0, i0);
      subgroup_v[ss]()()(0, 1) = source_v[ss]()()(i0, i1);
@ -257,15 +257,16 @@ public:
    su2SubGroupIndex(i0, i1, su2_index);

    dest = 1.0;  // start out with identity
-    auto dest_v = dest.View();
-    auto subgroup_v = subgroup.View();
-    thread_for(ss, grid->oSites(),
+    autoView( dest_v , dest, AcceleratorWrite);
+    autoView( subgroup_v, subgroup, AcceleratorRead);
+    accelerator_for(ss, grid->oSites(),1,
    {
      dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0);
      dest_v[ss]()()(i0, i1) = subgroup_v[ss]()()(0, 1);
      dest_v[ss]()()(i1, i0) = subgroup_v[ss]()()(1, 0);
      dest_v[ss]()()(i1, i1) = subgroup_v[ss]()()(1, 1);
    });
+
  }

  ///////////////////////////////////////////////
@ -608,8 +609,8 @@ public:

  // reunitarise??
  template <typename LatticeMatrixType>
-  static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out,
-                           double scale = 1.0) {
+  static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out, double scale = 1.0) 
+  {
    GridBase *grid = out.Grid();

    typedef typename LatticeMatrixType::vector_type vector_type;
@ -618,8 +619,7 @@ public:
    typedef iSinglet<vector_type> vTComplexType;

    typedef Lattice<vTComplexType> LatticeComplexType;
-    typedef typename GridTypeMapper<
-      typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
+    typedef typename GridTypeMapper<typename LatticeMatrixType::vector_object>::scalar_object MatrixType;

    LatticeComplexType ca(grid);
    LatticeMatrixType lie(grid);
@ -629,6 +629,7 @@ public:
    MatrixType ta;

    lie = Zero();
+
    for (int a = 0; a < AdjointDimension; a++) {
      random(pRNG, ca);

@ -640,6 +641,7 @@ public:
      la = ci * ca * ta;

      lie = lie + la;  // e^{i la ta}
+
    }
    taExp(lie, out);
  }
				`@ -0,0 +1 @@`
				`../NaiveStaggeredFermionInstantiation.cc.master`