Staggereed move to accelerator

2025-09-18 09:11:04 +01:00 · 2020-05-28 08:33:06 -04:00
parent cf2938688a
commit 006cc8a8f1
18 changed files with 1319 additions and 356 deletions
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);

+#include <Grid/qcd/action/fermion/NaiveStaggeredFermion.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
 NAMESPACE_CHECK(Staggered);
@@ -282,6 +283,10 @@ typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;

+typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
+typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
+typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
+
 typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@@ -0,0 +1,194 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#ifndef GRID_QCD_NAIVE_STAG_FERMION_H
+#define GRID_QCD_NAIVE_STAG_FERMION_H
+
+NAMESPACE_BEGIN(Grid);
+
+class NaiveStaggeredFermionStatic {
+public:
+  static const std::vector<int> directions;
+  static const std::vector<int> displacements;
+  static const int npoint = 8;
+};
+
+template <class Impl>
+class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStaggeredFermionStatic {
+public:
+  INHERIT_IMPL_TYPES(Impl);
+  typedef StaggeredKernels<Impl> Kernels;
+
+  FermionField _tmp;
+  FermionField &tmp(void) { return _tmp; }
+
+  ////////////////////////////////////////
+  // Performance monitoring
+  ////////////////////////////////////////
+  void Report(void);
+  void ZeroCounters(void);
+  double DhopTotalTime;
+  double DhopCalls;
+  double DhopCommTime;
+  double DhopComputeTime;
+  double DhopComputeTime2;
+  double DhopFaceTime;
+
+  ///////////////////////////////////////////////////////////////
+  // Implement the abstract base
+  ///////////////////////////////////////////////////////////////
+  GridBase *GaugeGrid(void) { return _grid; }
+  GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
+  GridBase *FermionGrid(void) { return _grid; }
+  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
+
+  //////////////////////////////////////////////////////////////////
+  // override multiply; cut number routines if pass dagger argument
+  // and also make interface more uniformly consistent
+  //////////////////////////////////////////////////////////////////
+  RealD M(const FermionField &in, FermionField &out);
+  RealD Mdag(const FermionField &in, FermionField &out);
+
+  /////////////////////////////////////////////////////////
+  // half checkerboard operations
+  /////////////////////////////////////////////////////////
+  void Meooe(const FermionField &in, FermionField &out);
+  void MeooeDag(const FermionField &in, FermionField &out);
+  void Mooee(const FermionField &in, FermionField &out);
+  void MooeeDag(const FermionField &in, FermionField &out);
+  void MooeeInv(const FermionField &in, FermionField &out);
+  void MooeeInvDag(const FermionField &in, FermionField &out);
+
+  ////////////////////////
+  // Derivative interface
+  ////////////////////////
+  // Interface calls an internal routine
+  void DhopDeriv  (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+  void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+  void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+
+  ///////////////////////////////////////////////////////////////
+  // non-hermitian hopping term; half cb or both
+  ///////////////////////////////////////////////////////////////
+  void Dhop  (const FermionField &in, FermionField &out, int dag);
+  void DhopOE(const FermionField &in, FermionField &out, int dag);
+  void DhopEO(const FermionField &in, FermionField &out, int dag);
+
+  ///////////////////////////////////////////////////////////////
+  // Multigrid assistance; force term uses too
+  ///////////////////////////////////////////////////////////////
+  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
+  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
+
+  ///////////////////////////////////////////////////////////////
+  // Extra methods added by derived
+  ///////////////////////////////////////////////////////////////
+  void DerivInternal(StencilImpl &st, 
+		     DoubledGaugeField &U,
+		     GaugeField &mat, 
+		     const FermionField &A, const FermionField &B, int dag);
+
+  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+                    const FermionField &in, FermionField &out, int dag);
+  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+			       const FermionField &in, FermionField &out, int dag);
+  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+				   const FermionField &in, FermionField &out, int dag);
+
+  //////////////////////////////////////////////////////////////////////////
+  // Grid own interface Constructor
+  //////////////////////////////////////////////////////////////////////////
+  NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
+			GridRedBlackCartesian &Hgrid, RealD _mass,
+			RealD _c1, RealD _u0,
+			const ImplParams &p = ImplParams());
+  NaiveStaggeredFermion(GridCartesian &Fgrid,
+			GridRedBlackCartesian &Hgrid, RealD _mass,
+			RealD _c1, RealD _u0,
+			const ImplParams &p = ImplParams());
+
+  // DoubleStore impl dependent
+  void ImportGauge      (const GaugeField &_U );
+  DoubledGaugeField &GetU(void)   { return Umu ; } ;
+  void CopyGaugeCheckerboards(void);
+
+  ///////////////////////////////////////////////////////////////
+  // Data members require to support the functionality
+  ///////////////////////////////////////////////////////////////
+
+  //    protected:
+public:
+  // any other parameters of action ???
+  virtual int   isTrivialEE(void) { return 1; };
+  virtual RealD Mass(void) { return mass; }
+  RealD mass;
+  RealD u0;
+  RealD c1;
+
+  GridBase *_grid;
+  GridBase *_cbgrid;
+
+  // Defines the stencils for even and odd
+  StencilImpl Stencil;
+  StencilImpl StencilEven;
+  StencilImpl StencilOdd;
+
+  // Copy of the gauge field , with even and odd subsets
+  DoubledGaugeField Umu;
+  DoubledGaugeField UmuEven;
+  DoubledGaugeField UmuOdd;
+
+  LebesgueOrder Lebesgue;
+  LebesgueOrder LebesgueEvenOdd;
+  
+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+                                PropagatorField &q_in_2,
+                                PropagatorField &q_out,
+                                PropagatorField &src,
+                                Current curr_type,
+                                unsigned int mu);
+  void SeqConservedCurrent(PropagatorField &q_in,
+                           PropagatorField &q_out,
+                           PropagatorField &srct,
+                           Current curr_type,
+                           unsigned int mu, 
+                           unsigned int tmin,
+                           unsigned int tmax,
+			   ComplexField &lattice_cmplx);
+};
+
+typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
+typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/qcd/action/fermion/StaggeredKernels.h
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@@ -49,21 +49,32 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   
 public:

+  void DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
+		    DoubledGaugeField &U, DoubledGaugeField &UUU, 
+		    const FermionField &in, FermionField &out, int dag, int interior,int exterior);
+  void DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
+		 DoubledGaugeField &U,
+		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
+  
  void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
 		     int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
+ protected:    

   ///////////////////////////////////////////////////////////////////////////////////////
   // Generic Nc kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik>
+   void DhopSiteGeneric(StencilView &st, 
 			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik>
+   void DhopSiteGenericInt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik>
+   void DhopSiteGenericExt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionFieldView &in, FermionFieldView &out,int dag);
@@ -71,15 +82,18 @@ public:
   ///////////////////////////////////////////////////////////////////////////////////////
   // Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik>
+   void DhopSiteHand(StencilView &st, 
 		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		     SiteSpinor * buf, int LLs, int sU, 
 		     const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik>
+   void DhopSiteHandInt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik>
+   void DhopSiteHandExt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
@@ -87,27 +101,10 @@ public:
   ///////////////////////////////////////////////////////////////////////////////////////
   // Asm Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+   void DhopSiteAsm(StencilView &st, 
 		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU, 
 		    const FermionFieldView &in, FermionFieldView &out,int dag);
-   ///////////////////////////////////////////////////////////////////////////////////////////////////
-   // Generic interface; fan out to right routine
-   ///////////////////////////////////////////////////////////////////////////////////////////////////
-   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
-		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
-		 SiteSpinor * buf, int LLs, int sU,
-		 const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
-
-   void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, 
-		    DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
-		    SiteSpinor * buf, int LLs, int sU,
-		    const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
-
-   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
-		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
-		 SiteSpinor * buf, int LLs, int sU,
-		 const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
  
 public:

--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@@ -281,11 +281,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 						    DoubledGaugeField & U,DoubledGaugeField & UUU,
 						    const FermionField &in, FermionField &out,int dag)
 {
-#ifdef GRID_OMP
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
  else
-#endif
    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
 }

@@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
 								   DoubledGaugeField & U,DoubledGaugeField & UUU,
 								   const FermionField &in, FermionField &out,int dag)
 {
-#ifdef GRID_OMP
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-
  Compressor compressor; 

  int LLs = in.Grid()->_rdimensions[0];
@@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  DhopFaceTime-=usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
+  DhopFaceTime+=usecond();
+
+  DhopCommTime -=usecond();
+  std::vector<std::vector<CommsRequest_t> > requests;
+  st.CommunicateBegin(requests);
+
  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
+  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();

-  double ctime=0;
-  double ptime=0;
-
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
+  // Remove explicit thread mapping introduced for OPA reasons.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
+  DhopComputeTime-=usecond();
  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      double start = usecond();
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = U.Grid()->oSites(); // 4d vol
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
+    int interior=1;
+    int exterior=0;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
-
-      // do the compute
-      auto   U_v  =   U.View(CpuRead);
-      auto UUU_v  = UUU.View(CpuRead);
-      auto  in_v  =  in.View(CpuRead);
-      auto out_v  = out.View(CpuWrite);
-
-      if (dag == DaggerYes) {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-          int sU = ss;
-	  // Interior = 1; Exterior = 0; must implement for staggered
-          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
-        }
-      } else {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  // Interior = 1; Exterior = 0;
-          int sU = ss;
-          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
-        }
-      }
-        ptime = usecond() - start;
-    } else {
-      double start = usecond();
-      st.CommunicateThreaded();
-      ctime = usecond() - start;
-    }
-  }
-  DhopCommTime += ctime;
-  DhopComputeTime+=ptime;
-
-  // First to enter, last to leave timing
-  st.CollateThreads();
+  DhopComputeTime+=usecond();

  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();

-  DhopComputeTime2-=usecond();
+  st.CommunicateComplete(requests);
+  DhopCommTime +=usecond();

-  auto   U_v  =   U.View(CpuRead);
-  auto UUU_v  = UUU.View(CpuRead);
-  auto  in_v  =  in.View(CpuRead);
-  auto out_v  = out.View(CpuWrite);
-  if (dag == DaggerYes) {
-    int sz=st.surface_list.size();
-    thread_for( ss,sz,{
-      int sU = st.surface_list[ss];
-      Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
-    });
-  } else {
-    int sz=st.surface_list.size();
-    thread_for( ss,sz,{
-      int sU = st.surface_list[ss];
-      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
-    });
+  DhopComputeTime2-=usecond();
+  {
+    int interior=0;
+    int exterior=1;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime2+=usecond();
-#else
-  assert(0);
-#endif
-
 }

 template<class Impl>
@@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  Compressor compressor;
  int LLs = in.Grid()->_rdimensions[0];

-
-
 //double t1=usecond();
  DhopTotalTime -= usecond();
  DhopCommTime -= usecond();
@@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  
  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  auto   U_v  =   U.View(CpuRead);
-  auto UUU_v  = UUU.View(CpuRead);
-  auto  in_v  =  in.View(CpuRead);
-  auto out_v  = out.View(CpuWrite);
-  if (dag == DaggerYes) {
-    thread_for( ss,U.Grid()->oSites(),{
-      int sU=ss;
-      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
-    });
-  } else {
-    thread_for( ss,U.Grid()->oSites(),{
-      int sU=ss;
-      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
-    });
+  {
+    int interior=1;
+    int exterior=1;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
- //double t2=usecond();
- //std::cout << __FILE__ << " " << __func__  << " Total Time " << DhopTotalTime << std::endl;
- //std::cout << __FILE__ << " " << __func__  << " Total Time Org " << t2-t1 << std::endl;
- //std::cout << __FILE__ << " " << __func__  << " Comml Time " << DhopCommTime << std::endl;
- //std::cout << __FILE__ << " " << __func__  << " Compute Time " << DhopComputeTime << std::endl;

 }
 /*CHANGE END*/
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@@ -395,11 +395,9 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
 						  const FermionField &in,
 						  FermionField &out, int dag) 
 {
-#ifdef GRID_OMP
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
  else
-#endif
    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
 }
 template <class Impl>
@@ -409,7 +407,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
 								 const FermionField &in,
 								 FermionField &out, int dag) 
 {
-#ifdef GRID_OMP
  Compressor compressor; 
  int len =  U.Grid()->oSites();

@@ -418,60 +415,30 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
+  DhopFaceTime    += usecond();
+
+  DhopCommTime -=usecond();
+  std::vector<std::vector<CommsRequest_t> > requests;
+  st.CommunicateBegin(requests);
+
+  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+= usecond();

  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
+  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime    -= usecond();
-#pragma omp parallel 
  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-
-    if (tid >= ncomms) {
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = len;
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
-      }
-
-      // do the compute
-      auto U_v   = U.View(CpuRead);
-      auto UUU_v = UUU.View(CpuRead);
-      auto in_v  = in.View(CpuRead);
-      auto out_v = out.View(CpuWrite);
-      if (dag == DaggerYes) {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-          int sU = ss;
-	  // Interior = 1; Exterior = 0; must implement for staggered
-          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); 
-        }
-      } else {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  // Interior = 1; Exterior = 0;
-          int sU = ss;
-          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
-        }
-      }
-    } else {
-      st.CommunicateThreaded();
-    }
+    int interior=1;
+    int exterior=0;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime    += usecond();

+  st.CommunicateComplete(requests);
+  DhopCommTime +=usecond();
+
  // First to enter, last to leave timing
  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
@@ -479,28 +446,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st

  DhopComputeTime2    -= usecond();
  {
-    auto U_v   = U.View(CpuRead);
-    auto UUU_v = UUU.View(CpuRead);
-    auto in_v  = in.View(CpuRead);
-    auto out_v = out.View(CpuWrite);
-    if (dag == DaggerYes) {
-      int sz=st.surface_list.size();
-      thread_for(ss,sz,{
-	int sU = st.surface_list[ss];
-	Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
-      });
-    } else {
-      int sz=st.surface_list.size();
-      thread_for(ss,sz,{
-	int sU = st.surface_list[ss];
-	Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
-      });
-    }
+    int interior=0;
+    int exterior=1;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime2    += usecond();
-#else
-  assert(0);
-#endif
 }


@@ -520,19 +470,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
  st.HaloExchange(in, compressor);
  DhopCommTime    += usecond();

-  auto U_v   =   U.View(CpuRead);
-  auto UUU_v = UUU.View(CpuRead);
-  auto in_v  =  in.View(CpuRead);
-  auto out_v = out.View(CpuWrite);
  DhopComputeTime -= usecond();
-  if (dag == DaggerYes) {
-    thread_for(sss, in.Grid()->oSites(),{
-      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
-    });
-  } else {
-    thread_for(sss, in.Grid()->oSites(),{
-      Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
-    });
+  {
+    int interior=1;
+    int exterior=1;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
--- a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
@@ -0,0 +1,499 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+#pragma once 
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////
+// Constructor and gauge import
+/////////////////////////////////
+
+template <class Impl>
+NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, 
+						   RealD _mass,
+						   RealD _c1, RealD _u0,
+						   const ImplParams &p)
+  : Kernels(p),
+    _grid(&Fgrid),
+    _cbgrid(&Hgrid),
+    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
+    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
+    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
+    mass(_mass),
+    Lebesgue(_grid),
+    LebesgueEvenOdd(_cbgrid),
+    Umu(&Fgrid),
+    UmuEven(&Hgrid),
+    UmuOdd(&Hgrid),
+    _tmp(&Hgrid)
+{
+  int vol4;
+  int LLs=1;
+  c1=_c1;
+  u0=_u0;
+  vol4= _grid->oSites();
+  Stencil.BuildSurfaceList(LLs,vol4);
+  vol4= _cbgrid->oSites();
+  StencilEven.BuildSurfaceList(LLs,vol4);
+  StencilOdd.BuildSurfaceList(LLs,vol4);
+}
+
+template <class Impl>
+NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
+						   GridRedBlackCartesian &Hgrid, RealD _mass,
+						   RealD _c1, RealD _u0,
+						   const ImplParams &p)
+  : NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p)
+{
+  ImportGauge(_U);
+}
+
+////////////////////////////////////////////////////////////
+// Momentum space propagator should be 
+// https://arxiv.org/pdf/hep-lat/9712010.pdf
+//
+// mom space action.
+//   gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
+//
+// must track through staggered flavour/spin reduction in literature to 
+// turn to free propagator for the one component chi field, a la page 4/5
+// of above link to implmement fourier based solver.
+////////////////////////////////////////////////////////////
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
+{
+  pickCheckerboard(Even, UmuEven,  Umu);
+  pickCheckerboard(Odd,  UmuOdd ,  Umu);
+}
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::ImportGauge(const GaugeField &_U) 
+{
+  GaugeLinkField U(GaugeGrid());
+  DoubledGaugeField _UUU(GaugeGrid());
+  ////////////////////////////////////////////////////////
+  // Double Store should take two fields for Naik and one hop separately.
+  // Discard teh Naik as Naive
+  ////////////////////////////////////////////////////////
+  Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U );
+
+  ////////////////////////////////////////////////////////
+  // Apply scale factors to get the right fermion Kinetic term
+  // Could pass coeffs into the double store to save work.
+  // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) 
+  ////////////////////////////////////////////////////////
+  for (int mu = 0; mu < Nd; mu++) {
+
+    U = PeekIndex<LorentzIndex>(Umu, mu);
+    PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
+    
+    U = PeekIndex<LorentzIndex>(Umu, mu+4);
+    PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
+
+  }
+
+  CopyGaugeCheckerboards();
+}
+
+/////////////////////////////
+// Implement the interface
+/////////////////////////////
+
+template <class Impl>
+RealD NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  Dhop(in, out, DaggerNo);
+  return axpy_norm(out, mass, in, out);
+}
+
+template <class Impl>
+RealD NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  Dhop(in, out, DaggerYes);
+  return axpy_norm(out, mass, in, out);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
+  }
+}
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
+  }
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  typename FermionField::scalar_type scal(mass);
+  out = scal * in;
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  Mooee(in, out);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  out = (1.0 / (mass)) * in;
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) 
+{
+  out.Checkerboard() = in.Checkerboard();
+  MooeeInv(in, out);
+}
+
+///////////////////////////////////
+// Internal
+///////////////////////////////////
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
+						GaugeField & mat,
+						const FermionField &A, const FermionField &B, int dag) 
+{
+  assert((dag == DaggerNo) || (dag == DaggerYes));
+
+  Compressor compressor;
+
+  FermionField Btilde(B.Grid());
+  FermionField Atilde(B.Grid());
+  Atilde = A;
+
+  st.HaloExchange(B, compressor);
+
+  for (int mu = 0; mu < Nd; mu++) {
+
+    ////////////////////////
+    // Call the single hop
+    ////////////////////////
+    auto U_v   = U.View(CpuRead);
+    auto B_v      = B.View(CpuWrite);
+    auto Btilde_v = Btilde.View(CpuWrite);
+    thread_for(sss,B.Grid()->oSites(),{
+      Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
+    });
+
+    assert(0);// need to figure out the force interface with a blasted three link term.
+    
+  }
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+
+  conformable(U.Grid(), _grid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());
+
+  mat.Checkerboard() = U.Checkerboard();
+
+  DerivInternal(Stencil, Umu, mat, U, V, dag);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+
+  conformable(U.Grid(), _cbgrid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());
+
+  assert(V.Checkerboard() == Even);
+  assert(U.Checkerboard() == Odd);
+  mat.Checkerboard() = Odd;
+
+  DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+
+  conformable(U.Grid(), _cbgrid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());
+
+  assert(V.Checkerboard() == Odd);
+  assert(U.Checkerboard() == Even);
+  mat.Checkerboard() = Even;
+
+  DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
+{
+  DhopCalls+=2;
+  conformable(in.Grid(), _grid);  // verifies full grid
+  conformable(in.Grid(), out.Grid());
+
+  out.Checkerboard() = in.Checkerboard();
+
+  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
+{
+  DhopCalls+=1;
+  conformable(in.Grid(), _cbgrid);    // verifies half grid
+  conformable(in.Grid(), out.Grid());  // drops the cb check
+
+  assert(in.Checkerboard() == Even);
+  out.Checkerboard() = Odd;
+
+  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
+{
+  DhopCalls+=1;
+  conformable(in.Grid(), _cbgrid);    // verifies half grid
+  conformable(in.Grid(), out.Grid());  // drops the cb check
+
+  assert(in.Checkerboard() == Odd);
+  out.Checkerboard() = Even;
+
+  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
+  DhopDir(in, out, dir, disp);
+}
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  assert(0); // Not implemented yet
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
+
+  Compressor compressor;
+  Stencil.HaloExchange(in, compressor);
+  auto Umu_v   =   Umu.View(CpuRead);
+  auto in_v    =  in.View(CpuRead);
+  auto out_v   = out.View(CpuWrite);
+  //  thread_for( sss, in.Grid()->oSites(),{
+  //    Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
+  //  });
+  assert(0);
+};
+
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+					       DoubledGaugeField &U,
+					       const FermionField &in,
+					       FermionField &out, int dag) 
+{
+  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
+    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
+  else
+    DhopInternalSerialComms(st,lo,U,in,out,dag);
+}
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+							      DoubledGaugeField &U,
+							      const FermionField &in,
+							      FermionField &out, int dag) 
+{
+  Compressor compressor; 
+  int len =  U.Grid()->oSites();
+
+  DhopTotalTime   -= usecond();
+
+  DhopFaceTime    -= usecond();
+  st.Prepare();
+  st.HaloGather(in,compressor);
+  DhopFaceTime    += usecond();
+
+  DhopCommTime -=usecond();
+  std::vector<std::vector<CommsRequest_t> > requests;
+  st.CommunicateBegin(requests);
+
+  DhopFaceTime-=usecond();
+  st.CommsMergeSHM(compressor);
+  DhopFaceTime+= usecond();
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Removed explicit thread comms
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  DhopComputeTime    -= usecond();
+  {
+    int interior=1;
+    int exterior=0;
+    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+  }
+  DhopComputeTime    += usecond();
+
+  st.CommunicateComplete(requests);
+  DhopCommTime +=usecond();
+
+  // First to enter, last to leave timing
+  DhopFaceTime    -= usecond();
+  st.CommsMerge(compressor);
+  DhopFaceTime    -= usecond();
+
+  DhopComputeTime2    -= usecond();
+  {
+    int interior=0;
+    int exterior=1;
+    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+  }
+  DhopComputeTime2    += usecond();
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
+							  DoubledGaugeField &U,
+							  const FermionField &in,
+							  FermionField &out, int dag) 
+{
+  assert((dag == DaggerNo) || (dag == DaggerYes));
+
+  DhopTotalTime   -= usecond();
+
+  DhopCommTime    -= usecond();
+  Compressor compressor;
+  st.HaloExchange(in, compressor);
+  DhopCommTime    += usecond();
+
+  DhopComputeTime -= usecond();
+  {
+    int interior=1;
+    int exterior=1;
+    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+  }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
+};
+
+  ////////////////////////////////////////////////////////////////
+  // Reporting
+  ////////////////////////////////////////////////////////////////
+template<class Impl>
+void NaiveStaggeredFermion<Impl>::Report(void) 
+{
+  Coordinate latt = _grid->GlobalDimensions();
+  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP = _grid->_Nprocessors;
+  RealD NN = _grid->NodeCount();
+
+  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
+
+  std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls   : " 
+	    << DhopCalls   << std::endl;
+  std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime   /Calls       : " 
+	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime    /Calls       : " 
+	    << DhopCommTime    / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls        : " 
+	    << DhopComputeTime / DhopCalls << " us" << std::endl;
+
+  // Average the compute time
+  _grid->GlobalSum(DhopComputeTime);
+  DhopComputeTime/=NP;
+
+  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
+  
+  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
+
+  std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
+  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
+  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
+}
+template<class Impl>
+void NaiveStaggeredFermion<Impl>::ZeroCounters(void) 
+{
+  DhopCalls       = 0;
+  DhopTotalTime   = 0;
+  DhopCommTime    = 0;
+  DhopComputeTime = 0;
+  DhopFaceTime    = 0;
+
+  Stencil.ZeroCounters();
+  StencilEven.ZeroCounters();
+  StencilOdd.ZeroCounters();
+}
+
+
+//////////////////////////////////////////////////////// 
+// Conserved current - not yet implemented.
+////////////////////////////////////////////////////////
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+							      PropagatorField &q_in_2,
+							      PropagatorField &q_out,
+							      PropagatorField &src,
+							      Current curr_type,
+							      unsigned int mu)
+{
+  assert(0);
+}
+
+template <class Impl>
+void NaiveStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+                                                         PropagatorField &q_out,
+                                                         PropagatorField &src,
+                                                         Current curr_type,
+                                                         unsigned int mu, 
+                                                         unsigned int tmin,
+                                              unsigned int tmax,
+					      ComplexField &lattice_cmplx)
+{
+  assert(0);
+
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
@@ -618,10 +618,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);

 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
 					 DoubledGaugeFieldView &U,
 					 DoubledGaugeFieldView &UUU,
-					 SiteSpinor *buf, int LLs,
+					 SiteSpinor *buf, int sF,
 					 int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  assert(0);
@@ -680,12 +680,13 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
  gauge2 =(uint64_t)&UU[sU]( Z );				\
  gauge3 =(uint64_t)&UU[sU]( T ); 
  
+
  // This is the single precision 5th direction vectorised kernel
 #include <Grid/simd/Intel512single.h>
-template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
 								    DoubledGaugeFieldView &U,
 								    DoubledGaugeFieldView &UUU,
-								    SiteSpinor *buf, int LLs,
+								    SiteSpinor *buf, int sF,
 								    int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
@@ -702,9 +703,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
  StencilEntry *SE2;
  StencilEntry *SE3;

-   for(int s=0;s<LLs;s++){
+  //   for(int s=0;s<LLs;s++){

-    int sF=s+LLs*sU;
+  //    int sF=s+LLs*sU;
+  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHI(addr0,addr1,addr2,addr3);
@@ -736,10 +738,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
 }

 #include <Grid/simd/Intel512double.h>
-template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st, 
 								    DoubledGaugeFieldView &U,
 								    DoubledGaugeFieldView &UUU,
-								    SiteSpinor *buf, int LLs,
+								    SiteSpinor *buf, int sF,
 								    int sU, const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
 #ifdef AVX512
@@ -756,8 +758,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
  StencilEntry *SE2;
  StencilEntry *SE3;

-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=s+LLs*sU;
+  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHI(addr0,addr1,addr2,addr3);
@@ -821,10 +824,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
  // This is the single precision 5th direction vectorised kernel

 #include <Grid/simd/Intel512single.h>
-template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st, 
 							       DoubledGaugeFieldView &U,
 							       DoubledGaugeFieldView &UUU,
-							       SiteSpinor *buf, int LLs,
+							       SiteSpinor *buf, int sF,
 							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
@@ -841,9 +844,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
  StencilEntry *SE2;
  StencilEntry *SE3;

-  for(int s=0;s<LLs;s++){
-    
-    int sF=s+LLs*sU;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=s+LLs*sU;
+  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHIa(addr0,addr1);
@@ -890,10 +893,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
 }

 #include <Grid/simd/Intel512double.h>
-template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st, 
 							       DoubledGaugeFieldView &U,
 							       DoubledGaugeFieldView &UUU,
-							       SiteSpinor *buf, int LLs,
+							       SiteSpinor *buf, int sF,
 							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
@@ -910,9 +913,9 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
  StencilEntry *SE2;
  StencilEntry *SE3;

-  for(int s=0;s<LLs;s++){
-    
-    int sF=s+LLs*sU;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=s+LLs*sU;
+  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHIa(addr0,addr1);
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
@@ -146,9 +146,10 @@ NAMESPACE_BEGIN(Grid);


 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
+void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
 					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
-					  SiteSpinor *buf, int LLs, int sU, 
+					  SiteSpinor *buf, int sF, int sU, 
 					  const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
@@ -181,8 +182,9 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
  StencilEntry *SE;
  int skew;

-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=s+LLs*sU;
+  {

    skew = 0;
    HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);  
@@ -193,6 +195,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG      (U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG      (U,Zm,1,skew,even);  
    HAND_STENCIL_LEG      (U,Tm,0,skew,odd);  
+    if (Naik) {
    skew = 8;
    HAND_STENCIL_LEG(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);   
@@ -202,7 +205,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);  
-    
+    }    
    if ( dag ) {
      result()()(0) = - even_0 - odd_0;
      result()()(1) = - even_1 - odd_1;
@@ -218,9 +221,10 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,


 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
+void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
+					     SiteSpinor *buf, int sF, int sU, 
 					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
@@ -253,8 +257,9 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
  StencilEntry *SE;
  int skew;

-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=s+LLs*sU;
+  {

    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
@@ -268,6 +273,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);  
    HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);  
+    if (Naik) {
    skew = 8;
    HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);   
@@ -277,7 +283,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);  
-
+    }
    // Assume every site must be connected to at least one interior point. No 1^4 subvols.
    if ( dag ) {
      result()()(0) = - even_0 - odd_0;
@@ -294,9 +300,10 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,


 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
+void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
+					     SiteSpinor *buf, int sF, int sU, 
 					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
@@ -329,8 +336,9 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
  StencilEntry *SE;
  int skew;

-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=s+LLs*sU;
+  {

    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
@@ -344,6 +352,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);  
    HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);  
+    if (Naik) {
    skew = 8;
    HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);   
@@ -353,7 +362,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);  
-
+    }
    // Add sum of all exterior connected stencil legs
    if ( nmu ) { 
      if ( dag ) {
@@ -370,6 +379,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
  }
 }

+/*
 #define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
@@ -385,7 +395,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
-
+*/
 #undef LOAD_CHI

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -78,10 +78,12 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 // Int, Ext, Int+Ext cases for comms overlap
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
+void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionFieldView &in, FermionFieldView &out, int dag) {
+					     SiteSpinor *buf, int sF, int sU, 
+					     const FermionFieldView &in, FermionFieldView &out, int dag) 
+{
  const SiteSpinor *chi_p;
  SiteSpinor chi;
  SiteSpinor Uchi;
@@ -89,8 +91,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
  int ptype;
  int skew;

-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
+  //  for(int s=0;s<LLs;s++){
+  //
+  //    int sF=LLs*sU+s;
+  {
    skew = 0;
    GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
    GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
@@ -100,6 +104,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
    GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
+    if ( Naik ) {
    skew=8;
    GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
@@ -109,6 +114,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
    GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
+    }
    if ( dag ) { 
      Uchi = - Uchi;
    } 
@@ -120,9 +126,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
  // Only contributions from interior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
+void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-						SiteSpinor *buf, int LLs, int sU, 
+						SiteSpinor *buf, int sF, int sU, 
 						const FermionFieldView &in, FermionFieldView &out,int dag) {
  const SiteSpinor *chi_p;
  SiteSpinor chi;
@@ -131,8 +138,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
  int ptype;
  int skew ;

-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=LLs*sU+s;
+  {
    skew = 0;
    Uchi=Zero();
    GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
@@ -143,6 +151,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
+    if ( Naik ) {
    skew=8;
    GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
@@ -152,6 +161,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
+    }
    if ( dag ) {
      Uchi = - Uchi;
    }
@@ -164,9 +174,10 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
  // Only contributions from exterior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
+void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-						SiteSpinor *buf, int LLs, int sU,
+						SiteSpinor *buf, int sF, int sU,
 						const FermionFieldView &in, FermionFieldView &out,int dag) {
  const SiteSpinor *chi_p;
  //  SiteSpinor chi;
@@ -176,8 +187,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
  int nmu=0;
  int skew ;

-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
+  //  for(int s=0;s<LLs;s++){
+  //    int sF=LLs*sU+s;
+  {
    skew = 0;
    Uchi=Zero();
    GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
@@ -188,6 +200,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
+    if ( Naik ) {
    skew=8;
    GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
@@ -197,7 +210,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
-
+    }
    if ( nmu ) { 
      if ( dag ) { 
 	out[sF] = out[sF] - Uchi;
@@ -211,72 +224,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
 ////////////////////////////////////////////////////////////////////////////////////
 // Driving / wrapping routine to select right kernel
 ////////////////////////////////////////////////////////////////////////////////////
-
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					 SiteSpinor *buf, int LLs, int sU,
-					 const FermionFieldView &in, FermionFieldView &out,
-					 int interior,int exterior)
-{
-  int dag=1;
-  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-				      SiteSpinor *buf, int LLs, int sU,
-				      const FermionFieldView &in, FermionFieldView &out,
-				      int interior,int exterior)
-{
-  int dag=0;
-  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-				      SiteSpinor *buf, int LLs,
-				      int sU, const FermionFieldView &in, FermionFieldView &out,
-				      int dag,int interior,int exterior) 
-{
-  switch(Opt) {
-#ifdef AVX512
-  case OptInlineAsm:
-    if ( interior && exterior ) {
-      DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else { 
-      std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
-      assert(0);
-    }
-    break;
-#endif
-  case OptHandUnroll:
-    if ( interior && exterior ) {
-      DhopSiteHand   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( interior ) {
-      DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( exterior ) {
-      DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    }
-    break;
-  case OptGeneric:
-    if ( interior && exterior ) {
-      DhopSiteGeneric   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( interior ) {
-      DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( exterior ) {
-      DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    }
-    break;
-  default:
-    std::cout<<"Oops Opt = "<<Opt<<std::endl;
-    assert(0);
-    break;
-  }
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U,  DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
-					    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp) 
+void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
+					   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
 {
  // Disp should be either +1,-1,+3,-3
  // What about "dag" ?
@@ -285,6 +235,108 @@ void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi
  assert(0);
 }

+#define KERNEL_CALLNB(A,improved)					\
+  const uint64_t    NN = Nsite*Ls;					\
+  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
+    });
+
+#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier(); 
+
+#define ASM_CALL(A)							\
+  const uint64_t    NN = Nsite*Ls;					\
+  thread_for( ss, NN, {							\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag);		\
+  });
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
+					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
+					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
+{
+  GridBase *FGrid=in.Grid();  
+  GridBase *UGrid=U.Grid();  
+  typedef StaggeredKernels<Impl> ThisKernel;
+  auto UUU_v = UUU.View(AcceleratorRead);
+  auto U_v   =   U.View(AcceleratorRead);
+  auto in_v  =  in.View(AcceleratorRead);
+  auto out_v = out.View(AcceleratorWrite);
+  auto st_v  =  st.View(AcceleratorRead);
+  SiteSpinor * buf = st.CommBuf();
+    
+  int Ls=1;
+  if(FGrid->Nd()==UGrid->Nd()+1){
+    Ls    = FGrid->_rdimensions[0];
+  }
+  int Nsite = UGrid->oSites();
+
+  if( interior && exterior ) { 
+    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
+#ifndef GRID_CUDA
+    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;}
+    if (Opt == OptInlineAsm  ) {  ASM_CALL(DhopSiteAsm);     return;}
+#endif
+  } else if( interior ) {
+    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
+#ifndef GRID_CUDA
+    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1);    return;}
+#endif
+  } else if( exterior ) { 
+    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
+#ifndef GRID_CUDA
+    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;}
+#endif
+  }
+  assert(0 && " Kernel optimisation case not covered ");
+}
+template <class Impl>
+void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
+				       DoubledGaugeField &U,
+				       const FermionField &in, FermionField &out, int dag, int interior,int exterior)
+{
+  GridBase *FGrid=in.Grid();  
+  GridBase *UGrid=U.Grid();  
+  typedef StaggeredKernels<Impl> ThisKernel;
+  auto UUU_v=   U.View(AcceleratorRead);
+  auto U_v   =   U.View(AcceleratorRead);
+  auto in_v  =  in.View(AcceleratorRead);
+  auto out_v = out.View(AcceleratorWrite);
+  auto st_v  =  st.View(AcceleratorRead);
+  SiteSpinor * buf = st.CommBuf();
+
+  int Ls=1;
+  if(FGrid->Nd()==UGrid->Nd()+1){
+    Ls    = FGrid->_rdimensions[0];
+  }
+  int Nsite = UGrid->oSites();
+  
+  if( interior && exterior ) { 
+    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
+#ifndef GRID_CUDA
+    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0);    return;}
+#endif
+  } else if( interior ) {
+    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
+#ifndef GRID_CUDA
+    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0);    return;}
+#endif
+  } else if( exterior ) { 
+    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
+#ifndef GRID_CUDA
+    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0);    return;}
+#endif
+  }
+}
+
+
+#undef KERNEL_CALLNB
+#undef KERNEL_CALL
+#undef ASM_CALL
+
 NAMESPACE_END(Grid);


--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -496,5 +496,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   assert(0 && " Kernel optimisation case not covered ");
  }

+#undef KERNEL_CALLNB
+#undef KERNEL_CALL
+#undef ASM_CALL
+
 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#include <Grid/Grid.h>
+
+NAMESPACE_BEGIN(Grid);
+
+const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
+const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master
@@ -0,0 +1,37 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/NaiveStaggeredFermion.cc
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class NaiveStaggeredFermion<IMPLEMENTATION>; 
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc
@@ -0,0 +1 @@
+../NaiveStaggeredFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc
@@ -0,0 +1 @@
+../NaiveStaggeredFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@@ -88,6 +88,7 @@ done
 CC_LIST=" \
  ImprovedStaggeredFermion5DInstantiation \
  ImprovedStaggeredFermionInstantiation \
+  NaiveStaggeredFermionInstantiation \
  StaggeredKernelsInstantiation "

 for impl in $STAG_IMPL_LIST
--- a/benchmarks/Benchmark_staggered.cc
+++ b/benchmarks/Benchmark_staggered.cc
@@ -87,26 +87,6 @@ int main (int argc, char ** argv)
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
-  ref = Zero();
-  /*  
-  { // Naive wilson implementation
-    ref = Zero();
-    for(int mu=0;mu<Nd;mu++){
-      //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
-      tmp = U[mu]*Cshift(src,mu,1);
-      for(int i=0;i<ref._odata.size();i++){
-	ref[i]+= tmp[i] - Gamma(Gmu[mu])*tmp[i]; ;
-      }
-
-      tmp =adj(U[mu])*src;
-      tmp =Cshift(tmp,mu,-1);
-      for(int i=0;i<ref._odata.size();i++){
-	ref[i]+= tmp[i] + Gamma(Gmu[mu])*tmp[i]; ;
-      }
-    }
-  }
-  ref = -0.5*ref;
-  */

  RealD mass=0.1;
  RealD c1=9.0/8.0;
@@ -125,10 +105,7 @@ int main (int argc, char ** argv)
  
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-  err = ref-result; 
-  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;

  Grid_finalize();
 }
--- a/tests/core/Test_staggered_naive.cc
+++ b/tests/core/Test_staggered_naive.cc
@@ -0,0 +1,282 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_wilson.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+ ;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate latt_size   = GridDefaultLatt();
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  Coordinate mpi_layout  = GridDefaultMpi();
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(&Grid);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
+  std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
+  std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
+
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          pRNG(&Grid);
+  pRNG.SeedFixedIntegers(seeds);
+  //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+
+  typedef typename NaiveStaggeredFermionR::FermionField FermionField; 
+  typedef typename NaiveStaggeredFermionR::ComplexField ComplexField; 
+  typename NaiveStaggeredFermionR::ImplParams params; 
+
+  FermionField src   (&Grid); random(pRNG,src);
+  FermionField result(&Grid); result=Zero();
+  FermionField    ref(&Grid);    ref=Zero();
+  FermionField    tmp(&Grid);    tmp=Zero();
+  FermionField    err(&Grid);    tmp=Zero();
+  FermionField phi   (&Grid); random(pRNG,phi);
+  FermionField chi   (&Grid); random(pRNG,chi);
+  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  std::vector<LatticeColourMatrix> U(4,&Grid);
+
+
+  double volume=1;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }  
+
+  // Only one non-zero (y)
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+  /* Debug force unit
+    U[mu] = 1.0;
+    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
+  */
+  }
+
+  ref = Zero();
+
+  RealD mass=0.1;
+  RealD c1=9.0/8.0;
+  RealD u0=1.0;
+
+  { // Simple improved staggered implementation
+    ref = Zero();
+    RealD c1tad = 0.5*c1/u0;
+
+    Lattice<iScalar<vInteger> > coor(&Grid);
+
+    Lattice<iScalar<vInteger> > x(&Grid); LatticeCoordinate(x,0);
+    Lattice<iScalar<vInteger> > y(&Grid); LatticeCoordinate(y,1);
+    Lattice<iScalar<vInteger> > z(&Grid); LatticeCoordinate(z,2);
+    Lattice<iScalar<vInteger> > t(&Grid); LatticeCoordinate(t,3);
+
+    Lattice<iScalar<vInteger> > lin_z(&Grid); lin_z=x+y;
+    Lattice<iScalar<vInteger> > lin_t(&Grid); lin_t=x+y+z;
+
+    for(int mu=0;mu<Nd;mu++){
+
+      // Staggered Phase.
+      ComplexField phases(&Grid);	phases=1.0;
+  
+      if ( mu == 1 ) phases = where( mod(x    ,2)==(Integer)0, phases,-phases);
+      if ( mu == 2 ) phases = where( mod(lin_z,2)==(Integer)0, phases,-phases);
+      if ( mu == 3 ) phases = where( mod(lin_t,2)==(Integer)0, phases,-phases);
+
+      tmp = PeriodicBC::CovShiftForward(U[mu],mu,src);
+      ref = ref +c1tad*tmp*phases; // Forward 1 hop
+
+      tmp = PeriodicBC::CovShiftBackward(U[mu],mu,src);
+      ref = ref -c1tad*tmp*phases; // Backward 1 hop
+
+    }
+    //    ref = ref + mass * src;
+  }
+
+  NaiveStaggeredFermionR Ds(Umu,Grid,RBGrid,mass,c1,u0,params);
+  
+
+  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
+  std::cout<<GridLogMessage<<"= Testing Dhop against cshift implementation         "<<std::endl;
+  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
+
+  std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
+  int ncall=1000;
+  double t0=usecond();
+  for(int i=0;i<ncall;i++){
+    Ds.Dhop(src,result,0);
+  }
+  double t1=usecond();
+  double t2;
+  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
+  
+  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+
+  err = ref-result; 
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+
+  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
+  std::cout<<GridLogMessage<<"= Testing that Deo + Doe = Dunprec "<<std::endl;
+  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
+
+  FermionField src_e   (&RBGrid);
+  FermionField src_o   (&RBGrid);
+  FermionField r_e   (&RBGrid);
+  FermionField r_o   (&RBGrid);
+  FermionField r_eo  (&Grid);
+  pickCheckerboard(Even,src_e,src);
+  pickCheckerboard(Odd,src_o,src);
+
+  Ds.Meooe(src_e,r_o);  std::cout<<GridLogMessage<<"Applied Meo"<<std::endl;
+  Ds.Meooe(src_o,r_e);  std::cout<<GridLogMessage<<"Applied Moe"<<std::endl;
+  Ds.Dhop (src,ref,DaggerNo);
+
+  setCheckerboard(r_eo,r_o);
+  setCheckerboard(r_eo,r_e);
+
+  err= ref - r_eo;
+  std::cout<<GridLogMessage << "EO norm diff   "<< norm2(err)<< " "<<norm2(ref)<< " " << norm2(r_eo) <<std::endl;
+
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  std::cout<<GridLogMessage<<"= Test Ddagger is the dagger of D by requiring                "<<std::endl;
+  std::cout<<GridLogMessage<<"=  < phi | Deo | chi > * = < chi | Deo^dag| phi>  "<<std::endl;
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+
+  FermionField chi_e   (&RBGrid);
+  FermionField chi_o   (&RBGrid);
+
+  FermionField dchi_e  (&RBGrid);
+  FermionField dchi_o  (&RBGrid);
+
+  FermionField phi_e   (&RBGrid);
+  FermionField phi_o   (&RBGrid);
+
+  FermionField dphi_e  (&RBGrid);
+  FermionField dphi_o  (&RBGrid);
+
+  pickCheckerboard(Even,chi_e,chi);
+  pickCheckerboard(Odd ,chi_o,chi);
+  pickCheckerboard(Even,phi_e,phi);
+  pickCheckerboard(Odd ,phi_o,phi);
+
+  Ds.Meooe(chi_e,dchi_o);
+  Ds.Meooe(chi_o,dchi_e);
+  Ds.MeooeDag(phi_e,dphi_o);
+  Ds.MeooeDag(phi_o,dphi_e);
+
+  ComplexD pDce = innerProduct(phi_e,dchi_e);
+  ComplexD pDco = innerProduct(phi_o,dchi_o);
+  ComplexD cDpe = innerProduct(chi_e,dphi_e);
+  ComplexD cDpo = innerProduct(chi_o,dphi_o);
+
+  std::cout<<GridLogMessage <<"e "<<pDce<<" "<<cDpe <<std::endl;
+  std::cout<<GridLogMessage <<"o "<<pDco<<" "<<cDpo <<std::endl;
+
+  std::cout<<GridLogMessage <<"pDce - conj(cDpo) "<< pDce-conj(cDpo) <<std::endl;
+  std::cout<<GridLogMessage <<"pDco - conj(cDpe) "<< pDco-conj(cDpe) <<std::endl;
+  std::cout<<GridLogMessage <<"e "<<pDce<<" "<<cDpe <<std::endl;
+  std::cout<<GridLogMessage <<"o "<<pDco<<" "<<cDpo <<std::endl;
+
+  std::cout<<GridLogMessage <<"pDce - conj(cDpo) "<< pDce-conj(cDpo) <<std::endl;
+  std::cout<<GridLogMessage <<"pDco - conj(cDpe) "<< pDco-conj(cDpe) <<std::endl;
+
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  std::cout<<GridLogMessage<<"= Test MeeInv Mee = 1                                         "<<std::endl;
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+
+  pickCheckerboard(Even,chi_e,chi);
+  pickCheckerboard(Odd ,chi_o,chi);
+
+  Ds.Mooee(chi_e,src_e);
+  Ds.MooeeInv(src_e,phi_e);
+
+  Ds.Mooee(chi_o,src_o);
+  Ds.MooeeInv(src_o,phi_o);
+  
+  setCheckerboard(phi,phi_e);
+  setCheckerboard(phi,phi_o);
+
+  err = phi-chi;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<< std::endl;
+
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  std::cout<<GridLogMessage<<"= Test MeeInvDag MeeDag = 1                                   "<<std::endl;
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+
+  pickCheckerboard(Even,chi_e,chi);
+  pickCheckerboard(Odd ,chi_o,chi);
+
+  Ds.MooeeDag(chi_e,src_e);
+  Ds.MooeeInvDag(src_e,phi_e);
+
+  Ds.MooeeDag(chi_o,src_o);
+  Ds.MooeeInvDag(src_o,phi_o);
+  
+  setCheckerboard(phi,phi_e);
+  setCheckerboard(phi,phi_o);
+
+  err = phi-chi;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<< std::endl;
+
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  std::cout<<GridLogMessage<<"= Test MpcDagMpc is Hermitian              "<<std::endl;
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  
+  random(pRNG,phi);
+  random(pRNG,chi);
+  pickCheckerboard(Even,chi_e,chi);
+  pickCheckerboard(Odd ,chi_o,chi);
+  pickCheckerboard(Even,phi_e,phi);
+  pickCheckerboard(Odd ,phi_o,phi);
+
+  SchurDiagMooeeOperator<NaiveStaggeredFermionR,FermionField> HermOpEO(Ds);
+  HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2);
+  HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2);
+
+  HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2);
+  HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2);
+
+  pDce = innerProduct(phi_e,dchi_e);
+  pDco = innerProduct(phi_o,dchi_o);
+  cDpe = innerProduct(chi_e,dphi_e);
+  cDpo = innerProduct(chi_o,dphi_o);
+
+  std::cout<<GridLogMessage <<"e "<<pDce<<" "<<cDpe <<std::endl;
+  std::cout<<GridLogMessage <<"o "<<pDco<<" "<<cDpo <<std::endl;
+
+  std::cout<<GridLogMessage <<"pDce - conj(cDpo) "<< pDco-conj(cDpo) <<std::endl;
+  std::cout<<GridLogMessage <<"pDco - conj(cDpe) "<< pDce-conj(cDpe) <<std::endl;
+
+  Grid_finalize();
+}
				`@@ -0,0 +1 @@`
				`../NaiveStaggeredFermionInstantiation.cc.master`