Staggereed move to accelerator

2025-11-04 22:09:31 +00:00 · 2020-05-28 08:33:06 -04:00
parent cf2938688a
commit 006cc8a8f1
18 changed files with 1319 additions and 356 deletions
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
 #include <Grid/qcd/action/fermion/NaiveStaggeredFermion.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
 NAMESPACE_CHECK(Staggered);
@@ -282,6 +283,10 @@ typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
 typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
 typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
 typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
 typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -62,8 +62,8 @@ public:
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
-      double DhopComputeTime2;
+  double DhopComputeTime2;
-      double DhopFaceTime;
+  double DhopFaceTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@@ -0,0 +1,194 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
 Copyright (C) 2015
 Author: Azusa Yamaguchi, Peter Boyle
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #ifndef GRID_QCD_NAIVE_STAG_FERMION_H
 #define GRID_QCD_NAIVE_STAG_FERMION_H
 NAMESPACE_BEGIN(Grid);
 class NaiveStaggeredFermionStatic {
 public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static const int npoint = 8;
 };
 template <class Impl>
 class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStaggeredFermionStatic {
 public:
  INHERIT_IMPL_TYPES(Impl);
  typedef StaggeredKernels<Impl> Kernels;
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  ////////////////////////////////////////
  // Performance monitoring
  ////////////////////////////////////////
  void Report(void);
  void ZeroCounters(void);
  double DhopTotalTime;
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
  GridBase *GaugeGrid(void) { return _grid; }
  GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
  GridBase *FermionGrid(void) { return _grid; }
  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
  //////////////////////////////////////////////////////////////////
  RealD M(const FermionField &in, FermionField &out);
  RealD Mdag(const FermionField &in, FermionField &out);
  /////////////////////////////////////////////////////////
  // half checkerboard operations
  /////////////////////////////////////////////////////////
  void Meooe(const FermionField &in, FermionField &out);
  void MeooeDag(const FermionField &in, FermionField &out);
  void Mooee(const FermionField &in, FermionField &out);
  void MooeeDag(const FermionField &in, FermionField &out);
  void MooeeInv(const FermionField &in, FermionField &out);
  void MooeeInvDag(const FermionField &in, FermionField &out);
  ////////////////////////
  // Derivative interface
  ////////////////////////
  // Interface calls an internal routine
  void DhopDeriv  (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
  void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
  void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
  ///////////////////////////////////////////////////////////////
  // non-hermitian hopping term; half cb or both
  ///////////////////////////////////////////////////////////////
  void Dhop  (const FermionField &in, FermionField &out, int dag);
  void DhopOE(const FermionField &in, FermionField &out, int dag);
  void DhopEO(const FermionField &in, FermionField &out, int dag);
  ///////////////////////////////////////////////////////////////
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
  ///////////////////////////////////////////////////////////////
  // Extra methods added by derived
  ///////////////////////////////////////////////////////////////
  void DerivInternal(StencilImpl &st, 
 		     DoubledGaugeField &U,
 		     GaugeField &mat, 
 		     const FermionField &A, const FermionField &B, int dag);
  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);
  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 			       const FermionField &in, FermionField &out, int dag);
  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 				   const FermionField &in, FermionField &out, int dag);
  //////////////////////////////////////////////////////////////////////////
  // Grid own interface Constructor
  //////////////////////////////////////////////////////////////////////////
  NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
 			GridRedBlackCartesian &Hgrid, RealD _mass,
 			RealD _c1, RealD _u0,
 			const ImplParams &p = ImplParams());
  NaiveStaggeredFermion(GridCartesian &Fgrid,
 			GridRedBlackCartesian &Hgrid, RealD _mass,
 			RealD _c1, RealD _u0,
 			const ImplParams &p = ImplParams());
  // DoubleStore impl dependent
  void ImportGauge      (const GaugeField &_U );
  DoubledGaugeField &GetU(void)   { return Umu ; } ;
  void CopyGaugeCheckerboards(void);
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
  //    protected:
 public:
  // any other parameters of action ???
  virtual int   isTrivialEE(void) { return 1; };
  virtual RealD Mass(void) { return mass; }
  RealD mass;
  RealD u0;
  RealD c1;
  GridBase *_grid;
  GridBase *_cbgrid;
  // Defines the stencils for even and odd
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
  ///////////////////////////////////////////////////////////////
  void ContractConservedCurrent(PropagatorField &q_in_1,
                                PropagatorField &q_in_2,
                                PropagatorField &q_out,
                                PropagatorField &src,
                                Current curr_type,
                                unsigned int mu);
  void SeqConservedCurrent(PropagatorField &q_in,
                           PropagatorField &q_out,
                           PropagatorField &srct,
                           Current curr_type,
                           unsigned int mu, 
                           unsigned int tmin,
                           unsigned int tmax,
 			   ComplexField &lattice_cmplx);
 };
 typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
 typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/StaggeredKernels.h
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@@ -47,23 +47,34 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
  INHERIT_IMPL_TYPES(Impl);
  typedef FermionOperator<Impl> Base;
-public:
+ public:
-    
+
-   void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
+  void DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
-		      int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
+		    DoubledGaugeField &U, DoubledGaugeField &UUU, 
 		    const FermionField &in, FermionField &out, int dag, int interior,int exterior);
  void DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
 		 DoubledGaugeField &U,
 		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
  void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
 		     int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
 protected:    
   ///////////////////////////////////////////////////////////////////////////////////////
   // Generic Nc kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik>
   void DhopSiteGeneric(StencilView &st, 
 			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik>
   void DhopSiteGenericInt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik>
   void DhopSiteGenericExt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionFieldView &in, FermionFieldView &out,int dag);
@@ -71,15 +82,18 @@ public:
   ///////////////////////////////////////////////////////////////////////////////////////
   // Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik>
   void DhopSiteHand(StencilView &st, 
 		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		     SiteSpinor * buf, int LLs, int sU, 
 		     const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik>
   void DhopSiteHandInt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik>
   void DhopSiteHandExt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
@@ -87,27 +101,10 @@ public:
   ///////////////////////////////////////////////////////////////////////////////////////
   // Asm Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+   void DhopSiteAsm(StencilView &st, 
 		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU, 
 		    const FermionFieldView &in, FermionFieldView &out,int dag);
   ///////////////////////////////////////////////////////////////////////////////////////////////////
   // Generic interface; fan out to right routine
   ///////////////////////////////////////////////////////////////////////////////////////////////////
   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
 		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 		 SiteSpinor * buf, int LLs, int sU,
 		 const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
   void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, 
 		    DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU,
 		    const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
 		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 		 SiteSpinor * buf, int LLs, int sU,
 		 const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
 public:
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@@ -281,11 +281,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 						    DoubledGaugeField & U,DoubledGaugeField & UUU,
 						    const FermionField &in, FermionField &out,int dag)
 {
 #ifdef GRID_OMP
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
  else
 #endif
    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
 }
@@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
 								   DoubledGaugeField & U,DoubledGaugeField & UUU,
 								   const FermionField &in, FermionField &out,int dag)
 {
 #ifdef GRID_OMP
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  Compressor compressor; 
  int LLs = in.Grid()->_rdimensions[0];
@@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  DhopFaceTime-=usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  DhopFaceTime+=usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();
  double ctime=0;
  double ptime=0;
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
+  // Remove explicit thread mapping introduced for OPA reasons.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
+  DhopComputeTime-=usecond();
  {
-    int tid = omp_get_thread_num();
+    int interior=1;
-    int nthreads = omp_get_num_threads();
+    int exterior=0;
-    int ncomms = CartesianCommunicator::nCommThreads;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
    if (ncomms == -1) ncomms = 1;
    assert(nthreads > ncomms);
    if (tid >= ncomms) {
      double start = usecond();
      nthreads -= ncomms;
      int ttid  = tid - ncomms;
      int n     = U.Grid()->oSites(); // 4d vol
      int chunk = n / nthreads;
      int rem   = n % nthreads;
      int myblock, myn;
      if (ttid < rem) {
        myblock = ttid * chunk + ttid;
        myn = chunk+1;
      } else {
        myblock = ttid*chunk + rem;
        myn = chunk;
      }
      // do the compute
      auto   U_v  =   U.View(CpuRead);
      auto UUU_v  = UUU.View(CpuRead);
      auto  in_v  =  in.View(CpuRead);
      auto out_v  = out.View(CpuWrite);
      if (dag == DaggerYes) {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
          int sU = ss;
 	  // Interior = 1; Exterior = 0; must implement for staggered
          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
        }
      } else {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
 	  // Interior = 1; Exterior = 0;
          int sU = ss;
          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
        }
      }
        ptime = usecond() - start;
    } else {
      double start = usecond();
      st.CommunicateThreaded();
      ctime = usecond() - start;
    }
  }
-  DhopCommTime += ctime;
+  DhopComputeTime+=usecond();
  DhopComputeTime+=ptime;
  // First to enter, last to leave timing
  st.CollateThreads();
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
-  DhopComputeTime2-=usecond();
+  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
-  auto   U_v  =   U.View(CpuRead);
+  DhopComputeTime2-=usecond();
-  auto UUU_v  = UUU.View(CpuRead);
+  {
-  auto  in_v  =  in.View(CpuRead);
+    int interior=0;
-  auto out_v  = out.View(CpuWrite);
+    int exterior=1;
-  if (dag == DaggerYes) {
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
    int sz=st.surface_list.size();
    thread_for( ss,sz,{
      int sU = st.surface_list[ss];
      Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
    });
  } else {
    int sz=st.surface_list.size();
    thread_for( ss,sz,{
      int sU = st.surface_list[ss];
      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
    });
  }
  DhopComputeTime2+=usecond();
 #else
  assert(0);
 #endif
 }
 template<class Impl>
@@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  Compressor compressor;
  int LLs = in.Grid()->_rdimensions[0];
 //double t1=usecond();
  DhopTotalTime -= usecond();
  DhopCommTime -= usecond();
@@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  auto   U_v  =   U.View(CpuRead);
+  {
-  auto UUU_v  = UUU.View(CpuRead);
+    int interior=1;
-  auto  in_v  =  in.View(CpuRead);
+    int exterior=1;
-  auto out_v  = out.View(CpuWrite);
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  if (dag == DaggerYes) {
    thread_for( ss,U.Grid()->oSites(),{
      int sU=ss;
      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
    });
  } else {
    thread_for( ss,U.Grid()->oSites(),{
      int sU=ss;
      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
    });
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 //double t2=usecond();
 //std::cout << __FILE__ << " " << __func__  << " Total Time " << DhopTotalTime << std::endl;
 //std::cout << __FILE__ << " " << __func__  << " Total Time Org " << t2-t1 << std::endl;
 //std::cout << __FILE__ << " " << __func__  << " Comml Time " << DhopCommTime << std::endl;
 //std::cout << __FILE__ << " " << __func__  << " Compute Time " << DhopComputeTime << std::endl;
 }
 /*CHANGE END*/
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@@ -395,11 +395,9 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
 						  const FermionField &in,
 						  FermionField &out, int dag) 
 {
 #ifdef GRID_OMP
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
  else
 #endif
    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
 }
 template <class Impl>
@@ -409,7 +407,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
 								 const FermionField &in,
 								 FermionField &out, int dag) 
 {
 #ifdef GRID_OMP
  Compressor compressor; 
  int len =  U.Grid()->oSites();
@@ -418,60 +415,30 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  st.CommsMergeSHM(compressor);
  DhopFaceTime    += usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+= usecond();
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
+  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime    -= usecond();
 #pragma omp parallel 
  {
-    int tid = omp_get_thread_num();
+    int interior=1;
-    int nthreads = omp_get_num_threads();
+    int exterior=0;
-    int ncomms = CartesianCommunicator::nCommThreads;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
    if (ncomms == -1) ncomms = 1;
    assert(nthreads > ncomms);
    if (tid >= ncomms) {
      nthreads -= ncomms;
      int ttid  = tid - ncomms;
      int n     = len;
      int chunk = n / nthreads;
      int rem   = n % nthreads;
      int myblock, myn;
      if (ttid < rem) {
        myblock = ttid * chunk + ttid;
        myn = chunk+1;
      } else {
        myblock = ttid*chunk + rem;
        myn = chunk;
      }
      // do the compute
      auto U_v   = U.View(CpuRead);
      auto UUU_v = UUU.View(CpuRead);
      auto in_v  = in.View(CpuRead);
      auto out_v = out.View(CpuWrite);
      if (dag == DaggerYes) {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
          int sU = ss;
 	  // Interior = 1; Exterior = 0; must implement for staggered
          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); 
        }
      } else {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
 	  // Interior = 1; Exterior = 0;
          int sU = ss;
          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
        }
      }
    } else {
      st.CommunicateThreaded();
    }
  }
  DhopComputeTime    += usecond();
  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
  // First to enter, last to leave timing
  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
@@ -479,28 +446,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  DhopComputeTime2    -= usecond();
  {
-    auto U_v   = U.View(CpuRead);
+    int interior=0;
-    auto UUU_v = UUU.View(CpuRead);
+    int exterior=1;
-    auto in_v  = in.View(CpuRead);
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
    auto out_v = out.View(CpuWrite);
    if (dag == DaggerYes) {
      int sz=st.surface_list.size();
      thread_for(ss,sz,{
 	int sU = st.surface_list[ss];
 	Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
      });
    } else {
      int sz=st.surface_list.size();
      thread_for(ss,sz,{
 	int sU = st.surface_list[ss];
 	Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
      });
    }
  }
  DhopComputeTime2    += usecond();
 #else
  assert(0);
 #endif
 }
@@ -520,19 +470,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
  st.HaloExchange(in, compressor);
  DhopCommTime    += usecond();
  auto U_v   =   U.View(CpuRead);
  auto UUU_v = UUU.View(CpuRead);
  auto in_v  =  in.View(CpuRead);
  auto out_v = out.View(CpuWrite);
  DhopComputeTime -= usecond();
-  if (dag == DaggerYes) {
+  {
-    thread_for(sss, in.Grid()->oSites(),{
+    int interior=1;
-      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
+    int exterior=1;
-    });
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  } else {
    thread_for(sss, in.Grid()->oSites(),{
      Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
    });
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
--- a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
@@ -0,0 +1,499 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
 Copyright (C) 2015
 Author: Azusa Yamaguchi, Peter Boyle
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #pragma once 
 NAMESPACE_BEGIN(Grid);
 /////////////////////////////////
 // Constructor and gauge import
 /////////////////////////////////
 template <class Impl>
 NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, 
 						   RealD _mass,
 						   RealD _c1, RealD _u0,
 						   const ImplParams &p)
  : Kernels(p),
    _grid(&Fgrid),
    _cbgrid(&Hgrid),
    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
    mass(_mass),
    Lebesgue(_grid),
    LebesgueEvenOdd(_cbgrid),
    Umu(&Fgrid),
    UmuEven(&Hgrid),
    UmuOdd(&Hgrid),
    _tmp(&Hgrid)
 {
  int vol4;
  int LLs=1;
  c1=_c1;
  u0=_u0;
  vol4= _grid->oSites();
  Stencil.BuildSurfaceList(LLs,vol4);
  vol4= _cbgrid->oSites();
  StencilEven.BuildSurfaceList(LLs,vol4);
  StencilOdd.BuildSurfaceList(LLs,vol4);
 }
 template <class Impl>
 NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
 						   GridRedBlackCartesian &Hgrid, RealD _mass,
 						   RealD _c1, RealD _u0,
 						   const ImplParams &p)
  : NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p)
 {
  ImportGauge(_U);
 }
 ////////////////////////////////////////////////////////////
 // Momentum space propagator should be 
 // https://arxiv.org/pdf/hep-lat/9712010.pdf
 //
 // mom space action.
 //   gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
 //
 // must track through staggered flavour/spin reduction in literature to 
 // turn to free propagator for the one component chi field, a la page 4/5
 // of above link to implmement fourier based solver.
 ////////////////////////////////////////////////////////////
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
 {
  pickCheckerboard(Even, UmuEven,  Umu);
  pickCheckerboard(Odd,  UmuOdd ,  Umu);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::ImportGauge(const GaugeField &_U) 
 {
  GaugeLinkField U(GaugeGrid());
  DoubledGaugeField _UUU(GaugeGrid());
  ////////////////////////////////////////////////////////
  // Double Store should take two fields for Naik and one hop separately.
  // Discard teh Naik as Naive
  ////////////////////////////////////////////////////////
  Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U );
  ////////////////////////////////////////////////////////
  // Apply scale factors to get the right fermion Kinetic term
  // Could pass coeffs into the double store to save work.
  // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) 
  ////////////////////////////////////////////////////////
  for (int mu = 0; mu < Nd; mu++) {
    U = PeekIndex<LorentzIndex>(Umu, mu);
    PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
    U = PeekIndex<LorentzIndex>(Umu, mu+4);
    PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
  }
  CopyGaugeCheckerboards();
 }
 /////////////////////////////
 // Implement the interface
 /////////////////////////////
 template <class Impl>
 RealD NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
  return axpy_norm(out, mass, in, out);
 }
 template <class Impl>
 RealD NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerYes);
  return axpy_norm(out, mass, in, out);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerNo);
  } else {
    DhopOE(in, out, DaggerNo);
  }
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerYes);
  } else {
    DhopOE(in, out, DaggerYes);
  }
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  typename FermionField::scalar_type scal(mass);
  out = scal * in;
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  Mooee(in, out);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  out = (1.0 / (mass)) * in;
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) 
 {
  out.Checkerboard() = in.Checkerboard();
  MooeeInv(in, out);
 }
 ///////////////////////////////////
 // Internal
 ///////////////////////////////////
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
 						GaugeField & mat,
 						const FermionField &A, const FermionField &B, int dag) 
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor;
  FermionField Btilde(B.Grid());
  FermionField Atilde(B.Grid());
  Atilde = A;
  st.HaloExchange(B, compressor);
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////
    // Call the single hop
    ////////////////////////
    auto U_v   = U.View(CpuRead);
    auto B_v      = B.View(CpuWrite);
    auto Btilde_v = Btilde.View(CpuWrite);
    thread_for(sss,B.Grid()->oSites(),{
      Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
    });
    assert(0);// need to figure out the force interface with a blasted three link term.
  }
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
  conformable(U.Grid(), _grid);
  conformable(U.Grid(), V.Grid());
  conformable(U.Grid(), mat.Grid());
  mat.Checkerboard() = U.Checkerboard();
  DerivInternal(Stencil, Umu, mat, U, V, dag);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
  conformable(U.Grid(), _cbgrid);
  conformable(U.Grid(), V.Grid());
  conformable(U.Grid(), mat.Grid());
  assert(V.Checkerboard() == Even);
  assert(U.Checkerboard() == Odd);
  mat.Checkerboard() = Odd;
  DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
  conformable(U.Grid(), _cbgrid);
  conformable(U.Grid(), V.Grid());
  conformable(U.Grid(), mat.Grid());
  assert(V.Checkerboard() == Odd);
  assert(U.Checkerboard() == Even);
  mat.Checkerboard() = Even;
  DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());
  out.Checkerboard() = in.Checkerboard();
  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
  assert(in.Checkerboard() == Even);
  out.Checkerboard() = Odd;
  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
  assert(in.Checkerboard() == Odd);
  out.Checkerboard() = Even;
  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
  DhopDir(in, out, dir, disp);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
  assert(0); // Not implemented yet
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
  Compressor compressor;
  Stencil.HaloExchange(in, compressor);
  auto Umu_v   =   Umu.View(CpuRead);
  auto in_v    =  in.View(CpuRead);
  auto out_v   = out.View(CpuWrite);
  //  thread_for( sss, in.Grid()->oSites(),{
  //    Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
  //  });
  assert(0);
 };
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
 					       DoubledGaugeField &U,
 					       const FermionField &in,
 					       FermionField &out, int dag) 
 {
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else
    DhopInternalSerialComms(st,lo,U,in,out,dag);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
 							      DoubledGaugeField &U,
 							      const FermionField &in,
 							      FermionField &out, int dag) 
 {
  Compressor compressor; 
  int len =  U.Grid()->oSites();
  DhopTotalTime   -= usecond();
  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  DhopFaceTime    += usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+= usecond();
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime    -= usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime    += usecond();
  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
  // First to enter, last to leave timing
  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
  DhopFaceTime    -= usecond();
  DhopComputeTime2    -= usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime2    += usecond();
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
 							  DoubledGaugeField &U,
 							  const FermionField &in,
 							  FermionField &out, int dag) 
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  DhopTotalTime   -= usecond();
  DhopCommTime    -= usecond();
  Compressor compressor;
  st.HaloExchange(in, compressor);
  DhopCommTime    += usecond();
  DhopComputeTime -= usecond();
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 };
  ////////////////////////////////////////////////////////////////
  // Reporting
  ////////////////////////////////////////////////////////////////
 template<class Impl>
 void NaiveStaggeredFermion<Impl>::Report(void) 
 {
  Coordinate latt = _grid->GlobalDimensions();
  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls   : " 
 	    << DhopCalls   << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime   /Calls       : " 
 	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime    /Calls       : " 
 	    << DhopCommTime    / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls        : " 
 	    << DhopComputeTime / DhopCalls << " us" << std::endl;
  // Average the compute time
  _grid->GlobalSum(DhopComputeTime);
  DhopComputeTime/=NP;
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
 }
 template<class Impl>
 void NaiveStaggeredFermion<Impl>::ZeroCounters(void) 
 {
  DhopCalls       = 0;
  DhopTotalTime   = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopFaceTime    = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 //////////////////////////////////////////////////////// 
 // Conserved current - not yet implemented.
 ////////////////////////////////////////////////////////
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 							      PropagatorField &q_in_2,
 							      PropagatorField &q_out,
 							      PropagatorField &src,
 							      Current curr_type,
 							      unsigned int mu)
 {
  assert(0);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                                         PropagatorField &q_out,
                                                         PropagatorField &src,
                                                         Current curr_type,
                                                         unsigned int mu, 
                                                         unsigned int tmin,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
  assert(0);
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
@@ -618,10 +618,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
 					 DoubledGaugeFieldView &U,
 					 DoubledGaugeFieldView &UUU,
-					 SiteSpinor *buf, int LLs,
+					 SiteSpinor *buf, int sF,
 					 int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  assert(0);
@@ -680,12 +680,13 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
  gauge2 =(uint64_t)&UU[sU]( Z );				\
  gauge3 =(uint64_t)&UU[sU]( T ); 
  // This is the single precision 5th direction vectorised kernel
 #include <Grid/simd/Intel512single.h>
-template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
 								    DoubledGaugeFieldView &U,
 								    DoubledGaugeFieldView &UUU,
-								    SiteSpinor *buf, int LLs,
+								    SiteSpinor *buf, int sF,
 								    int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
@@ -702,9 +703,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
  StencilEntry *SE2;
  StencilEntry *SE3;
-   for(int s=0;s<LLs;s++){
+  //   for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //    int sF=s+LLs*sU;
  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHI(addr0,addr1,addr2,addr3);
@@ -736,10 +738,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
 }
 #include <Grid/simd/Intel512double.h>
-template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st, 
 								    DoubledGaugeFieldView &U,
 								    DoubledGaugeFieldView &UUU,
-								    SiteSpinor *buf, int LLs,
+								    SiteSpinor *buf, int sF,
 								    int sU, const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
 #ifdef AVX512
@@ -756,8 +758,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
  StencilEntry *SE2;
  StencilEntry *SE3;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //    int sF=s+LLs*sU;
  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHI(addr0,addr1,addr2,addr3);
@@ -821,10 +824,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
  // This is the single precision 5th direction vectorised kernel
 #include <Grid/simd/Intel512single.h>
-template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st, 
 							       DoubledGaugeFieldView &U,
 							       DoubledGaugeFieldView &UUU,
-							       SiteSpinor *buf, int LLs,
+							       SiteSpinor *buf, int sF,
 							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
@@ -841,9 +844,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
  StencilEntry *SE2;
  StencilEntry *SE3;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    
+  //    int sF=s+LLs*sU;
-    int sF=s+LLs*sU;
+  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHIa(addr0,addr1);
@@ -890,10 +893,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
 }
 #include <Grid/simd/Intel512double.h>
-template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st, 
 							       DoubledGaugeFieldView &U,
 							       DoubledGaugeFieldView &UUU,
-							       SiteSpinor *buf, int LLs,
+							       SiteSpinor *buf, int sF,
 							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
@@ -910,9 +913,9 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
  StencilEntry *SE2;
  StencilEntry *SE3;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    
+  //    int sF=s+LLs*sU;
-    int sF=s+LLs*sU;
+  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHIa(addr0,addr1);
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
@@ -146,9 +146,10 @@ NAMESPACE_BEGIN(Grid);
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
 					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
-					  SiteSpinor *buf, int LLs, int sU, 
+					  SiteSpinor *buf, int sF, int sU, 
 					  const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
@@ -181,8 +182,9 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
  StencilEntry *SE;
  int skew;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //    int sF=s+LLs*sU;
  {
    skew = 0;
    HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);  
@@ -193,6 +195,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG      (U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG      (U,Zm,1,skew,even);  
    HAND_STENCIL_LEG      (U,Tm,0,skew,odd);  
    if (Naik) {
    skew = 8;
    HAND_STENCIL_LEG(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);   
@@ -202,7 +205,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);  
-    
+    }    
    if ( dag ) {
      result()()(0) = - even_0 - odd_0;
      result()()(1) = - even_1 - odd_1;
@@ -218,9 +221,10 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
+					     SiteSpinor *buf, int sF, int sU, 
 					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
@@ -253,8 +257,9 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
  StencilEntry *SE;
  int skew;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //    int sF=s+LLs*sU;
  {
    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
@@ -268,6 +273,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);  
    HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);  
    if (Naik) {
    skew = 8;
    HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);   
@@ -277,7 +283,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);  
-
+    }
    // Assume every site must be connected to at least one interior point. No 1^4 subvols.
    if ( dag ) {
      result()()(0) = - even_0 - odd_0;
@@ -294,9 +300,10 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
+					     SiteSpinor *buf, int sF, int sU, 
 					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
@@ -329,8 +336,9 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
  StencilEntry *SE;
  int skew;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //    int sF=s+LLs*sU;
  {
    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
@@ -344,6 +352,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);  
    HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);  
    if (Naik) {
    skew = 8;
    HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);   
@@ -353,7 +362,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);  
-
+    }
    // Add sum of all exterior connected stencil legs
    if ( nmu ) { 
      if ( dag ) {
@@ -370,6 +379,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
  }
 }
 /*
 #define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
@@ -385,7 +395,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
-
+*/
 #undef LOAD_CHI
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
  if (SE->_is_local ) {						\
    if (SE->_permute) {						\
      chi_p = &chi;						\
-      permute(chi,  in[SE->_offset], ptype);		\
+      permute(chi,  in[SE->_offset], ptype);			\
    } else {							\
-      chi_p = &in[SE->_offset];				\
+      chi_p = &in[SE->_offset];					\
    }								\
  } else {							\
    chi_p = &buf[SE->_offset];					\
@@ -51,15 +51,15 @@ NAMESPACE_BEGIN(Grid);
  if (SE->_is_local ) {						\
    if (SE->_permute) {						\
      chi_p = &chi;						\
-      permute(chi,  in[SE->_offset], ptype);		\
+      permute(chi,  in[SE->_offset], ptype);			\
    } else {							\
-      chi_p = &in[SE->_offset];				\
+      chi_p = &in[SE->_offset];					\
    }								\
  } else if ( st.same_node[Dir] ) {				\
    chi_p = &buf[SE->_offset];					\
  }								\
  if (SE->_is_local || st.same_node[Dir] ) {			\
-    multLink(Uchi, U[sU], *chi_p, Dir);			\
+    multLink(Uchi, U[sU], *chi_p, Dir);				\
  }
 #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
@@ -67,7 +67,7 @@ NAMESPACE_BEGIN(Grid);
  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
    chi_p = &buf[SE->_offset];					\
-    multLink(Uchi, U[sU], *chi_p, Dir);			\
+    multLink(Uchi, U[sU], *chi_p, Dir);				\
  }
 template <class Impl>
@@ -78,10 +78,12 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 // Int, Ext, Int+Ext cases for comms overlap
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
+					     SiteSpinor *buf, int sF, int sU, 
-					     const FermionFieldView &in, FermionFieldView &out, int dag) {
+					     const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
  const SiteSpinor *chi_p;
  SiteSpinor chi;
  SiteSpinor Uchi;
@@ -89,8 +91,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
  int ptype;
  int skew;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
+  //
  //    int sF=LLs*sU+s;
  {
    skew = 0;
    GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
    GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
@@ -100,6 +104,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
    GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
    if ( Naik ) {
    skew=8;
    GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
@@ -109,6 +114,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
    GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
    }
    if ( dag ) { 
      Uchi = - Uchi;
    } 
@@ -120,9 +126,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
  // Only contributions from interior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-						SiteSpinor *buf, int LLs, int sU, 
+						SiteSpinor *buf, int sF, int sU, 
 						const FermionFieldView &in, FermionFieldView &out,int dag) {
  const SiteSpinor *chi_p;
  SiteSpinor chi;
@@ -131,8 +138,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
  int ptype;
  int skew ;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
+  //    int sF=LLs*sU+s;
  {
    skew = 0;
    Uchi=Zero();
    GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
@@ -143,6 +151,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
    if ( Naik ) {
    skew=8;
    GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
@@ -152,6 +161,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
    }
    if ( dag ) {
      Uchi = - Uchi;
    }
@@ -164,9 +174,10 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
  // Only contributions from exterior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-						SiteSpinor *buf, int LLs, int sU,
+						SiteSpinor *buf, int sF, int sU,
 						const FermionFieldView &in, FermionFieldView &out,int dag) {
  const SiteSpinor *chi_p;
  //  SiteSpinor chi;
@@ -176,8 +187,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
  int nmu=0;
  int skew ;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
+  //    int sF=LLs*sU+s;
  {
    skew = 0;
    Uchi=Zero();
    GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
@@ -188,6 +200,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
    if ( Naik ) {
    skew=8;
    GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
@@ -197,7 +210,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
-
+    }
    if ( nmu ) { 
      if ( dag ) { 
 	out[sF] = out[sF] - Uchi;
@@ -211,72 +224,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
 ////////////////////////////////////////////////////////////////////////////////////
 // Driving / wrapping routine to select right kernel
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
+void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
-					 SiteSpinor *buf, int LLs, int sU,
+					   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
 					 const FermionFieldView &in, FermionFieldView &out,
 					 int interior,int exterior)
 {
  int dag=1;
  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
 };
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 				      SiteSpinor *buf, int LLs, int sU,
 				      const FermionFieldView &in, FermionFieldView &out,
 				      int interior,int exterior)
 {
  int dag=0;
  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
 };
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 				      SiteSpinor *buf, int LLs,
 				      int sU, const FermionFieldView &in, FermionFieldView &out,
 				      int dag,int interior,int exterior) 
 {
  switch(Opt) {
 #ifdef AVX512
  case OptInlineAsm:
    if ( interior && exterior ) {
      DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else { 
      std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
      assert(0);
    }
    break;
 #endif
  case OptHandUnroll:
    if ( interior && exterior ) {
      DhopSiteHand   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else if ( interior ) {
      DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else if ( exterior ) {
      DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    }
    break;
  case OptGeneric:
    if ( interior && exterior ) {
      DhopSiteGeneric   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else if ( interior ) {
      DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else if ( exterior ) {
      DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    }
    break;
  default:
    std::cout<<"Oops Opt = "<<Opt<<std::endl;
    assert(0);
    break;
  }
 };
 template <class Impl>
 void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U,  DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
 					    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp) 
 {
  // Disp should be either +1,-1,+3,-3
  // What about "dag" ?
@@ -285,6 +235,108 @@ void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi
  assert(0);
 }
 #define KERNEL_CALLNB(A,improved)					\
  const uint64_t    NN = Nsite*Ls;					\
  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
      int sF = ss;							\
      int sU = ss/Ls;							\
      ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
    });
 #define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier(); 
 #define ASM_CALL(A)							\
  const uint64_t    NN = Nsite*Ls;					\
  thread_for( ss, NN, {							\
      int sF = ss;							\
      int sU = ss/Ls;							\
      ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag);		\
  });
 template <class Impl>
 void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
 					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
 {
  GridBase *FGrid=in.Grid();  
  GridBase *UGrid=U.Grid();  
  typedef StaggeredKernels<Impl> ThisKernel;
  auto UUU_v = UUU.View(AcceleratorRead);
  auto U_v   =   U.View(AcceleratorRead);
  auto in_v  =  in.View(AcceleratorRead);
  auto out_v = out.View(AcceleratorWrite);
  auto st_v  =  st.View(AcceleratorRead);
  SiteSpinor * buf = st.CommBuf();
  int Ls=1;
  if(FGrid->Nd()==UGrid->Nd()+1){
    Ls    = FGrid->_rdimensions[0];
  }
  int Nsite = UGrid->oSites();
  if( interior && exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;}
    if (Opt == OptInlineAsm  ) {  ASM_CALL(DhopSiteAsm);     return;}
 #endif
  } else if( interior ) {
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1);    return;}
 #endif
  } else if( exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;}
 #endif
  }
  assert(0 && " Kernel optimisation case not covered ");
 }
 template <class Impl>
 void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
 				       DoubledGaugeField &U,
 				       const FermionField &in, FermionField &out, int dag, int interior,int exterior)
 {
  GridBase *FGrid=in.Grid();  
  GridBase *UGrid=U.Grid();  
  typedef StaggeredKernels<Impl> ThisKernel;
  auto UUU_v=   U.View(AcceleratorRead);
  auto U_v   =   U.View(AcceleratorRead);
  auto in_v  =  in.View(AcceleratorRead);
  auto out_v = out.View(AcceleratorWrite);
  auto st_v  =  st.View(AcceleratorRead);
  SiteSpinor * buf = st.CommBuf();
  int Ls=1;
  if(FGrid->Nd()==UGrid->Nd()+1){
    Ls    = FGrid->_rdimensions[0];
  }
  int Nsite = UGrid->oSites();
  if( interior && exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0);    return;}
 #endif
  } else if( interior ) {
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0);    return;}
 #endif
  } else if( exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0);    return;}
 #endif
  }
 }
 #undef KERNEL_CALLNB
 #undef KERNEL_CALL
 #undef ASM_CALL
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -496,5 +496,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   assert(0 && " Kernel optimisation case not covered ");
  }
 #undef KERNEL_CALLNB
 #undef KERNEL_CALL
 #undef ASM_CALL
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc
@@ -0,0 +1,36 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
 Copyright (C) 2015
 Author: Azusa Yamaguchi, Peter Boyle
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #include <Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
 const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
 const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master
@@ -0,0 +1,37 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/NaiveStaggeredFermion.cc
 Copyright (C) 2015
 Author: Azusa Yamaguchi, Peter Boyle
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class NaiveStaggeredFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc
@@ -0,0 +1 @@
 ../NaiveStaggeredFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc
@@ -0,0 +1 @@
 ../NaiveStaggeredFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@@ -88,6 +88,7 @@ done
 CC_LIST=" \
  ImprovedStaggeredFermion5DInstantiation \
  ImprovedStaggeredFermionInstantiation \
  NaiveStaggeredFermionInstantiation \
  StaggeredKernelsInstantiation "
 for impl in $STAG_IMPL_LIST
--- a/benchmarks/Benchmark_staggered.cc
+++ b/benchmarks/Benchmark_staggered.cc
@@ -87,26 +87,6 @@ int main (int argc, char ** argv)
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  ref = Zero();
  /*  
  { // Naive wilson implementation
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){
      //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
      tmp = U[mu]*Cshift(src,mu,1);
      for(int i=0;i<ref._odata.size();i++){
 	ref[i]+= tmp[i] - Gamma(Gmu[mu])*tmp[i]; ;
      }
      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu,-1);
      for(int i=0;i<ref._odata.size();i++){
 	ref[i]+= tmp[i] + Gamma(Gmu[mu])*tmp[i]; ;
      }
    }
  }
  ref = -0.5*ref;
  */
  RealD mass=0.1;
  RealD c1=9.0/8.0;
@@ -125,10 +105,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
  err = ref-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
  Grid_finalize();
 }
--- a/tests/core/Test_staggered_naive.cc
+++ b/tests/core/Test_staggered_naive.cc
@@ -0,0 +1,282 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_wilson.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 ;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  Coordinate latt_size   = GridDefaultLatt();
  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  Coordinate mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);
  pRNG.SeedFixedIntegers(seeds);
  //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
  typedef typename NaiveStaggeredFermionR::FermionField FermionField; 
  typedef typename NaiveStaggeredFermionR::ComplexField ComplexField; 
  typename NaiveStaggeredFermionR::ImplParams params; 
  FermionField src   (&Grid); random(pRNG,src);
  FermionField result(&Grid); result=Zero();
  FermionField    ref(&Grid);    ref=Zero();
  FermionField    tmp(&Grid);    tmp=Zero();
  FermionField    err(&Grid);    tmp=Zero();
  FermionField phi   (&Grid); random(pRNG,phi);
  FermionField chi   (&Grid); random(pRNG,chi);
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  std::vector<LatticeColourMatrix> U(4,&Grid);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }  
  // Only one non-zero (y)
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  /* Debug force unit
    U[mu] = 1.0;
    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
  */
  }
  ref = Zero();
  RealD mass=0.1;
  RealD c1=9.0/8.0;
  RealD u0=1.0;
  { // Simple improved staggered implementation
    ref = Zero();
    RealD c1tad = 0.5*c1/u0;
    Lattice<iScalar<vInteger> > coor(&Grid);
    Lattice<iScalar<vInteger> > x(&Grid); LatticeCoordinate(x,0);
    Lattice<iScalar<vInteger> > y(&Grid); LatticeCoordinate(y,1);
    Lattice<iScalar<vInteger> > z(&Grid); LatticeCoordinate(z,2);
    Lattice<iScalar<vInteger> > t(&Grid); LatticeCoordinate(t,3);
    Lattice<iScalar<vInteger> > lin_z(&Grid); lin_z=x+y;
    Lattice<iScalar<vInteger> > lin_t(&Grid); lin_t=x+y+z;
    for(int mu=0;mu<Nd;mu++){
      // Staggered Phase.
      ComplexField phases(&Grid);	phases=1.0;
      if ( mu == 1 ) phases = where( mod(x    ,2)==(Integer)0, phases,-phases);
      if ( mu == 2 ) phases = where( mod(lin_z,2)==(Integer)0, phases,-phases);
      if ( mu == 3 ) phases = where( mod(lin_t,2)==(Integer)0, phases,-phases);
      tmp = PeriodicBC::CovShiftForward(U[mu],mu,src);
      ref = ref +c1tad*tmp*phases; // Forward 1 hop
      tmp = PeriodicBC::CovShiftBackward(U[mu],mu,src);
      ref = ref -c1tad*tmp*phases; // Backward 1 hop
    }
    //    ref = ref + mass * src;
  }
  NaiveStaggeredFermionR Ds(Umu,Grid,RBGrid,mass,c1,u0,params);
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing Dhop against cshift implementation         "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
  int ncall=1000;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
    Ds.Dhop(src,result,0);
  }
  double t1=usecond();
  double t2;
  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
  err = ref-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that Deo + Doe = Dunprec "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  FermionField src_e   (&RBGrid);
  FermionField src_o   (&RBGrid);
  FermionField r_e   (&RBGrid);
  FermionField r_o   (&RBGrid);
  FermionField r_eo  (&Grid);
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);
  Ds.Meooe(src_e,r_o);  std::cout<<GridLogMessage<<"Applied Meo"<<std::endl;
  Ds.Meooe(src_o,r_e);  std::cout<<GridLogMessage<<"Applied Moe"<<std::endl;
  Ds.Dhop (src,ref,DaggerNo);
  setCheckerboard(r_eo,r_o);
  setCheckerboard(r_eo,r_e);
  err= ref - r_eo;
  std::cout<<GridLogMessage << "EO norm diff   "<< norm2(err)<< " "<<norm2(ref)<< " " << norm2(r_eo) <<std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Test Ddagger is the dagger of D by requiring                "<<std::endl;
  std::cout<<GridLogMessage<<"=  < phi | Deo | chi > * = < chi | Deo^dag| phi>  "<<std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  FermionField chi_e   (&RBGrid);
  FermionField chi_o   (&RBGrid);
  FermionField dchi_e  (&RBGrid);
  FermionField dchi_o  (&RBGrid);
  FermionField phi_e   (&RBGrid);
  FermionField phi_o   (&RBGrid);
  FermionField dphi_e  (&RBGrid);
  FermionField dphi_o  (&RBGrid);
  pickCheckerboard(Even,chi_e,chi);
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  Ds.Meooe(chi_e,dchi_o);
  Ds.Meooe(chi_o,dchi_e);
  Ds.MeooeDag(phi_e,dphi_o);
  Ds.MeooeDag(phi_o,dphi_e);
  ComplexD pDce = innerProduct(phi_e,dchi_e);
  ComplexD pDco = innerProduct(phi_o,dchi_o);
  ComplexD cDpe = innerProduct(chi_e,dphi_e);
  ComplexD cDpo = innerProduct(chi_o,dphi_o);
  std::cout<<GridLogMessage <<"e "<<pDce<<" "<<cDpe <<std::endl;
  std::cout<<GridLogMessage <<"o "<<pDco<<" "<<cDpo <<std::endl;
  std::cout<<GridLogMessage <<"pDce - conj(cDpo) "<< pDce-conj(cDpo) <<std::endl;
  std::cout<<GridLogMessage <<"pDco - conj(cDpe) "<< pDco-conj(cDpe) <<std::endl;
  std::cout<<GridLogMessage <<"e "<<pDce<<" "<<cDpe <<std::endl;
  std::cout<<GridLogMessage <<"o "<<pDco<<" "<<cDpo <<std::endl;
  std::cout<<GridLogMessage <<"pDce - conj(cDpo) "<< pDce-conj(cDpo) <<std::endl;
  std::cout<<GridLogMessage <<"pDco - conj(cDpe) "<< pDco-conj(cDpe) <<std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Test MeeInv Mee = 1                                         "<<std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  pickCheckerboard(Even,chi_e,chi);
  pickCheckerboard(Odd ,chi_o,chi);
  Ds.Mooee(chi_e,src_e);
  Ds.MooeeInv(src_e,phi_e);
  Ds.Mooee(chi_o,src_o);
  Ds.MooeeInv(src_o,phi_o);
  setCheckerboard(phi,phi_e);
  setCheckerboard(phi,phi_o);
  err = phi-chi;
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<< std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Test MeeInvDag MeeDag = 1                                   "<<std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  pickCheckerboard(Even,chi_e,chi);
  pickCheckerboard(Odd ,chi_o,chi);
  Ds.MooeeDag(chi_e,src_e);
  Ds.MooeeInvDag(src_e,phi_e);
  Ds.MooeeDag(chi_o,src_o);
  Ds.MooeeInvDag(src_o,phi_o);
  setCheckerboard(phi,phi_e);
  setCheckerboard(phi,phi_o);
  err = phi-chi;
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<< std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Test MpcDagMpc is Hermitian              "<<std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  random(pRNG,phi);
  random(pRNG,chi);
  pickCheckerboard(Even,chi_e,chi);
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  SchurDiagMooeeOperator<NaiveStaggeredFermionR,FermionField> HermOpEO(Ds);
  HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2);
  HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2);
  HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2);
  HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2);
  pDce = innerProduct(phi_e,dchi_e);
  pDco = innerProduct(phi_o,dchi_o);
  cDpe = innerProduct(chi_e,dphi_e);
  cDpo = innerProduct(chi_o,dphi_o);
  std::cout<<GridLogMessage <<"e "<<pDce<<" "<<cDpe <<std::endl;
  std::cout<<GridLogMessage <<"o "<<pDco<<" "<<cDpo <<std::endl;
  std::cout<<GridLogMessage <<"pDce - conj(cDpo) "<< pDco-conj(cDpo) <<std::endl;
  std::cout<<GridLogMessage <<"pDco - conj(cDpe) "<< pDce-conj(cDpe) <<std::endl;
  Grid_finalize();
 }
		`@@ -0,0 +1 @@`
							`../NaiveStaggeredFermionInstantiation.cc.master`