Merge remote-tracking branch 'origin/develop' into temporary-smearing

2026-07-03 00:43:29 +01:00 · 2016-07-04 17:28:40 +01:00
parent 6ce174cd60 d6737e4bd8
commit 9cb90f714e
107 changed files with 7839 additions and 4572 deletions
@@ -90,7 +90,7 @@ namespace QCD {
    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;

-    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;

    // Spin matrix
@@ -383,7 +383,6 @@ namespace QCD {
    //////////////////////////////////////////////
    // Poke scalars
    //////////////////////////////////////////////
-
    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
    {
      pokeIndex<SpinIndex>(lhs,rhs,i);
@@ -407,7 +406,41 @@ namespace QCD {
      pokeIndex<LorentzIndex>(lhs,rhs,i);
    }

-
+    //////////////////////////////////////////////
+    // Fermion <-> propagator assignements
+    //////////////////////////////////////////////
+    template <class Prop, class Ferm>
+    void FermToProp(Prop &p, const Ferm &f, const int s, const int c)
+    {
+        for(int j = 0; j < Ns; ++j)
+        {
+            auto pjs = peekSpin(p, j, s);
+            auto fj  = peekSpin(f, j);
+            
+            for(int i = 0; i < Nc; ++i)
+            {
+                pokeColour(pjs, peekColour(fj, i), i, c);
+            }
+            pokeSpin(p, pjs, j, s);
+        }
+    }
+    
+    template <class Prop, class Ferm>
+    void PropToFerm(Ferm &f, const Prop &p, const int s, const int c)
+    {
+        for(int j = 0; j < Ns; ++j)
+        {
+            auto pjs = peekSpin(p, j, s);
+            auto fj  = peekSpin(f, j);
+            
+            for(int i = 0; i < Nc; ++i)
+            {
+                pokeColour(fj, peekColour(pjs, i, c), i);
+            }
+            pokeSpin(f, fj, j);
+        }
+    }
+    
    //////////////////////////////////////////////
    // transpose array and scalar
    //////////////////////////////////////////////
@@ -109,10 +109,12 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction

 #define FermOpTemplateInstantiate(A) \
  template class A<WilsonImplF>;		\
-  template class A<WilsonImplD>;  \
+  template class A<WilsonImplD>;		\
  template class A<GparityWilsonImplF>;		\
  template class A<GparityWilsonImplD>;		

+#define GparityFermOpTemplateInstantiate(A) 
+
 ////////////////////////////////////////////
 // Fermion operators / actions
 ////////////////////////////////////////////
@@ -208,6 +210,14 @@ typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;

+typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
+typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
+typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
+typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
+typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
+typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
+
+
  }}
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
@@ -527,6 +527,7 @@ namespace QCD {
  }

  FermOpTemplateInstantiate(CayleyFermion5D);
+  GparityFermOpTemplateInstantiate(CayleyFermion5D);

 }}

@@ -130,7 +130,7 @@ namespace Grid {

      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
      typedef WilsonImplParams ImplParams;
-      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
+      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;

      ImplParams Params;

@@ -142,6 +142,10 @@ namespace Grid {
        mult(&phi(),&U(mu),&chi());
      }

+      template<class ref>
+      inline void loadLinkElement(Simd & reg,ref &memory){
+	reg = memory;
+      }
      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
      {
        conformable(Uds._grid,GaugeGrid);
@@ -181,6 +185,100 @@ PARALLEL_FOR_LOOP

    };

+
+
+    ///////
+    // Single flavour four spinors with colour index, 5d redblack
+    ///////
+    template<class S,int Nrepresentation=Nc>
+    class DomainWallRedBlack5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+    public:
+
+      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
+
+      INHERIT_GIMPL_TYPES(Gimpl);
+      
+      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
+      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
+      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
+      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd >;
+      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+    
+      typedef iImplSpinor    <Simd>           SiteSpinor;
+      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
+      typedef Lattice<SiteSpinor>             FermionField;
+
+      // Make the doubled gauge field a *scalar*
+      typedef iImplDoubledGaugeField<typename Simd::scalar_type>    SiteDoubledGaugeField; // This is a scalar
+      typedef iImplGaugeField<typename Simd::scalar_type>           SiteScalarGaugeField;  // scalar
+      typedef iImplGaugeLink <typename Simd::scalar_type>           SiteScalarGaugeLink;   // scalar
+
+      typedef Lattice<SiteDoubledGaugeField>                  DoubledGaugeField;
+
+      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
+      typedef WilsonImplParams ImplParams;
+      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
+
+      ImplParams Params;
+
+      DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
+
+      bool overlapCommsCompute(void) { return false; };
+    
+      template<class ref>
+      inline void loadLinkElement(Simd & reg,ref &memory){
+	vsplat(reg,memory);
+      }
+      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
+      {
+	SiteGaugeLink UU;
+	for(int i=0;i<Nrepresentation;i++){
+	  for(int j=0;j<Nrepresentation;j++){
+	    vsplat(UU()()(i,j),U(mu)()(i,j));
+	  }
+	}
+        mult(&phi(),&UU(),&chi());
+      }
+
+      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+      {
+	SiteScalarGaugeField  ScalarUmu;
+	SiteDoubledGaugeField ScalarUds;
+
+        GaugeLinkField U   (Umu._grid);
+	GaugeField     Uadj(Umu._grid);
+        for(int mu=0;mu<Nd;mu++){
+  	  U = PeekIndex<LorentzIndex>(Umu,mu);
+	  U = adj(Cshift(U,mu,-1));
+	  PokeIndex<LorentzIndex>(Uadj,U,mu);
+	}
+
+	for(int lidx=0;lidx<GaugeGrid->lSites();lidx++){
+	  std::vector<int> lcoor;
+	  GaugeGrid->LocalIndexToLocalCoor(lidx,lcoor);
+
+	  peekLocalSite(ScalarUmu,Umu,lcoor);
+	  for(int mu=0;mu<4;mu++) ScalarUds(mu) = ScalarUmu(mu);
+
+	  peekLocalSite(ScalarUmu,Uadj,lcoor);
+	  for(int mu=0;mu<4;mu++) ScalarUds(mu+4) = ScalarUmu(mu);
+
+	  pokeLocalSite(ScalarUds,Uds,lcoor);
+	}
+
+      }
+	
+      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+	assert(0);
+      }   
+
+      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+	assert(0);
+      }
+
+    };
+
+
    ////////////////////////////////////////////////////////////////////////////////////////
    // Flavour doubled spinors; is Gparity the only? what about C*?
    ////////////////////////////////////////////////////////////////////////////////////////
@@ -205,7 +303,7 @@ PARALLEL_FOR_LOOP
      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;

      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
-      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
+      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;

      typedef GparityWilsonImplParams ImplParams;

@@ -290,8 +388,8 @@ PARALLEL_FOR_LOOP
 	conformable(Uds._grid,GaugeGrid);
 	conformable(Umu._grid,GaugeGrid);
 	
-	GaugeLinkField Utmp(GaugeGrid);
-	GaugeLinkField U(GaugeGrid);
+	GaugeLinkField Utmp (GaugeGrid);
+	GaugeLinkField U    (GaugeGrid);
 	GaugeLinkField Uconj(GaugeGrid);
 	
 	Lattice<iScalar<vInteger> > coor(GaugeGrid);
@@ -379,6 +477,10 @@ PARALLEL_FOR_LOOP
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double

+    typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
+    typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
+    typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
+
    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
    typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
@@ -48,14 +48,16 @@ namespace Grid {
 			GridCartesian         &FourDimGrid,
 			GridRedBlackCartesian &FourDimRedBlackGrid,
 			RealD _mass,RealD _M5,
-			RealD scale) :
+//			RealD scale):
+			RealD scale,const ImplParams &p= ImplParams()) :
      
      // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
      MobiusFermion<Impl>(_Umu,
 		    FiveDimGrid,
 		    FiveDimRedBlackGrid,
 		    FourDimGrid,
-		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
+	FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0),p)
+//		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
      {
      }

@@ -48,12 +48,7 @@ namespace QCD {
      mu=p;
    };

-    virtual SiteHalfSpinor operator () (const SiteSpinor &in,int dim,int plane,int osite,GridBase *grid) {
-      return spinproject(in);
-    }
-
-    SiteHalfSpinor spinproject(const SiteSpinor &in)
-    {
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
      SiteHalfSpinor ret;
      int mudag=mu;
      if (!dag) {
@@ -92,6 +87,173 @@ namespace QCD {
    }
  };

+  /////////////////////////
+  // optimised versions
+  /////////////////////////
+
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonXpCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjXp(ret,in);
+      return ret;
+    }
+  };
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonYpCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjYp(ret,in);
+      return ret;
+    }
+  };
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonZpCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjZp(ret,in);
+      return ret;
+    }
+  };
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonTpCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjTp(ret,in);
+      return ret;
+    }
+  };
+
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonXmCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjXm(ret,in);
+      return ret;
+    }
+  };
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonYmCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjYm(ret,in);
+      return ret;
+    }
+  };
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonZmCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjZm(ret,in);
+      return ret;
+    }
+  };
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonTmCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjTm(ret,in);
+      return ret;
+    }
+  };
+
+    // Fast comms buffer manipulation which should inline right through (avoid direction
+    // dependent logic that prevents inlining
+  template<class vobj,class cobj>
+  class WilsonStencil : public CartesianStencil<vobj,cobj> {
+  public:
+
+    WilsonStencil(GridBase *grid,
+		int npoints,
+		int checkerboard,
+		const std::vector<int> &directions,
+		const std::vector<int> &distances)  : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) 
+      {    };
+
+    template < class compressor>
+    std::thread HaloExchangeOptBegin(const Lattice<vobj> &source,compressor &compress) {
+      this->Mergers.resize(0); 
+      this->Packets.resize(0);
+      this->HaloGatherOpt(source,compress);
+      return std::thread([&] { this->Communicate(); });
+    }
+
+    template < class compressor>
+    void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
+    {
+      auto thr = this->HaloExchangeOptBegin(source,compress);
+      this->HaloExchangeOptComplete(thr);
+    }
+
+    void HaloExchangeOptComplete(std::thread &thr) 
+    {
+	this->CommsMerge(); // spins
+	this->jointime-=usecond();
+	thr.join();
+	this->jointime+=usecond();
+    }
+
+    template < class compressor>
+    void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress)
+    {
+	// conformable(source._grid,_grid);
+	assert(source._grid==this->_grid);
+	this->halogtime-=usecond();
+
+	assert (this->comm_buf.size() == this->_unified_buffer_size );
+	this->u_comm_offset=0;
+
+	int dag = compress.dag;
+	static std::vector<int> dirs(Nd*2);
+	for(int mu=0;mu<Nd;mu++){
+	  if ( dag ) {
+	    dirs[mu]  =mu;
+	    dirs[mu+4]=mu+Nd;
+	  } else { 
+	    dirs[mu]  =mu+Nd;
+	    dirs[mu+Nd]=mu;
+	  }
+	}
+
+
+	WilsonXpCompressor<cobj,vobj> XpCompress;
+	this->HaloGatherDir(source,XpCompress,dirs[0]);
+
+	WilsonYpCompressor<cobj,vobj> YpCompress;
+	this->HaloGatherDir(source,YpCompress,dirs[1]);
+
+	WilsonZpCompressor<cobj,vobj> ZpCompress;
+	this->HaloGatherDir(source,ZpCompress,dirs[2]);
+
+	WilsonTpCompressor<cobj,vobj> TpCompress;
+	this->HaloGatherDir(source,TpCompress,dirs[3]);
+
+	WilsonXmCompressor<cobj,vobj> XmCompress;
+	this->HaloGatherDir(source,XmCompress,dirs[4]);
+
+	WilsonYmCompressor<cobj,vobj> YmCompress;
+	this->HaloGatherDir(source,YmCompress,dirs[5]);
+
+	WilsonZmCompressor<cobj,vobj> ZmCompress;
+	this->HaloGatherDir(source,ZmCompress,dirs[6]);
+
+	WilsonTmCompressor<cobj,vobj> TmCompress;
+	this->HaloGatherDir(source,TmCompress,dirs[7]);
+
+	assert(this->u_comm_offset==this->_unified_buffer_size);
+	this->halogtime+=usecond();
+      }
+
+  };
+

 }} // namespace close
 #endif
@@ -64,7 +64,9 @@ namespace QCD {
  template<class Impl>
  void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  {
-    Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
+    GaugeField HUmu(_Umu._grid);
+    HUmu = _Umu*(-0.5);
+    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
    pickCheckerboard(Even,UmuEven,Umu);
    pickCheckerboard(Odd ,UmuOdd,Umu);
  }
@@ -286,121 +288,27 @@ PARALLEL_FOR_LOOP
  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) 
  {
-    if ( Impl::overlapCommsCompute () ) { 
-      DhopInternalCommsOverlapCompute(st,U,in,out,dag);
-    } else { 
-      DhopInternalCommsThenCompute(st,U,in,out,dag);
-    }
-  }
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
-							 const FermionField &in, FermionField &out,int dag) {
-
    assert((dag==DaggerNo) ||(dag==DaggerYes));

    Compressor compressor(dag);
    st.HaloExchange(in,compressor);
    
    if ( dag == DaggerYes ) {
-      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
-	}
+      for(int sss=0;sss<in._grid->oSites();sss++){
+	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
      }
    } else {
-      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
-	}
+      for(int sss=0;sss<in._grid->oSites();sss++){
+	Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
      }
    }
  };

-
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
-						     const FermionField &in, FermionField &out,int dag) {
-
-    assert((dag==DaggerNo) ||(dag==DaggerYes));
-
-    Compressor compressor(dag);
-
-    auto handle = st.HaloExchangeBegin(in,compressor);
-
-    bool local    = true;
-    bool nonlocal = false;
-    if ( dag == DaggerYes ) {
-      if( HandOptDslash ) {
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      }
-    } else {
-      if( HandOptDslash ) {
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      }
-    }
-
-    st.HaloExchangeComplete(handle);
-
-    local    = false;
-    nonlocal = true;
-    if ( dag == DaggerYes ) {
-      if( HandOptDslash ) {
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      }
-    } else {
-      if( HandOptDslash ) {
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      }
-    }
-
-  };
-
 
  FermOpTemplateInstantiate(WilsonFermion);
+  GparityFermOpTemplateInstantiate(WilsonFermion);


 }}
@@ -114,12 +114,6 @@ namespace Grid {
      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 			const FermionField &in, FermionField &out,int dag) ;

-      void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
-				    const FermionField &in, FermionField &out,int dag) ;
-      void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
-				    const FermionField &in, FermionField &out,int dag) ;
-
-
      // Constructor
      WilsonFermion(GaugeField &_Umu,
 		    GridCartesian         &Fgrid,
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -38,8 +38,6 @@ namespace QCD {
 // S-direction is INNERMOST and takes no part in the parity.
 const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
 const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
-int WilsonFermion5DStatic::HandOptDslash;
-int WilsonFermion5DStatic::AsmOptDslash;

  // 5d lattice for DWF.
 template<class Impl>
@@ -67,10 +65,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
-  
  assert(FiveDimRedBlackGrid._ndimension==5);
  assert(FourDimRedBlackGrid._ndimension==4);
-
  assert(FiveDimRedBlackGrid._checker_dim==1);

  // Dimension zero of the five-d is the Ls direction
@@ -99,16 +95,74 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,

  // Allocate the required comms buffer
  ImportGauge(_Umu);
-  alltime=0;
-  commtime=0;
-  jointime=0;
-  dslashtime=0;
-  dslash1time=0;
 }  
+
+template<class Impl>
+WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
+				       GridCartesian         &FiveDimGrid,
+				       GridRedBlackCartesian &FiveDimRedBlackGrid,
+				       GridCartesian         &FourDimGrid,
+				       RealD _M5,const ImplParams &p) :
+  Kernels(p),
+  _FiveDimGrid        (&FiveDimGrid),
+  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
+  _FourDimGrid        (&FourDimGrid),
+  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
+  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
+  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
+  M5(_M5),
+  Umu(_FourDimGrid),
+  UmuEven(_FourDimGrid),
+  UmuOdd (_FourDimGrid),
+  Lebesgue(_FourDimGrid),
+  LebesgueEvenOdd(_FourDimGrid)
+{
+  int nsimd = Simd::Nsimd();
+
+  // some assertions
+  assert(FiveDimGrid._ndimension==5);
+  assert(FiveDimRedBlackGrid._ndimension==5);
+  assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
+  assert(FourDimGrid._ndimension==4);
+
+  // Dimension zero of the five-d is the Ls direction
+  Ls=FiveDimGrid._fdimensions[0];
+  assert(FiveDimGrid._processors[0]         ==1);
+  assert(FiveDimGrid._simd_layout[0]        ==nsimd);
+
+  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+  assert(FiveDimRedBlackGrid._processors[0] ==1);
+  assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
+
+  // Other dimensions must match the decomposition of the four-D fields 
+  for(int d=0;d<4;d++){
+    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+
+    assert(FourDimGrid._simd_layout[d]=1);
+    assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
+
+    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+  }
+
+  {
+    GaugeField HUmu(_Umu._grid);
+    HUmu = _Umu*(-0.5);
+    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
+    UmuEven=Umu;// Really want a reference.
+    UmuOdd =Umu;
+  }
+}  
+
+
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
-  Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
+  GaugeField HUmu(_Umu._grid);
+  HUmu = _Umu*(-0.5);
+  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
  pickCheckerboard(Even,UmuEven,Umu);
  pickCheckerboard(Odd ,UmuOdd,Umu);
 }
@@ -232,30 +286,6 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 }


-template<class Impl>
-void WilsonFermion5D<Impl>::Report(void)
-{
-  std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
-  std::cout<<GridLogMessage << "Wilson5d      time "<<alltime <<" us"<<std::endl;
-  std::cout<<GridLogMessage << "HaloBegin     time "<<commtime <<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Dslash        time "<<dslashtime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Dslash1       time "<<dslash1time<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "HaloComplete  time "<<jointime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil all gather      time "<<Stencil.halogtime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil splice   gather time "<<Stencil.splicetime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "********************"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil gather        "<<Stencil.gathertime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil gather simd   "<<Stencil.gathermtime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil merge  simd   "<<Stencil.mergetime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil spin   simd   "<<Stencil.spintime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "********************"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil MB/s          "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
-  std::cout<<GridLogMessage << "Stencil comm     time "<<Stencil.commtime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil join     time "<<Stencil.jointime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "********************"<<std::endl;
-}
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 				  const FermionField &A,
@@ -277,280 +307,32 @@ template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag)
-{
-  if ( Impl::overlapCommsCompute () ) { 
-    DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag);
-  } else { 
-    DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
-  }
-}
-
-template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo,
-					 DoubledGaugeField & U,
-					 const FermionField &in, FermionField &out,int dag)
 {
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-  alltime-=usecond();
  Compressor compressor(dag);

-  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
-
-  int threads = GridThread::GetThreads();
-  int HT      = GridThread::GetHyperThreads();
-  int cores   = GridThread::GetCores();
-  int nwork = U._grid->oSites();
+  int LLs = in._grid->_rdimensions[0];
  
-  commtime -=usecond();
-  auto handle = st.HaloExchangeBegin(in,compressor);
-  st.HaloExchangeComplete(handle);
-  commtime +=usecond();
-
-  jointime -=usecond();
-  jointime +=usecond();
+  st.HaloExchange(in,compressor);
  
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  // Not loop ordering and data layout.
-  // Designed to create 
-  // - per thread reuse in L1 cache for U
-  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
-  dslashtime -=usecond();
  if ( dag == DaggerYes ) {
-    if( this->HandOptDslash ) {
-#pragma omp parallel for schedule(static)
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-      }
-    } else { 
 PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	{
-	  int sd;
-	  for(sd=0;sd<Ls;sd++){
-	    int sU=ss;
-	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
+    for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	int sF=LLs*sU;
+	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
    }
  } else {
-    if( this->AsmOptDslash ) {
-      //      for(int i=0;i<1;i++){
-      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-      //	PerformanceCounter Counter(i);
-      //	Counter.Start();
-
-#pragma omp parallel for 
-      for(int t=0;t<threads;t++){
-
-	int hyperthread = t%HT;
-	int core        = t/HT;
-
-        int sswork, swork,soff,ssoff,  sU,sF;
-	
-	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
-	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
-
-	for(int ss=0;ss<sswork;ss++){
-	  for(int s=soff;s<soff+swork;s++){
-
-	    sU=ss+ ssoff;
-
-	    if ( LebesgueOrder::UseLebesgueOrder ) {
-	      sU = lo.Reorder(sU);
-	    }
-	    sF = s+Ls*sU;
-	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
-	  }
-	}
-      }
-      //      Counter.Stop();
-      //      Counter.Report();
-      //      }
-    } else if( this->HandOptDslash ) {
-      /*
-
-#pragma omp parallel for schedule(static)
-      for(int t=0;t<threads;t++){
-
-	int hyperthread = t%HT;
-	int core        = t/HT;
-
-        int sswork, swork,soff,ssoff,  sU,sF;
-	
-	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
-	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
-
-	for(int ss=0;ss<sswork;ss++){
-	  sU=ss+ ssoff;
-	  for(int s=soff;s<soff+swork;s++){
-	    sF = s+Ls*sU;
-	    Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
-      */
-
-#pragma omp parallel for schedule(static)
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
-      }
-    } else { 
 PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
-      }
+    for(int ss=0;ss<U._grid->oSites();ss++){
+      int sU=ss;
+      int sF=LLs*sU;
+      Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
    }
  }
-  dslashtime +=usecond();
-  alltime+=usecond();
 }

-template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
-						     DoubledGaugeField & U,
-						     const FermionField &in, FermionField &out,int dag)
-{
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-  alltime-=usecond();
-
-  int calls;
-  int updates;
-  Compressor compressor(dag);
-
-  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
-
-  int threads = GridThread::GetThreads();
-  int HT      = GridThread::GetHyperThreads();
-  int cores   = GridThread::GetCores();
-  int nwork = U._grid->oSites();
-  
-  commtime -=usecond();
-  auto handle = st.HaloExchangeBegin(in,compressor);
-  commtime +=usecond();
-  
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  // Not loop ordering and data layout.
-  // Designed to create 
-  // - per thread reuse in L1 cache for U
-  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
-  bool local    = true;
-  bool nonlocal = false;
-  dslashtime -=usecond();
-  if ( dag == DaggerYes ) {
-    if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	  }
-      }
-    } else { 
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	{
-	  int sd;
-	  for(sd=0;sd<Ls;sd++){
-	    int sU=ss;
-	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	  }
-	}
-      }
-    }
-  } else {
-    if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	}
-      }
-    } else { 
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	}
-      }
-    }
-  }
-  dslashtime +=usecond();
-
-  jointime -=usecond();
-  st.HaloExchangeComplete(handle);
-  jointime +=usecond();
-
-  local    = false;
-  nonlocal = true;
-  dslash1time -=usecond();
-  if ( dag == DaggerYes ) {
-    if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	  }
-      }
-    } else { 
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	{
-	  int sd;
-	  for(sd=0;sd<Ls;sd++){
-	    int sU=ss;
-	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	  }
-	}
-      }
-    }
-  } else {
-    if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	}
-      }
-    } else { 
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	}
-      }
-    }
-  }
-  dslash1time +=usecond();
-  alltime+=usecond();
-
-}

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
@@ -593,7 +375,10 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 }

 FermOpTemplateInstantiate(WilsonFermion5D);
-
+GparityFermOpTemplateInstantiate(WilsonFermion5D);
+template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
+template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
+  
 }}


@@ -1,3 +1,4 @@
+
    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -48,8 +49,6 @@ namespace Grid {
    class WilsonFermion5DStatic { 
    public:
      // S-direction is INNERMOST and takes no part in the parity.
-      static int AsmOptDslash; // these are a temporary hack
-      static int HandOptDslash; // these are a temporary hack
      static const std::vector<int> directions;
      static const std::vector<int> displacements;
      const int npoint = 8;
@@ -61,11 +60,7 @@ namespace Grid {
    public:
     INHERIT_IMPL_TYPES(Impl);
     typedef WilsonKernels<Impl> Kernels;
-     double alltime;
-     double jointime;
-     double commtime;
-     double dslashtime;
-     double dslash1time;
+
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
      ///////////////////////////////////////////////////////////////
@@ -86,6 +81,7 @@ namespace Grid {
      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

      // These can be overridden by fancy 5d chiral action
      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
@@ -120,19 +116,6 @@ namespace Grid {
 			FermionField &out,
 			int dag);

-      void DhopInternalCommsThenCompute(StencilImpl & st,
-			LebesgueOrder &lo,
-			DoubledGaugeField &U,
-			const FermionField &in, 
-			FermionField &out,
-			int dag);
-      void DhopInternalCommsOverlapCompute(StencilImpl & st,
-			LebesgueOrder &lo,
-			DoubledGaugeField &U,
-			const FermionField &in, 
-			FermionField &out,
-			int dag);
-
      // Constructors
      WilsonFermion5D(GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
@@ -141,14 +124,21 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      double _M5,const ImplParams &p= ImplParams());

+      // Constructors
+      WilsonFermion5D(int simd, 
+		      GaugeField &_Umu,
+		      GridCartesian         &FiveDimGrid,
+		      GridRedBlackCartesian &FiveDimRedBlackGrid,
+		      GridCartesian         &FourDimGrid,
+		      double _M5,const ImplParams &p= ImplParams());
+
      // DoubleStore
      void ImportGauge(const GaugeField &_Umu);

-      void Report(void);
      ///////////////////////////////////////////////////////////////
      // Data members require to support the functionality
      ///////////////////////////////////////////////////////////////
-    protected:
+    public:

      // Add these to the support from Wilson
      GridBase *_FourDimGrid;
@@ -31,440 +31,410 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 namespace QCD {

+  int WilsonKernelsStatic::HandOpt;
+  int WilsonKernelsStatic::AsmOpt;
+
 template<class Impl> 
 WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};

-  // Need controls to do interior, exterior, or both
+template<class Impl> 
+void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
+						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						  int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
+{
+  if ( AsmOpt ) {
+
+    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out);
+
+  } else {
+
+    for(int site=0;site<Ns;site++) {
+      for(int s=0;s<Ls;s++) {
+	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
+	else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
+	sF++;
+      }
+      sU++;
+    }
+
+  }
+}
+
 template<class Impl> 
 void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					   int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
+					   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
+{
+  // No asm implementation yet.
+  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
+  //  else
+  for(int site=0;site<Ns;site++) {
+    for(int s=0;s<Ls;s++) {
+      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
+      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
+      sF++;
+    }
+    sU++;
+  }
+}
+
+
+  ////////////////////////////////////////////
+  // Generic implementation; move to different file?
+  ////////////////////////////////////////////
+
+template<class Impl> 
+void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					   int sF,int sU,const FermionField &in, FermionField &out)
 {
  SiteHalfSpinor  tmp;    
  SiteHalfSpinor  chi;    
+  SiteHalfSpinor *chi_p;
  SiteHalfSpinor Uchi;
  SiteSpinor result;
  StencilEntry *SE;
  int ptype;

-  int num = 0;
-
-  result=zero;
-
  ///////////////////////////
  // Xp
  ///////////////////////////
  SE=st.GetEntry(ptype,Xp,sF);

-  if (local && SE->_is_local ) { 
+  if (SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXp(chi,in._odata[SE->_offset]);
    }
-  }
-
-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
+  } else { 
+    chi_p=&buf[SE->_offset];
  }
  
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
-    accumReconXp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
+  spReconXp(result,Uchi);
    
  ///////////////////////////
  // Yp
  ///////////////////////////
  SE=st.GetEntry(ptype,Yp,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYp(chi,in._odata[SE->_offset]);
    }
+  } else { 
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
-    accumReconYp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
+  accumReconYp(result,Uchi);

  ///////////////////////////
  // Zp
  ///////////////////////////
  SE=st.GetEntry(ptype,Zp,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZp(chi,in._odata[SE->_offset]);
    }
+  } else { 
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
-    accumReconZp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
+  accumReconZp(result,Uchi);

  ///////////////////////////
  // Tp
  ///////////////////////////
  SE=st.GetEntry(ptype,Tp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjTp(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
-    accumReconTp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
+  accumReconTp(result,Uchi);

  ///////////////////////////
  // Xm
  ///////////////////////////
  SE=st.GetEntry(ptype,Xm,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
-    accumReconXm(result,Uchi);
-    num++;
-  }
-
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
+  accumReconXm(result,Uchi);
+  
  ///////////////////////////
  // Ym
  ///////////////////////////
  SE=st.GetEntry(ptype,Ym,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
-    accumReconYm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
+  accumReconYm(result,Uchi);
  
  ///////////////////////////
  // Zm
  ///////////////////////////
  SE=st.GetEntry(ptype,Zm,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
-    accumReconZm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
+  accumReconZm(result,Uchi);

  ///////////////////////////
  // Tm
  ///////////////////////////
  SE=st.GetEntry(ptype,Tm,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else { 
      spProjTm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
+  accumReconTm(result,Uchi);

-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
-    accumReconTm(result,Uchi);
-    num++;
-  }
-
-  if ( local ) {
-    vstream(out._odata[sF],result*(-0.5));
-  } else if ( num ) { 
-    vstream(out._odata[sF],out._odata[sF]+result*(-0.5));
-  }
+  vstream(out._odata[sF],result);
 };


  // Need controls to do interior, exterior, or both
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
-					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					   int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
+void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
+						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						  int sF,int sU,const FermionField &in, FermionField &out)
 {
  SiteHalfSpinor  tmp;    
  SiteHalfSpinor  chi;    
+  SiteHalfSpinor *chi_p;    
  SiteHalfSpinor Uchi;
  SiteSpinor result;
  StencilEntry *SE;
  int ptype;

-  int num = 0;
-
-  result=zero;
-
  ///////////////////////////
  // Xp
  ///////////////////////////
  SE=st.GetEntry(ptype,Xm,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXp(chi,in._odata[SE->_offset]);
    }
-  }
-
-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
+  } else { 
+    chi_p=&buf[SE->_offset];
  }
  
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
-    accumReconXp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
+  spReconXp(result,Uchi);
    
  ///////////////////////////
  // Yp
  ///////////////////////////
  SE=st.GetEntry(ptype,Ym,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYp(chi,in._odata[SE->_offset]);
    }
+  } else { 
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
-    accumReconYp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
+  accumReconYp(result,Uchi);

  ///////////////////////////
  // Zp
  ///////////////////////////
  SE=st.GetEntry(ptype,Zm,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZp(chi,in._odata[SE->_offset]);
    }
+  } else { 
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
-    accumReconZp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
+  accumReconZp(result,Uchi);

  ///////////////////////////
  // Tp
  ///////////////////////////
  SE=st.GetEntry(ptype,Tm,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjTp(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
-    accumReconTp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
+  accumReconTp(result,Uchi);

  ///////////////////////////
  // Xm
  ///////////////////////////
  SE=st.GetEntry(ptype,Xp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
-    accumReconXm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
+  accumReconXm(result,Uchi);

  ///////////////////////////
  // Ym
  ///////////////////////////
  SE=st.GetEntry(ptype,Yp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
-    accumReconYm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
+  accumReconYm(result,Uchi);
  
  ///////////////////////////
  // Zm
  ///////////////////////////
  SE=st.GetEntry(ptype,Zp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
-    accumReconZm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
+  accumReconZm(result,Uchi);

  ///////////////////////////
  // Tm
  ///////////////////////////
  SE=st.GetEntry(ptype,Tp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else { 
      spProjTm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
+  accumReconTm(result,Uchi);

-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
-    accumReconTm(result,Uchi);
-    num++;
-  }
-
-  if ( local ) {
-    vstream(out._odata[sF],result*(-0.5));
-  } else if ( num ) { 
-    vstream(out._odata[sF],out._odata[sF]+result*(-0.5));
-  }
+  vstream(out._odata[sF],result);
 };

 template<class Impl> 
@@ -593,19 +563,13 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
    spReconTm(result,Uchi);
  }

-  vstream(out._odata[sF],result*(-0.5));
+  vstream(out._odata[sF],result);
 }

-#if ( ! defined(AVX512) ) && ( ! defined(IMCI) )
-template<class Impl> 
-void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
-					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					      int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
-{
-  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-}
-#endif

  FermOpTemplateInstantiate(WilsonKernels);

+template class WilsonKernels<DomainWallRedBlack5dImplF>;		
+template class WilsonKernels<DomainWallRedBlack5dImplD>;
+
 }}
@@ -38,37 +38,56 @@ namespace Grid {
    // Helper routines that implement Wilson stencil for a single site.
    // Common to both the WilsonFermion and WilsonFermion5D
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    class WilsonKernelsStatic { 
+    public:
+      // S-direction is INNERMOST and takes no part in the parity.
+      static int AsmOpt;  // these are a temporary hack
+      static int HandOpt; // these are a temporary hack
+    };

-    template<class Impl> class WilsonKernels : public FermionOperator<Impl> { 
+    template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
    public:

     INHERIT_IMPL_TYPES(Impl);
     typedef FermionOperator<Impl> Base;
     
    public:
+
     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			   int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+			   int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
      
     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in,FermionField &out,bool local= true, bool nonlocal=true);
+			      int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);

     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);

+    private:
+     // Specialised variants
+     void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
+			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+			   int sF,int sU, const FermionField &in, FermionField &out);
+      
+     void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+			      int sF,int sU,const FermionField &in,FermionField &out);
+
     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);

-     int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+
+     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+			      int sF,int sU,const FermionField &in, FermionField &out);
     
-     int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-				 int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+				 int sF,int sU,const FermionField &in, FermionField &out);
+    public:

     WilsonKernels(const ImplParams &p= ImplParams());
     
@@ -2,6 +2,8 @@

    Grid physics library, www.github.com/paboyle/Grid 

+
+
    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc

    Copyright (C) 2015
@@ -26,320 +28,88 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
 #include <Grid.h>
-#if defined(AVX512) || defined (IMCI)
-
-#include <simd/Avx512Asm.h>
-
-#undef VLOAD
-#undef VSTORE
-#undef VMUL
-#undef VMADD
-#undef ZEND
-#undef ZLOAD
-#undef ZMUL
-#undef ZMADD
-#undef VZERO
-#undef VTIMESI
-#undef VTIMESMINUSI
-
-#define VZERO(A)                  VZEROf(A)
-#define VMOV(A,B)                 VMOVf(A,B)
-#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
-#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
-
-#define VADD(A,B,C)               VADDf(A,B,C)
-#define VSUB(A,B,C)               VSUBf(A,B,C)
-#define VMUL(Uri,Uir,Chi,UChi,Z)  VMULf(Uri,Uir,Chi,UChi,Z)
-#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z)
-
-#define VTIMESI(A,B,C)            VTIMESIf(A,B,C)
-#define VTIMESMINUSI(A,B,C)       VTIMESMINUSIf(A,B,C)
-#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
-#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
-
-#define VTIMESI0(A,B,C)            VTIMESI0f(A,B,C)
-#define VTIMESMINUSI0(A,B,C)       VTIMESMINUSI0f(A,B,C)
-#define VACCTIMESI0(A,B,C)         VACCTIMESI0f(A,B,C)
-#define VACCTIMESMINUSI0(A,B,C)    VACCTIMESMINUSI0f(A,B,C)
-
-#define VTIMESI1(A,B,C)            VTIMESI1f(A,B,C)
-#define VTIMESMINUSI1(A,B,C)       VTIMESMINUSI1f(A,B,C)
-#define VACCTIMESI1(A,B,C)         VACCTIMESI1f(A,B,C)
-#define VACCTIMESMINUSI1(A,B,C)    VACCTIMESMINUSI1f(A,B,C)
-
-#define VTIMESI2(A,B,C)            VTIMESI2f(A,B,C)
-#define VTIMESMINUSI2(A,B,C)       VTIMESMINUSI2f(A,B,C)
-#define VACCTIMESI2(A,B,C)         VACCTIMESI2f(A,B,C)
-#define VACCTIMESMINUSI2(A,B,C)    VACCTIMESMINUSI2f(A,B,C)
-
-#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
-#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
-#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
-#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
-
-#define VPERM0(A,B)               VPERM0f(A,B)
-#define VPERM1(A,B)               VPERM1f(A,B)
-#define VPERM2(A,B)               VPERM2f(A,B)
-#define VPERM3(A,B)               VPERM3f(A,B)
-#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
-
-#define ZEND1(A,B,C)               ZEND1f(A,B,C)
-#define ZEND2(A,B,C)               ZEND2f(A,B,C)
-#define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
-#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
-#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
-
-#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
-#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
-
-#define VADDMEM(O,A,B,C)            VADDMEMf(O,A,B,C)
-#define VSUBMEM(O,A,B,C)            VSUBMEMf(O,A,B,C)
-
-#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
-#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 

 namespace Grid {
 namespace QCD {

+
+  ///////////////////////////////////////////////////////////
+  // Default to no assembler implementation
+  ///////////////////////////////////////////////////////////
 template<class Impl>
 void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
-						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
+					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
-  uint64_t  now;
-  uint64_t first ;
-  int offset,local,perm, ptype;
-  const SiteHalfSpinor *pbuf = & buf[0];
-  const SiteSpinor   *plocal = & in._odata[0];
-  void *pf;
-  int osites = in._grid->oSites();
-
-  
-  StencilEntry *SE;
-
-  //#define STAMP(i) timers[i] = __rdtsc() ; 
-#define STAMP(i) //timers[i] = __rdtsc() ; 
-
-  MASK_REGS;
-
-  first = __rdtsc();
-
-  SE=st.GetEntry(ptype,Xm,ss);
-
-#if 0
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-  LOAD64(%r9,pf);
-  __asm__( 
-	  VPREFETCH(0,%r9)
-	  VPREFETCH(1,%r9)
-	  VPREFETCH(2,%r9)
-	  VPREFETCH(3,%r9)
-	  VPREFETCH(4,%r9)
-	  VPREFETCH(5,%r9)
-	  VPREFETCH(6,%r9)
-	  VPREFETCH(7,%r9)
-	  VPREFETCH(8,%r9)
-	  VPREFETCH(9,%r9)
-	  VPREFETCH(10,%r9)
-	  VPREFETCH(11,%r9) );
-#endif
-
-  // Xm
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Ym,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-  
-  if ( local ) {
-    XM_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFXM(Xm,pf);
-  }
-  XM_RECON;
-
-  // Ym
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Zm,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-  
-  if ( local ) {
-    YM_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFYM(Ym,pf);
-  }
-  YM_RECON_ACCUM;
-
-  // Zm
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Tm,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-  if ( local ) {
-    ZM_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFZM(Zm,pf);
-  }
-  ZM_RECON_ACCUM;
-
-  // Tm
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-  
-  SE=st.GetEntry(ptype,Tp,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-
-  if ( local ) {
-    TM_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFTM(Tm,pf);
-  }
-  TM_RECON_ACCUM;
-
-  // Tp
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Zp,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-  
-  if ( local ) {
-    TP_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFTP(Tp,pf);
-  }
-  TP_RECON_ACCUM;
-
-  // Zp
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Yp,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-  if ( local ) {
-    ZP_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFZP(Zp,pf);
-  }
-  ZP_RECON_ACCUM;
-
-
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Xp,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-  
-  if ( local ) {
-    YP_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFYP(Yp,pf);
-  }
-  YP_RECON_ACCUM;
-
-  // Xp
-  perm   = SE->_permute;
-  offset = SE->_offset;
-  local  = SE->_is_local;
-    
-  //  PREFETCH_R(A);
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-  if ( local ) {
-    XP_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFXP(Xp,pf);
-  }
-  XP_RECON_ACCUM;
-
- debug:
-  SAVE_RESULT(&out._odata[ss]);
-
+  assert(0);
 }

-  template class WilsonKernels<WilsonImplF>;		
-  template class WilsonKernels<WilsonImplD>; 
+#if defined(AVX512) 
+
+
+  ///////////////////////////////////////////////////////////
+  // If we are AVX512 specialise the single precision routine
+  ///////////////////////////////////////////////////////////
+
+#include <simd/Intel512wilson.h>
+#include <simd/Intel512single.h>
+
+static Vector<vComplexF> signs;
+
+int setupSigns(void ){
+  Vector<vComplexF> bother(2);
+  signs = bother;
+  vrsign(signs[0]);
+  visign(signs[1]);
+  return 1;
+}
+static int signInit = setupSigns();
+
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
+
+template<>
+void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						     int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#define MAYBEPERM(A,B) 
+#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+template<>
+void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								   int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>

-}}
 #endif
+
+template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							      int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+
+template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+}}
+
@@ -0,0 +1,164 @@
+{
+  int locala,perma, ptypea;
+  int localb,permb, ptypeb;
+  uint64_t basea, baseb;
+  uint64_t basex;
+  const uint64_t plocal =(uint64_t) & in._odata[0];
+
+  //  vComplexF isigns[2] = { signs[0], signs[1] };
+  vComplexF *isigns = &signs[0];
+
+  MASK_REGS;
+
+  for(int site=0;site<Ns;site++) {
+  for(int s=0;s<Ls;s++) {
+
+  ////////////////////////////////
+  // Xp
+  ////////////////////////////////
+  int ent=ss*8;// 2*Ndim
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  basex = basea;
+
+  if ( locala ) {
+    LOAD64(%r10,isigns);
+    XM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXP(Xp,baseb);
+  }
+  LOAD64(%r10,isigns);
+  XM_RECON;
+
+  ////////////////////////////////
+  // Yp
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYP(Yp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zp
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZP(Zp,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tp
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFTP(Tp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Xm
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    XP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXM(Xm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  XP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Ym
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYM(Ym,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zm
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZM(Zm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tm
+  ////////////////////////////////
+  basea = (uint64_t)&out._odata[ss];
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFTM(Tm,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TP_RECON_ACCUM;
+
+  PREFETCH_CHIMU(basex);
+  SAVE_RESULT(&out._odata[ss]);
+
+  
+  ss++;
+  } 
+  sU++;
+  }
+}
@@ -54,14 +54,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chi_11 = ref()(1)(1);\
    Chi_12 = ref()(1)(2);

+// To splat or not to splat depends on the implementation
 #define MULT_2SPIN(A)\
   auto & ref(U._odata[sU](A));	\
-    U_00 = ref()(0,0);\
-    U_10 = ref()(1,0);\
-    U_20 = ref()(2,0);\
-    U_01 = ref()(0,1);\
-    U_11 = ref()(1,1);				\
-    U_21 = ref()(2,1);\
+   Impl::loadLinkElement(U_00,ref()(0,0));	\
+   Impl::loadLinkElement(U_10,ref()(1,0));	\
+   Impl::loadLinkElement(U_20,ref()(2,0));	\
+   Impl::loadLinkElement(U_01,ref()(0,1));	\
+   Impl::loadLinkElement(U_11,ref()(1,1));	\
+   Impl::loadLinkElement(U_21,ref()(2,1));	\
    UChi_00 = U_00*Chi_00;\
    UChi_10 = U_00*Chi_10;\
    UChi_01 = U_10*Chi_00;\
@@ -74,9 +75,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    UChi_11+= U_11*Chi_11;\
    UChi_02+= U_21*Chi_01;\
    UChi_12+= U_21*Chi_11;\
-    U_00 = ref()(0,2);\
-    U_10 = ref()(1,2);\
-    U_20 = ref()(2,2);\
+    Impl::loadLinkElement(U_00,ref()(0,2));	\
+    Impl::loadLinkElement(U_10,ref()(1,2));	\
+    Impl::loadLinkElement(U_20,ref()(2,2));	\
    UChi_00+= U_00*Chi_02;\
    UChi_10+= U_00*Chi_12;\
    UChi_01+= U_10*Chi_02;\
@@ -84,6 +85,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    UChi_02+= U_20*Chi_02;\
    UChi_12+= U_20*Chi_12;

+
 #define PERMUTE_DIR(dir)			\
      permute##dir(Chi_00,Chi_00);\
      permute##dir(Chi_01,Chi_01);\
@@ -309,546 +311,10 @@ namespace Grid {
 namespace QCD {


-template<class Impl>
-int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
-						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						   int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
-{
-  //  std::cout << "Hand op Dhop "<<std::endl;
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
-  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
-  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
-  
-  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
-  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
-  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
-
-  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
-  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
-  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
-
-  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
-  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
-  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
-
-  REGISTER Simd Chi_00;    // two spinor; 6 regs
-  REGISTER Simd Chi_01;
-  REGISTER Simd Chi_02;
-
-  REGISTER Simd Chi_10;
-  REGISTER Simd Chi_11;
-  REGISTER Simd Chi_12;   // 14 left
-
-  REGISTER Simd UChi_00;  // two spinor; 6 regs
-  REGISTER Simd UChi_01;
-  REGISTER Simd UChi_02;
-
-  REGISTER Simd UChi_10;
-  REGISTER Simd UChi_11;
-  REGISTER Simd UChi_12;  // 8 left
-
-  REGISTER Simd U_00;  // two rows of U matrix
-  REGISTER Simd U_10;
-  REGISTER Simd U_20;  
-  REGISTER Simd U_01;
-  REGISTER Simd U_11;
-  REGISTER Simd U_21;  // 2 reg left.
-
-#define Chimu_00 Chi_00
-#define Chimu_01 Chi_01
-#define Chimu_02 Chi_02
-#define Chimu_10 Chi_10
-#define Chimu_11 Chi_11
-#define Chimu_12 Chi_12
-#define Chimu_20 UChi_00
-#define Chimu_21 UChi_01
-#define Chimu_22 UChi_02
-#define Chimu_30 UChi_10
-#define Chimu_31 UChi_11
-#define Chimu_32 UChi_12
-
-
-  StencilEntry *SE;
-  int offset, ptype;
-  int num = 0;
-
-  // Xp
-  SE=st.GetEntry(ptype,Xp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    XP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-
-  }
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Xp);
-    XP_RECON_ACCUM;
-    num++;  
-  }
-
-  // Yp
-  SE=st.GetEntry(ptype,Yp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    YP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Yp);
-    YP_RECON_ACCUM;
-    num++;  
-  }
-
-
-  // Zp
-  SE=st.GetEntry(ptype,Zp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    ZP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }  
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Zp);
-    ZP_RECON_ACCUM;
-    num++;  
-  }
-
-  // Tp
-  SE=st.GetEntry(ptype,Tp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    TP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Tp);
-    TP_RECON_ACCUM;
-    num++;  
-  }
-  
-  // Xm
-  SE=st.GetEntry(ptype,Xm,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    XM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Xm);
-    XM_RECON_ACCUM;
-    num++;  
-  }
-  
-  // Ym
-  SE=st.GetEntry(ptype,Ym,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    YM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Ym);
-    YM_RECON_ACCUM;
-    num++;  
-  }
-
-  // Zm
-  SE=st.GetEntry(ptype,Zm,ss);
-  offset = SE->_offset;
-
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    ZM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Zm);
-    ZM_RECON_ACCUM;
-    num++;  
-  }
-
-  // Tm
-  SE=st.GetEntry(ptype,Tm,ss);
-  offset = SE->_offset;
-
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    TM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Tm);
-    TM_RECON_ACCUM;
-    num++;  
-  }
-
-  SiteSpinor & ref (out._odata[ss]);
-  if ( Local ) {
-    vstream(ref()(0)(0),result_00*(-0.5));
-    vstream(ref()(0)(1),result_01*(-0.5));
-    vstream(ref()(0)(2),result_02*(-0.5));
-    vstream(ref()(1)(0),result_10*(-0.5));
-    vstream(ref()(1)(1),result_11*(-0.5));
-    vstream(ref()(1)(2),result_12*(-0.5));
-    vstream(ref()(2)(0),result_20*(-0.5));
-    vstream(ref()(2)(1),result_21*(-0.5));
-    vstream(ref()(2)(2),result_22*(-0.5));
-    vstream(ref()(3)(0),result_30*(-0.5));
-    vstream(ref()(3)(1),result_31*(-0.5));
-    vstream(ref()(3)(2),result_32*(-0.5));
-    return 1;
-  } else if ( num ) { 
-    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
-    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
-    vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
-    vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
-    vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
-    vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
-    vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
-    vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
-    vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
-    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
-    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
-    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
-    return 1;
-  }
-  return 0;
-}
-
-
-
-
-template<class Impl>
-int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
-						std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
-{
-  //  std::cout << "Hand op Dhop "<<std::endl;
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
-  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
-  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
-  
-  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
-  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
-  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
-
-  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
-  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
-  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
-
-  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
-  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
-  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
-
-  REGISTER Simd Chi_00;    // two spinor; 6 regs
-  REGISTER Simd Chi_01;
-  REGISTER Simd Chi_02;
-
-  REGISTER Simd Chi_10;
-  REGISTER Simd Chi_11;
-  REGISTER Simd Chi_12;   // 14 left
-
-  REGISTER Simd UChi_00;  // two spinor; 6 regs
-  REGISTER Simd UChi_01;
-  REGISTER Simd UChi_02;
-
-  REGISTER Simd UChi_10;
-  REGISTER Simd UChi_11;
-  REGISTER Simd UChi_12;  // 8 left
-
-  REGISTER Simd U_00;  // two rows of U matrix
-  REGISTER Simd U_10;
-  REGISTER Simd U_20;  
-  REGISTER Simd U_01;
-  REGISTER Simd U_11;
-  REGISTER Simd U_21;  // 2 reg left.
-
-#define Chimu_00 Chi_00
-#define Chimu_01 Chi_01
-#define Chimu_02 Chi_02
-#define Chimu_10 Chi_10
-#define Chimu_11 Chi_11
-#define Chimu_12 Chi_12
-#define Chimu_20 UChi_00
-#define Chimu_21 UChi_01
-#define Chimu_22 UChi_02
-#define Chimu_30 UChi_10
-#define Chimu_31 UChi_11
-#define Chimu_32 UChi_12
-
-
-  StencilEntry *SE;
-  int offset, ptype;
-  int num = 0;
-
-  // Xp
-  SE=st.GetEntry(ptype,Xp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    XM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Xp);
-    XM_RECON_ACCUM;
-    num++;  
-  }
-
-
-  // Yp
-  SE=st.GetEntry(ptype,Yp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    YM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Yp);
-    YM_RECON_ACCUM;
-    num++;  
-  }
-
-
-  // Zp
-  SE=st.GetEntry(ptype,Zp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    ZM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }  
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Zp);
-    ZM_RECON_ACCUM;
-    num++;  
-  }
-
-  // Tp
-  SE=st.GetEntry(ptype,Tp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    TM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Tp);
-    TM_RECON_ACCUM;
-    num++;  
-  }
-  
-  // Xm
-  SE=st.GetEntry(ptype,Xm,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    XP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Xm);
-    XP_RECON_ACCUM;
-    num++;  
-  }
-  
-  // Ym
-  SE=st.GetEntry(ptype,Ym,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    YP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Ym);
-    YP_RECON_ACCUM;
-    num++;  
-  }
-
-  // Zm
-  SE=st.GetEntry(ptype,Zm,ss);
-  offset = SE->_offset;
-
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    ZP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Zm);
-    ZP_RECON_ACCUM;
-    num++;  
-  }
-
-  // Tm
-  SE=st.GetEntry(ptype,Tm,ss);
-  offset = SE->_offset;
-
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    TP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Tm);
-    TP_RECON_ACCUM;
-    num++;  
-  }
-
-  SiteSpinor & ref (out._odata[ss]);
-  if ( Local ) {
-    vstream(ref()(0)(0),result_00*(-0.5));
-    vstream(ref()(0)(1),result_01*(-0.5));
-    vstream(ref()(0)(2),result_02*(-0.5));
-    vstream(ref()(1)(0),result_10*(-0.5));
-    vstream(ref()(1)(1),result_11*(-0.5));
-    vstream(ref()(1)(2),result_12*(-0.5));
-    vstream(ref()(2)(0),result_20*(-0.5));
-    vstream(ref()(2)(1),result_21*(-0.5));
-    vstream(ref()(2)(2),result_22*(-0.5));
-    vstream(ref()(3)(0),result_30*(-0.5));
-    vstream(ref()(3)(1),result_31*(-0.5));
-    vstream(ref()(3)(2),result_32*(-0.5));
-    return 1;
-  } else if ( num ) { 
-    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
-    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
-    vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
-    vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
-    vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
-    vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
-    vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
-    vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
-    vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
-    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
-    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
-    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
-    return 1;
-  }
-  return 0;
-}
-
-  /*
 template<class Impl>
 void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
-						std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       int ss,int sU,const FermionField &in, FermionField &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -1073,89 +539,346 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel

  {
    SiteSpinor & ref (out._odata[ss]);
-    vstream(ref()(0)(0),result_00*(-0.5));
-    vstream(ref()(0)(1),result_01*(-0.5));
-    vstream(ref()(0)(2),result_02*(-0.5));
-    vstream(ref()(1)(0),result_10*(-0.5));
-    vstream(ref()(1)(1),result_11*(-0.5));
-    vstream(ref()(1)(2),result_12*(-0.5));
-    vstream(ref()(2)(0),result_20*(-0.5));
-    vstream(ref()(2)(1),result_21*(-0.5));
-    vstream(ref()(2)(2),result_22*(-0.5));
-    vstream(ref()(3)(0),result_30*(-0.5));
-    vstream(ref()(3)(1),result_31*(-0.5));
-    vstream(ref()(3)(2),result_32*(-0.5));
+    vstream(ref()(0)(0),result_00);
+    vstream(ref()(0)(1),result_01);
+    vstream(ref()(0)(2),result_02);
+    vstream(ref()(1)(0),result_10);
+    vstream(ref()(1)(1),result_11);
+    vstream(ref()(1)(2),result_12);
+    vstream(ref()(2)(0),result_20);
+    vstream(ref()(2)(1),result_21);
+    vstream(ref()(2)(2),result_22);
+    vstream(ref()(3)(0),result_30);
+    vstream(ref()(3)(1),result_31);
+    vstream(ref()(3)(2),result_32);
  }
 }
-*/
+
+template<class Impl>
+void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       int ss,int sU,const FermionField &in, FermionField &out)
+{
+  //  std::cout << "Hand op Dhop "<<std::endl;
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  REGISTER Simd result_00; // 12 regs on knc
+  REGISTER Simd result_01;
+  REGISTER Simd result_02;
+  
+  REGISTER Simd result_10;
+  REGISTER Simd result_11;
+  REGISTER Simd result_12;
+
+  REGISTER Simd result_20;
+  REGISTER Simd result_21;
+  REGISTER Simd result_22;
+
+  REGISTER Simd result_30;
+  REGISTER Simd result_31;
+  REGISTER Simd result_32; // 20 left
+
+  REGISTER Simd Chi_00;    // two spinor; 6 regs
+  REGISTER Simd Chi_01;
+  REGISTER Simd Chi_02;
+
+  REGISTER Simd Chi_10;
+  REGISTER Simd Chi_11;
+  REGISTER Simd Chi_12;   // 14 left
+
+  REGISTER Simd UChi_00;  // two spinor; 6 regs
+  REGISTER Simd UChi_01;
+  REGISTER Simd UChi_02;
+
+  REGISTER Simd UChi_10;
+  REGISTER Simd UChi_11;
+  REGISTER Simd UChi_12;  // 8 left
+
+  REGISTER Simd U_00;  // two rows of U matrix
+  REGISTER Simd U_10;
+  REGISTER Simd U_20;  
+  REGISTER Simd U_01;
+  REGISTER Simd U_11;
+  REGISTER Simd U_21;  // 2 reg left.
+
+#define Chimu_00 Chi_00
+#define Chimu_01 Chi_01
+#define Chimu_02 Chi_02
+#define Chimu_10 Chi_10
+#define Chimu_11 Chi_11
+#define Chimu_12 Chi_12
+#define Chimu_20 UChi_00
+#define Chimu_21 UChi_01
+#define Chimu_22 UChi_02
+#define Chimu_30 UChi_10
+#define Chimu_31 UChi_11
+#define Chimu_32 UChi_12
+
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+  
+  // Xp
+  SE=st.GetEntry(ptype,Xp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+  
+  if ( local ) {
+    LOAD_CHIMU;
+    XP_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+
+  {
+    MULT_2SPIN(Xp);
+  }
+  XP_RECON;
+
+  // Yp
+  SE=st.GetEntry(ptype,Yp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+  
+  if ( local ) {
+    LOAD_CHIMU;
+    YP_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+  {
+    MULT_2SPIN(Yp);
+  }
+  YP_RECON_ACCUM;
+
+
+  // Zp
+  SE=st.GetEntry(ptype,Zp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+  
+  if ( local ) {
+    LOAD_CHIMU;
+    ZP_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+  {
+    MULT_2SPIN(Zp);
+  }
+  ZP_RECON_ACCUM;
+
+  // Tp
+  SE=st.GetEntry(ptype,Tp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+  
+  if ( local ) {
+    LOAD_CHIMU;
+    TP_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+  {
+    MULT_2SPIN(Tp);
+  }
+  TP_RECON_ACCUM;
+  
+  // Xm
+  SE=st.GetEntry(ptype,Xm,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+  
+  if ( local ) {
+    LOAD_CHIMU;
+    XM_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+  {
+    MULT_2SPIN(Xm);
+  }
+  XM_RECON_ACCUM;
+  
+  // Ym
+  SE=st.GetEntry(ptype,Ym,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+  
+  if ( local ) {
+    LOAD_CHIMU;
+    YM_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+  {
+    MULT_2SPIN(Ym);
+  }
+  YM_RECON_ACCUM;
+
+  // Zm
+  SE=st.GetEntry(ptype,Zm,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+
+  if ( local ) {
+    LOAD_CHIMU;
+    ZM_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+  {
+    MULT_2SPIN(Zm);
+  }
+  ZM_RECON_ACCUM;
+
+  // Tm
+  SE=st.GetEntry(ptype,Tm,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+
+  if ( local ) {
+    LOAD_CHIMU;
+    TM_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+  {
+    MULT_2SPIN(Tm);
+  }
+  TM_RECON_ACCUM;
+
+  {
+    SiteSpinor & ref (out._odata[ss]);
+    vstream(ref()(0)(0),result_00);
+    vstream(ref()(0)(1),result_01);
+    vstream(ref()(0)(2),result_02);
+    vstream(ref()(1)(0),result_10);
+    vstream(ref()(1)(1),result_11);
+    vstream(ref()(1)(2),result_12);
+    vstream(ref()(2)(0),result_20);
+    vstream(ref()(2)(1),result_21);
+    vstream(ref()(2)(2),result_22);
+    vstream(ref()(3)(0),result_30);
+    vstream(ref()(3)(1),result_31);
+    vstream(ref()(3)(2),result_32);
+  }
+}
+
+
  ////////////////////////////////////////////////
  // Specialise Gparity to simple implementation
  ////////////////////////////////////////////////
 template<>
-int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							     int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+							     int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
-  //check consistency of return types between these functions and the ones in WilsonKernels.cc
-  return 0;
-  
+  assert(0);
 }

 template<>
-int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+								int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-  return 0;
+  assert(0);
 }

 template<>
-int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							     int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+							     int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-  return 0;
+  assert(0);
 }

 template<>
-int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+								int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-  return 0;
+  assert(0);
 }



-template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+////////////// Wilson ; uses this implementation /////////////////////
+// Need Nc=3 though //
+
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out,bool l,bool n);
-template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+							       int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
-template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+							       int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								  int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
-template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+								  int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								  int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
+								  int ss,int sU,const FermionField &in, FermionField &out);


-template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
-template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+								      int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
-template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+								      int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
-template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+									 int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
+									 int ss,int sU,const FermionField &in, FermionField &out);
+
+
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								      int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								      int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+									 int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+									 int ss,int sU,const FermionField &in, FermionField &out);
+

 }}
@@ -42,7 +42,9 @@ template<class Gimpl> class WilsonLoops;
 #define INHERIT_GIMPL_TYPES(GImpl) \
    typedef typename GImpl::Simd                           Simd;\
    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
-    typedef typename GImpl::GaugeField               GaugeField;	
+    typedef typename GImpl::GaugeField               GaugeField;\
+    typedef typename GImpl::SiteGaugeField       SiteGaugeField;\
+    typedef typename GImpl::SiteGaugeLink         SiteGaugeLink;

    // 
    template<class S,int Nrepresentation=Nc>
@@ -62,9 +64,9 @@ template<class Gimpl> class WilsonLoops;

    // Move this elsewhere?
    static inline void AddGaugeLink(GaugeField& U, GaugeLinkField& W, int mu){  // U[mu] += W 
-PARALLEL_FOR_LOOP
+    PARALLEL_FOR_LOOP
      for(auto ss=0;ss<U._grid->oSites();ss++){
-	U._odata[ss]._internal[mu] = U._odata[ss]._internal[mu] + W._odata[ss]._internal;
+	         U._odata[ss]._internal[mu] = U._odata[ss]._internal[mu] + W._odata[ss]._internal;
        }  
    }
    
@@ -92,13 +92,13 @@ public:
    
    // Create integrator, including the smearing policy
    // Smearing policy
-    std::cout << GridLogMessage << " Creating the Stout class\n";
-    double rho = 0.1; // smearing parameter
+    std::cout << GridLogDebug << " Creating the Stout class\n";
+    double rho = 0.1; // smearing parameter, now hardcoded
    int Nsmear = 1;   // number of smearing levels
    Smear_Stout<Gimpl> Stout(rho);
-    std::cout << GridLogMessage << " Creating the SmearedConfiguration class\n";
+    std::cout << GridLogDebug << " Creating the SmearedConfiguration class\n";
    SmearedConfiguration<Gimpl> SmearingPolicy(UGrid, Nsmear, Stout);
-    std::cout << GridLogMessage << " done\n";
+    std::cout << GridLogDebug << " done\n";
    //////////////
    typedef MinimumNorm2<GaugeField, SmearedConfiguration<Gimpl> >  IntegratorType;// change here to change the algorithm
    IntegratorParameters MDpar(20);
@@ -116,27 +116,27 @@ public:

    if ( StartType == HotStart ) {
      // Hot start
-      HMCpar.NoMetropolisUntil =0;
+      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::HotConfiguration(pRNG, U);
    } else if ( StartType == ColdStart ) { 
      // Cold start
-      HMCpar.NoMetropolisUntil =0;
+      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::ColdConfiguration(pRNG, U);
    } else if ( StartType == TepidStart ) {       
      // Tepid start
-      HMCpar.NoMetropolisUntil =0;
+      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::TepidConfiguration(pRNG, U);
    } else if ( StartType == CheckpointStart ) { 
-      HMCpar.NoMetropolisUntil =0;
+      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      // CheckpointRestart
      Checkpoint.CheckpointRestore(StartTraj, U, sRNG, pRNG);
@@ -144,7 +144,7 @@ public:

    // Attach the gauge field to the smearing Policy and create the fill the smeared set
    // notice that the unit configuration is singular in this procedure
-    std::cout << GridLogMessage << "Filling the smeared set\n";
+    std::cout << GridLogMessage << "Filling the smeared set\n"; 
    SmearingPolicy.set_GaugeField(U);
    
    HybridMonteCarlo<GaugeField,IntegratorType>  HMC(HMCpar, MDynamics,sRNG,pRNG,U); 
@@ -60,6 +60,31 @@ namespace Grid {
      "-Gamma5  ",
      "         "
    };
+    
+    SpinMatrix makeGammaProd(const unsigned int i)
+    {
+      SpinMatrix g;
+      
+      g = 1.;
+      if (i & 0x1)
+      {
+        g = g*Gamma(Gamma::GammaMatrix::GammaX);
+      }
+      if (i & 0x2)
+      {
+        g = g*Gamma(Gamma::GammaMatrix::GammaY);
+      }
+      if (i & 0x4)
+      {
+        g = g*Gamma(Gamma::GammaMatrix::GammaZ);
+      }
+      if (i & 0x8)
+      {
+        g = g*Gamma(Gamma::GammaMatrix::GammaT);
+      }
+      
+      return g;
+    }

    //    void sprojMul( vHalfSpinColourVector &out,vColourMatrix &u, vSpinColourVector &in){
    //      vHalfSpinColourVector hspin;
@@ -82,7 +82,10 @@ namespace QCD {
    GammaMatrix _g;

  };
-
+  
+    // Make gamma products (Chroma convention)
+    SpinMatrix makeGammaProd(const unsigned int i);
+    
    /* Gx
     *  0 0  0  i    
     *  0 0  i  0    
@@ -608,14 +608,14 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
    LatticeMatrix Umu(out._grid);
    for(int mu=0;mu<Nd;mu++){
      LieRandomize(pRNG,Umu,0.01);
-      pokeLorentz(out,Umu,mu);
+      PokeIndex<LorentzIndex>(out,Umu,mu);
    }
  }
  static void ColdConfiguration(GridParallelRNG &pRNG,LatticeGaugeField &out){
    LatticeMatrix Umu(out._grid);
    Umu=1.0;
    for(int mu=0;mu<Nd;mu++){
-      pokeLorentz(out,Umu,mu);
+      PokeIndex<LorentzIndex>(out,Umu,mu);
    }
  }

@@ -41,7 +41,11 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFourDimRedBlackGrid(const GridCartesia
 {
  return new GridRedBlackCartesian(FourDimGrid); 
 }
-
+GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const std::vector<int> & latt,const std::vector<int> &mpi)
+{
+  std::vector<int> simd(4,1);
+  return makeFourDimGrid(latt,simd,mpi);
+}
 GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;
@@ -58,6 +62,7 @@ GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian
  return new GridCartesian(latt5,simd5,mpi5); 
 }

+
 GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;
@@ -76,4 +81,42 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC
  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
 }

+
+GridCartesian         *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartesian *FourDimGrid)
+{
+  int N4=FourDimGrid->_ndimension;
+  int nsimd = FourDimGrid->Nsimd();
+
+  std::vector<int> latt5(1,Ls);
+  std::vector<int> simd5(1,nsimd);
+  std::vector<int>  mpi5(1,1);
+  
+  for(int d=0;d<N4;d++){
+    latt5.push_back(FourDimGrid->_fdimensions[d]);
+    simd5.push_back(1);
+     mpi5.push_back(FourDimGrid->_processors[d]);
+  }
+  return new GridCartesian(latt5,simd5,mpi5); 
+}
+
+GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
+{
+  int N4=FourDimGrid->_ndimension;
+  int nsimd = FourDimGrid->Nsimd();
+  int cbd=0;
+  std::vector<int> latt5(1,Ls);
+  std::vector<int> simd5(1,nsimd);
+  std::vector<int>  mpi5(1,1);
+  std::vector<int>   cb5(1,1);
+    
+  for(int d=0;d<N4;d++){
+    latt5.push_back(FourDimGrid->_fdimensions[d]);
+    simd5.push_back(1);
+     mpi5.push_back(FourDimGrid->_processors[d]);
+      cb5.push_back(1);
+    }
+  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
+}
+
+
 }}
@@ -35,9 +35,14 @@ class SpaceTimeGrid {

  static GridCartesian         *makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi);
  static GridRedBlackCartesian *makeFourDimRedBlackGrid       (const GridCartesian *FourDimGrid);
+
  static GridCartesian         *makeFiveDimGrid        (int Ls,const GridCartesian *FourDimGrid);
  static GridRedBlackCartesian *makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);

+  static GridCartesian         *makeFiveDimDWFGrid        (int Ls,const GridCartesian *FourDimGrid);
+  static GridRedBlackCartesian *makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
+  static GridCartesian         *makeFourDimDWFGrid        (const std::vector<int> & latt,const std::vector<int> &mpi);
+
 };

 }}
@@ -52,9 +52,9 @@ namespace Grid {
 	// or this-> ; there is no "this" in a static method. This forces explicit Gimpl scope
 	// resolution throughout the usage in this file, and rather defeats the purpose of deriving
 	// from Gimpl.
-	plaq= Gimpl::CovShiftBackward(U[mu],mu,
-				      Gimpl::CovShiftBackward(U[nu],nu,
-							      Gimpl::CovShiftForward (U[mu],mu,U[nu])));
+	plaq = Gimpl::CovShiftBackward(U[mu],mu,
+		   Gimpl::CovShiftBackward(U[nu],nu,
+		   Gimpl::CovShiftForward (U[mu],mu,U[nu])));
      }
      //////////////////////////////////////////////////
      // trace of directed plaquette oriented in mu,nu plane
@@ -100,16 +100,16 @@ namespace Grid {
      //////////////////////////////////////////////////
      // average over all x,y,z,t and over all planes of plaquette
      //////////////////////////////////////////////////
-      static RealD avgPlaquette(const GaugeLorentz &Umu){
-	
-	RealD sumplaq = sumPlaquette(Umu);
-	
-	double vol = Umu._grid->gSites();
-	
-	double faces = (1.0*Nd*(Nd-1))/2.0;
-	
-	return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
-      }
+	static RealD avgPlaquette(const GaugeLorentz &Umu){
+		RealD sumplaq = sumPlaquette(Umu);
+		double vol = Umu._grid->gSites();
+		double faces = (1.0*Nd*(Nd-1))/2.0;
+		return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
+   	}
+
+      //////////////////////////////////////////////////
+      // average over traced single links
+      //////////////////////////////////////////////////
      static RealD linkTrace(const GaugeLorentz &Umu){
 	std::vector<GaugeMat> U(4,Umu._grid);
 	
@@ -126,47 +126,6 @@ namespace Grid {
 	
 	return p.real()/vol/4.0/3.0;
      };
-      //////////////////////////////////////////////////
-      // the sum over all staples on each site
-      //////////////////////////////////////////////////
-      static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
-	
-	GridBase *grid = Umu._grid;
-	
-	std::vector<GaugeMat> U(4,grid);
-	for(int d=0;d<Nd;d++){
-	  U[d] = PeekIndex<LorentzIndex>(Umu,d);
-	}
-	staple = zero;
-		
-	
-	for(int nu=0;nu<Nd;nu++){
-	  
-	  if(nu != mu) {
-	    
-	    // mu
-	    // ^
-	    // |__>  nu
-	    
-	    //    __ 
-	    //      |
-	    //    __|
-	    //
-	    
-	    staple+=Gimpl::ShiftStaple(Gimpl::CovShiftForward (U[nu],nu, 
-							       Gimpl::CovShiftBackward(U[mu],mu,
-										       Gimpl::CovShiftIdentityBackward(U[nu],nu))),mu);
-	    
-	    //  __ 
-	    // |   
-	    // |__ 
-	    //
-	    //
-	    staple+=Gimpl::ShiftStaple(Gimpl::CovShiftBackward(U[nu],nu,		  		  
-							       Gimpl::CovShiftBackward(U[mu],mu,U[nu])),mu);
-	  }
-	}
-      }
      
      //////////////////////////////////////////////////
      // the sum over all staples on each site in direction mu,nu
@@ -210,6 +169,51 @@ namespace Grid {
 	}
      }

+//////////////////////////////////////////////////
+// the sum over all staples on each site
+//////////////////////////////////////////////////
+  static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
+
+    GridBase *grid = Umu._grid;
+
+    std::vector<GaugeMat> U(Nd,grid);
+    for(int d=0;d<Nd;d++){
+      U[d] = PeekIndex<LorentzIndex>(Umu,d);
+    }
+    staple = zero;
+    GaugeMat tmp(grid);
+
+    
+    for(int nu=0;nu<Nd;nu++){
+
+      if(nu != mu) {
+
+      // mu
+      // ^
+      // |__>  nu
+
+      //    __ 
+      //      |
+      //    __|
+      //
+
+	staple+=Gimpl::ShiftStaple(
+	        Gimpl::CovShiftForward (U[nu],nu, 
+		Gimpl::CovShiftBackward(U[mu],mu,
+		Gimpl::CovShiftIdentityBackward(U[nu],nu))),mu);
+
+      //  __ 
+      // |   
+      // |__ 
+      //
+      //
+	staple+=Gimpl::ShiftStaple(  
+                Gimpl::CovShiftBackward(U[nu],nu,		  		  
+		Gimpl::CovShiftBackward(U[mu],mu,U[nu])),mu);
+      }
+    }
+  }
+

      //////////////////////////////////////////////////
      // the sum over all staples on each site in direction mu,nu, upper part
@@ -247,246 +251,246 @@ namespace Grid {



-
-      //////////////////////////////////////////////////////
-      // Similar to above for rectangle is required
-      //////////////////////////////////////////////////////
-      static void dirRectangle(GaugeMat &rect,const std::vector<GaugeMat> &U, const int mu, const int nu)
-      {
-	rect =  Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[mu],mu,U[nu]))* // ->->|
-	  adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[mu],mu,U[mu]))) ;
-	rect = rect + 
+  //////////////////////////////////////////////////////
+  // Similar to above for rectangle is required
+  //////////////////////////////////////////////////////
+  static void dirRectangle(GaugeMat &rect,const std::vector<GaugeMat> &U, const int mu, const int nu)
+  {
+    rect =  Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[mu],mu,U[nu]))* // ->->|
+	adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[mu],mu,U[mu]))) ;
+    rect = rect + 
          Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[nu],nu,U[nu]))* // ->||
-	  adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[nu],nu,U[mu]))) ;
+      adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[nu],nu,U[mu]))) ;
+  }
+  static void traceDirRectangle(LatticeComplex &rect, const std::vector<GaugeMat> &U, const int mu, const int nu)
+  {
+    GaugeMat sp(U[0]._grid);
+    dirRectangle(sp,U,mu,nu);
+    rect=trace(sp);
+  }
+  static void siteRectangle(LatticeComplex &Rect,const std::vector<GaugeMat> &U)
+  {
+    LatticeComplex siteRect(U[0]._grid);
+    Rect=zero;
+    for(int mu=1;mu<Nd;mu++){
+      for(int nu=0;nu<mu;nu++){
+	traceDirRectangle(siteRect,U,mu,nu);
+	Rect = Rect + siteRect;
      }
-      static void traceDirRectangle(LatticeComplex &rect, const std::vector<GaugeMat> &U, const int mu, const int nu)
-      {
-	GaugeMat sp(U[0]._grid);
-	dirRectangle(sp,U,mu,nu);
-	rect=trace(sp);
-      }
-      static void siteRectangle(LatticeComplex &Rect,const std::vector<GaugeMat> &U)
-      {
-	LatticeComplex siteRect(U[0]._grid);
-	Rect=zero;
-	for(int mu=1;mu<Nd;mu++){
-	  for(int nu=0;nu<mu;nu++){
-	    traceDirRectangle(siteRect,U,mu,nu);
-	    Rect = Rect + siteRect;
-	  }
-	}
-      }
-      //////////////////////////////////////////////////
-      // sum over all x,y,z,t and over all planes of plaquette
-      //////////////////////////////////////////////////
-      static RealD sumRectangle(const GaugeLorentz &Umu){
-	std::vector<GaugeMat> U(4,Umu._grid);
+    }
+  }

-	for(int mu=0;mu<Nd;mu++){
-	  U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
-	}
+ //////////////////////////////////////////////////
+  // sum over all x,y,z,t and over all planes of plaquette
+  //////////////////////////////////////////////////
+  static RealD sumRectangle(const GaugeLorentz &Umu){
+    std::vector<GaugeMat> U(Nd,Umu._grid);

-	LatticeComplex Rect(Umu._grid);
+    for(int mu=0;mu<Nd;mu++){
+      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+    }
+
+    LatticeComplex Rect(Umu._grid);
    
-	siteRectangle(Rect,U);
+    siteRectangle(Rect,U);
    
-	TComplex Tp = sum(Rect);
-	Complex p  = TensorRemove(Tp);
-	return p.real();
-      }
-      //////////////////////////////////////////////////
-      // average over all x,y,z,t and over all planes of plaquette
-      //////////////////////////////////////////////////
-      static RealD avgRectangle(const GaugeLorentz &Umu){
+    TComplex Tp = sum(Rect);
+    Complex p  = TensorRemove(Tp);
+    return p.real();
+  }
+  //////////////////////////////////////////////////
+  // average over all x,y,z,t and over all planes of plaquette
+  //////////////////////////////////////////////////
+  static RealD avgRectangle(const GaugeLorentz &Umu){

-	RealD sumrect = sumRectangle(Umu);
+    RealD sumrect = sumRectangle(Umu);
    
-	double vol = Umu._grid->gSites();
+    double vol = Umu._grid->gSites();
    
-	double faces = (1.0*Nd*(Nd-1)); // 2 distinct orientations summed
+    double faces = (1.0*Nd*(Nd-1)); // 2 distinct orientations summed
    
-	return sumrect/vol/faces/Nc; // Nd , Nc dependent... FIXME
-      }
+    return sumrect/vol/faces/Nc; // Nd , Nc dependent... FIXME
+  }

-      //////////////////////////////////////////////////
-      // the sum over all staples on each site
-      //////////////////////////////////////////////////
-      static void RectStapleDouble(GaugeMat &U2,const GaugeMat & U,int mu){
-	U2 = U * Cshift(U,mu,1);
-      }
+  //////////////////////////////////////////////////
+  // the sum over all staples on each site
+  //////////////////////////////////////////////////
+  static void RectStapleDouble(GaugeMat &U2,const GaugeMat & U,int mu){
+    U2 = U * Cshift(U,mu,1);
+  }

-      ////////////////////////////////////////////////////////////////////////////
-      // Hop by two optimisation strategy does not work nicely with Gparity. (could do,
-      // but need to track two deep where cross boundary and apply a conjugation).
-      // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do so .
-      ////////////////////////////////////////////////////////////////////////////
-      static void RectStapleOptimised(GaugeMat &Stap,std::vector<GaugeMat> &U2,std::vector<GaugeMat> &U,int mu){
+  ////////////////////////////////////////////////////////////////////////////
+  // Hop by two optimisation strategy does not work nicely with Gparity. (could do,
+  // but need to track two deep where cross boundary and apply a conjugation).
+  // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do so .
+  ////////////////////////////////////////////////////////////////////////////
+  static void RectStapleOptimised(GaugeMat &Stap,std::vector<GaugeMat> &U2,std::vector<GaugeMat> &U,int mu){

-	Stap = zero;
+    Stap = zero;

-	GridBase *grid = U[0]._grid;
+    GridBase *grid = U[0]._grid;

-	GaugeMat Staple2x1 (grid);
-	GaugeMat tmp (grid);
+    GaugeMat Staple2x1 (grid);
+    GaugeMat tmp (grid);

-	for(int nu=0;nu<Nd;nu++){
-	  if ( nu!=mu) {
+    for(int nu=0;nu<Nd;nu++){
+      if ( nu!=mu) {

-	    // Up staple    ___ ___ 
-	    //             |       |
-	    tmp = Cshift(adj(U[nu]),nu,-1); 
-	    tmp = adj(U2[mu])*tmp;
-	    tmp = Cshift(tmp,mu,-2);
+	// Up staple    ___ ___ 
+	//             |       |
+	tmp = Cshift(adj(U[nu]),nu,-1); 
+	tmp = adj(U2[mu])*tmp;
+	tmp = Cshift(tmp,mu,-2);

-	    Staple2x1 = Gimpl::CovShiftForward (U[nu],nu,tmp);
+	Staple2x1 = Gimpl::CovShiftForward (U[nu],nu,tmp);


-	    // Down staple
-	    //             |___ ___|
-	    //
-	    tmp = adj(U2[mu])*U[nu];
-	    Staple2x1+= Gimpl::CovShiftBackward(U[nu],nu,Cshift(tmp,mu,-2));
+	// Down staple
+	//             |___ ___|
+	//
+	tmp = adj(U2[mu])*U[nu];
+	Staple2x1+= Gimpl::CovShiftBackward(U[nu],nu,Cshift(tmp,mu,-2));


-	    //              ___ ___
-	    //             |    ___|
-	    //             |___ ___|
-	    //
+	//              ___ ___
+	//             |    ___|
+	//             |___ ___|
+	//

-	    Stap+= Cshift(Gimpl::CovShiftForward (U[mu],mu,Staple2x1),mu,1);
+	Stap+= Cshift(Gimpl::CovShiftForward (U[mu],mu,Staple2x1),mu,1);

-	    //              ___ ___
-	    //             |___    |
-	    //             |___ ___|
-	    //
+	//              ___ ___
+	//             |___    |
+	//             |___ ___|
+	//

-	    //	tmp= Staple2x1* Cshift(U[mu],mu,-2);
-	    //	Stap+= Cshift(tmp,mu,1) ;
-	    Stap+= Cshift(Staple2x1,mu,1)*Cshift(U[mu],mu,-1); ;
+	//	tmp= Staple2x1* Cshift(U[mu],mu,-2);
+	//	Stap+= Cshift(tmp,mu,1) ;
+	Stap+= Cshift(Staple2x1,mu,1)*Cshift(U[mu],mu,-1); ;

-	    //       --    
-	    //      |  |              
-	    //          
-	    //      |  | 
+	//       --    
+	//      |  |              
+	//          
+	//      |  | 
 	
-	    tmp = Cshift(adj(U2[nu]),nu,-2);
-	    tmp = Gimpl::CovShiftBackward(U[mu],mu,tmp);
-	    tmp = U2[nu]*Cshift(tmp,nu,2);
-	    Stap+= Cshift(tmp, mu, 1);
+	tmp = Cshift(adj(U2[nu]),nu,-2);
+	tmp = Gimpl::CovShiftBackward(U[mu],mu,tmp);
+	tmp = U2[nu]*Cshift(tmp,nu,2);
+	Stap+= Cshift(tmp, mu, 1);

-	    //      |  |              
-	    //          
-	    //      |  | 
-	    //       -- 
+	//      |  |              
+	//          
+	//      |  | 
+	//       -- 
 	
-	    tmp = Gimpl::CovShiftBackward(U[mu],mu,U2[nu]);
-	    tmp = adj(U2[nu])*tmp;
-	    tmp = Cshift(tmp,nu,-2);
-	    Stap+=Cshift(tmp, mu, 1);
-	  }}
+	tmp = Gimpl::CovShiftBackward(U[mu],mu,U2[nu]);
+	tmp = adj(U2[nu])*tmp;
+	tmp = Cshift(tmp,nu,-2);
+	Stap+=Cshift(tmp, mu, 1);
+    }}


-      }
+  }

-      static void RectStaple(GaugeMat &Stap,const GaugeLorentz & Umu,int mu)
-      {
-	RectStapleUnoptimised(Stap,Umu,mu);
-      }
-      static void RectStaple(const GaugeLorentz & Umu,GaugeMat &Stap,
-			     std::vector<GaugeMat> &U2,
-			     std::vector<GaugeMat> &U, int mu)
-      {
-	if ( Gimpl::isPeriodicGaugeField() ){ 
-	  RectStapleOptimised(Stap,U2,U,mu);
-	} else {
-	  RectStapleUnoptimised(Stap,Umu,mu);
-	}
-      }
+  static void RectStaple(GaugeMat &Stap,const GaugeLorentz & Umu,int mu)
+  {
+    RectStapleUnoptimised(Stap,Umu,mu);
+  }
+  static void RectStaple(const GaugeLorentz & Umu,GaugeMat &Stap,
+			 std::vector<GaugeMat> &U2,
+			 std::vector<GaugeMat> &U, int mu)
+  {
+    if ( Gimpl::isPeriodicGaugeField() ){ 
+      RectStapleOptimised(Stap,U2,U,mu);
+    } else {
+      RectStapleUnoptimised(Stap,Umu,mu);
+    }
+  }

-      static void RectStapleUnoptimised(GaugeMat &Stap,const GaugeLorentz &Umu,int mu){
-	GridBase *grid = Umu._grid;
+  static void RectStapleUnoptimised(GaugeMat &Stap,const GaugeLorentz &Umu,int mu){
+    GridBase *grid = Umu._grid;

-	std::vector<GaugeMat> U(4,grid);
-	for(int d=0;d<Nd;d++){
-	  U[d] = PeekIndex<LorentzIndex>(Umu,d);
-	}
+    std::vector<GaugeMat> U(Nd,grid);
+    for(int d=0;d<Nd;d++){
+      U[d] = PeekIndex<LorentzIndex>(Umu,d);
+    }

-	Stap=zero;
+    Stap=zero;

-	for(int nu=0;nu<Nd;nu++){
-	  if ( nu!=mu) {
-	    //           __ ___ 
-	    //          |    __ |
-	    //
-	    Stap+= Gimpl::ShiftStaple(
-				      Gimpl::CovShiftForward (U[mu],mu,
-							      Gimpl::CovShiftForward (U[nu],nu,
-										      Gimpl::CovShiftBackward(U[mu],mu,
-													      Gimpl::CovShiftBackward(U[mu],mu,
-																      Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
+    for(int nu=0;nu<Nd;nu++){
+      if ( nu!=mu) {
+    //           __ ___ 
+    //          |    __ |
+    //
+    Stap+= Gimpl::ShiftStaple(
+		  Gimpl::CovShiftForward (U[mu],mu,
+		  Gimpl::CovShiftForward (U[nu],nu,
+		  Gimpl::CovShiftBackward(U[mu],mu,
+                  Gimpl::CovShiftBackward(U[mu],mu,
+		  Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);

-	    //              __ 
-	    //          |__ __ |
+    //              __ 
+    //          |__ __ |

-	    Stap+= Gimpl::ShiftStaple(
-				      Gimpl::CovShiftForward (U[mu],mu,
-							      Gimpl::CovShiftBackward(U[nu],nu,
-										      Gimpl::CovShiftBackward(U[mu],mu,
-													      Gimpl::CovShiftBackward(U[mu],mu, U[nu])))) , mu);
+    Stap+= Gimpl::ShiftStaple(
+                  Gimpl::CovShiftForward (U[mu],mu,
+		  Gimpl::CovShiftBackward(U[nu],nu,
+		  Gimpl::CovShiftBackward(U[mu],mu,
+                  Gimpl::CovShiftBackward(U[mu],mu, U[nu])))) , mu);

-	    //           __ 
-	    //          |__ __ |
+    //           __ 
+    //          |__ __ |

-	    Stap+= Gimpl::ShiftStaple(
-				      Gimpl::CovShiftBackward(U[nu],nu,
-							      Gimpl::CovShiftBackward(U[mu],mu,
-										      Gimpl::CovShiftBackward(U[mu],mu,
-													      Gimpl::CovShiftForward(U[nu],nu,U[mu])))) , mu);
+    Stap+= Gimpl::ShiftStaple(
+		  Gimpl::CovShiftBackward(U[nu],nu,
+		  Gimpl::CovShiftBackward(U[mu],mu,
+		  Gimpl::CovShiftBackward(U[mu],mu,
+		  Gimpl::CovShiftForward(U[nu],nu,U[mu])))) , mu);

-	    //           __ ___ 
-	    //          |__    |
+    //           __ ___ 
+    //          |__    |

-	    Stap+= Gimpl::ShiftStaple(
-				      Gimpl::CovShiftForward (U[nu],nu,
-							      Gimpl::CovShiftBackward(U[mu],mu,
-										      Gimpl::CovShiftBackward(U[mu],mu,
-													      Gimpl::CovShiftBackward(U[nu],nu,U[mu])))) , mu);
+    Stap+= Gimpl::ShiftStaple(
+		   Gimpl::CovShiftForward (U[nu],nu,
+	           Gimpl::CovShiftBackward(U[mu],mu,
+                   Gimpl::CovShiftBackward(U[mu],mu,
+                   Gimpl::CovShiftBackward(U[nu],nu,U[mu])))) , mu);

-	    //       --    
-	    //      |  |              
-	    //          
-	    //      |  | 
+     //       --    
+     //      |  |              
+     //          
+     //      |  | 
     
-	    Stap+= Gimpl::ShiftStaple(
-				      Gimpl::CovShiftForward(U[nu],nu,
-							     Gimpl::CovShiftForward(U[nu],nu,
-										    Gimpl::CovShiftBackward(U[mu],mu,
-													    Gimpl::CovShiftBackward(U[nu],nu,
-																    Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
+    Stap+= Gimpl::ShiftStaple(
+		   Gimpl::CovShiftForward(U[nu],nu,
+		   Gimpl::CovShiftForward(U[nu],nu,
+                   Gimpl::CovShiftBackward(U[mu],mu,
+                   Gimpl::CovShiftBackward(U[nu],nu,
+		   Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);


-	    //      |  |              
-	    //          
-	    //      |  | 
-	    //       -- 
+     //      |  |              
+     //          
+     //      |  | 
+     //       -- 
     
-	    Stap+= Gimpl::ShiftStaple(
-				      Gimpl::CovShiftBackward(U[nu],nu,
-							      Gimpl::CovShiftBackward(U[nu],nu,
-										      Gimpl::CovShiftBackward(U[mu],mu,
-													      Gimpl::CovShiftForward (U[nu],nu,U[nu])))) , mu);
-	  }}
-      }
+    Stap+= Gimpl::ShiftStaple(
+		   Gimpl::CovShiftBackward(U[nu],nu,
+		   Gimpl::CovShiftBackward(U[nu],nu,
+                   Gimpl::CovShiftBackward(U[mu],mu,
+                   Gimpl::CovShiftForward (U[nu],nu,U[nu])))) , mu);
+    }}
+  }


-    };
+};


-    typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
-    typedef WilsonLoops<PeriodicGimplR> U1WilsonLoops;
-    typedef WilsonLoops<PeriodicGimplR> SU2WilsonLoops;
-    typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;
+ typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
+ typedef WilsonLoops<PeriodicGimplR> U1WilsonLoops;
+ typedef WilsonLoops<PeriodicGimplR> SU2WilsonLoops;
+ typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;

-  }}
+}}

-#endif
+#endif