Merge branch 'develop' into feature/staggered-comms-compute

Conflicts: lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
2026-06-22 03:33:17 +01:00 · 2018-05-21 13:07:29 +01:00
parent 0e127b1fc7 a61e0df54b
commit 7fbbb31a50
238 changed files with 18155 additions and 7613 deletions
@@ -39,6 +39,7 @@ namespace QCD {
    static const int Zdir = 2;
    static const int Tdir = 3;

+  
    static const int Xp = 0;
    static const int Yp = 1;
    static const int Zp = 2;
@@ -420,15 +421,16 @@ namespace QCD {
    //////////////////////////////////////////////
    // Fermion <-> propagator assignements
    //////////////////////////////////////////////
-    template <class Prop, class Ferm>
-    void FermToProp(Prop &p, const Ferm &f, const int s, const int c)
+    //template <class Prop, class Ferm>
+    template <class Fimpl>
+      void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
    {
-        for(int j = 0; j < Ns; ++j)
+      for(int j = 0; j < Ns; ++j)
        {
            auto pjs = peekSpin(p, j, s);
            auto fj  = peekSpin(f, j);
            
-            for(int i = 0; i < Nc; ++i)
+            for(int i = 0; i < Fimpl::Dimension; ++i)
            {
                pokeColour(pjs, peekColour(fj, i), i, c);
            }
@@ -436,15 +438,16 @@ namespace QCD {
        }
    }
    
-    template <class Prop, class Ferm>
-    void PropToFerm(Ferm &f, const Prop &p, const int s, const int c)
+    //template <class Prop, class Ferm>
+    template <class Fimpl>
+      void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
    {
        for(int j = 0; j < Ns; ++j)
        {
            auto pjs = peekSpin(p, j, s);
            auto fj  = peekSpin(f, j);
            
-            for(int i = 0; i < Nc; ++i)
+            for(int i = 0; i < Fimpl::Dimension; ++i)
            {
                pokeColour(fj, peekColour(pjs, i, c), i);
            }
@@ -492,41 +495,17 @@ namespace QCD {
      return traceIndex<ColourIndex>(lhs);
    }

+    //////////////////////////////////////////
+    // Current types
+    //////////////////////////////////////////
+    GRID_SERIALIZABLE_ENUM(Current, undef,
+                           Vector,  0,
+                           Axial,   1,
+                           Tadpole, 2);
+
 }   //namespace QCD
 } // Grid

-/*
-<<<<<<< HEAD
-#include <Grid/qcd/utils/SpaceTimeGrid.h>
-#include <Grid/qcd/spin/Dirac.h>
-#include <Grid/qcd/spin/TwoSpinor.h>
-#include <Grid/qcd/utils/LinalgUtils.h>
-#include <Grid/qcd/utils/CovariantCshift.h>
-
-// Include representations  
-#include <Grid/qcd/utils/SUn.h>
-#include <Grid/qcd/utils/SUnAdjoint.h>
-#include <Grid/qcd/utils/SUnTwoIndex.h>
-#include <Grid/qcd/representations/hmc_types.h>
-
-// Scalar field
-#include <Grid/qcd/utils/ScalarObjs.h>
-
-#include <Grid/qcd/action/Actions.h>
-
-#include <Grid/qcd/smearing/Smearing.h>
-
-#include <Grid/qcd/hmc/integrators/Integrator.h>
-#include <Grid/qcd/hmc/integrators/Integrator_algorithm.h>
-#include <Grid/qcd/observables/hmc_observable.h>
-#include <Grid/qcd/hmc/HMC.h>
-
-
-//#include <Grid/qcd/modules/mods.h>
-=======
-
->>>>>>> develop
-*/


 #endif
@@ -52,6 +52,35 @@ namespace QCD {
 { 
 }

+///////////////////////////////////////////////////////////////
+// Physical surface field utilities
+///////////////////////////////////////////////////////////////
+template<class Impl>  
+void CayleyFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+{
+  int Ls = this->Ls;
+  FermionField tmp(this->FermionGrid());
+  tmp = solution5d;
+  conformable(solution5d._grid,this->FermionGrid());
+  conformable(exported4d._grid,this->GaugeGrid());
+  axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0);
+  axpby_ssp_pplus (tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
+  ExtractSlice(exported4d, tmp, 0, 0);
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+{
+  int Ls = this->Ls;
+  FermionField tmp(this->FermionGrid());
+  conformable(imported5d._grid,this->FermionGrid());
+  conformable(input4d._grid   ,this->GaugeGrid());
+  tmp = zero;
+  InsertSlice(input4d, tmp, 0   , 0);
+  InsertSlice(input4d, tmp, Ls-1, 0);
+  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
+  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
+  Dminus(tmp,imported5d);
+}
 template<class Impl>  
 void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
 {
@@ -73,7 +102,7 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
  this->DW(psi,tmp_f,DaggerYes);

  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
  }
 }

@@ -83,8 +83,13 @@ namespace Grid {
      virtual void   M5D   (const FermionField &psi, FermionField &chi);
      virtual void   M5Ddag(const FermionField &psi, FermionField &chi);

+      ///////////////////////////////////////////////////////////////
+      // Physical surface field utilities
+      ///////////////////////////////////////////////////////////////
      virtual void   Dminus(const FermionField &psi, FermionField &chi);
      virtual void   DminusDag(const FermionField &psi, FermionField &chi);
+      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+      virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);

      /////////////////////////////////////////////////////
      // Instantiate different versions depending on Impl
@@ -469,7 +469,7 @@ void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionFie
 	}
 	a0 = a0+incr;
 	a1 = a1+incr;
-	a2 = a2+sizeof(Simd::scalar_type);
+	a2 = a2+sizeof(typename Simd::scalar_type);
      }}
    {
      int lexa = s1+LLs*site;
@@ -701,7 +701,7 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
 	}
 	a0 = a0+incr;
 	a1 = a1+incr;
-	a2 = a2+sizeof(Simd::scalar_type);
+	a2 = a2+sizeof(typename Simd::scalar_type);
      }}
    {
      int lexa = s1+LLs*site;
@@ -295,6 +295,27 @@ namespace Grid {
      assert((Ls&0x1)==1); // Odd Ls required
    }

+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+    {
+      int Ls = this->Ls;
+      conformable(solution5d._grid,this->FermionGrid());
+      conformable(exported4d._grid,this->GaugeGrid());
+      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+    }
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+    {
+      int Ls = this->Ls;
+      conformable(imported5d._grid,this->FermionGrid());
+      conformable(input4d._grid   ,this->GaugeGrid());
+      FermionField tmp(this->FermionGrid());
+      tmp=zero;
+      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
+      this->Dminus(tmp,imported5d);
+    }
+
    FermOpTemplateInstantiate(ContinuedFractionFermion5D);

  }
@@ -65,6 +65,14 @@ namespace Grid {
      // Efficient support for multigrid coarsening
      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);

+      ///////////////////////////////////////////////////////////////
+      // Physical surface field utilities
+      ///////////////////////////////////////////////////////////////
+      //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
+      //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
+      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+
      // Constructors
      ContinuedFractionFermion5D(GaugeField &_Umu,
 				 GridCartesian         &FiveDimGrid,
@@ -475,7 +475,7 @@ namespace QCD {
                        }
                        a0 = a0 + incr;
                        a1 = a1 + incr;
-                        a2 = a2 + sizeof(Simd::scalar_type);
+                        a2 = a2 + sizeof(typename Simd::scalar_type);
                    }
                }

@@ -50,11 +50,13 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ////////////////////////////////////////////

 #include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
-#include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>     // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
-//#include <Grid/qcd/action/fermion/CloverFermion.h>
+
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
+
 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
 #include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
@@ -104,10 +106,33 @@ typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermi
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;

+typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
+typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
+typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
+
+// Twisted mass fermion
 typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
 typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;

+// Clover fermions
+typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
+typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
+typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
+
+typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
+typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
+typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
+
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
+
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
+
+// Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
@@ -70,7 +70,9 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>

 #define TwoIndexFermOpTemplateInstantiate(A) \
  template class A<WilsonTwoIndexSymmetricImplF>; \
-  template class A<WilsonTwoIndexSymmetricImplD>; 
+  template class A<WilsonTwoIndexSymmetricImplD>; \
+  template class A<WilsonTwoIndexAntiSymmetricImplF>; \
+  template class A<WilsonTwoIndexAntiSymmetricImplD>;

 #define FermOp5dVecTemplateInstantiate(A) \
  template class A<DomainWallVec5dImplF>;	\
@@ -47,6 +47,7 @@ namespace Grid {
      INHERIT_IMPL_TYPES(Impl);

      FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
+      virtual ~FermionOperator(void) = default;

      virtual FermionField &tmp(void) = 0;

@@ -115,6 +116,34 @@ namespace Grid {
      ///////////////////////////////////////////////
      virtual void ImportGauge(const GaugeField & _U)=0;

+      //////////////////////////////////////////////////////////////////////
+      // Conserved currents, either contract at sink or insert sequentially.
+      //////////////////////////////////////////////////////////////////////
+      virtual void ContractConservedCurrent(PropagatorField &q_in_1,
+                                            PropagatorField &q_in_2,
+                                            PropagatorField &q_out,
+                                            Current curr_type,
+                                            unsigned int mu)=0;
+      virtual void SeqConservedCurrent(PropagatorField &q_in, 
+                                       PropagatorField &q_out,
+                                       Current curr_type,
+                                       unsigned int mu,
+                                       std::vector<Real> mom,
+                                       unsigned int tmin, 
+                                       unsigned int tmax)=0;
+      ///////////////////////////////////////////////
+      // Physical field import/export
+      ///////////////////////////////////////////////
+      virtual void Dminus(const FermionField &psi, FermionField &chi)    { chi=psi; }
+      virtual void DminusDag(const FermionField &psi, FermionField &chi) { chi=psi; }
+      virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)
+      {
+	imported = input;
+      };
+      virtual void ExportPhysicalFermionSolution(const FermionField &solution,FermionField &exported)
+      {
+	exported=solution;
+      };
    };

  }
@@ -164,6 +164,7 @@ namespace QCD {
    public:

    static const int Dimension = Representation::Dimension;
+    static const bool isFundamental = Representation::isFundamental;
    static const bool LsVectorised=false;
    static const int Nhcs = Options::Nhcs;

@@ -212,6 +213,13 @@ namespace QCD {
                         StencilImpl &St) {
      mult(&phi(), &U(mu), &chi());
    }
+    
+    inline void multLinkProp(SitePropagator &phi,
+                             const SiteDoubledGaugeField &U,
+                             const SitePropagator &chi,
+                             int mu) {
+       mult(&phi(), &U(mu), &chi());
+    }
      
    template <class ref>
    inline void loadLinkElement(Simd &reg, ref &memory) {
@@ -254,8 +262,22 @@ namespace QCD {
      GaugeLinkField link(mat._grid);
      link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
      PokeIndex<LorentzIndex>(mat,link,mu);
-    }   
+    }  
+    
+    inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
+      mat = outerProduct(B,A); 
+    }  
+
+    inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+      mat = TraceIndex<SpinIndex>(P); 
+    }
      
+    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+      for (int mu = 0; mu < Nd; mu++)
+      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
+    }
+
+
    inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
      
      int Ls=Btilde._grid->_fdimensions[0];
@@ -277,27 +299,28 @@ namespace QCD {
  ////////////////////////////////////////////////////////////////////////////////////
  // Single flavour four spinors with colour index, 5d redblack
  ////////////////////////////////////////////////////////////////////////////////////
-template<class S,int Nrepresentation=Nc, class Options=CoeffReal>
-class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+template<class S,class Representation = FundamentalRepresentation, class Options=CoeffReal>
+class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Representation::Dimension> > { 
  public:

-  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > Gimpl;
  INHERIT_GIMPL_TYPES(Gimpl);

-  static const int Dimension = Nrepresentation;
+  static const int Dimension = Representation::Dimension;
+  static const bool isFundamental = Representation::isFundamental;
  static const bool LsVectorised=true;
  static const int Nhcs = Options::Nhcs;
      
  typedef typename Options::_Coeff_t Coeff_t;      
  typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
  
-  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
-  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Nrepresentation>, Ns> >;
-  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
-  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhcs> >;
-  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
-  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
-  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
+  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
  
  typedef iImplSpinor<Simd>            SiteSpinor;
  typedef iImplPropagator<Simd>        SitePropagator;
@@ -333,14 +356,27 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
                       const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
                       StencilImpl &St) {
    SiteGaugeLink UU;
-    for (int i = 0; i < Nrepresentation; i++) {
-      for (int j = 0; j < Nrepresentation; j++) {
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
        vsplat(UU()()(i, j), U(mu)()(i, j));
      }
    }
    mult(&phi(), &UU(), &chi());
  }
-      
+
+  inline void multLinkProp(SitePropagator &phi,
+                           const SiteDoubledGaugeField &U,
+                           const SitePropagator &chi,
+                           int mu) {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+        vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mult(&phi(), &UU(), &chi());
+  }
+
  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) 
  {
    SiteScalarGaugeField  ScalarUmu;
@@ -373,6 +409,19 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
    assert(0);
  }

+  inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
+    assert(0);
+  } 
+
+  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+    assert(0);
+  }
+
+  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    assert(0);
+  }
+
+
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {

    assert(0);
@@ -425,25 +474,26 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
    ////////////////////////////////////////////////////////////////////////////////////////
    // Flavour doubled spinors; is Gparity the only? what about C*?
    ////////////////////////////////////////////////////////////////////////////////////////
-template <class S, int Nrepresentation, class Options=CoeffReal>
-class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
+template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
+class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:

- static const int Dimension = Nrepresentation;
+ static const int Dimension = Representation::Dimension;
+ static const bool isFundamental = Representation::isFundamental;
 static const int Nhcs = Options::Nhcs;
 static const bool LsVectorised=false;

- typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
+ typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
 INHERIT_GIMPL_TYPES(Gimpl);

 typedef typename Options::_Coeff_t Coeff_t;
 typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
      
- template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>,   Ngp>;
- template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Nrepresentation>, Ns>,   Ngp>;
- template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>,  Ngp>;
- template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Nrepresentation>, Nhcs>, Ngp>;
- template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
+ template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Dimension>, Ns>,   Ngp>;
+ template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Dimension>, Ns>,   Ngp>;
+ template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Dimension>, Nhs>,  Ngp>;
+ template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
+ template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;

 typedef iImplSpinor<Simd>            SiteSpinor;
 typedef iImplPropagator<Simd>        SitePropagator;
@@ -537,7 +587,12 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
   }
   
 }
-
+    // Fixme: Gparity prop * link
+    inline void multLinkProp(SitePropagator &phi, const SiteDoubledGaugeField &U,
+                             const SitePropagator &chi, int mu)
+    {
+        assert(0);
+    }

 template <class ref>
 inline void loadLinkElement(Simd &reg, ref &memory) {
@@ -611,6 +666,25 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
   return;
 }
      
+ inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
+   //mat = outerProduct(Btilde, A);
+   assert(0);
+  }
+
+  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+    assert(0);
+    /*
+    auto tmp = TraceIndex<SpinIndex>(P);
+    parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
+      mat[ss]() = tmp[ss](0, 0) + conjugate(tmp[ss](1, 1));
+    }
+    */
+  }
+
+  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    assert(0);
+  }
+  
 inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {

   int Ls = Btilde._grid->_fdimensions[0];
@@ -640,6 +714,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:

    typedef RealD  _Coeff_t ;
    static const int Dimension = Representation::Dimension;
+    static const bool isFundamental = Representation::isFundamental;
    static const bool LsVectorised=false;
    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
      
@@ -758,8 +833,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
      GaugeLinkField link(mat._grid);
      link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
      PokeIndex<LorentzIndex>(mat,link,mu);
-    }   
-      
+    } 
+          
    inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
      assert (0); 
      // Must never hit
@@ -775,6 +850,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
    public:

    static const int Dimension = Representation::Dimension;
+    static const bool isFundamental = Representation::isFundamental;
    static const bool LsVectorised=true;
    typedef RealD   Coeff_t ;
    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
@@ -951,29 +1027,33 @@ typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation, CoeffReal > Wilso
 typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplF;  // Float
 typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplD;  // Double
 
-typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffReal> DomainWallVec5dImplF; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffReal> DomainWallVec5dImplD; // Double
+typedef WilsonImpl<vComplex,  TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplR;   // Real.. whichever prec
+typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF;  // Float
+typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD;  // Double
+
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplD; // Double
 
-typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
 
-typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplex> ZDomainWallVec5dImplF; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplex> ZDomainWallVec5dImplD; // Double
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplD; // Double
 
-typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
 
-typedef GparityWilsonImpl<vComplex , Nc,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
-typedef GparityWilsonImpl<vComplexF, Nc,CoeffReal> GparityWilsonImplF;  // Float
-typedef GparityWilsonImpl<vComplexD, Nc,CoeffReal> GparityWilsonImplD;  // Double
+typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
+typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF;  // Float
+typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD;  // Double
 
-typedef GparityWilsonImpl<vComplex , Nc,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
-typedef GparityWilsonImpl<vComplexF, Nc,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
-typedef GparityWilsonImpl<vComplexD, Nc,CoeffRealHalfComms> GparityWilsonImplDF;  // Double
+typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
+typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
+typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF;  // Double

 typedef StaggeredImpl<vComplex,  FundamentalRepresentation > StaggeredImplR;   // Real.. whichever prec
 typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF;  // Float
@@ -569,6 +569,31 @@ void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
 }


+//////////////////////////////////////////////////////// 
+// Conserved current - not yet implemented.
+////////////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                        PropagatorField &q_in_2,
+                                                        PropagatorField &q_out,
+                                                        Current curr_type,
+                                                        unsigned int mu)
+{
+    assert(0);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+                                                         PropagatorField &q_out,
+                                                         Current curr_type,
+                                                         unsigned int mu, 
+                                                         std::vector<Real> mom,
+                                                         unsigned int tmin,
+                                                         unsigned int tmax)
+{
+    assert(0);
+}
+
 FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);

  //AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
@@ -179,6 +179,22 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS

  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
+  
+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+                                PropagatorField &q_in_2,
+                                PropagatorField &q_out,
+                                Current curr_type,
+                                unsigned int mu);
+  void SeqConservedCurrent(PropagatorField &q_in,
+                           PropagatorField &q_out,
+                           Current curr_type,
+                           unsigned int mu, 
+                           std::vector<Real> mom,
+                           unsigned int tmin,
+                           unsigned int tmax);
 };

 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
@@ -619,6 +619,30 @@ void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
  MooeeInv(in, out);
 }

+//////////////////////////////////////////////////////// 
+// Conserved current - not yet implemented.
+////////////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                         PropagatorField &q_in_2,
+                                                         PropagatorField &q_out,
+                                                         Current curr_type,
+                                                         unsigned int mu)
+{
+    assert(0);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+                                                          PropagatorField &q_out,
+                                                          Current curr_type,
+                                                          unsigned int mu, 
+                                                          std::vector<Real> mom,
+                                                          unsigned int tmin,
+                                                          unsigned int tmax)
+{
+    assert(0);
+}

 FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
 FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
@@ -212,6 +212,21 @@ namespace QCD {
    // Comms buffer
    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    
+    ///////////////////////////////////////////////////////////////
+    // Conserved current utilities
+    ///////////////////////////////////////////////////////////////
+    void ContractConservedCurrent(PropagatorField &q_in_1,
+                                  PropagatorField &q_in_2,
+                                  PropagatorField &q_out,
+                                  Current curr_type,
+                                  unsigned int mu);
+    void SeqConservedCurrent(PropagatorField &q_in,
+                             PropagatorField &q_out,
+                             Current curr_type,
+                             unsigned int mu, 
+                             std::vector<Real> mom,
+                             unsigned int tmin,
+                             unsigned int tmax);
  };

 }}
@@ -853,7 +853,7 @@ namespace QCD {

              a0 = a0 + incr;
              a1 = a1 + incr;
-              a2 = a2 + sizeof(Simd::scalar_type);
+              a2 = a2 + sizeof(typename Simd::scalar_type);
            }
          }

@@ -396,6 +396,27 @@ namespace Grid {
      amax=zolo_hi;
    }

+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+    {
+      int Ls = this->Ls;
+      conformable(solution5d._grid,this->FermionGrid());
+      conformable(exported4d._grid,this->GaugeGrid());
+      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+    }
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+    {
+      int Ls = this->Ls;
+      conformable(imported5d._grid,this->FermionGrid());
+      conformable(input4d._grid   ,this->GaugeGrid());
+      FermionField tmp(this->FermionGrid());
+      tmp=zero;
+      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
+      this->Dminus(tmp,imported5d);
+    }
+
      // Constructors
    template<class Impl>
    PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
@@ -70,6 +70,12 @@ namespace Grid {
      // Efficient support for multigrid coarsening
      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);

+      ///////////////////////////////////////////////////////////////
+      // Physical surface field utilities
+      ///////////////////////////////////////////////////////////////
+      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+
      // Constructors
      PartialFractionFermion5D(GaugeField &_Umu,
 			       GridCartesian         &FiveDimGrid,
@@ -0,0 +1,243 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
+
+    Copyright (C) 2017
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/Eigen/Dense>
+#include <Grid/qcd/spin/Dirac.h>
+
+namespace Grid
+{
+namespace QCD
+{
+
+// *NOT* EO
+template <class Impl>
+RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
+{
+  FermionField temp(out._grid);
+
+  // Wilson term
+  out.checkerboard = in.checkerboard;
+  this->Dhop(in, out, DaggerNo);
+
+  // Clover term
+  Mooee(in, temp);
+
+  out += temp;
+  return norm2(out);
+}
+
+template <class Impl>
+RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
+{
+  FermionField temp(out._grid);
+
+  // Wilson term
+  out.checkerboard = in.checkerboard;
+  this->Dhop(in, out, DaggerYes);
+
+  // Clover term
+  MooeeDag(in, temp);
+
+  out += temp;
+  return norm2(out);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
+{
+  WilsonFermion<Impl>::ImportGauge(_Umu);
+  GridBase *grid = _Umu._grid;
+  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
+
+  // Compute the field strength terms mu>nu
+  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
+
+  // Compute the Clover Operator acting on Colour and Spin
+  // multiply here by the clover coefficients for the anisotropy
+  CloverTerm  = fillCloverYZ(Bx) * csw_r;
+  CloverTerm += fillCloverXZ(By) * csw_r;
+  CloverTerm += fillCloverXY(Bz) * csw_r;
+  CloverTerm += fillCloverXT(Ex) * csw_t;
+  CloverTerm += fillCloverYT(Ey) * csw_t;
+  CloverTerm += fillCloverZT(Ez) * csw_t;
+  CloverTerm += diag_mass;
+
+  int lvol = _Umu._grid->lSites();
+  int DimRep = Impl::Dimension;
+
+  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+
+  std::vector<int> lcoor;
+  typename SiteCloverType::scalar_object Qx = zero, Qxinv = zero;
+
+  for (int site = 0; site < lvol; site++)
+  {
+    grid->LocalIndexToLocalCoor(site, lcoor);
+    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+    peekLocalSite(Qx, CloverTerm, lcoor);
+    Qxinv = zero;
+    //if (csw!=0){
+    for (int j = 0; j < Ns; j++)
+      for (int k = 0; k < Ns; k++)
+        for (int a = 0; a < DimRep; a++)
+          for (int b = 0; b < DimRep; b++)
+            EigenCloverOp(a + j * DimRep, b + k * DimRep) = Qx()(j, k)(a, b);
+    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
+
+    EigenInvCloverOp = EigenCloverOp.inverse();
+    //std::cout << EigenInvCloverOp << std::endl;
+    for (int j = 0; j < Ns; j++)
+      for (int k = 0; k < Ns; k++)
+        for (int a = 0; a < DimRep; a++)
+          for (int b = 0; b < DimRep; b++)
+            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
+    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
+    //  }
+    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
+  }
+
+  // Separate the even and odd parts
+  pickCheckerboard(Even, CloverTermEven, CloverTerm);
+  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
+
+  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
+  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
+
+  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
+  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
+
+  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
+  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerNo, InverseNo);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerYes, InverseNo);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerNo, InverseYes);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerYes, InverseYes);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
+{
+  out.checkerboard = in.checkerboard;
+  CloverFieldType *Clover;
+  assert(in.checkerboard == Odd || in.checkerboard == Even);
+
+  if (dag)
+  {
+    if (in._grid->_isCheckerBoarded)
+    {
+      if (in.checkerboard == Odd)
+      {
+        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
+      }
+      else
+      {
+        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
+      }
+      out = *Clover * in;
+    }
+    else
+    {
+      Clover = (inv) ? &CloverTermInv : &CloverTerm;
+      out = adj(*Clover) * in;
+    }
+  }
+  else
+  {
+    if (in._grid->_isCheckerBoarded)
+    {
+
+      if (in.checkerboard == Odd)
+      {
+        //  std::cout << "Calling clover term Odd" << std::endl;
+        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
+      }
+      else
+      {
+        //  std::cout << "Calling clover term Even" << std::endl;
+        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
+      }
+      out = *Clover * in;
+      //  std::cout << GridLogMessage << "*Clover.checkerboard "  << (*Clover).checkerboard << std::endl;
+    }
+    else
+    {
+      Clover = (inv) ? &CloverTermInv : &CloverTerm;
+      out = *Clover * in;
+    }
+  }
+
+} // MooeeInternal
+
+
+// Derivative parts
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
+{
+  assert(0);
+}
+
+// Derivative parts
+template <class Impl>
+void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+{
+  assert(0); // not implemented yet
+}
+
+FermOpTemplateInstantiate(WilsonCloverFermion);
+AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
+TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
+//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
+}
+}
@@ -0,0 +1,366 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
+
+    Copyright (C) 2017
+
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: David Preti <>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#ifndef GRID_QCD_WILSON_CLOVER_FERMION_H
+#define GRID_QCD_WILSON_CLOVER_FERMION_H
+
+#include <Grid/Grid.h>
+
+namespace Grid
+{
+namespace QCD
+{
+
+///////////////////////////////////////////////////////////////////
+// Wilson Clover
+//
+// Operator ( with anisotropy coefficients):
+//
+// Q =   1 + (Nd-1)/xi_0 + m
+//     + W_t + (nu/xi_0) * W_s
+//     - 1/2*[ csw_t * sum_s (sigma_ts F_ts) + (csw_s/xi_0) * sum_ss (sigma_ss F_ss)  ]
+//
+// s spatial, t temporal directions.
+// where W_t and W_s are the temporal and spatial components of the
+// Wilson Dirac operator
+//
+// csw_r = csw_t to recover the isotropic version
+//////////////////////////////////////////////////////////////////
+
+template <class Impl>
+class WilsonCloverFermion : public WilsonFermion<Impl>
+{
+public:
+  // Types definitions
+  INHERIT_IMPL_TYPES(Impl);
+  template <typename vtype>
+  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
+  typedef iImplClover<Simd> SiteCloverType;
+  typedef Lattice<SiteCloverType> CloverFieldType;
+
+public:
+  typedef WilsonFermion<Impl> WilsonBase;
+
+  virtual void Instantiatable(void){};
+  // Constructors
+  WilsonCloverFermion(GaugeField &_Umu, GridCartesian &Fgrid,
+                      GridRedBlackCartesian &Hgrid,
+                      const RealD _mass,
+                      const RealD _csw_r = 0.0,
+                      const RealD _csw_t = 0.0,
+                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
+                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
+                                                                                     Fgrid,
+                                                                                     Hgrid,
+                                                                                     _mass, impl_p, clover_anisotropy),
+                                                                 CloverTerm(&Fgrid),
+                                                                 CloverTermInv(&Fgrid),
+                                                                 CloverTermEven(&Hgrid),
+                                                                 CloverTermOdd(&Hgrid),
+                                                                 CloverTermInvEven(&Hgrid),
+                                                                 CloverTermInvOdd(&Hgrid),
+                                                                 CloverTermDagEven(&Hgrid),
+                                                                 CloverTermDagOdd(&Hgrid),
+                                                                 CloverTermInvDagEven(&Hgrid),
+                                                                 CloverTermInvDagOdd(&Hgrid)
+  {
+    assert(Nd == 4); // require 4 dimensions
+
+    if (clover_anisotropy.isAnisotropic)
+    {
+      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
+      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
+    }
+    else
+    {
+      csw_r = _csw_r * 0.5;
+      diag_mass = 4.0 + _mass;
+    }
+    csw_t = _csw_t * 0.5;
+
+    if (csw_r == 0)
+      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
+    if (csw_t == 0)
+      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
+
+    ImportGauge(_Umu);
+  }
+
+  virtual RealD M(const FermionField &in, FermionField &out);
+  virtual RealD Mdag(const FermionField &in, FermionField &out);
+
+  virtual void Mooee(const FermionField &in, FermionField &out);
+  virtual void MooeeDag(const FermionField &in, FermionField &out);
+  virtual void MooeeInv(const FermionField &in, FermionField &out);
+  virtual void MooeeInvDag(const FermionField &in, FermionField &out);
+  virtual void MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv);
+
+  //virtual void MDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+  virtual void MooDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+  virtual void MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+
+  void ImportGauge(const GaugeField &_Umu);
+
+  // Derivative parts unpreconditioned pseudofermions
+  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+  {
+    conformable(X._grid, Y._grid);
+    conformable(X._grid, force._grid);
+    GaugeLinkField force_mu(force._grid), lambda(force._grid);
+    GaugeField clover_force(force._grid);
+    PropagatorField Lambda(force._grid);
+
+    // Guido: Here we are hitting some performance issues:
+    // need to extract the components of the DoubledGaugeField
+    // for each call
+    // Possible solution
+    // Create a vector object to store them? (cons: wasting space)
+    std::vector<GaugeLinkField> U(Nd, this->Umu._grid);
+
+    Impl::extractLinkField(U, this->Umu);
+
+    force = zero;
+    // Derivative of the Wilson hopping term
+    this->DhopDeriv(force, X, Y, dag);
+
+    ///////////////////////////////////////////////////////////
+    // Clover term derivative
+    ///////////////////////////////////////////////////////////
+    Impl::outerProductImpl(Lambda, X, Y);
+    //std::cout << "Lambda:" << Lambda << std::endl;
+
+    Gamma::Algebra sigma[] = {
+        Gamma::Algebra::SigmaXY,
+        Gamma::Algebra::SigmaXZ,
+        Gamma::Algebra::SigmaXT,
+        Gamma::Algebra::MinusSigmaXY,
+        Gamma::Algebra::SigmaYZ,
+        Gamma::Algebra::SigmaYT,
+        Gamma::Algebra::MinusSigmaXZ,
+        Gamma::Algebra::MinusSigmaYZ,
+        Gamma::Algebra::SigmaZT,
+        Gamma::Algebra::MinusSigmaXT,
+        Gamma::Algebra::MinusSigmaYT,
+        Gamma::Algebra::MinusSigmaZT};
+
+    /*
+      sigma_{\mu \nu}=
+      | 0         sigma[0]  sigma[1]  sigma[2] |
+      | sigma[3]    0       sigma[4]  sigma[5] |
+      | sigma[6]  sigma[7]     0      sigma[8] |
+      | sigma[9]  sigma[10] sigma[11]   0      |
+    */
+
+    int count = 0;
+    clover_force = zero;
+    for (int mu = 0; mu < 4; mu++)
+    {
+      force_mu = zero;
+      for (int nu = 0; nu < 4; nu++)
+      {
+        if (mu == nu)
+        continue;
+        
+        RealD factor;
+        if (nu == 4 || mu == 4)
+        {
+          factor = 2.0 * csw_t;
+        }
+        else
+        {
+          factor = 2.0 * csw_r;
+        }
+        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
+        count++;
+      }
+
+      pokeLorentz(clover_force, U[mu] * force_mu, mu);
+    }
+    //clover_force *= csw;
+    force += clover_force;
+  }
+
+  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
+  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
+  {
+    conformable(lambda._grid, U[0]._grid);
+    GaugeLinkField out(lambda._grid), tmp(lambda._grid);
+    // insertion in upper staple
+    // please check redundancy of shift operations
+
+    // C1+
+    tmp = lambda * U[nu];
+    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
+
+    // C2+
+    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
+
+    // C3+
+    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
+
+    // C4+
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
+
+    // insertion in lower staple
+    // C1-
+    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+
+    // C2-
+    tmp = adj(lambda) * U[nu];
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+
+    // C3-
+    tmp = lambda * U[nu];
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
+
+    // C4-
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
+
+    return out;
+  }
+
+private:
+  // here fixing the 4 dimensions, make it more general?
+
+  RealD csw_r;                                               // Clover coefficient - spatial
+  RealD csw_t;                                               // Clover coefficient - temporal
+  RealD diag_mass;                                           // Mass term
+  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
+  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
+  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
+  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
+  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
+
+  // eventually these can be compressed into 6x6 blocks instead of the 12x12
+  // using the DeGrand-Rossi basis for the gamma matrices
+  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 1) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(1, 0) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 1) = -F._odata[i]()();
+      T._odata[i]()(1, 0) = F._odata[i]()();
+      T._odata[i]()(2, 3) = -F._odata[i]()();
+      T._odata[i]()(3, 2) = F._odata[i]()();
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXY(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+
+      T._odata[i]()(0, 0) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(1, 1) = timesI(F._odata[i]()());
+      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 1) = timesI(F._odata[i]()());
+      T._odata[i]()(1, 0) = timesI(F._odata[i]()());
+      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverYT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 1) = -(F._odata[i]()());
+      T._odata[i]()(1, 0) = (F._odata[i]()());
+      T._odata[i]()(2, 3) = (F._odata[i]()());
+      T._odata[i]()(3, 2) = -(F._odata[i]()());
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverZT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 0) = timesI(F._odata[i]()());
+      T._odata[i]()(1, 1) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
+    }
+
+    return T;
+  }
+};
+}
+}
+
+#endif // GRID_QCD_WILSON_CLOVER_FERMION_H
@@ -69,39 +69,47 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Compress includes precision change if mpi data is not same */
  /*****************************************************/
-  inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
-    projector::Proj(buf[o],in,mu,dag);
+  inline void Compress(SiteHalfSpinor * __restrict__ buf,Integer o,const SiteSpinor &in) {
+    SiteHalfSpinor tmp;
+    projector::Proj(tmp,in,mu,dag);
+    vstream(buf[o],tmp);
  }

  /*****************************************************/
  /* Exchange includes precision change if mpi data is not same */
  /*****************************************************/
-  inline void Exchange(SiteHalfSpinor *mp,
-                       SiteHalfSpinor *vp0,
-                       SiteHalfSpinor *vp1,
+  inline void Exchange(SiteHalfSpinor * __restrict__ mp,
+                       const SiteHalfSpinor * __restrict__ vp0,
+                       const SiteHalfSpinor * __restrict__ vp1,
 		       Integer type,Integer o){
-    exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
+    SiteHalfSpinor tmp1;
+    SiteHalfSpinor tmp2;
+    exchange(tmp1,tmp2,vp0[o],vp1[o],type);
+    vstream(mp[2*o  ],tmp1);
+    vstream(mp[2*o+1],tmp2);
  }

  /*****************************************************/
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
-  inline void Decompress(SiteHalfSpinor *out,
-			 SiteHalfSpinor *in, Integer o) {    
+  inline void Decompress(SiteHalfSpinor * __restrict__ out,
+			 SiteHalfSpinor * __restrict__ in, Integer o) {    
    assert(0);
  }

  /*****************************************************/
  /* Compress Exchange                                 */
  /*****************************************************/
-  inline void CompressExchange(SiteHalfSpinor *out0,
-			       SiteHalfSpinor *out1,
-			       const SiteSpinor *in,
+  inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
+			       SiteHalfSpinor * __restrict__ out1,
+			       const SiteSpinor * __restrict__ in,
 			       Integer j,Integer k, Integer m,Integer type){
    SiteHalfSpinor temp1, temp2,temp3,temp4;
    projector::Proj(temp1,in[k],mu,dag);
    projector::Proj(temp2,in[m],mu,dag);
-    exchange(out0[j],out1[j],temp1,temp2,type);
+    exchange(temp3,temp4,temp1,temp2,type);
+    vstream(out0[j],temp3);
+    vstream(out1[j],temp4);
  }

  /*****************************************************/
@@ -265,7 +273,6 @@ public:
    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
  }
-  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;

  WilsonStencil(GridBase *grid,
 		int npoints,
@@ -47,7 +47,8 @@ int WilsonFermionStatic::HandOptDslash;
 template <class Impl>
 WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
                                   GridRedBlackCartesian &Hgrid, RealD _mass,
-                                   const ImplParams &p)
+                                   const ImplParams &p,
+                                   const WilsonAnisotropyCoefficients &anis)
    : Kernels(p),
      _grid(&Fgrid),
      _cbgrid(&Hgrid),
@@ -60,16 +61,41 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
      Umu(&Fgrid),
      UmuEven(&Hgrid),
      UmuOdd(&Hgrid),
-      _tmp(&Hgrid)
+      _tmp(&Hgrid),
+      anisotropyCoeff(anis)
 {
  // Allocate the required comms buffer
  ImportGauge(_Umu);
+  if  (anisotropyCoeff.isAnisotropic){
+    diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
+  } else {
+    diag_mass = 4.0 + mass;
+  }
+
+
 }

 template <class Impl>
 void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {
  GaugeField HUmu(_Umu._grid);
-  HUmu = _Umu * (-0.5);
+
+  //Here multiply the anisotropy coefficients
+  if (anisotropyCoeff.isAnisotropic)
+  {
+
+    for (int mu = 0; mu < Nd; mu++)
+    {
+      GaugeLinkField U_dir = (-0.5)*PeekIndex<LorentzIndex>(_Umu, mu);
+      if (mu != anisotropyCoeff.t_direction)
+        U_dir *= (anisotropyCoeff.nu / anisotropyCoeff.xi_0);
+
+      PokeIndex<LorentzIndex>(HUmu, U_dir, mu);
+    }
+  }
+  else
+  {
+    HUmu = _Umu * (-0.5);
+  }
  Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
  pickCheckerboard(Even, UmuEven, Umu);
  pickCheckerboard(Odd, UmuOdd, Umu);
@@ -83,14 +109,14 @@ template <class Impl>
 RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
  Dhop(in, out, DaggerNo);
-  return axpy_norm(out, 4 + mass, in, out);
+  return axpy_norm(out, diag_mass, in, out);
 }

 template <class Impl>
 RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
  Dhop(in, out, DaggerYes);
-  return axpy_norm(out, 4 + mass, in, out);
+  return axpy_norm(out, diag_mass, in, out);
 }

 template <class Impl>
@@ -114,7 +140,7 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
 template <class Impl>
 void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
-  typename FermionField::scalar_type scal(4.0 + mass);
+  typename FermionField::scalar_type scal(diag_mass);
  out = scal * in;
 }

@@ -127,7 +153,7 @@ void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
 template<class Impl>
 void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
-  out = (1.0/(4.0+mass))*in;
+  out = (1.0/(diag_mass))*in;
 }
  
 template<class Impl>
@@ -204,7 +230,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,

  FermionField Btilde(B._grid);
  FermionField Atilde(B._grid);
-  Atilde = A;
+  Atilde = A;//redundant

  st.HaloExchange(B, compressor);

@@ -429,6 +455,112 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
 };
 /*Change ends */

+/*******************************************************************************
+ * Conserved current utilities for Wilson fermions, for contracting propagators
+ * to make a conserved current sink or inserting the conserved current 
+ * sequentially.
+ ******************************************************************************/
+template <class Impl>
+void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                   PropagatorField &q_in_2,
+                                                   PropagatorField &q_out,
+                                                   Current curr_type,
+                                                   unsigned int mu)
+{
+    Gamma g5(Gamma::Algebra::Gamma5);
+    conformable(_grid, q_in_1._grid);
+    conformable(_grid, q_in_2._grid);
+    conformable(_grid, q_out._grid);
+    PropagatorField tmp1(_grid), tmp2(_grid);
+    q_out = zero;
+
+    // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
+    // Inefficient comms method but not performance critical.
+    tmp1 = Cshift(q_in_1, mu, 1);
+    tmp2 = Cshift(q_in_2, mu, 1);
+    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    {
+        Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sU],
+                                                 q_in_2._odata[sU],
+                                                 q_out._odata[sU],
+                                                 Umu, sU, mu);
+        Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sU],
+                                                 tmp2._odata[sU],
+                                                 q_out._odata[sU],
+                                                 Umu, sU, mu);
+    }
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
+                                              PropagatorField &q_out,
+                                              Current curr_type,
+                                              unsigned int mu,
+                                              std::vector<Real> mom,
+                                              unsigned int tmin, 
+                                              unsigned int tmax)
+{
+    conformable(_grid, q_in._grid);
+    conformable(_grid, q_out._grid);
+    Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
+    ComplexD i(0.0,1.0);
+    PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
+    unsigned int tshift = (mu == Tp) ? 1 : 0;
+    unsigned int LLt    = GridDefaultLatt()[Tp];
+
+    // Momentum projection
+    ph = zero;
+    for(unsigned int mu = 0; mu < Nd - 1; mu++)
+    {
+        LatticeCoordinate(coor, mu);
+        ph = ph + mom[mu]*coor*((1./(_grid->_fdimensions[mu])));
+    }
+    ph = exp((RealD)(2*M_PI)*i*ph);
+
+    q_out = zero;
+    LatticeInteger coords(_grid);
+    LatticeCoordinate(coords, Tp);
+
+    // Need q(x + mu) and q(x - mu).
+    tmp = Cshift(q_in, mu, 1);
+    tmpFwd = tmp*ph;
+    tmp = ph*q_in;
+    tmpBwd = Cshift(tmp, mu, -1);
+
+    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    {
+        // Compute the sequential conserved current insertion only if our simd
+        // object contains a timeslice we need.
+        vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
+                             (coords._odata[sU] <= tmax));
+        Integer timeSlices = Reduce(t_mask);
+
+        if (timeSlices > 0)
+        {
+            Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sU], 
+                                                q_out._odata[sU], 
+                                                Umu, sU, mu, t_mask);
+        }
+
+        // Repeat for backward direction.
+        t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
+                      (coords._odata[sU] <= (tmax + tshift)));
+
+	//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+	unsigned int t0 = 0;
+	if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
+
+        timeSlices = Reduce(t_mask);
+
+        if (timeSlices > 0)
+        {
+            Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sU], 
+                                                q_out._odata[sU], 
+                                                Umu, sU, mu, t_mask);
+        }
+    }
+}
+
 FermOpTemplateInstantiate(WilsonFermion);
 AdjointFermOpTemplateInstantiate(WilsonFermion);
 TwoIndexFermOpTemplateInstantiate(WilsonFermion);
@@ -44,6 +44,21 @@ class WilsonFermionStatic {
  static const int npoint = 8;
 };

+ struct WilsonAnisotropyCoefficients: Serializable
+ {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonAnisotropyCoefficients,
+  bool, isAnisotropic,
+  int, t_direction,
+  double, xi_0,
+  double, nu);
+
+  WilsonAnisotropyCoefficients():
+    isAnisotropic(false), 
+    t_direction(Nd-1), 
+    xi_0(1.0), 
+    nu(1.0){}
+};
+
 template <class Impl>
 class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
 public:
@@ -65,8 +80,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
  //////////////////////////////////////////////////////////////////
-  RealD M(const FermionField &in, FermionField &out);
-  RealD Mdag(const FermionField &in, FermionField &out);
+  virtual RealD M(const FermionField &in, FermionField &out);
+  virtual RealD Mdag(const FermionField &in, FermionField &out);

  /////////////////////////////////////////////////////////
  // half checkerboard operations
@@ -123,8 +138,9 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {

  // Constructor
  WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
-                GridRedBlackCartesian &Hgrid, RealD _mass,
-                const ImplParams &p = ImplParams());
+                GridRedBlackCartesian &Hgrid, RealD _mass, 
+                const ImplParams &p = ImplParams(), 
+                const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );

  // DoubleStore impl dependent
  void ImportGauge(const GaugeField &_Umu);
@@ -138,6 +154,7 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  virtual RealD Mass(void) { return mass; }
  virtual int   isTrivialEE(void) { return 1; };
  RealD mass;
+  RealD diag_mass;

  GridBase *_grid;
  GridBase *_cbgrid;
@@ -154,6 +171,24 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {

  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
+
+  WilsonAnisotropyCoefficients anisotropyCoeff;
+  
+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+                                PropagatorField &q_in_2,
+                                PropagatorField &q_out,
+                                Current curr_type,
+                                unsigned int mu);
+  void SeqConservedCurrent(PropagatorField &q_in,
+                           PropagatorField &q_out,
+                           Current curr_type,
+                           unsigned int mu, 
+                           std::vector<Real> mom,
+                           unsigned int tmin,
+                           unsigned int tmax);
 };

 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
@@ -12,6 +12,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
+Author: Andrew Lawson <andrew.lawson1991@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -701,6 +702,168 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe

 }

+/*******************************************************************************
+ * Conserved current utilities for Wilson fermions, for contracting propagators
+ * to make a conserved current sink or inserting the conserved current 
+ * sequentially.
+ ******************************************************************************/
+
+// Helper macro to reverse Simd vector. Fixme: slow, generic implementation.
+#define REVERSE_LS(qSite, qSiteRev, Nsimd) \
+{ \
+    std::vector<typename SitePropagator::scalar_object> qSiteVec(Nsimd); \
+    extract(qSite, qSiteVec); \
+    for (int i = 0; i < Nsimd / 2; ++i) \
+    { \
+        typename SitePropagator::scalar_object tmp = qSiteVec[i]; \
+        qSiteVec[i] = qSiteVec[Nsimd - i - 1]; \
+        qSiteVec[Nsimd - i - 1] = tmp; \
+    } \
+    merge(qSiteRev, qSiteVec); \
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                     PropagatorField &q_in_2,
+                                                     PropagatorField &q_out,
+                                                     Current curr_type,
+                                                     unsigned int mu)
+{
+    conformable(q_in_1._grid, FermionGrid());
+    conformable(q_in_1._grid, q_in_2._grid);
+    conformable(_FourDimGrid, q_out._grid);
+    PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
+    unsigned int LLs = q_in_1._grid->_rdimensions[0];
+    q_out = zero;
+
+    // Forward, need q1(x + mu, s), q2(x, Ls - 1 - s). Backward, need q1(x, s), 
+    // q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
+    tmp1 = Cshift(q_in_1, mu + 1, 1);
+    tmp2 = Cshift(q_in_2, mu + 1, 1);
+    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    {
+        unsigned int sF1 = sU * LLs;
+        unsigned int sF2 = (sU + 1) * LLs - 1;
+
+        for (unsigned int s = 0; s < LLs; ++s)
+        {
+            bool axial_sign = ((curr_type == Current::Axial) && \
+                               (s < (LLs / 2)));
+            SitePropagator qSite2, qmuSite2;
+
+            // If vectorised in 5th dimension, reverse q2 vector to match up
+            // sites correctly.
+            if (Impl::LsVectorised)
+            {
+                REVERSE_LS(q_in_2._odata[sF2], qSite2, Ls / LLs);
+                REVERSE_LS(tmp2._odata[sF2], qmuSite2, Ls / LLs);
+            }
+            else
+            {
+                qSite2   = q_in_2._odata[sF2];
+                qmuSite2 = tmp2._odata[sF2];
+            }
+            Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sF1], 
+                                                     qSite2, 
+                                                     q_out._odata[sU],
+                                                     Umu, sU, mu, axial_sign);
+            Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sF1],
+                                                     qmuSite2,
+                                                     q_out._odata[sU],
+                                                     Umu, sU, mu, axial_sign);
+            sF1++;
+            sF2--;
+        }
+    }
+}
+
+
+template <class Impl>
+void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
+                                                PropagatorField &q_out,
+                                                Current curr_type, 
+                                                unsigned int mu,
+                                                std::vector<Real> mom,
+                                                unsigned int tmin, 
+                                                unsigned int tmax)
+{
+    conformable(q_in._grid, FermionGrid());
+    conformable(q_in._grid, q_out._grid);
+    Lattice<iSinglet<Simd>> ph(FermionGrid()), coor(FermionGrid());
+    PropagatorField tmpFwd(FermionGrid()), tmpBwd(FermionGrid()),
+                    tmp(FermionGrid());
+    ComplexD i(0.0, 1.0);
+    unsigned int tshift = (mu == Tp) ? 1 : 0;
+    unsigned int LLs = q_in._grid->_rdimensions[0];
+    unsigned int LLt    = GridDefaultLatt()[Tp];
+
+    // Momentum projection.
+    ph = zero;
+    for(unsigned int nu = 0; nu < Nd - 1; nu++)
+    {
+        // Shift coordinate lattice index by 1 to account for 5th dimension.
+        LatticeCoordinate(coor, nu + 1);
+        ph = ph + mom[nu]*coor*((1./(_FourDimGrid->_fdimensions[nu])));
+    }
+    ph = exp((RealD)(2*M_PI)*i*ph);
+
+    q_out = zero;
+    LatticeInteger coords(_FourDimGrid);
+    LatticeCoordinate(coords, Tp);
+
+    // Need q(x + mu, s) and q(x - mu, s). 5D lattice so shift 4D coordinate mu
+    // by one.
+    tmp = Cshift(q_in, mu + 1, 1);
+    tmpFwd = tmp*ph;
+    tmp = ph*q_in;
+    tmpBwd = Cshift(tmp, mu + 1, -1);
+
+    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    {
+        // Compute the sequential conserved current insertion only if our simd
+        // object contains a timeslice we need.
+        vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
+                             (coords._odata[sU] <= tmax));
+        Integer timeSlices = Reduce(t_mask);
+
+        if (timeSlices > 0)
+        {
+            unsigned int sF = sU * LLs;
+            for (unsigned int s = 0; s < LLs; ++s)
+            {
+                bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
+                Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sF], 
+                                                    q_out._odata[sF], Umu, sU,
+                                                    mu, t_mask, axial_sign);
+                ++sF;
+            }
+        }
+
+        // Repeat for backward direction.
+        t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
+                      (coords._odata[sU] <= (tmax + tshift)));
+
+	//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+	unsigned int t0 = 0;
+	if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
+
+        timeSlices = Reduce(t_mask);
+
+        if (timeSlices > 0)
+        {
+            unsigned int sF = sU * LLs;
+            for (unsigned int s = 0; s < LLs; ++s)
+            {
+                bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
+                Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sF], 
+                                                    q_out._odata[sF], Umu, sU,
+                                                    mu, t_mask, axial_sign);
+                ++sF;
+            }
+        }
+    }
+}
+
 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
  
@@ -214,6 +214,21 @@ namespace QCD {
    // Comms buffer
    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    
+    ///////////////////////////////////////////////////////////////
+    // Conserved current utilities
+    ///////////////////////////////////////////////////////////////
+    void ContractConservedCurrent(PropagatorField &q_in_1,
+                                  PropagatorField &q_in_2,
+                                  PropagatorField &q_out,
+                                  Current curr_type, 
+                                  unsigned int mu);
+    void SeqConservedCurrent(PropagatorField &q_in,
+                             PropagatorField &q_out,
+                             Current curr_type,
+                             unsigned int mu,
+                             std::vector<Real> mom,
+                             unsigned int tmin,
+                             unsigned int tmax);
  };

 }}
@@ -281,6 +281,172 @@ void WilsonKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHal
  vstream(out._odata[sF], result);
 }

+/*******************************************************************************
+ * Conserved current utilities for Wilson fermions, for contracting propagators
+ * to make a conserved current sink or inserting the conserved current 
+ * sequentially. Common to both 4D and 5D.
+ ******************************************************************************/
+// N.B. Functions below assume a -1/2 factor within U.
+#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
+#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
+
+/*******************************************************************************
+ * Name: ContractConservedCurrentSiteFwd
+ * Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in_1 shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(
+                                                  const SitePropagator &q_in_1,
+                                                  const SitePropagator &q_in_2,
+                                                  SitePropagator &q_out,
+                                                  DoubledGaugeField &U,
+                                                  unsigned int sU,
+                                                  unsigned int mu,
+                                                  bool switch_sign)
+{
+    SitePropagator result, tmp;
+    Gamma g5(Gamma::Algebra::Gamma5);
+    Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu);
+    result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
+    if (switch_sign)
+    {
+        q_out -= result;
+    }
+    else
+    {
+        q_out += result;
+    }
+}
+
+/*******************************************************************************
+ * Name: ContractConservedCurrentSiteBwd
+ * Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in_2 shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(
+                                                  const SitePropagator &q_in_1,
+                                                  const SitePropagator &q_in_2,
+                                                  SitePropagator &q_out,
+                                                  DoubledGaugeField &U,
+                                                  unsigned int sU,
+                                                  unsigned int mu,
+                                                  bool switch_sign)
+{
+    SitePropagator result, tmp;
+    Gamma g5(Gamma::Algebra::Gamma5);
+    Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu + Nd);
+    result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
+    if (switch_sign)
+    {
+        q_out += result;
+    }
+    else
+    {
+        q_out -= result;
+    }
+}
+
+// G-parity requires more specialised implementation.
+#define NO_CURR_SITE(Impl) \
+template <> \
+void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd( \
+                                                  const SitePropagator &q_in_1, \
+                                                  const SitePropagator &q_in_2, \
+                                                  SitePropagator &q_out,        \
+                                                  DoubledGaugeField &U,         \
+                                                  unsigned int sU,              \
+                                                  unsigned int mu,              \
+                                                  bool switch_sign)             \
+{ \
+    assert(0); \
+} \
+template <> \
+void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd( \
+                                                  const SitePropagator &q_in_1, \
+                                                  const SitePropagator &q_in_2, \
+                                                  SitePropagator &q_out,        \
+                                                  DoubledGaugeField &U,         \
+                                                  unsigned int mu,              \
+                                                  unsigned int sU,              \
+                                                  bool switch_sign)             \
+{ \
+    assert(0); \
+}
+
+NO_CURR_SITE(GparityWilsonImplF);
+NO_CURR_SITE(GparityWilsonImplD);
+NO_CURR_SITE(GparityWilsonImplFH);
+NO_CURR_SITE(GparityWilsonImplDF);
+
+
+/*******************************************************************************
+ * Name: SeqConservedCurrentSiteFwd
+ * Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
+                                                     SitePropagator &q_out,
+                                                     DoubledGaugeField &U,
+                                                     unsigned int sU,
+                                                     unsigned int mu,
+                                                     vInteger t_mask,
+                                                     bool switch_sign)
+{
+    SitePropagator result;
+    Impl::multLinkProp(result, U._odata[sU], q_in, mu);
+    result = WilsonCurrentFwd(result, mu);
+
+    // Zero any unwanted timeslice entries.
+    result = predicatedWhere(t_mask, result, 0.*result);
+
+    if (switch_sign)
+    {
+        q_out -= result;
+    }
+    else
+    {
+        q_out += result;
+    }
+}
+
+/*******************************************************************************
+ * Name: SeqConservedCurrentSiteFwd
+ * Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in shifted in -ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in, 
+                                                     SitePropagator &q_out,
+                                                     DoubledGaugeField &U,
+                                                     unsigned int sU,
+                                                     unsigned int mu,
+                                                     vInteger t_mask,
+                                                     bool switch_sign)
+{
+    SitePropagator result;
+    Impl::multLinkProp(result, U._odata[sU], q_in, mu + Nd);
+    result = WilsonCurrentBwd(result, mu);
+
+    // Zero any unwanted timeslice entries.
+    result = predicatedWhere(t_mask, result, 0.*result);
+
+    if (switch_sign)
+    {
+        q_out += result;
+    }
+    else
+    {
+        q_out -= result;
+    }
+}
+
 FermOpTemplateInstantiate(WilsonKernels);
 AdjointFermOpTemplateInstantiate(WilsonKernels);
 TwoIndexFermOpTemplateInstantiate(WilsonKernels);
@@ -55,7 +55,7 @@ template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public
 public:

  template <bool EnableBool = true>
-  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
+  typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
  {
@@ -99,7 +99,7 @@ public:
  }
     
  template <bool EnableBool = true>
-  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
+  typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool, void>::type
  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 	   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) {
    // no kernel choice  
@@ -116,7 +116,7 @@ public:
  }
     
  template <bool EnableBool = true>
-  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
+  typename std::enable_if<Impl::isFundamental==true && Nc == 3 && EnableBool,void>::type
  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 	      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
 {
@@ -161,7 +161,7 @@ public:
  }

  template <bool EnableBool = true>
-  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
+  typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool,void>::type
  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
 		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) {

@@ -180,6 +180,38 @@ public:
  void DhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
 		       int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
      
+  //////////////////////////////////////////////////////////////////////////////
+  // Utilities for inserting Wilson conserved current.
+  //////////////////////////////////////////////////////////////////////////////
+  void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
+                                       const SitePropagator &q_in_2,
+                                       SitePropagator &q_out,
+                                       DoubledGaugeField &U,
+                                       unsigned int sU,
+                                       unsigned int mu,
+                                       bool switch_sign = false);
+  void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
+                                       const SitePropagator &q_in_2,
+                                       SitePropagator &q_out,
+                                       DoubledGaugeField &U,
+                                       unsigned int sU,
+                                       unsigned int mu,
+                                       bool switch_sign = false);
+  void SeqConservedCurrentSiteFwd(const SitePropagator &q_in, 
+                                  SitePropagator &q_out,
+                                  DoubledGaugeField &U,
+                                  unsigned int sU,
+                                  unsigned int mu,
+                                  vInteger t_mask,
+                                  bool switch_sign = false);
+  void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
+                                  SitePropagator &q_out,
+                                  DoubledGaugeField &U,
+                                  unsigned int sU,
+                                  unsigned int mu,
+                                  vInteger t_mask,
+                                  bool switch_sign = false);
+
 private:
     // Specialised variants
  void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
@@ -30,181 +30,60 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define REGISTER

-#define LOAD_CHIMU_BODY(F)			\
-  Chimu_00=ref(F)(0)(0);			\
-  Chimu_01=ref(F)(0)(1);			\
-  Chimu_02=ref(F)(0)(2);			\
-  Chimu_10=ref(F)(1)(0);			\
-  Chimu_11=ref(F)(1)(1);			\
-  Chimu_12=ref(F)(1)(2);			\
-  Chimu_20=ref(F)(2)(0);			\
-  Chimu_21=ref(F)(2)(1);			\
-  Chimu_22=ref(F)(2)(2);			\
-  Chimu_30=ref(F)(3)(0);			\
-  Chimu_31=ref(F)(3)(1);			\
-  Chimu_32=ref(F)(3)(2)
+#define LOAD_CHIMU \
+  {const SiteSpinor & ref (in._odata[offset]);	\
+    Chimu_00=ref()(0)(0);\
+    Chimu_01=ref()(0)(1);\
+    Chimu_02=ref()(0)(2);\
+    Chimu_10=ref()(1)(0);\
+    Chimu_11=ref()(1)(1);\
+    Chimu_12=ref()(1)(2);\
+    Chimu_20=ref()(2)(0);\
+    Chimu_21=ref()(2)(1);\
+    Chimu_22=ref()(2)(2);\
+    Chimu_30=ref()(3)(0);\
+    Chimu_31=ref()(3)(1);\
+    Chimu_32=ref()(3)(2);}

-#define LOAD_CHIMU(DIR,F,PERM)						\
-  { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
-
-#define LOAD_CHI_BODY(F)				\
-    Chi_00 = ref(F)(0)(0);\
-    Chi_01 = ref(F)(0)(1);\
-    Chi_02 = ref(F)(0)(2);\
-    Chi_10 = ref(F)(1)(0);\
-    Chi_11 = ref(F)(1)(1);\
-    Chi_12 = ref(F)(1)(2)
-
-#define LOAD_CHI(DIR,F,PERM)					\
-  {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
-
-
-//G-parity implementations using in-place intrinsic ops
-
-//1l 1h -> 1h 1l
-//0l 0h , 1h 1l -> 0l 1h 0h,1l
-//0h,1l -> 1l,0h
-//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
-//Pulled fermion through forwards face, GPBC on upper component
-//Need 0= 0l 1h   1= 1l 0h
-//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
-//Pulled fermion through backwards face, GPBC on lower component
-//Need 0= 1l 0h   1= 0l 1h
-
-//1l 1h -> 1h 1l
-//0l 0h , 1h 1l -> 0l 1h 0h,1l
-#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
-  permute##PERM(tmp1, ref(1)(S)(C));				\
-  exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1);		\
-  INTO = tmp2;
-
-//0l 0h -> 0h 0l
-//1l 1h, 0h 0l -> 1l 0h, 1h 0l
-#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
-  permute##PERM(tmp1, ref(0)(S)(C));				\
-  exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1);		\
-  INTO = tmp2;
-
-
-
-
-#define LOAD_CHI_SETUP(DIR,F)						\
-  g = F;								\
-  direction = st._directions[DIR];				\
-  distance = st._distances[DIR];				\
-  sl = st._grid->_simd_layout[direction];			\
-  inplace_twist = 0;						\
-  if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\
-    if(sl == 1){							\
-      g = (F+1) % 2;							\
-    }else{								\
-      inplace_twist = 1;						\
-    }									\
-  }  
-
-#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\
-  { const SiteSpinor &ref(in._odata[offset]);				\
-    LOAD_CHI_SETUP(DIR,F);						\
-    if(!inplace_twist){							\
-      LOAD_CHIMU_BODY(g);						\
-    }else{								\
-      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
-	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
-	DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
-      }else{								\
-	DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
-      } \
-    } \
-  }
-
-
-#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)				\
-  { const SiteHalfSpinor &ref(buf[offset]);				\
-    LOAD_CHI_SETUP(DIR,F);						\
-    if(!inplace_twist){							\
-      LOAD_CHI_BODY(g);							\
-    }else{								\
-      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
-	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
-	DO_TWIST_0L_1H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_0L_1H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
-	DO_TWIST_0L_1H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
-	DO_TWIST_0L_1H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
-	DO_TWIST_0L_1H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_0L_1H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
-      }else{								\
-	DO_TWIST_1L_0H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_1L_0H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
-	DO_TWIST_1L_0H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
-	DO_TWIST_1L_0H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
-	DO_TWIST_1L_0H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_1L_0H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
-      }									\
-    }									\
-  }
-
-
-#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
-#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
+#define LOAD_CHI\
+  {const SiteHalfSpinor &ref(buf[offset]);	\
+    Chi_00 = ref()(0)(0);\
+    Chi_01 = ref()(0)(1);\
+    Chi_02 = ref()(0)(2);\
+    Chi_10 = ref()(1)(0);\
+    Chi_11 = ref()(1)(1);\
+    Chi_12 = ref()(1)(2);}

 // To splat or not to splat depends on the implementation
-#define MULT_2SPIN_BODY \
-  Impl::loadLinkElement(U_00,ref()(0,0));	\
-  Impl::loadLinkElement(U_10,ref()(1,0));	\
-  Impl::loadLinkElement(U_20,ref()(2,0));	\
-  Impl::loadLinkElement(U_01,ref()(0,1));	\
-  Impl::loadLinkElement(U_11,ref()(1,1));	\
-  Impl::loadLinkElement(U_21,ref()(2,1));	\
-  UChi_00 = U_00*Chi_00;			\
-  UChi_10 = U_00*Chi_10;			\
-  UChi_01 = U_10*Chi_00;			\
-  UChi_11 = U_10*Chi_10;			\
-  UChi_02 = U_20*Chi_00;			\
-  UChi_12 = U_20*Chi_10;			\
-  UChi_00+= U_01*Chi_01;			\
-  UChi_10+= U_01*Chi_11;			\
-  UChi_01+= U_11*Chi_01;			\
-  UChi_11+= U_11*Chi_11;			\
-  UChi_02+= U_21*Chi_01;			\
-  UChi_12+= U_21*Chi_11;			\
-  Impl::loadLinkElement(U_00,ref()(0,2));	\
-  Impl::loadLinkElement(U_10,ref()(1,2));	\
-  Impl::loadLinkElement(U_20,ref()(2,2));	\
-  UChi_00+= U_00*Chi_02;			\
-  UChi_10+= U_00*Chi_12;			\
-  UChi_01+= U_10*Chi_02;			\
-  UChi_11+= U_10*Chi_12;			\
-  UChi_02+= U_20*Chi_02;			\
-  UChi_12+= U_20*Chi_12
-
-
-#define MULT_2SPIN(A,F)					\
-  {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
-
-#define MULT_2SPIN_GPARITY(A,F)				\
-  {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
+#define MULT_2SPIN(A)\
+  {auto & ref(U._odata[sU](A));			\
+   Impl::loadLinkElement(U_00,ref()(0,0));	\
+   Impl::loadLinkElement(U_10,ref()(1,0));	\
+   Impl::loadLinkElement(U_20,ref()(2,0));	\
+   Impl::loadLinkElement(U_01,ref()(0,1));	\
+   Impl::loadLinkElement(U_11,ref()(1,1));	\
+   Impl::loadLinkElement(U_21,ref()(2,1));	\
+    UChi_00 = U_00*Chi_00;\
+    UChi_10 = U_00*Chi_10;\
+    UChi_01 = U_10*Chi_00;\
+    UChi_11 = U_10*Chi_10;\
+    UChi_02 = U_20*Chi_00;\
+    UChi_12 = U_20*Chi_10;\
+    UChi_00+= U_01*Chi_01;\
+    UChi_10+= U_01*Chi_11;\
+    UChi_01+= U_11*Chi_01;\
+    UChi_11+= U_11*Chi_11;\
+    UChi_02+= U_21*Chi_01;\
+    UChi_12+= U_21*Chi_11;\
+    Impl::loadLinkElement(U_00,ref()(0,2));	\
+    Impl::loadLinkElement(U_10,ref()(1,2));	\
+    Impl::loadLinkElement(U_20,ref()(2,2));	\
+    UChi_00+= U_00*Chi_02;\
+    UChi_10+= U_00*Chi_12;\
+    UChi_01+= U_10*Chi_02;\
+    UChi_11+= U_10*Chi_12;\
+    UChi_02+= U_20*Chi_02;\
+    UChi_12+= U_20*Chi_12;}


 #define PERMUTE_DIR(dir)			\
@@ -428,87 +307,84 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  result_31-= UChi_11;	\
  result_32-= UChi_12;

-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if ( local ) {				\
-    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
+    LOAD_CHIMU;					\
    PROJ;					\
    if ( perm) {				\
      PERMUTE_DIR(PERM);			\
    }						\
  } else {					\
-    LOAD_CHI_IMPL(DIR,F,PERM);			\
+    LOAD_CHI;					\
  }						\
-  MULT_2SPIN_IMPL(DIR,F);			\
+  MULT_2SPIN(DIR);				\
  RECON;					

-
-#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
+#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if ( local ) {				\
-    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
+    LOAD_CHIMU;					\
    PROJ;					\
    if ( perm) {				\
      PERMUTE_DIR(PERM);			\
    }						\
  } else if ( st.same_node[DIR] ) {		\
-    LOAD_CHI_IMPL(DIR,F,PERM);			\
+    LOAD_CHI;					\
  }						\
  if (local || st.same_node[DIR] ) {		\
-    MULT_2SPIN_IMPL(DIR,F);			\
+    MULT_2SPIN(DIR);				\
    RECON;					\
  }

-#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
+#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
-    LOAD_CHI_IMPL(DIR,F,PERM);			\
-    MULT_2SPIN_IMPL(DIR,F);			\
+    LOAD_CHI;					\
+    MULT_2SPIN(DIR);				\
    RECON;					\
    nmu++;					\
  }

-#define HAND_RESULT(ss,F)			\
+#define HAND_RESULT(ss)				\
  {						\
    SiteSpinor & ref (out._odata[ss]);		\
-    vstream(ref(F)(0)(0),result_00);		\
-    vstream(ref(F)(0)(1),result_01);		\
-    vstream(ref(F)(0)(2),result_02);		\
-    vstream(ref(F)(1)(0),result_10);		\
-    vstream(ref(F)(1)(1),result_11);		\
-    vstream(ref(F)(1)(2),result_12);		\
-    vstream(ref(F)(2)(0),result_20);		\
-    vstream(ref(F)(2)(1),result_21);		\
-    vstream(ref(F)(2)(2),result_22);		\
-    vstream(ref(F)(3)(0),result_30);		\
-    vstream(ref(F)(3)(1),result_31);		\
-    vstream(ref(F)(3)(2),result_32);		\
+    vstream(ref()(0)(0),result_00);		\
+    vstream(ref()(0)(1),result_01);		\
+    vstream(ref()(0)(2),result_02);		\
+    vstream(ref()(1)(0),result_10);		\
+    vstream(ref()(1)(1),result_11);		\
+    vstream(ref()(1)(2),result_12);		\
+    vstream(ref()(2)(0),result_20);		\
+    vstream(ref()(2)(1),result_21);		\
+    vstream(ref()(2)(2),result_22);		\
+    vstream(ref()(3)(0),result_30);		\
+    vstream(ref()(3)(1),result_31);		\
+    vstream(ref()(3)(2),result_32);		\
  }

-#define HAND_RESULT_EXT(ss,F)			\
+#define HAND_RESULT_EXT(ss)			\
  if (nmu){					\
    SiteSpinor & ref (out._odata[ss]);		\
-    ref(F)(0)(0)+=result_00;		\
-    ref(F)(0)(1)+=result_01;		\
-    ref(F)(0)(2)+=result_02;		\
-    ref(F)(1)(0)+=result_10;		\
-    ref(F)(1)(1)+=result_11;		\
-    ref(F)(1)(2)+=result_12;		\
-    ref(F)(2)(0)+=result_20;		\
-    ref(F)(2)(1)+=result_21;		\
-    ref(F)(2)(2)+=result_22;		\
-    ref(F)(3)(0)+=result_30;		\
-    ref(F)(3)(1)+=result_31;		\
-    ref(F)(3)(2)+=result_32;		\
+    ref()(0)(0)+=result_00;		\
+    ref()(0)(1)+=result_01;		\
+    ref()(0)(2)+=result_02;		\
+    ref()(1)(0)+=result_10;		\
+    ref()(1)(1)+=result_11;		\
+    ref()(1)(2)+=result_12;		\
+    ref()(2)(0)+=result_20;		\
+    ref()(2)(1)+=result_21;		\
+    ref()(2)(2)+=result_22;		\
+    ref()(3)(0)+=result_30;		\
+    ref()(3)(1)+=result_31;		\
+    ref()(3)(2)+=result_32;		\
  }


@@ -587,18 +463,15 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge
  int offset,local,perm, ptype;
  StencilEntry *SE;

-#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss,F)
-
-  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
+  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
+  HAND_RESULT(ss);
 }

 template<class Impl>
@@ -612,19 +485,16 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub

  StencilEntry *SE;
  int offset,local,perm, ptype;
-
-#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss,F)
-
-  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+  
+  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
+  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
+  HAND_RESULT(ss);
 }

 template<class Impl> void 
@@ -639,20 +509,16 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa

  int offset,local,perm, ptype;
  StencilEntry *SE;
-
-#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  ZERO_RESULT; \
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss,F)
-
-  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+  ZERO_RESULT;
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
+  HAND_RESULT(ss);
 }

 template<class Impl>
@@ -666,20 +532,16 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D

  StencilEntry *SE;
  int offset,local,perm, ptype;
-
-#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
-  ZERO_RESULT;							\
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_RESULT(ss,F)
-  
-  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+  ZERO_RESULT;
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
+  HAND_RESULT(ss);
 }

 template<class Impl> void 
@@ -695,20 +557,16 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
  int offset,local,perm, ptype;
  StencilEntry *SE;
  int nmu=0;
-
-#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  ZERO_RESULT; \
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT_EXT(ss,F)
-
-  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+  ZERO_RESULT;
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
+  HAND_RESULT_EXT(ss);
 }

 template<class Impl>
@@ -723,193 +581,18 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
  StencilEntry *SE;
  int offset,local,perm, ptype;
  int nmu=0;
-
-#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  ZERO_RESULT; \
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT_EXT(ss,F)
-
-  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+  ZERO_RESULT;
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
+  HAND_RESULT_EXT(ss);
 }

-  ////////////////////////////////////////////////
-  // Specialise Gparity to simple implementation
-  ////////////////////////////////////////////////
-#define HAND_SPECIALISE_EMPTY(IMPL)					\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,			\
-				    LebesgueOrder &lo,			\
-				    DoubledGaugeField &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionField &in,		\
-				    FermionField &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,			\
-				    LebesgueOrder &lo,			\
-				    DoubledGaugeField &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionField &in,		\
-				    FermionField &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,			\
-				    LebesgueOrder &lo,			\
-				    DoubledGaugeField &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionField &in,		\
-				    FermionField &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,			\
-				    LebesgueOrder &lo,			\
-				    DoubledGaugeField &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionField &in,		\
-				    FermionField &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,	       	\
-				    LebesgueOrder &lo,			\
-				    DoubledGaugeField &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionField &in,		\
-				    FermionField &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,	       	\
-				    LebesgueOrder &lo,			\
-				    DoubledGaugeField &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionField &in,		\
-				    FermionField &out){ assert(0); }	\
-
-
-
-#define HAND_SPECIALISE_GPARITY(IMPL)					\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
-				    int ss,int sU,const FermionField &in, FermionField &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    StencilEntry *SE;							\
-    HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-					    int ss,int sU,const FermionField &in, FermionField &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    StencilEntry *SE;							\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
-    HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
-						     int ss,int sU,const FermionField &in, FermionField &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
-    StencilEntry *SE;							\
-    HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-							     int ss,int sU,const FermionField &in, FermionField &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    StencilEntry *SE;							\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
-						     int ss,int sU,const FermionField &in, FermionField &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    StencilEntry *SE;							\
-    int nmu=0;								\
-    HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    nmu = 0;								\
-    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-							     int ss,int sU,const FermionField &in, FermionField &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    StencilEntry *SE;							\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    int nmu=0;								\
-    HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    nmu = 0;								\
-    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }
-
-
-HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
-
-
-
-
-
-
-
-
-
-
-  
 ////////////// Wilson ; uses this implementation /////////////////////

 #define INSTANTIATE_THEM(A) \
@@ -930,8 +613,6 @@ INSTANTIATE_THEM(WilsonImplF);
 INSTANTIATE_THEM(WilsonImplD);
 INSTANTIATE_THEM(ZWilsonImplF);
 INSTANTIATE_THEM(ZWilsonImplD);
-INSTANTIATE_THEM(GparityWilsonImplF);
-INSTANTIATE_THEM(GparityWilsonImplD);
 INSTANTIATE_THEM(DomainWallVec5dImplF);
 INSTANTIATE_THEM(DomainWallVec5dImplD);
 INSTANTIATE_THEM(ZDomainWallVec5dImplF);
@@ -940,11 +621,11 @@ INSTANTIATE_THEM(WilsonImplFH);
 INSTANTIATE_THEM(WilsonImplDF);
 INSTANTIATE_THEM(ZWilsonImplFH);
 INSTANTIATE_THEM(ZWilsonImplDF);
-INSTANTIATE_THEM(GparityWilsonImplFH);
-INSTANTIATE_THEM(GparityWilsonImplDF);
 INSTANTIATE_THEM(DomainWallVec5dImplFH);
 INSTANTIATE_THEM(DomainWallVec5dImplDF);
 INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
 INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
+INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
+INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);

 }}
@@ -0,0 +1,878 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+#define REGISTER
+
+#define LOAD_CHIMU_BODY(F)			\
+  Chimu_00=ref(F)(0)(0);			\
+  Chimu_01=ref(F)(0)(1);			\
+  Chimu_02=ref(F)(0)(2);			\
+  Chimu_10=ref(F)(1)(0);			\
+  Chimu_11=ref(F)(1)(1);			\
+  Chimu_12=ref(F)(1)(2);			\
+  Chimu_20=ref(F)(2)(0);			\
+  Chimu_21=ref(F)(2)(1);			\
+  Chimu_22=ref(F)(2)(2);			\
+  Chimu_30=ref(F)(3)(0);			\
+  Chimu_31=ref(F)(3)(1);			\
+  Chimu_32=ref(F)(3)(2)
+
+#define LOAD_CHIMU(DIR,F,PERM)						\
+  { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
+
+#define LOAD_CHI_BODY(F)				\
+    Chi_00 = ref(F)(0)(0);\
+    Chi_01 = ref(F)(0)(1);\
+    Chi_02 = ref(F)(0)(2);\
+    Chi_10 = ref(F)(1)(0);\
+    Chi_11 = ref(F)(1)(1);\
+    Chi_12 = ref(F)(1)(2)
+
+#define LOAD_CHI(DIR,F,PERM)					\
+  {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
+
+
+//G-parity implementations using in-place intrinsic ops
+
+//1l 1h -> 1h 1l
+//0l 0h , 1h 1l -> 0l 1h 0h,1l
+//0h,1l -> 1l,0h
+//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
+//Pulled fermion through forwards face, GPBC on upper component
+//Need 0= 0l 1h   1= 1l 0h
+//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
+//Pulled fermion through backwards face, GPBC on lower component
+//Need 0= 1l 0h   1= 0l 1h
+
+//1l 1h -> 1h 1l
+//0l 0h , 1h 1l -> 0l 1h 0h,1l
+#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
+  permute##PERM(tmp1, ref(1)(S)(C));				\
+  exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1);		\
+  INTO = tmp2;
+
+//0l 0h -> 0h 0l
+//1l 1h, 0h 0l -> 1l 0h, 1h 0l
+#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
+  permute##PERM(tmp1, ref(0)(S)(C));				\
+  exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1);		\
+  INTO = tmp2;
+
+
+
+
+#define LOAD_CHI_SETUP(DIR,F)						\
+  g = F;								\
+  direction = st._directions[DIR];				\
+  distance = st._distances[DIR];				\
+  sl = st._grid->_simd_layout[direction];			\
+  inplace_twist = 0;						\
+  if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\
+    if(sl == 1){							\
+      g = (F+1) % 2;							\
+    }else{								\
+      inplace_twist = 1;						\
+    }									\
+  }  
+
+#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\
+  { const SiteSpinor &ref(in._odata[offset]);				\
+    LOAD_CHI_SETUP(DIR,F);						\
+    if(!inplace_twist){							\
+      LOAD_CHIMU_BODY(g);						\
+    }else{								\
+      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
+	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
+	DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
+      }else{								\
+	DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
+      } \
+    } \
+  }
+
+
+#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)				\
+  { const SiteHalfSpinor &ref(buf[offset]);				\
+    LOAD_CHI_SETUP(DIR,F);						\
+    if(!inplace_twist){							\
+      LOAD_CHI_BODY(g);							\
+    }else{								\
+      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
+	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
+	DO_TWIST_0L_1H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_0L_1H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
+	DO_TWIST_0L_1H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
+	DO_TWIST_0L_1H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
+	DO_TWIST_0L_1H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_0L_1H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
+      }else{								\
+	DO_TWIST_1L_0H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_1L_0H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
+	DO_TWIST_1L_0H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
+	DO_TWIST_1L_0H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
+	DO_TWIST_1L_0H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_1L_0H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
+      }									\
+    }									\
+  }
+
+
+#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
+#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
+
+// To splat or not to splat depends on the implementation
+#define MULT_2SPIN_BODY \
+  Impl::loadLinkElement(U_00,ref()(0,0));	\
+  Impl::loadLinkElement(U_10,ref()(1,0));	\
+  Impl::loadLinkElement(U_20,ref()(2,0));	\
+  Impl::loadLinkElement(U_01,ref()(0,1));	\
+  Impl::loadLinkElement(U_11,ref()(1,1));	\
+  Impl::loadLinkElement(U_21,ref()(2,1));	\
+  UChi_00 = U_00*Chi_00;			\
+  UChi_10 = U_00*Chi_10;			\
+  UChi_01 = U_10*Chi_00;			\
+  UChi_11 = U_10*Chi_10;			\
+  UChi_02 = U_20*Chi_00;			\
+  UChi_12 = U_20*Chi_10;			\
+  UChi_00+= U_01*Chi_01;			\
+  UChi_10+= U_01*Chi_11;			\
+  UChi_01+= U_11*Chi_01;			\
+  UChi_11+= U_11*Chi_11;			\
+  UChi_02+= U_21*Chi_01;			\
+  UChi_12+= U_21*Chi_11;			\
+  Impl::loadLinkElement(U_00,ref()(0,2));	\
+  Impl::loadLinkElement(U_10,ref()(1,2));	\
+  Impl::loadLinkElement(U_20,ref()(2,2));	\
+  UChi_00+= U_00*Chi_02;			\
+  UChi_10+= U_00*Chi_12;			\
+  UChi_01+= U_10*Chi_02;			\
+  UChi_11+= U_10*Chi_12;			\
+  UChi_02+= U_20*Chi_02;			\
+  UChi_12+= U_20*Chi_12
+
+
+#define MULT_2SPIN(A,F)					\
+  {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
+
+#define MULT_2SPIN_GPARITY(A,F)				\
+  {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
+
+
+#define PERMUTE_DIR(dir)			\
+      permute##dir(Chi_00,Chi_00);\
+      permute##dir(Chi_01,Chi_01);\
+      permute##dir(Chi_02,Chi_02);\
+      permute##dir(Chi_10,Chi_10);\
+      permute##dir(Chi_11,Chi_11);\
+      permute##dir(Chi_12,Chi_12);
+
+//      hspin(0)=fspin(0)+timesI(fspin(3));
+//      hspin(1)=fspin(1)+timesI(fspin(2));
+#define XP_PROJ \
+    Chi_00 = Chimu_00+timesI(Chimu_30);\
+    Chi_01 = Chimu_01+timesI(Chimu_31);\
+    Chi_02 = Chimu_02+timesI(Chimu_32);\
+    Chi_10 = Chimu_10+timesI(Chimu_20);\
+    Chi_11 = Chimu_11+timesI(Chimu_21);\
+    Chi_12 = Chimu_12+timesI(Chimu_22);
+
+#define YP_PROJ \
+    Chi_00 = Chimu_00-Chimu_30;\
+    Chi_01 = Chimu_01-Chimu_31;\
+    Chi_02 = Chimu_02-Chimu_32;\
+    Chi_10 = Chimu_10+Chimu_20;\
+    Chi_11 = Chimu_11+Chimu_21;\
+    Chi_12 = Chimu_12+Chimu_22;
+
+#define ZP_PROJ \
+  Chi_00 = Chimu_00+timesI(Chimu_20);		\
+  Chi_01 = Chimu_01+timesI(Chimu_21);		\
+  Chi_02 = Chimu_02+timesI(Chimu_22);		\
+  Chi_10 = Chimu_10-timesI(Chimu_30);		\
+  Chi_11 = Chimu_11-timesI(Chimu_31);		\
+  Chi_12 = Chimu_12-timesI(Chimu_32);
+
+#define TP_PROJ \
+  Chi_00 = Chimu_00+Chimu_20;		\
+  Chi_01 = Chimu_01+Chimu_21;		\
+  Chi_02 = Chimu_02+Chimu_22;		\
+  Chi_10 = Chimu_10+Chimu_30;		\
+  Chi_11 = Chimu_11+Chimu_31;		\
+  Chi_12 = Chimu_12+Chimu_32;
+
+
+//      hspin(0)=fspin(0)-timesI(fspin(3));
+//      hspin(1)=fspin(1)-timesI(fspin(2));
+#define XM_PROJ \
+    Chi_00 = Chimu_00-timesI(Chimu_30);\
+    Chi_01 = Chimu_01-timesI(Chimu_31);\
+    Chi_02 = Chimu_02-timesI(Chimu_32);\
+    Chi_10 = Chimu_10-timesI(Chimu_20);\
+    Chi_11 = Chimu_11-timesI(Chimu_21);\
+    Chi_12 = Chimu_12-timesI(Chimu_22);
+
+#define YM_PROJ \
+    Chi_00 = Chimu_00+Chimu_30;\
+    Chi_01 = Chimu_01+Chimu_31;\
+    Chi_02 = Chimu_02+Chimu_32;\
+    Chi_10 = Chimu_10-Chimu_20;\
+    Chi_11 = Chimu_11-Chimu_21;\
+    Chi_12 = Chimu_12-Chimu_22;
+
+#define ZM_PROJ \
+  Chi_00 = Chimu_00-timesI(Chimu_20);		\
+  Chi_01 = Chimu_01-timesI(Chimu_21);		\
+  Chi_02 = Chimu_02-timesI(Chimu_22);		\
+  Chi_10 = Chimu_10+timesI(Chimu_30);		\
+  Chi_11 = Chimu_11+timesI(Chimu_31);		\
+  Chi_12 = Chimu_12+timesI(Chimu_32);
+
+#define TM_PROJ \
+  Chi_00 = Chimu_00-Chimu_20;		\
+  Chi_01 = Chimu_01-Chimu_21;		\
+  Chi_02 = Chimu_02-Chimu_22;		\
+  Chi_10 = Chimu_10-Chimu_30;		\
+  Chi_11 = Chimu_11-Chimu_31;		\
+  Chi_12 = Chimu_12-Chimu_32;
+
+//      fspin(0)=hspin(0);
+//      fspin(1)=hspin(1);
+//      fspin(2)=timesMinusI(hspin(1));
+//      fspin(3)=timesMinusI(hspin(0));
+#define XP_RECON\
+  result_00 = UChi_00;\
+  result_01 = UChi_01;\
+  result_02 = UChi_02;\
+  result_10 = UChi_10;\
+  result_11 = UChi_11;\
+  result_12 = UChi_12;\
+  result_20 = timesMinusI(UChi_10);\
+  result_21 = timesMinusI(UChi_11);\
+  result_22 = timesMinusI(UChi_12);\
+  result_30 = timesMinusI(UChi_00);\
+  result_31 = timesMinusI(UChi_01);\
+  result_32 = timesMinusI(UChi_02);
+
+#define XP_RECON_ACCUM\
+  result_00+=UChi_00;\
+  result_01+=UChi_01;\
+  result_02+=UChi_02;\
+  result_10+=UChi_10;\
+  result_11+=UChi_11;\
+  result_12+=UChi_12;\
+  result_20-=timesI(UChi_10);\
+  result_21-=timesI(UChi_11);\
+  result_22-=timesI(UChi_12);\
+  result_30-=timesI(UChi_00);\
+  result_31-=timesI(UChi_01);\
+  result_32-=timesI(UChi_02);
+
+#define XM_RECON\
+  result_00 = UChi_00;\
+  result_01 = UChi_01;\
+  result_02 = UChi_02;\
+  result_10 = UChi_10;\
+  result_11 = UChi_11;\
+  result_12 = UChi_12;\
+  result_20 = timesI(UChi_10);\
+  result_21 = timesI(UChi_11);\
+  result_22 = timesI(UChi_12);\
+  result_30 = timesI(UChi_00);\
+  result_31 = timesI(UChi_01);\
+  result_32 = timesI(UChi_02);
+
+#define XM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= timesI(UChi_10);\
+  result_21+= timesI(UChi_11);\
+  result_22+= timesI(UChi_12);\
+  result_30+= timesI(UChi_00);\
+  result_31+= timesI(UChi_01);\
+  result_32+= timesI(UChi_02);
+
+#define YP_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= UChi_10;\
+  result_21+= UChi_11;\
+  result_22+= UChi_12;\
+  result_30-= UChi_00;\
+  result_31-= UChi_01;\
+  result_32-= UChi_02;
+
+#define YM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20-= UChi_10;\
+  result_21-= UChi_11;\
+  result_22-= UChi_12;\
+  result_30+= UChi_00;\
+  result_31+= UChi_01;\
+  result_32+= UChi_02;
+
+#define ZP_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20-= timesI(UChi_00);			\
+  result_21-= timesI(UChi_01);			\
+  result_22-= timesI(UChi_02);			\
+  result_30+= timesI(UChi_10);			\
+  result_31+= timesI(UChi_11);			\
+  result_32+= timesI(UChi_12);
+
+#define ZM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= timesI(UChi_00);			\
+  result_21+= timesI(UChi_01);			\
+  result_22+= timesI(UChi_02);			\
+  result_30-= timesI(UChi_10);			\
+  result_31-= timesI(UChi_11);			\
+  result_32-= timesI(UChi_12);
+
+#define TP_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= UChi_00;			\
+  result_21+= UChi_01;			\
+  result_22+= UChi_02;			\
+  result_30+= UChi_10;			\
+  result_31+= UChi_11;			\
+  result_32+= UChi_12;
+
+#define TM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20-= UChi_00;	\
+  result_21-= UChi_01;	\
+  result_22-= UChi_02;	\
+  result_30-= UChi_10;	\
+  result_31-= UChi_11;	\
+  result_32-= UChi_12;
+
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  SE=st.GetEntry(ptype,DIR,ss);			\
+  offset = SE->_offset;				\
+  local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
+  if ( local ) {				\
+    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
+    PROJ;					\
+    if ( perm) {				\
+      PERMUTE_DIR(PERM);			\
+    }						\
+  } else {					\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
+  }						\
+  MULT_2SPIN_IMPL(DIR,F);			\
+  RECON;					
+
+
+#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
+  SE=st.GetEntry(ptype,DIR,ss);			\
+  offset = SE->_offset;				\
+  local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
+  if ( local ) {				\
+    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
+    PROJ;					\
+    if ( perm) {				\
+      PERMUTE_DIR(PERM);			\
+    }						\
+  } else if ( st.same_node[DIR] ) {		\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
+  }						\
+  if (local || st.same_node[DIR] ) {		\
+    MULT_2SPIN_IMPL(DIR,F);			\
+    RECON;					\
+  }
+
+#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
+  SE=st.GetEntry(ptype,DIR,ss);			\
+  offset = SE->_offset;				\
+  local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
+  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
+    MULT_2SPIN_IMPL(DIR,F);			\
+    RECON;					\
+    nmu++;					\
+  }
+
+#define HAND_RESULT(ss,F)			\
+  {						\
+    SiteSpinor & ref (out._odata[ss]);		\
+    vstream(ref(F)(0)(0),result_00);		\
+    vstream(ref(F)(0)(1),result_01);		\
+    vstream(ref(F)(0)(2),result_02);		\
+    vstream(ref(F)(1)(0),result_10);		\
+    vstream(ref(F)(1)(1),result_11);		\
+    vstream(ref(F)(1)(2),result_12);		\
+    vstream(ref(F)(2)(0),result_20);		\
+    vstream(ref(F)(2)(1),result_21);		\
+    vstream(ref(F)(2)(2),result_22);		\
+    vstream(ref(F)(3)(0),result_30);		\
+    vstream(ref(F)(3)(1),result_31);		\
+    vstream(ref(F)(3)(2),result_32);		\
+  }
+
+#define HAND_RESULT_EXT(ss,F)			\
+  if (nmu){					\
+    SiteSpinor & ref (out._odata[ss]);		\
+    ref(F)(0)(0)+=result_00;		\
+    ref(F)(0)(1)+=result_01;		\
+    ref(F)(0)(2)+=result_02;		\
+    ref(F)(1)(0)+=result_10;		\
+    ref(F)(1)(1)+=result_11;		\
+    ref(F)(1)(2)+=result_12;		\
+    ref(F)(2)(0)+=result_20;		\
+    ref(F)(2)(1)+=result_21;		\
+    ref(F)(2)(2)+=result_22;		\
+    ref(F)(3)(0)+=result_30;		\
+    ref(F)(3)(1)+=result_31;		\
+    ref(F)(3)(2)+=result_32;		\
+  }
+
+
+#define HAND_DECLARATIONS(a)			\
+  Simd result_00;				\
+  Simd result_01;				\
+  Simd result_02;				\
+  Simd result_10;				\
+  Simd result_11;				\
+  Simd result_12;				\
+  Simd result_20;				\
+  Simd result_21;				\
+  Simd result_22;				\
+  Simd result_30;				\
+  Simd result_31;				\
+  Simd result_32;				\
+  Simd Chi_00;					\
+  Simd Chi_01;					\
+  Simd Chi_02;					\
+  Simd Chi_10;					\
+  Simd Chi_11;					\
+  Simd Chi_12;					\
+  Simd UChi_00;					\
+  Simd UChi_01;					\
+  Simd UChi_02;					\
+  Simd UChi_10;					\
+  Simd UChi_11;					\
+  Simd UChi_12;					\
+  Simd U_00;					\
+  Simd U_10;					\
+  Simd U_20;					\
+  Simd U_01;					\
+  Simd U_11;					\
+  Simd U_21;
+
+#define ZERO_RESULT				\
+  result_00=zero;				\
+  result_01=zero;				\
+  result_02=zero;				\
+  result_10=zero;				\
+  result_11=zero;				\
+  result_12=zero;				\
+  result_20=zero;				\
+  result_21=zero;				\
+  result_22=zero;				\
+  result_30=zero;				\
+  result_31=zero;				\
+  result_32=zero;			
+
+#define Chimu_00 Chi_00
+#define Chimu_01 Chi_01
+#define Chimu_02 Chi_02
+#define Chimu_10 Chi_10
+#define Chimu_11 Chi_11
+#define Chimu_12 Chi_12
+#define Chimu_20 UChi_00
+#define Chimu_21 UChi_01
+#define Chimu_22 UChi_02
+#define Chimu_30 UChi_10
+#define Chimu_31 UChi_11
+#define Chimu_32 UChi_12
+
+namespace Grid {
+namespace QCD {
+
+template<class Impl> void 
+WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionField &in, FermionField &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+
+#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT(ss,F)
+
+  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+}
+
+template<class Impl>
+void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionField &in, FermionField &out)
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+
+#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT(ss,F)
+
+  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionField &in, FermionField &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+
+#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  ZERO_RESULT; \
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT(ss,F)
+
+  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+}
+
+template<class Impl>
+void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionField &in, FermionField &out)
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+
+#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
+  ZERO_RESULT;							\
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_RESULT(ss,F)
+  
+  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionField &in, FermionField &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+  int nmu=0;
+
+#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  ZERO_RESULT; \
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT_EXT(ss,F)
+
+  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+}
+
+template<class Impl>
+void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionField &in, FermionField &out)
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+  int nmu=0;
+
+#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  ZERO_RESULT; \
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT_EXT(ss,F)
+
+  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+}
+
+#define HAND_SPECIALISE_GPARITY(IMPL)					\
+  template<> void							\
+  WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
+				    int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    StencilEntry *SE;							\
+    HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<>								\
+  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+					    int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    StencilEntry *SE;							\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
+    HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<> void							\
+  WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
+						     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
+    StencilEntry *SE;							\
+    HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<>								\
+  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+							     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    StencilEntry *SE;							\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<> void							\
+  WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
+						     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    StencilEntry *SE;							\
+    int nmu=0;								\
+    HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    nmu = 0;								\
+    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+  template<>								\
+  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+							     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    StencilEntry *SE;							\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    int nmu=0;								\
+    HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    nmu = 0;								\
+    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }
+
+
+HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
+HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
+HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
+HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
+
+
+
+
+
+
+
+
+
+
+  
+////////////// Wilson ; uses this implementation /////////////////////
+
+#define INSTANTIATE_THEM(A) \
+template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+					     int ss,int sU,const FermionField &in, FermionField &out); \
+template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+						int ss,int sU,const FermionField &in, FermionField &out);\
+template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+						int ss,int sU,const FermionField &in, FermionField &out); \
+template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+						   int ss,int sU,const FermionField &in, FermionField &out); \
+template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+						int ss,int sU,const FermionField &in, FermionField &out); \
+template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+						   int ss,int sU,const FermionField &in, FermionField &out); 
+
+INSTANTIATE_THEM(GparityWilsonImplF);
+INSTANTIATE_THEM(GparityWilsonImplD);
+INSTANTIATE_THEM(GparityWilsonImplFH);
+INSTANTIATE_THEM(GparityWilsonImplDF);
+}}
@@ -71,18 +71,14 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {

    RealD factor = 0.5 * beta / RealD(Nc);

-    //GaugeLinkField Umu(U._grid);
+    GaugeLinkField Umu(U._grid);
    GaugeLinkField dSdU_mu(U._grid);
    for (int mu = 0; mu < Nd; mu++) {
-      //Umu = PeekIndex<LorentzIndex>(U, mu);
+      Umu = PeekIndex<LorentzIndex>(U, mu);

      // Staple in direction mu
-      //WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
-      //dSdU_mu = Ta(Umu * dSdU_mu) * factor;
-
-  
-      WilsonLoops<Gimpl>::StapleMult(dSdU_mu, U, mu);
-      dSdU_mu = Ta(dSdU_mu) * factor;
+      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
+      dSdU_mu = Ta(Umu * dSdU_mu) * factor;

      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
@@ -16,12 +16,12 @@ class ScalarImplTypes {
    typedef iImplField<Simd> SiteField;
    typedef SiteField        SitePropagator;
    typedef SiteField        SiteComplex;
-    
+
    typedef Lattice<SiteField> Field;
    typedef Field              ComplexField;
    typedef Field              FermionField;
    typedef Field              PropagatorField;
-    
+
    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
      gaussian(pRNG, P);
    }
@@ -47,54 +47,60 @@ class ScalarImplTypes {
    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
      U = 1.0;
    }
-    
+
    static void MomentumSpacePropagator(Field &out, RealD m)
    {
      GridBase           *grid = out._grid;
      Field              kmu(grid), one(grid);
      const unsigned int nd    = grid->_ndimension;
      std::vector<int>   &l    = grid->_fdimensions;
-      
+
      one = Complex(1.0,0.0);
      out = m*m;
      for(int mu = 0; mu < nd; mu++)
      {
        Real twoPiL = M_PI*2./l[mu];
-        
+
        LatticeCoordinate(kmu,mu);
        kmu = 2.*sin(.5*twoPiL*kmu);
        out = out + kmu*kmu;
      }
      out = one/out;
    }
-    
+
    static void FreePropagator(const Field &in, Field &out,
                               const Field &momKernel)
    {
      FFT   fft((GridCartesian *)in._grid);
      Field inFT(in._grid);
-      
+
      fft.FFT_all_dim(inFT, in, FFT::forward);
      inFT = inFT*momKernel;
      fft.FFT_all_dim(out, inFT, FFT::backward);
    }
-    
+
    static void FreePropagator(const Field &in, Field &out, RealD m)
    {
      Field momKernel(in._grid);
-      
+
      MomentumSpacePropagator(momKernel, m);
      FreePropagator(in, out, momKernel);
    }
-    
+
  };

+  #ifdef  USE_FFT_ACCELERATION
+  #ifndef FFT_MASS
+  #error  "USE_FFT_ACCELERATION is defined but not FFT_MASS"
+  #endif
+  #endif
+  
  template <class S, unsigned int N>
  class ScalarAdjMatrixImplTypes {
  public:
    typedef S Simd;
    typedef QCD::SU<N> Group;
-    
+
    template <typename vtype>
    using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
    template <typename vtype>
@@ -103,24 +109,119 @@ class ScalarImplTypes {
    typedef iImplField<Simd>   SiteField;
    typedef SiteField          SitePropagator;
    typedef iImplComplex<Simd> SiteComplex;
-    
+
    typedef Lattice<SiteField>   Field;
    typedef Lattice<SiteComplex> ComplexField;
    typedef Field                FermionField;
    typedef Field                PropagatorField;

-    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
+    static void MomentaSquare(ComplexField &out)
+    {
+      GridBase *grid = out._grid;
+      const std::vector<int> &l = grid->FullDimensions();
+      ComplexField kmu(grid);
+
+      for (int mu = 0; mu < grid->Nd(); mu++)
+      {
+        Real twoPiL = M_PI * 2.0 / l[mu];
+        LatticeCoordinate(kmu, mu);
+        kmu = 2.0 * sin(0.5 * twoPiL * kmu);
+        out += kmu * kmu;
+      }
+    }
+
+    static void MomentumSpacePropagator(ComplexField &out, RealD m)
+    {
+      GridBase *grid = out._grid;
+      ComplexField one(grid);
+      one = Complex(1.0, 0.0);
+      out = m * m;
+      MomentaSquare(out);
+      out = one / out;
+    }
+
+    static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
+    {
+#ifndef USE_FFT_ACCELERATION
      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
+#else
+
+      Field Pgaussian(P._grid), Pp(P._grid);
+      ComplexField p2(P._grid); p2 = zero;
+      RealD M = FFT_MASS;
+      
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
+
+      FFT theFFT((GridCartesian*)P._grid);
+      theFFT.FFT_all_dim(Pp, Pgaussian, FFT::forward);
+      MomentaSquare(p2);
+      p2 += M * M;
+      p2 = sqrt(p2);
+      Pp *= p2;
+      theFFT.FFT_all_dim(P, Pp, FFT::backward);
+
+#endif //USE_FFT_ACCELERATION
    }

    static inline Field projectForce(Field& P) {return P;}

-    static inline void update_field(Field& P, Field& U, double ep) {
-      U += P*ep;
+    static inline void update_field(Field &P, Field &U, double ep)
+    {
+#ifndef USE_FFT_ACCELERATION
+      double t0=usecond(); 
+      U += P * ep;
+      double t1=usecond();
+      double total_time = (t1-t0)/1e6;
+      std::cout << GridLogIntegrator << "Total time for updating field (s)       : " << total_time << std::endl; 
+#else
+      // FFT transform P(x) -> P(p)
+      // divide by (M^2+p^2)  M external parameter (how to pass?)
+      // P'(p) = P(p)/(M^2+p^2)
+      // Transform back -> P'(x)
+      // U += P'(x)*ep
+
+      Field Pp(U._grid), P_FFT(U._grid);     
+      static ComplexField p2(U._grid);
+      RealD M = FFT_MASS;
+      
+      FFT theFFT((GridCartesian*)U._grid);
+      theFFT.FFT_all_dim(Pp, P, FFT::forward);
+
+      static bool first_call = true;
+      if (first_call)
+      {
+        // avoid recomputing
+        MomentumSpacePropagator(p2, M);
+        first_call = false;
+      }
+      Pp *= p2;
+      theFFT.FFT_all_dim(P_FFT, Pp, FFT::backward);
+      U += P_FFT * ep;
+
+#endif //USE_FFT_ACCELERATION
    }

-    static inline RealD FieldSquareNorm(Field& U) {
-      return (TensorRemove(sum(trace(U*U))).real());
+    static inline RealD FieldSquareNorm(Field &U)
+    {
+#ifndef USE_FFT_ACCELERATION
+      return (TensorRemove(sum(trace(U * U))).real());
+#else
+      // In case of Fourier acceleration we have to:
+      // compute U(p)*U(p)/(M^2+p^2))   Parseval theorem
+      // 1 FFT needed U(x) -> U(p)
+      // M to be passed
+
+      FFT theFFT((GridCartesian*)U._grid);
+      Field Up(U._grid);
+
+      theFFT.FFT_all_dim(Up, U, FFT::forward);
+      RealD M = FFT_MASS;
+      ComplexField p2(U._grid);
+      MomentumSpacePropagator(p2, M);
+      Field Up2 = Up * p2;
+      // from the definition of the DFT we need to divide by the volume
+      return (-TensorRemove(sum(trace(adj(Up) * Up2))).real() / U._grid->gSites());
+#endif //USE_FFT_ACCELERATION
    }

    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
@@ -146,7 +247,7 @@ class ScalarImplTypes {
  typedef ScalarImplTypes<vComplex> ScalarImplCR;
  typedef ScalarImplTypes<vComplexF> ScalarImplCF;
  typedef ScalarImplTypes<vComplexD> ScalarImplCD;
-    
+
  // Hardcoding here the size of the matrices
  typedef ScalarAdjMatrixImplTypes<vComplex,  QCD::Nc> ScalarAdjImplR;
  typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
@@ -155,7 +256,7 @@ class ScalarImplTypes {
  template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex,   Colours >;
  template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF,  Colours >;
  template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD,  Colours >;
-  
+
  //}
 }

@@ -30,119 +30,179 @@ directory
 #ifndef SCALAR_INT_ACTION_H
 #define SCALAR_INT_ACTION_H

-
 // Note: this action can completely absorb the ScalarAction for real float fields
 // use the scalarObjs to generalise the structure

-namespace Grid {
-  // FIXME drop the QCD namespace everywhere here
+namespace Grid
+{
+// FIXME drop the QCD namespace everywhere here

-  template <class Impl, int Ndim >
-  class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
-  public:
-    INHERIT_FIELD_TYPES(Impl);
-  private:
-    RealD mass_square;
-    RealD lambda;
+template <class Impl, int Ndim>
+class ScalarInteractionAction : public QCD::Action<typename Impl::Field>
+{
+public:
+  INHERIT_FIELD_TYPES(Impl);

+private:
+  RealD mass_square;
+  RealD lambda;
+  RealD g;
+  const unsigned int N = Impl::Group::Dimension;

-    typedef typename Field::vector_object vobj;
-    typedef CartesianStencil<vobj,vobj> Stencil;
+  typedef typename Field::vector_object vobj;
+  typedef CartesianStencil<vobj, vobj> Stencil;

-    SimpleCompressor<vobj> compressor;
-    int npoint = 2*Ndim;
-    std::vector<int> directions;//    = {0,1,2,3,0,1,2,3};  // forcing 4 dimensions
-    std::vector<int> displacements;//  = {1,1,1,1, -1,-1,-1,-1};
+  SimpleCompressor<vobj> compressor;
+  int npoint = 2 * Ndim;
+  std::vector<int> directions;    //
+  std::vector<int> displacements; //

-
-  public:
-
-    ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){
-      for (int mu = 0 ; mu < Ndim; mu++){
-		directions[mu]         = mu; directions[mu+Ndim]    = mu;
-		displacements[mu]      =  1; displacements[mu+Ndim] = -1;
-      }
+public:
+  ScalarInteractionAction(RealD ms, RealD l, RealD gval) : mass_square(ms), lambda(l), g(gval), displacements(2 * Ndim, 0), directions(2 * Ndim, 0)
+  {
+    for (int mu = 0; mu < Ndim; mu++)
+    {
+      directions[mu] = mu;
+      directions[mu + Ndim] = mu;
+      displacements[mu] = 1;
+      displacements[mu + Ndim] = -1;
    }
+  }

-    virtual std::string LogParameters() {
-      std::stringstream sstream;
-      sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
-      sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
-      return sstream.str();
-    }
+  virtual std::string LogParameters()
+  {
+    std::stringstream sstream;
+    sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda << std::endl;
+    sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
+    sstream << GridLogMessage << "[ScalarAction] g           : " << g << std::endl;
+    return sstream.str();
+  }

-    virtual std::string action_name() {return "ScalarAction";}
+  virtual std::string action_name() { return "ScalarAction"; }

-    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
+  virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}

-    virtual RealD S(const Field &p) {
-      assert(p._grid->Nd() == Ndim);
-      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-      phiStencil.HaloExchange(p, compressor);
-      Field action(p._grid), pshift(p._grid), phisquared(p._grid);
-      phisquared = p*p;
-      action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
-      for (int mu = 0; mu < Ndim; mu++) {
-	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
-	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
-	  int permute_type;
-	  StencilEntry *SE;
-	  vobj temp2;
-	  const vobj *temp, *t_p;
-	    
-	  SE = phiStencil.GetEntry(permute_type, mu, i);
-	  t_p  = &p._odata[i];
-	  if ( SE->_is_local ) {
-	    temp = &p._odata[SE->_offset];
-	    if ( SE->_permute ) {
-	      permute(temp2, *temp, permute_type);
-	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
-	    } else {
-	      action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
-	    }
-	  } else {
-	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
-	  }
-	}
-	//  action -= pshift*p + p*pshift;
-      }
-      // NB the trace in the algebra is normalised to 1/2
-      // minus sign coming from the antihermitian fields
-      return -(TensorRemove(sum(trace(action)))).real();
-    };
-
-    virtual void deriv(const Field &p, Field &force) {
-      assert(p._grid->Nd() == Ndim);
-      force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
-      // move this outside
-      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-      phiStencil.HaloExchange(p, compressor);
-      
-      //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-      for (int point = 0; point < npoint; point++) {
-	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
-	  const vobj *temp;
-	  vobj temp2;
-	  int permute_type;
-	  StencilEntry *SE;
-	  SE = phiStencil.GetEntry(permute_type, point, i);
-	  
-	  if ( SE->_is_local ) {
-	    temp = &p._odata[SE->_offset];
-	    if ( SE->_permute ) {
-	      permute(temp2, *temp, permute_type);
-	      force._odata[i] -= temp2;
-	    } else {
-	      force._odata[i] -= *temp;
-	    }
-	  } else {
-	    force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
-	  }
-	}
+  virtual RealD S(const Field &p)
+  {
+    assert(p._grid->Nd() == Ndim);
+    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+    phiStencil.HaloExchange(p, compressor);
+    Field action(p._grid), pshift(p._grid), phisquared(p._grid);
+    phisquared = p * p;
+    action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
+    for (int mu = 0; mu < Ndim; mu++)
+    {
+      //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
+      parallel_for(int i = 0; i < p._grid->oSites(); i++)
+      {
+        int permute_type;
+        StencilEntry *SE;
+        vobj temp2;
+        const vobj *temp, *t_p;
+
+        SE = phiStencil.GetEntry(permute_type, mu, i);
+        t_p = &p._odata[i];
+        if (SE->_is_local)
+        {
+          temp = &p._odata[SE->_offset];
+          if (SE->_permute)
+          {
+            permute(temp2, *temp, permute_type);
+            action._odata[i] -= temp2 * (*t_p) + (*t_p) * temp2;
+          }
+          else
+          {
+            action._odata[i] -= (*temp) * (*t_p) + (*t_p) * (*temp);
+          }
+        }
+        else
+        {
+          action._odata[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset];
+        }
      }
+      //  action -= pshift*p + p*pshift;
    }
+    // NB the trace in the algebra is normalised to 1/2
+    // minus sign coming from the antihermitian fields
+    return -(TensorRemove(sum(trace(action)))).real() * N / g;
  };
-  
-}  // namespace Grid

-#endif  // SCALAR_INT_ACTION_H
+  virtual void deriv(const Field &p, Field &force)
+  {
+    double t0 = usecond();
+    assert(p._grid->Nd() == Ndim);
+    force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p;
+    double interm_t = usecond();
+
+    // move this outside
+    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+
+    phiStencil.HaloExchange(p, compressor);
+    double halo_t = usecond();
+    int chunk = 128;
+    //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
+
+    // inverting the order of the loops slows down the code(! g++ 7)
+    // cannot try to reduce the number of  force writes by factor npoint...
+    // use cache blocking
+    for (int point = 0; point < npoint; point++)
+    {
+
+#pragma omp parallel 
+{
+        int permute_type;
+        StencilEntry *SE;
+        const vobj *temp;
+
+#pragma omp for schedule(static, chunk)
+      for (int i = 0; i < p._grid->oSites(); i++)
+      {
+        SE = phiStencil.GetEntry(permute_type, point, i);
+        // prefetch next p?
+
+        if (SE->_is_local)
+        {
+          temp = &p._odata[SE->_offset];
+      
+          if (SE->_permute)
+          {
+            vobj temp2;
+            permute(temp2, *temp, permute_type);
+            force._odata[i] -= temp2;
+          }
+          else
+          {
+            force._odata[i] -= *temp; // slow part. Dominated by this read/write (BW)
+          }
+        }
+        else
+        {
+          force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
+        }
+      }
+
+    }
+  }
+  force *= N / g;
+
+  double t1 = usecond();
+  double total_time = (t1 - t0) / 1e6;
+  double interm_time = (interm_t - t0) / 1e6;
+  double halo_time = (halo_t - interm_t) / 1e6;
+  double stencil_time = (t1 - halo_t) / 1e6;
+  std::cout << GridLogIntegrator << "Total time for force computation (s)       : " << total_time << std::endl;
+  std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl;
+  std::cout << GridLogIntegrator << "Halo time in force computation (s)         : " << halo_time << std::endl;
+  std::cout << GridLogIntegrator << "Stencil time in force computation (s)      : " << stencil_time << std::endl;
+  double flops = p._grid->gSites() * (14 * N * N * N + 18 * N * N + 2);
+  double flops_no_stencil = p._grid->gSites() * (14 * N * N * N + 6 * N * N + 2);
+  double Gflops = flops / (total_time * 1e9);
+  double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9);
+  std::cout << GridLogIntegrator << "Flops: " << flops << "  - Gflop/s : " << Gflops << std::endl;
+  std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << "  - Gflop/s NS: " << Gflops_no_stencil << std::endl;
+}
+};
+
+} // namespace Grid
+
+#endif // SCALAR_INT_ACTION_H
@@ -211,7 +211,7 @@ typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
    ScalarAdjGenericHMCRunner;

 template <int Colours> 
-using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, MinimumNorm2, ScalarNxNMatrixFields<Colours> >;
+using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, ForceGradient, ScalarNxNMatrixFields<Colours> >;

 }  // namespace QCD
 }  // namespace Grid
@@ -48,6 +48,22 @@ with this program; if not, write to the Free Software Foundation, Inc.,
    }                                                                    \
  }

+#define RegisterLoadCheckPointerMetadataFunction(NAME)                   \
+  template < class Metadata >                                            \
+  void Load##NAME##Checkpointer(const CheckpointerParameters& Params_, const Metadata& M_) { \
+    if (!have_CheckPointer) {                                            \
+      std::cout << GridLogDebug << "Loading Metadata Checkpointer " << #NAME      \
+                << std::endl;                                            \
+      CP = std::unique_ptr<CheckpointerBaseModule>(                      \
+        new NAME##CPModule<ImplementationPolicy, Metadata >(Params_, M_));   \
+      have_CheckPointer = true;                                          \
+    } else {                                                             \
+      std::cout << GridLogError << "Checkpointer already loaded "        \
+                << std::endl;                                            \
+      exit(1);                                                           \
+    }                                                                    \
+  }
+
 namespace Grid {
 namespace QCD {

@@ -77,7 +93,7 @@ class HMCResourceManager {
  bool have_CheckPointer;

  // NOTE: operator << is not overloaded for std::vector<string> 
-  // so thsi function is necessary
+  // so this function is necessary
  void output_vector_string(const std::vector<std::string> &vs){
    for (auto &i: vs)
      std::cout << i << " ";
@@ -254,6 +270,7 @@ class HMCResourceManager {
  RegisterLoadCheckPointerFunction(Nersc);
  #ifdef HAVE_LIME
  RegisterLoadCheckPointerFunction(ILDG);
+  RegisterLoadCheckPointerMetadataFunction(Scidac);
  #endif

  ////////////////////////////////////////////////////////
@@ -76,6 +76,14 @@ class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> {
    }
 	} 

+  void check_filename(const std::string &filename){
+    std::ifstream f(filename.c_str());
+    if(!f.good()){
+      std::cout << GridLogError << "Filename " << filename << " not found. Aborting. " << std::endl;
+      abort();
+    };
+  }
+
  virtual void initialize(const CheckpointerParameters &Params) = 0;

  virtual void CheckpointRestore(int traj, typename Impl::Field &U,
@@ -93,6 +93,9 @@ class BinaryHmcCheckpointer : public BaseHmcCheckpointer<Impl> {
  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
    std::string config, rng;
    this->build_filenames(traj, Params, config, rng);
+    this->check_filename(rng);
+    this->check_filename(config);
+

    BinarySimpleMunger<sobj_double, sobj> munge;

@@ -136,6 +136,20 @@ class ILDGCPModule: public CheckPointerModule< ImplementationPolicy> {

 };

+template<class ImplementationPolicy, class Metadata>
+class ScidacCPModule: public CheckPointerModule< ImplementationPolicy> {
+  typedef CheckPointerModule< ImplementationPolicy> CPBase;
+  Metadata M;
+
+  // acquire resource
+  virtual void initialize(){
+     this->CheckPointPtr.reset(new ScidacHmcCheckpointer<ImplementationPolicy, Metadata>(this->Par_, M));
+  }
+public:
+  ScidacCPModule(typename CPBase::APar Par, Metadata M_):M(M_), CPBase(Par) {}
+  template <class ReaderClass>
+  ScidacCPModule(Reader<ReaderClass>& Reader) : Parametrized<typename CPBase::APar>(Reader), M(Reader){};
+};
 #endif


@@ -34,6 +34,7 @@ directory
 #include <Grid/qcd/hmc/checkpointers/NerscCheckpointer.h>
 #include <Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h>
 #include <Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h>
+#include <Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h>
 //#include <Grid/qcd/hmc/checkpointers/CheckPointerModules.h>


@@ -74,10 +74,10 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
    if ((traj % Params.saveInterval) == 0) {
      std::string config, rng;
      this->build_filenames(traj, Params, config, rng);
-      
+      GridBase *grid = U._grid;
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-      IldgWriter _IldgWriter;
+      IldgWriter _IldgWriter(grid->IsBoss());
      _IldgWriter.open(config);
      _IldgWriter.writeConfiguration(U, traj, config, config);
      _IldgWriter.close();
@@ -95,6 +95,10 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
                         GridParallelRNG &pRNG) {
    std::string config, rng;
    this->build_filenames(traj, Params, config, rng);
+    this->check_filename(rng);
+    this->check_filename(config);
+
+    

    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
@@ -69,6 +69,9 @@ class NerscHmcCheckpointer : public BaseHmcCheckpointer<Gimpl> {
                         GridParallelRNG &pRNG) {
    std::string config, rng;
    this->build_filenames(traj, Params, config, rng);
+    this->check_filename(rng);
+    this->check_filename(config);
+

    FieldMetaData header;
    NerscIO::readRNGState(sRNG, pRNG, header, rng);
@@ -0,0 +1,125 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/hmc/ScidacCheckpointer.h
+
+Copyright (C) 2018
+
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef SCIDAC_CHECKPOINTER
+#define SCIDAC_CHECKPOINTER
+
+#ifdef HAVE_LIME
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+namespace Grid {
+namespace QCD {
+
+// For generic fields
+template <class Implementation, class Metadata>
+class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
+ private:
+  CheckpointerParameters Params;
+  Metadata MData;
+
+  typedef typename Implementation::Field Field;
+
+ public:
+  //INHERIT_GIMPL_TYPES(Implementation);
+
+  ScidacHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
+  ScidacHmcCheckpointer(const CheckpointerParameters &Params_, const Metadata& M_):MData(M_) { initialize(Params_); }
+
+  void initialize(const CheckpointerParameters &Params_) {
+    Params = Params_;
+
+    // check here that the format is valid
+    int ieee32big = (Params.format == std::string("IEEE32BIG"));
+    int ieee32    = (Params.format == std::string("IEEE32"));
+    int ieee64big = (Params.format == std::string("IEEE64BIG"));
+    int ieee64    = (Params.format == std::string("IEEE64"));
+
+    if (!(ieee64big || ieee32 || ieee32big || ieee64)) {
+      std::cout << GridLogError << "Unrecognized file format " << Params.format
+                << std::endl;
+      std::cout << GridLogError
+                << "Allowed: IEEE32BIG | IEEE32 | IEEE64BIG | IEEE64"
+                << std::endl;
+
+      exit(1);
+    }
+  }
+
+  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
+                          GridParallelRNG &pRNG) {
+    if ((traj % Params.saveInterval) == 0) {
+      std::string config, rng;
+      this->build_filenames(traj, Params, config, rng);
+      GridBase *grid = U._grid;
+      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
+      ScidacWriter _ScidacWriter(grid->IsBoss());
+      _ScidacWriter.open(config);
+      _ScidacWriter.writeScidacFieldRecord(U, MData);
+      _ScidacWriter.close();
+
+      std::cout << GridLogMessage << "Written Scidac Configuration on " << config
+                << " checksum " << std::hex << nersc_csum<<"/"
+		            << scidac_csuma<<"/" << scidac_csumb
+		            << std::dec << std::endl;
+    }
+  };
+
+  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
+                         GridParallelRNG &pRNG) {
+    std::string config, rng;
+    this->build_filenames(traj, Params, config, rng);
+    this->check_filename(rng);
+    this->check_filename(config);
+
+
+    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+    BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
+
+    Metadata md_content;
+    ScidacReader _ScidacReader;
+    _ScidacReader.open(config);
+    _ScidacReader.readScidacFieldRecord(U,md_content);  // format from the header
+    _ScidacReader.close();
+
+    std::cout << GridLogMessage << "Read Scidac Configuration from " << config
+              << " checksum " << std::hex 
+	      << nersc_csum<<"/"
+	      << scidac_csuma<<"/"
+	      << scidac_csumb
+	      << std::dec << std::endl;
+  };
+};
+}
+}
+
+#endif  // HAVE_LIME
+#endif  // ILDG_CHECKPOINTER
@@ -114,18 +114,26 @@ class Integrator {
    // input U actually not used in the fundamental case
    // Fundamental updates, include smearing

-   for (int a = 0; a < as[level].actions.size(); ++a) {
+    for (int a = 0; a < as[level].actions.size(); ++a) {
+      double start_full = usecond();
      Field force(U._grid);
      conformable(U._grid, Mom._grid);
+
      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
+      double start_force = usecond();
      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta

      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
+      double end_force = usecond();
      Real force_abs = std::sqrt(norm2(force)/U._grid->gSites());
-      std::cout << GridLogIntegrator << "Force average: " << force_abs << std::endl;
+      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
      Mom -= force * ep; 
+      double end_full = usecond();
+      double time_full  = (end_full - start_full) / 1e3;
+      double time_force = (end_force - start_force) / 1e3;
+      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl;
    }

    // Force from the other representations
@@ -92,6 +92,19 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
  PlaquetteMod(): ObsBase(NoParameters()){}
 };

+template < class Impl >
+class PolyakovMod: public ObservableModule<PolyakovLogger<Impl>, NoParameters>{
+  typedef ObservableModule<PolyakovLogger<Impl>, NoParameters> ObsBase;
+  using ObsBase::ObsBase; // for constructors
+
+  // acquire resource
+  virtual void initialize(){
+    this->ObservablePtr.reset(new PolyakovLogger<Impl>());
+  }
+  public:
+  PolyakovMod(): ObsBase(NoParameters()){}
+};
+

 template < class Impl >
 class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
@@ -45,5 +45,7 @@ class HmcObservable {

 #include "plaquette.h"
 #include "topological_charge.h"
+#include "polyakov_loop.h"
+

 #endif  //  HMC_OBSERVABLE_H
@@ -0,0 +1,68 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/modules/polyakov_line.h
+
+Copyright (C) 2017
+
+Author: David Preti <david.preti@csic.es>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef HMC_POLYAKOV_H
+#define HMC_POLYAKOV_H
+
+namespace Grid {
+namespace QCD {
+
+// this is only defined for a gauge theory
+template <class Impl>
+class PolyakovLogger : public HmcObservable<typename Impl::Field> {
+ public:
+  // here forces the Impl to be of gauge fields
+  // if not the compiler will complain
+  INHERIT_GIMPL_TYPES(Impl);
+
+  // necessary for HmcObservable compatibility
+  typedef typename Impl::Field Field;
+
+  void TrajectoryComplete(int traj,
+                          Field &U,
+                          GridSerialRNG &sRNG,
+                          GridParallelRNG &pRNG) {
+
+    ComplexD polyakov = WilsonLoops<Impl>::avgPolyakovLoop(U);
+
+    int def_prec = std::cout.precision();
+
+    std::cout << GridLogMessage
+        << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+        << "Polyakov Loop: [ " << traj << " ] "<< polyakov << std::endl;
+
+    std::cout.precision(def_prec);
+
+  }
+};
+
+}  // namespace QCD
+}  // namespace Grid
+
+#endif  // HMC_POLYAKOV_H
@@ -23,6 +23,7 @@ class AdjointRep {
  typedef typename SU_Adjoint<ncolour>::LatticeAdjMatrix LatticeMatrix;
  typedef typename SU_Adjoint<ncolour>::LatticeAdjField LatticeField;
  static const int Dimension = ncolour * ncolour - 1;
+  static const bool isFundamental = false;

  LatticeField U;

@@ -19,6 +19,7 @@ template <int ncolour>
 class FundamentalRep {
 public:
  static const int Dimension = ncolour;
+  static const bool isFundamental = true;

  // typdef to be used by the Representations class in HMC to get the
  // types for the higher representation fields
@@ -29,6 +29,7 @@ class TwoIndexRep {
  typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexMatrix LatticeMatrix;
  typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexField LatticeField;
  static const int Dimension = ncolour * (ncolour + S) / 2;
+  static const bool isFundamental = false;

  LatticeField U;

@@ -6,30 +6,33 @@
 #ifndef GAUGE_CONFIG_
 #define GAUGE_CONFIG_

-namespace Grid {
+namespace Grid
+{

-namespace QCD {
+namespace QCD
+{

-  //trivial class for no smearing
-  template< class Impl >
-class NoSmearing {
+//trivial class for no smearing
+template <class Impl>
+class NoSmearing
+{
 public:
  INHERIT_FIELD_TYPES(Impl);

-  Field* ThinField;
+  Field *ThinField;

-  NoSmearing(): ThinField(NULL) {}
+  NoSmearing() : ThinField(NULL) {}

-  void set_Field(Field& U) { ThinField = &U; }
+  void set_Field(Field &U) { ThinField = &U; }

-  void smeared_force(Field&) const {}
+  void smeared_force(Field &) const {}

-  Field& get_SmearedU() { return *ThinField; }
+  Field &get_SmearedU() { return *ThinField; }

-  Field& get_U(bool smeared = false) {
+  Field &get_U(bool smeared = false)
+  {
    return *ThinField;
  }
-
 };

 /*!
@@ -44,32 +47,36 @@ public:
  It stores a list of smeared configurations.
 */
 template <class Gimpl>
-class SmearedConfiguration {
- public:
+class SmearedConfiguration
+{
+public:
  INHERIT_GIMPL_TYPES(Gimpl);

- private:
+private:
  const unsigned int smearingLevels;
  Smear_Stout<Gimpl> StoutSmearing;
  std::vector<GaugeField> SmearedSet;

  // Member functions
  //====================================================================
-  void fill_smearedSet(GaugeField& U) {
-    ThinLinks = &U;  // attach the smearing routine to the field U
+  void fill_smearedSet(GaugeField &U)
+  {
+    ThinLinks = &U; // attach the smearing routine to the field U

    // check the pointer is not null
    if (ThinLinks == NULL)
      std::cout << GridLogError
                << "[SmearedConfiguration] Error in ThinLinks pointer\n";

-    if (smearingLevels > 0) {
+    if (smearingLevels > 0)
+    {
      std::cout << GridLogDebug
                << "[SmearedConfiguration] Filling SmearedSet\n";
      GaugeField previous_u(ThinLinks->_grid);

      previous_u = *ThinLinks;
-      for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) {
+      for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl)
+      {
        StoutSmearing.smear(SmearedSet[smearLvl], previous_u);
        previous_u = SmearedSet[smearLvl];

@@ -81,9 +88,10 @@ class SmearedConfiguration {
    }
  }
  //====================================================================
-  GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
-                                  const GaugeField& GaugeK) const {
-    GridBase* grid = GaugeK._grid;
+  GaugeField AnalyticSmearedForce(const GaugeField &SigmaKPrime,
+                                  const GaugeField &GaugeK) const
+  {
+    GridBase *grid = GaugeK._grid;
    GaugeField C(grid), SigmaK(grid), iLambda(grid);
    GaugeLinkField iLambda_mu(grid);
    GaugeLinkField iQ(grid), e_iQ(grid);
@@ -94,7 +102,8 @@ class SmearedConfiguration {
    SigmaK = zero;
    iLambda = zero;

-    for (int mu = 0; mu < Nd; mu++) {
+    for (int mu = 0; mu < Nd; mu++)
+    {
      Cmu = peekLorentz(C, mu);
      GaugeKmu = peekLorentz(GaugeK, mu);
      SigmaKPrime_mu = peekLorentz(SigmaKPrime, mu);
@@ -104,20 +113,22 @@ class SmearedConfiguration {
      pokeLorentz(iLambda, iLambda_mu, mu);
    }
    StoutSmearing.derivative(SigmaK, iLambda,
-                             GaugeK);  // derivative of SmearBase
+                             GaugeK); // derivative of SmearBase
    return SigmaK;
  }

  /*! @brief Returns smeared configuration at level 'Level' */
-  const GaugeField& get_smeared_conf(int Level) const {
+  const GaugeField &get_smeared_conf(int Level) const
+  {
    return SmearedSet[Level];
  }

  //====================================================================
-  void set_iLambda(GaugeLinkField& iLambda, GaugeLinkField& e_iQ,
-                   const GaugeLinkField& iQ, const GaugeLinkField& Sigmap,
-                   const GaugeLinkField& GaugeK) const {
-    GridBase* grid = iQ._grid;
+  void set_iLambda(GaugeLinkField &iLambda, GaugeLinkField &e_iQ,
+                   const GaugeLinkField &iQ, const GaugeLinkField &Sigmap,
+                   const GaugeLinkField &GaugeK) const
+  {
+    GridBase *grid = iQ._grid;
    GaugeLinkField iQ2(grid), iQ3(grid), B1(grid), B2(grid), USigmap(grid);
    GaugeLinkField unity(grid);
    unity = 1.0;
@@ -206,15 +217,15 @@ class SmearedConfiguration {
  }

  //====================================================================
- public:
-  GaugeField*
-      ThinLinks; /*!< @brief Pointer to the thin
-                                                         links configuration */
+public:
+  GaugeField *
+      ThinLinks; /* Pointer to the thin links configuration */

-  /*! @brief Standard constructor */
-  SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
-                       Smear_Stout<Gimpl>& Stout)
-      : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) {
+  /* Standard constructor */
+  SmearedConfiguration(GridCartesian *UGrid, unsigned int Nsmear,
+                       Smear_Stout<Gimpl> &Stout)
+      : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL)
+  {
    for (unsigned int i = 0; i < smearingLevels; ++i)
      SmearedSet.push_back(*(new GaugeField(UGrid)));
  }
@@ -223,21 +234,29 @@ class SmearedConfiguration {
  SmearedConfiguration()
      : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {}

-
-  
  // attach the smeared routines to the thin links U and fill the smeared set
-  void set_Field(GaugeField& U) { fill_smearedSet(U); }
+  void set_Field(GaugeField &U)
+  {
+    double start = usecond();
+    fill_smearedSet(U);
+    double end = usecond();
+    double time = (end - start)/ 1e3;
+    std::cout << GridLogMessage << "Smearing in " << time << " ms" << std::endl;  
+  }

  //====================================================================
-  void smeared_force(GaugeField& SigmaTilde) const {
-    if (smearingLevels > 0) {
+  void smeared_force(GaugeField &SigmaTilde) const
+  {
+    if (smearingLevels > 0)
+    {
+      double start = usecond();
      GaugeField force = SigmaTilde; // actually = U*SigmaTilde
      GaugeLinkField tmp_mu(SigmaTilde._grid);

-      for (int mu = 0; mu < Nd; mu++) {
+      for (int mu = 0; mu < Nd; mu++)
+      {
        // to get just SigmaTilde
-        tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) *
-                 peekLorentz(force, mu);
+        tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) * peekLorentz(force, mu);
        pokeLorentz(force, tmp_mu, mu);
      }

@@ -246,33 +265,43 @@ class SmearedConfiguration {

      force = AnalyticSmearedForce(force, *ThinLinks);

-      for (int mu = 0; mu < Nd; mu++) {
+      for (int mu = 0; mu < Nd; mu++)
+      {
        tmp_mu = peekLorentz(*ThinLinks, mu) * peekLorentz(force, mu);
        pokeLorentz(SigmaTilde, tmp_mu, mu);
      }
-    }  // if smearingLevels = 0 do nothing
+      double end = usecond();
+      double time = (end - start)/ 1e3;
+      std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl;  
+    } // if smearingLevels = 0 do nothing
  }
  //====================================================================

-  GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
+  GaugeField &get_SmearedU() { return SmearedSet[smearingLevels - 1]; }

-  GaugeField& get_U(bool smeared = false) {
+  GaugeField &get_U(bool smeared = false)
+  {
    // get the config, thin links by default
-    if (smeared) {
-      if (smearingLevels) {
+    if (smeared)
+    {
+      if (smearingLevels)
+      {
        RealD impl_plaq =
            WilsonLoops<Gimpl>::avgPlaquette(SmearedSet[smearingLevels - 1]);
        std::cout << GridLogDebug << "getting Usmr Plaq: " << impl_plaq
                  << std::endl;
        return get_SmearedU();
-
-      } else {
+      }
+      else
+      {
        RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
        std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
                  << std::endl;
        return *ThinLinks;
      }
-    } else {
+    }
+    else
+    {
      RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
      std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
                << std::endl;
@@ -173,8 +173,8 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
        std::cout << "Time to evolve " << diff.count() << " s\n";
        #endif
        std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-            << step << "  "
-            << energyDensityPlaquette(step,out) << std::endl;
+		  << step << "  " << tau(step) << "  " 
+		  << energyDensityPlaquette(step,out) << std::endl;
         if( step % measure_interval == 0){
         std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
            << step << "  " 
@@ -193,8 +193,8 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
        //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
        evolve_step_adaptive(out, maxTau);
        std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-            << step << "  "
-            << energyDensityPlaquette(out) << std::endl;
+		  << step << "  " << taus << "  "
+		  << energyDensityPlaquette(out) << std::endl;
         if( step % measure_interval == 0){
         std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
            << step << "  " 
@@ -746,7 +746,7 @@ template<typename GaugeField,typename GaugeMat>
    }
  }
  template<typename GaugeField>
-  static void ColdConfiguration(GridParallelRNG &pRNG,GaugeField &out){
+  static void ColdConfiguration(GaugeField &out){
    typedef typename GaugeField::vector_type vector_type;
    typedef iSUnMatrix<vector_type> vMatrixType;
    typedef Lattice<vMatrixType> LatticeMatrixType;
@@ -757,6 +757,10 @@ template<typename GaugeField,typename GaugeMat>
      PokeIndex<LorentzIndex>(out,Umu,mu);
    }
  }
+  template<typename GaugeField>
+  static void ColdConfiguration(GridParallelRNG &pRNG,GaugeField &out){
+    ColdConfiguration(out);
+  }

  template<typename LatticeMatrixType>
  static void taProj( const LatticeMatrixType &in,  LatticeMatrixType &out){
@@ -123,6 +123,28 @@ public:
    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
  }

+
+  //////////////////////////////////////////////////
+  // average over all x,y,z the temporal loop
+  //////////////////////////////////////////////////
+  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  //assume Nd=4
+    GaugeMat Ut(Umu._grid), P(Umu._grid);
+    ComplexD out;
+    int T = Umu._grid->GlobalDimensions()[3];
+    int X = Umu._grid->GlobalDimensions()[0];
+    int Y = Umu._grid->GlobalDimensions()[1];
+    int Z = Umu._grid->GlobalDimensions()[2];
+
+    Ut = peekLorentz(Umu,3); //Select temporal direction
+    P = Ut;
+    for (int t=1;t<T;t++){ 
+      P = Gimpl::CovShiftForward(Ut,3,P);
+    }
+   RealD norm = 1.0/(Nc*X*Y*Z*T);
+   out = sum(trace(P))*norm;
+   return out;   
+}
+
  //////////////////////////////////////////////////
  // average over traced single links
  //////////////////////////////////////////////////
@@ -190,6 +212,7 @@ public:


 // For the force term
+/*
 static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    GridBase *grid = Umu._grid;
    std::vector<GaugeMat> U(Nd, grid);
@@ -203,7 +226,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {

    for (int nu = 0; nu < Nd; nu++) {
      if (nu != mu) {
-        // this is ~10% faster than the Staple
+        // this is ~10% faster than the Staple  -- PAB: so what it gives the WRONG answers for other BC's!
        tmp1 = Cshift(U[nu], mu, 1);
        tmp2 = Cshift(U[mu], nu, 1);
        staple += tmp1* adj(U[nu]*tmp2);
@@ -213,7 +236,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    }
    staple = U[mu]*staple;
 }
-
+*/
  //////////////////////////////////////////////////
  // the sum over all staples on each site
  //////////////////////////////////////////////////
@@ -291,9 +314,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    }
  }

-  //////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
  // the sum over all staples on each site in direction mu,nu, lower part
-  //////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
  static void StapleLower(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
                          int nu) {
    if (nu != mu) {
@@ -315,7 +338,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      //
      staple = Gimpl::ShiftStaple(
          Gimpl::CovShiftBackward(U[nu], nu,
-                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
+          mu);
+
    }
  }

@@ -325,7 +350,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void FieldStrength(GaugeMat &FS, const GaugeLorentz &Umu, int mu, int nu){
      // Fmn +--<--+  Ut +--<--+
      //     |     |     |     |
-      //  (x)+-->--+     +-->--+(x)
+      //  (x)+-->--+     +-->--+(x)  - h.c.
      //     |     |     |     |
      //     +--<--+     +--<--+

@@ -335,7 +360,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      GaugeMat v = Vup - Vdn;
      GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu);  // some redundant copies
      GaugeMat vu = v*u;
-      FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
+      //FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
+      FS = (u*v + Cshift(vu, mu, -1));
+      FS = 0.125*(FS - adj(FS));
  }

  static Real TopologicalCharge(GaugeLorentz &U){
@@ -360,6 +387,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    return TensorRemove(Tq).real();
  }

+
  //////////////////////////////////////////////////////
  // Similar to above for rectangle is required
  //////////////////////////////////////////////////////