Merge branch 'develop' of github.com:fionnoh/Grid into feature/A2A_current_insertion

Peter's GPU branch changes merged with A2A CI code
2025-10-29 19:14:33 +00:00 · 2019-09-30 16:53:44 +01:00
parent 25150eb2e0 b473405652
commit d1daab601a
785 changed files with 41312 additions and 51680 deletions
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -27,114 +27,112 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_QCD_BASE_H
-#define GRID_QCD_BASE_H
-namespace Grid{
-namespace QCD {
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once

-    static const int Xdir = 0;
-    static const int Ydir = 1;
-    static const int Zdir = 2;
-    static const int Tdir = 3;
+NAMESPACE_BEGIN(Grid);

-  
-    static const int Xp = 0;
-    static const int Yp = 1;
-    static const int Zp = 2;
-    static const int Tp = 3;
-    static const int Xm = 4;
-    static const int Ym = 5;
-    static const int Zm = 6;
-    static const int Tm = 7;
+static constexpr int Xdir = 0;
+static constexpr int Ydir = 1;
+static constexpr int Zdir = 2;
+static constexpr int Tdir = 3;

-    static const int Nc=3;
-    static const int Ns=4;
-    static const int Nd=4;
-    static const int Nhs=2; // half spinor
-    static const int Nds=8; // double stored gauge field
-    static const int Ngp=2; // gparity index range
+static constexpr int Xp = 0;
+static constexpr int Yp = 1;
+static constexpr int Zp = 2;
+static constexpr int Tp = 3;
+static constexpr int Xm = 4;
+static constexpr int Ym = 5;
+static constexpr int Zm = 6;
+static constexpr int Tm = 7;

-    //////////////////////////////////////////////////////////////////////////////
-    // QCD iMatrix types
-    // Index conventions:                            Lorentz x Spin x Colour
-    // note: static const int or constexpr will work for type deductions
-    //       with the intel compiler (up to version 17)
-    //////////////////////////////////////////////////////////////////////////////
-    #define ColourIndex  2
-    #define SpinIndex    1
-    #define LorentzIndex 0
+static constexpr int Nc=3;
+static constexpr int Ns=4;
+static constexpr int Nd=4;
+static constexpr int Nhs=2; // half spinor
+static constexpr int Nds=8; // double stored gauge field
+static constexpr int Ngp=2; // gparity index range

-    // Also should make these a named enum type
-    static const int DaggerNo=0;
-    static const int DaggerYes=1;
-    static const int InverseNo=0;
-    static const int InverseYes=1;
+//////////////////////////////////////////////////////////////////////////////
+// QCD iMatrix types
+// Index conventions:                            Lorentz x Spin x Colour
+// note: static constexpr int or constexpr will work for type deductions
+//       with the intel compiler (up to version 17)
+//////////////////////////////////////////////////////////////////////////////
+#define ColourIndex  (2)
+#define SpinIndex    (1)
+#define LorentzIndex (0)

-    // Useful traits is this a spin index
-    //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
+// Also should make these a named enum type
+static constexpr int DaggerNo=0;
+static constexpr int DaggerYes=1;
+static constexpr int InverseNo=0;
+static constexpr int InverseYes=1;

-    const int SpinorIndex = 2;
-    template<typename T> struct isSpinor {
-      static const bool value = (SpinorIndex==T::TensorLevel);
-    };
-    template <typename T> using IfSpinor    = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
-    template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
+// Useful traits is this a spin index
+//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;

-    // ChrisK very keen to add extra space for Gparity doubling.
-    //
-    // Also add domain wall index, in a way where Wilson operator 
-    // naturally distributes across the 5th dimensions.
-    //
-    // That probably makes for GridRedBlack4dCartesian grid.
+const int SpinorIndex = 2;
+template<typename T> struct isSpinor {
+  static constexpr bool value = (SpinorIndex==T::TensorLevel);
+};
+template <typename T> using IfSpinor    = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
+template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;

-    // s,sp,c,spc,lc
+// ChrisK very keen to add extra space for Gparity doubling.
+//
+// Also add domain wall index, in a way where Wilson operator 
+// naturally distributes across the 5th dimensions.
+//
+// That probably makes for GridRedBlack4dCartesian grid.

-    template<typename vtype> using iSinglet                     = iScalar<iScalar<iScalar<vtype> > >;
-    template<typename vtype> using iSpinMatrix                  = iScalar<iMatrix<iScalar<vtype>, Ns> >;
-    template<typename vtype> using iColourMatrix                = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
-    template<typename vtype> using iSpinColourMatrix            = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
-    template<typename vtype> using iLorentzColourMatrix         = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
-    template<typename vtype> using iDoubleStoredColourMatrix    = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
-    template<typename vtype> using iSpinVector                  = iScalar<iVector<iScalar<vtype>, Ns> >;
-    template<typename vtype> using iColourVector                = iScalar<iScalar<iVector<vtype, Nc> > >;
-    template<typename vtype> using iSpinColourVector            = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
-    template<typename vtype> using iHalfSpinVector              = iScalar<iVector<iScalar<vtype>, Nhs> >;
-    template<typename vtype> using iHalfSpinColourVector        = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
+// s,sp,c,spc,lc
+
+template<typename vtype> using iSinglet                   = iScalar<iScalar<iScalar<vtype> > >;
+template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iScalar<vtype>, Ns> >;
+template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
+template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
+template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
+template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
+template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
+template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
+template<typename vtype> using iSpinColourVector          = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
+template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
+template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;


-    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
-    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
+template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;

-    // Spin matrix
-    typedef iSpinMatrix<Complex  >          SpinMatrix;
-    typedef iSpinMatrix<ComplexF >          SpinMatrixF;
-    typedef iSpinMatrix<ComplexD >          SpinMatrixD;
+// Spin matrix
+typedef iSpinMatrix<Complex  >          SpinMatrix;
+typedef iSpinMatrix<ComplexF >          SpinMatrixF;
+typedef iSpinMatrix<ComplexD >          SpinMatrixD;

-    typedef iSpinMatrix<vComplex >          vSpinMatrix;
-    typedef iSpinMatrix<vComplexF>          vSpinMatrixF;
-    typedef iSpinMatrix<vComplexD>          vSpinMatrixD;
+typedef iSpinMatrix<vComplex >          vSpinMatrix;
+typedef iSpinMatrix<vComplexF>          vSpinMatrixF;
+typedef iSpinMatrix<vComplexD>          vSpinMatrixD;

-    // Colour Matrix
-    typedef iColourMatrix<Complex  >        ColourMatrix;
-    typedef iColourMatrix<ComplexF >        ColourMatrixF;
-    typedef iColourMatrix<ComplexD >        ColourMatrixD;
+// Colour Matrix
+typedef iColourMatrix<Complex  >        ColourMatrix;
+typedef iColourMatrix<ComplexF >        ColourMatrixF;
+typedef iColourMatrix<ComplexD >        ColourMatrixD;

-    typedef iColourMatrix<vComplex >        vColourMatrix;
-    typedef iColourMatrix<vComplexF>        vColourMatrixF;
-    typedef iColourMatrix<vComplexD>        vColourMatrixD;
+typedef iColourMatrix<vComplex >        vColourMatrix;
+typedef iColourMatrix<vComplexF>        vColourMatrixF;
+typedef iColourMatrix<vComplexD>        vColourMatrixD;
+
+// SpinColour matrix
+typedef iSpinColourMatrix<Complex  >    SpinColourMatrix;
+typedef iSpinColourMatrix<ComplexF >    SpinColourMatrixF;
+typedef iSpinColourMatrix<ComplexD >    SpinColourMatrixD;
+
+typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
+typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
+typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;

-    // SpinColour matrix
-    typedef iSpinColourMatrix<Complex  >    SpinColourMatrix;
-    typedef iSpinColourMatrix<ComplexF >    SpinColourMatrixF;
-    typedef iSpinColourMatrix<ComplexD >    SpinColourMatrixD;
-    
-    typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
-    typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
-    typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;
-    
    // SpinColourSpinColour matrix
    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
@@ -153,383 +151,379 @@ namespace QCD {
    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;

-    // LorentzColour
-    typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
-    typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
-    typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
+// LorentzColour
+typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
+typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
+typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;

-    typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
-    typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
-    typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
+typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
+typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
+typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;

-    // DoubleStored gauge field
-    typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix;
-    typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
-    typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;
+// DoubleStored gauge field
+typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix;
+typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
+typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;

-    typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
-    typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
-    typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
+typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
+typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
+typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;

-    // Spin vector
-    typedef iSpinVector<Complex >           SpinVector;
-    typedef iSpinVector<ComplexF>           SpinVectorF;
-    typedef iSpinVector<ComplexD>           SpinVectorD;
+// Spin vector
+typedef iSpinVector<Complex >           SpinVector;
+typedef iSpinVector<ComplexF>           SpinVectorF;
+typedef iSpinVector<ComplexD>           SpinVectorD;

-    typedef iSpinVector<vComplex >           vSpinVector;
-    typedef iSpinVector<vComplexF>           vSpinVectorF;
-    typedef iSpinVector<vComplexD>           vSpinVectorD;
+typedef iSpinVector<vComplex >           vSpinVector;
+typedef iSpinVector<vComplexF>           vSpinVectorF;
+typedef iSpinVector<vComplexD>           vSpinVectorD;

-    // Colour vector
-    typedef iColourVector<Complex >         ColourVector;
-    typedef iColourVector<ComplexF>         ColourVectorF;
-    typedef iColourVector<ComplexD>         ColourVectorD;
+// Colour vector
+typedef iColourVector<Complex >         ColourVector;
+typedef iColourVector<ComplexF>         ColourVectorF;
+typedef iColourVector<ComplexD>         ColourVectorD;

-    typedef iColourVector<vComplex >         vColourVector;
-    typedef iColourVector<vComplexF>         vColourVectorF;
-    typedef iColourVector<vComplexD>         vColourVectorD;
+typedef iColourVector<vComplex >         vColourVector;
+typedef iColourVector<vComplexF>         vColourVectorF;
+typedef iColourVector<vComplexD>         vColourVectorD;

-    // SpinColourVector
-    typedef iSpinColourVector<Complex >     SpinColourVector;
-    typedef iSpinColourVector<ComplexF>     SpinColourVectorF;
-    typedef iSpinColourVector<ComplexD>     SpinColourVectorD;
+// SpinColourVector
+typedef iSpinColourVector<Complex >     SpinColourVector;
+typedef iSpinColourVector<ComplexF>     SpinColourVectorF;
+typedef iSpinColourVector<ComplexD>     SpinColourVectorD;

-    typedef iSpinColourVector<vComplex >     vSpinColourVector;
-    typedef iSpinColourVector<vComplexF>     vSpinColourVectorF;
-    typedef iSpinColourVector<vComplexD>     vSpinColourVectorD;
+typedef iSpinColourVector<vComplex >     vSpinColourVector;
+typedef iSpinColourVector<vComplexF>     vSpinColourVectorF;
+typedef iSpinColourVector<vComplexD>     vSpinColourVectorD;

-    // HalfSpin vector
-    typedef iHalfSpinVector<Complex >       HalfSpinVector;
-    typedef iHalfSpinVector<ComplexF>       HalfSpinVectorF;
-    typedef iHalfSpinVector<ComplexD>       HalfSpinVectorD;
+// HalfSpin vector
+typedef iHalfSpinVector<Complex >       HalfSpinVector;
+typedef iHalfSpinVector<ComplexF>       HalfSpinVectorF;
+typedef iHalfSpinVector<ComplexD>       HalfSpinVectorD;

-    typedef iHalfSpinVector<vComplex >       vHalfSpinVector;
-    typedef iHalfSpinVector<vComplexF>       vHalfSpinVectorF;
-    typedef iHalfSpinVector<vComplexD>       vHalfSpinVectorD;
+typedef iHalfSpinVector<vComplex >       vHalfSpinVector;
+typedef iHalfSpinVector<vComplexF>       vHalfSpinVectorF;
+typedef iHalfSpinVector<vComplexD>       vHalfSpinVectorD;

-    // HalfSpinColour vector
-    typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
-    typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
-    typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
+// HalfSpinColour vector
+typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
+typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
+typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
    
-    typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
-    typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
-    typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
+typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
+typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
+typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
    
-    // singlets
-    typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
-    typedef iSinglet<ComplexF>         TComplexF;    // FIXME This is painful. Tensor singlet complex type.
-    typedef iSinglet<ComplexD>         TComplexD;    // FIXME This is painful. Tensor singlet complex type.
+// singlets
+typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
+typedef iSinglet<ComplexF>         TComplexF;    // FIXME This is painful. Tensor singlet complex type.
+typedef iSinglet<ComplexD>         TComplexD;    // FIXME This is painful. Tensor singlet complex type.

-    typedef iSinglet<vComplex >        vTComplex ;   // what if we don't know the tensor structure
-    typedef iSinglet<vComplexF>        vTComplexF;   // what if we don't know the tensor structure
-    typedef iSinglet<vComplexD>        vTComplexD;   // what if we don't know the tensor structure
+typedef iSinglet<vComplex >        vTComplex ;   // what if we don't know the tensor structure
+typedef iSinglet<vComplexF>        vTComplexF;   // what if we don't know the tensor structure
+typedef iSinglet<vComplexD>        vTComplexD;   // what if we don't know the tensor structure

-    typedef iSinglet<Real >            TReal;        // Shouldn't need these; can I make it work without?
-    typedef iSinglet<RealF>            TRealF;       // Shouldn't need these; can I make it work without?
-    typedef iSinglet<RealD>            TRealD;       // Shouldn't need these; can I make it work without?
+typedef iSinglet<Real >            TReal;        // Shouldn't need these; can I make it work without?
+typedef iSinglet<RealF>            TRealF;       // Shouldn't need these; can I make it work without?
+typedef iSinglet<RealD>            TRealD;       // Shouldn't need these; can I make it work without?

-    typedef iSinglet<vReal >           vTReal;      
-    typedef iSinglet<vRealF>           vTRealF;      
-    typedef iSinglet<vRealD>           vTRealD;      
+typedef iSinglet<vReal >           vTReal;      
+typedef iSinglet<vRealF>           vTRealF;      
+typedef iSinglet<vRealD>           vTRealD;      

-    typedef iSinglet<vInteger>         vTInteger;
-    typedef iSinglet<Integer >         TInteger;
+typedef iSinglet<vInteger>         vTInteger;
+typedef iSinglet<Integer >         TInteger;


-    // Lattices of these
-    typedef Lattice<vColourMatrix>          LatticeColourMatrix;
-    typedef Lattice<vColourMatrixF>         LatticeColourMatrixF;
-    typedef Lattice<vColourMatrixD>         LatticeColourMatrixD;
+// Lattices of these
+typedef Lattice<vColourMatrix>          LatticeColourMatrix;
+typedef Lattice<vColourMatrixF>         LatticeColourMatrixF;
+typedef Lattice<vColourMatrixD>         LatticeColourMatrixD;

-    typedef Lattice<vSpinMatrix>            LatticeSpinMatrix;
-    typedef Lattice<vSpinMatrixF>           LatticeSpinMatrixF;
-    typedef Lattice<vSpinMatrixD>           LatticeSpinMatrixD;
+typedef Lattice<vSpinMatrix>            LatticeSpinMatrix;
+typedef Lattice<vSpinMatrixF>           LatticeSpinMatrixF;
+typedef Lattice<vSpinMatrixD>           LatticeSpinMatrixD;

-    typedef Lattice<vSpinColourMatrix>      LatticeSpinColourMatrix;
-    typedef Lattice<vSpinColourMatrixF>     LatticeSpinColourMatrixF;
-    typedef Lattice<vSpinColourMatrixD>     LatticeSpinColourMatrixD;
+typedef Lattice<vSpinColourMatrix>      LatticeSpinColourMatrix;
+typedef Lattice<vSpinColourMatrixF>     LatticeSpinColourMatrixF;
+typedef Lattice<vSpinColourMatrixD>     LatticeSpinColourMatrixD;

-    typedef Lattice<vSpinColourSpinColourMatrix>      LatticeSpinColourSpinColourMatrix;
-    typedef Lattice<vSpinColourSpinColourMatrixF>     LatticeSpinColourSpinColourMatrixF;
-    typedef Lattice<vSpinColourSpinColourMatrixD>     LatticeSpinColourSpinColourMatrixD;
+typedef Lattice<vSpinColourSpinColourMatrix>      LatticeSpinColourSpinColourMatrix;
+typedef Lattice<vSpinColourSpinColourMatrixF>     LatticeSpinColourSpinColourMatrixF;
+typedef Lattice<vSpinColourSpinColourMatrixD>     LatticeSpinColourSpinColourMatrixD;

-    typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
-    typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
-    typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
+typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
+typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
+typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;

-    // DoubleStored gauge field
-    typedef Lattice<vDoubleStoredColourMatrix>  LatticeDoubleStoredColourMatrix;
-    typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
-    typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;
+// DoubleStored gauge field
+typedef Lattice<vDoubleStoredColourMatrix>  LatticeDoubleStoredColourMatrix;
+typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
+typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;

-    typedef Lattice<vSpinVector>            LatticeSpinVector;
-    typedef Lattice<vSpinVectorF>           LatticeSpinVectorF;
-    typedef Lattice<vSpinVectorD>           LatticeSpinVectorD;
+typedef Lattice<vSpinVector>            LatticeSpinVector;
+typedef Lattice<vSpinVectorF>           LatticeSpinVectorF;
+typedef Lattice<vSpinVectorD>           LatticeSpinVectorD;

-    typedef Lattice<vColourVector>          LatticeColourVector;
-    typedef Lattice<vColourVectorF>         LatticeColourVectorF;
-    typedef Lattice<vColourVectorD>         LatticeColourVectorD;
+typedef Lattice<vColourVector>          LatticeColourVector;
+typedef Lattice<vColourVectorF>         LatticeColourVectorF;
+typedef Lattice<vColourVectorD>         LatticeColourVectorD;

-    typedef Lattice<vSpinColourVector>      LatticeSpinColourVector;
-    typedef Lattice<vSpinColourVectorF>     LatticeSpinColourVectorF;
-    typedef Lattice<vSpinColourVectorD>     LatticeSpinColourVectorD;
+typedef Lattice<vSpinColourVector>      LatticeSpinColourVector;
+typedef Lattice<vSpinColourVectorF>     LatticeSpinColourVectorF;
+typedef Lattice<vSpinColourVectorD>     LatticeSpinColourVectorD;

-    typedef Lattice<vHalfSpinVector>        LatticeHalfSpinVector;
-    typedef Lattice<vHalfSpinVectorF>       LatticeHalfSpinVectorF;
-    typedef Lattice<vHalfSpinVectorD>       LatticeHalfSpinVectorD;
+typedef Lattice<vHalfSpinVector>        LatticeHalfSpinVector;
+typedef Lattice<vHalfSpinVectorF>       LatticeHalfSpinVectorF;
+typedef Lattice<vHalfSpinVectorD>       LatticeHalfSpinVectorD;

-    typedef Lattice<vHalfSpinColourVector>  LatticeHalfSpinColourVector;
-    typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
-    typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;
+typedef Lattice<vHalfSpinColourVector>  LatticeHalfSpinColourVector;
+typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
+typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;

-    typedef Lattice<vTReal>            LatticeReal;
-    typedef Lattice<vTRealF>           LatticeRealF;
-    typedef Lattice<vTRealD>           LatticeRealD;
+typedef Lattice<vTReal>            LatticeReal;
+typedef Lattice<vTRealF>           LatticeRealF;
+typedef Lattice<vTRealD>           LatticeRealD;

-    typedef Lattice<vTComplex>         LatticeComplex;
-    typedef Lattice<vTComplexF>        LatticeComplexF;
-    typedef Lattice<vTComplexD>        LatticeComplexD;
+typedef Lattice<vTComplex>         LatticeComplex;
+typedef Lattice<vTComplexF>        LatticeComplexF;
+typedef Lattice<vTComplexD>        LatticeComplexD;

-    typedef Lattice<vTInteger>         LatticeInteger; // Predicates for "where"
+typedef Lattice<vTInteger>         LatticeInteger; // Predicates for "where"


-    ///////////////////////////////////////////
-    // Physical names for things
-    ///////////////////////////////////////////
-    typedef LatticeHalfSpinColourVector  LatticeHalfFermion;
-    typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
-    typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;
+///////////////////////////////////////////
+// Physical names for things
+///////////////////////////////////////////
+typedef LatticeHalfSpinColourVector  LatticeHalfFermion;
+typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
+typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;

-    typedef LatticeSpinColourVector      LatticeFermion;
-    typedef LatticeSpinColourVectorF     LatticeFermionF;
-    typedef LatticeSpinColourVectorD     LatticeFermionD;
+typedef LatticeSpinColourVector      LatticeFermion;
+typedef LatticeSpinColourVectorF     LatticeFermionF;
+typedef LatticeSpinColourVectorD     LatticeFermionD;

-    typedef LatticeSpinColourMatrix                LatticePropagator;
-    typedef LatticeSpinColourMatrixF               LatticePropagatorF;
-    typedef LatticeSpinColourMatrixD               LatticePropagatorD;
+typedef LatticeSpinColourMatrix                LatticePropagator;
+typedef LatticeSpinColourMatrixF               LatticePropagatorF;
+typedef LatticeSpinColourMatrixD               LatticePropagatorD;

-    typedef LatticeLorentzColourMatrix             LatticeGaugeField;
-    typedef LatticeLorentzColourMatrixF            LatticeGaugeFieldF;
-    typedef LatticeLorentzColourMatrixD            LatticeGaugeFieldD;
+typedef LatticeLorentzColourMatrix             LatticeGaugeField;
+typedef LatticeLorentzColourMatrixF            LatticeGaugeFieldF;
+typedef LatticeLorentzColourMatrixD            LatticeGaugeFieldD;

-    typedef LatticeDoubleStoredColourMatrix        LatticeDoubledGaugeField;
-    typedef LatticeDoubleStoredColourMatrixF       LatticeDoubledGaugeFieldF;
-    typedef LatticeDoubleStoredColourMatrixD       LatticeDoubledGaugeFieldD;
+typedef LatticeDoubleStoredColourMatrix        LatticeDoubledGaugeField;
+typedef LatticeDoubleStoredColourMatrixF       LatticeDoubledGaugeFieldF;
+typedef LatticeDoubleStoredColourMatrixD       LatticeDoubledGaugeFieldD;

-    template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;
+template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;

-    // Uhgg... typing this hurt  ;)
-    // (my keyboard got burning hot when I typed this, must be the anti-Fermion)
-    typedef Lattice<vColourVector>          LatticeStaggeredFermion;    
-    typedef Lattice<vColourVectorF>         LatticeStaggeredFermionF;    
-    typedef Lattice<vColourVectorD>         LatticeStaggeredFermionD;    
+// Uhgg... typing this hurt  ;)
+// (my keyboard got burning hot when I typed this, must be the anti-Fermion)
+typedef Lattice<vColourVector>          LatticeStaggeredFermion;    
+typedef Lattice<vColourVectorF>         LatticeStaggeredFermionF;    
+typedef Lattice<vColourVectorD>         LatticeStaggeredFermionD;    

-    typedef Lattice<vColourMatrix>          LatticeStaggeredPropagator; 
-    typedef Lattice<vColourMatrixF>         LatticeStaggeredPropagatorF; 
-    typedef Lattice<vColourMatrixD>         LatticeStaggeredPropagatorD; 
+typedef Lattice<vColourMatrix>          LatticeStaggeredPropagator; 
+typedef Lattice<vColourMatrixF>         LatticeStaggeredPropagatorF; 
+typedef Lattice<vColourMatrixD>         LatticeStaggeredPropagatorD; 

-    //////////////////////////////////////////////////////////////////////////////
-    // Peek and Poke named after physics attributes
-    //////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+// Peek and Poke named after physics attributes
+//////////////////////////////////////////////////////////////////////////////

-    //spin
-    template<class vobj> auto peekSpin(const vobj &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
-    {
-      return PeekIndex<SpinIndex>(rhs,i);
-    }
-    template<class vobj> auto peekSpin(const vobj &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
-    {
-      return PeekIndex<SpinIndex>(rhs,i,j);
-    }
-    template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
-    {
-      return PeekIndex<SpinIndex>(rhs,i);
-    }
-    template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
-    {
-      return PeekIndex<SpinIndex>(rhs,i,j);
-    }
-    //colour
-    template<class vobj> auto peekColour(const vobj &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
-    {
-      return PeekIndex<ColourIndex>(rhs,i);
-    }
-    template<class vobj> auto peekColour(const vobj &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
-    {
-      return PeekIndex<ColourIndex>(rhs,i,j);
-    }
-    template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
-    {
-      return PeekIndex<ColourIndex>(rhs,i);
-    }
-    template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
-    {
-      return PeekIndex<ColourIndex>(rhs,i,j);
-    }
-    //lorentz
-    template<class vobj> auto peekLorentz(const vobj &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
-    {
-      return PeekIndex<LorentzIndex>(rhs,i);
-    }
-    template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
-    {
-      return PeekIndex<LorentzIndex>(rhs,i);
-    }
+//spin
+template<class vobj> auto peekSpin(const vobj &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
+{
+  return PeekIndex<SpinIndex>(rhs,i);
+}
+template<class vobj> auto peekSpin(const vobj &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
+{
+  return PeekIndex<SpinIndex>(rhs,i,j);
+}
+template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
+{
+  return PeekIndex<SpinIndex>(rhs,i);
+}
+template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
+{
+  return PeekIndex<SpinIndex>(rhs,i,j);
+}
+//colour
+template<class vobj> auto peekColour(const vobj &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
+{
+  return PeekIndex<ColourIndex>(rhs,i);
+}
+template<class vobj> auto peekColour(const vobj &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
+{
+  return PeekIndex<ColourIndex>(rhs,i,j);
+}
+template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
+{
+  return PeekIndex<ColourIndex>(rhs,i);
+}
+template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
+{
+  return PeekIndex<ColourIndex>(rhs,i,j);
+}
+//lorentz
+template<class vobj> auto peekLorentz(const vobj &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
+{
+  return PeekIndex<LorentzIndex>(rhs,i);
+}
+template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
+{
+  return PeekIndex<LorentzIndex>(rhs,i);
+}

-    //////////////////////////////////////////////
-    // Poke lattice
-    //////////////////////////////////////////////
-    template<class vobj> 
-      void pokeColour(Lattice<vobj> &lhs,
-              const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0))> & rhs,
+//////////////////////////////////////////////
+// Poke lattice
+//////////////////////////////////////////////
+template<class vobj> 
+void pokeColour(Lattice<vobj> &lhs,
+		const Lattice<decltype(peekIndex<ColourIndex>(vobj(),0))> & rhs,
+		int i)
+{
+  PokeIndex<ColourIndex>(lhs,rhs,i);
+}
+template<class vobj> 
+void pokeColour(Lattice<vobj> &lhs,
+		const Lattice<decltype(peekIndex<ColourIndex>(vobj(),0,0))> & rhs,
+		int i,int j)
+{
+  PokeIndex<ColourIndex>(lhs,rhs,i,j);
+}
+template<class vobj> 
+void pokeSpin(Lattice<vobj> &lhs,
+              const Lattice<decltype(peekIndex<SpinIndex>(vobj(),0))> & rhs,
              int i)
-    {
-      PokeIndex<ColourIndex>(lhs,rhs,i);
-    }
-    template<class vobj> 
-      void pokeColour(Lattice<vobj> &lhs,
-              const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0,0))> & rhs,
+{
+  PokeIndex<SpinIndex>(lhs,rhs,i);
+}
+template<class vobj> 
+void pokeSpin(Lattice<vobj> &lhs,
+              const Lattice<decltype(peekIndex<SpinIndex>(vobj(),0,0))> & rhs,
              int i,int j)
-    {
-      PokeIndex<ColourIndex>(lhs,rhs,i,j);
-    }
-    template<class vobj> 
-      void pokeSpin(Lattice<vobj> &lhs,
-              const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0))> & rhs,
-              int i)
-    {
-      PokeIndex<SpinIndex>(lhs,rhs,i);
-    }
-    template<class vobj> 
-      void pokeSpin(Lattice<vobj> &lhs,
-              const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0,0))> & rhs,
-              int i,int j)
-    {
-      PokeIndex<SpinIndex>(lhs,rhs,i,j);
-    }
-    template<class vobj> 
-      void pokeLorentz(Lattice<vobj> &lhs,
-              const Lattice<decltype(peekIndex<LorentzIndex>(lhs._odata[0],0))> & rhs,
-              int i)
-    {
-      PokeIndex<LorentzIndex>(lhs,rhs,i);
-    }
+{
+  PokeIndex<SpinIndex>(lhs,rhs,i,j);
+}
+template<class vobj> 
+void pokeLorentz(Lattice<vobj> &lhs,
+		 const Lattice<decltype(peekIndex<LorentzIndex>(vobj(),0))> & rhs,
+		 int i)
+{
+  PokeIndex<LorentzIndex>(lhs,rhs,i);
+}

-    //////////////////////////////////////////////
-    // Poke scalars
-    //////////////////////////////////////////////
-    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
-    {
-      pokeIndex<SpinIndex>(lhs,rhs,i);
-    }
-    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0,0)) & rhs,int i,int j)
-    {
-      pokeIndex<SpinIndex>(lhs,rhs,i,j);
-    }
+//////////////////////////////////////////////
+// Poke scalars
+//////////////////////////////////////////////
+template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
+{
+  pokeIndex<SpinIndex>(lhs,rhs,i);
+}
+template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0,0)) & rhs,int i,int j)
+{
+  pokeIndex<SpinIndex>(lhs,rhs,i,j);
+}

-    template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0)) & rhs,int i)
-    {
-      pokeIndex<ColourIndex>(lhs,rhs,i);
-    }
-    template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0,0)) & rhs,int i,int j)
-    {
-      pokeIndex<ColourIndex>(lhs,rhs,i,j);
-    }
+template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0)) & rhs,int i)
+{
+  pokeIndex<ColourIndex>(lhs,rhs,i);
+}
+template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0,0)) & rhs,int i,int j)
+{
+  pokeIndex<ColourIndex>(lhs,rhs,i,j);
+}

-    template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<LorentzIndex>(lhs,0)) & rhs,int i)
-    {
-      pokeIndex<LorentzIndex>(lhs,rhs,i);
-    }
+template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<LorentzIndex>(lhs,0)) & rhs,int i)
+{
+  pokeIndex<LorentzIndex>(lhs,rhs,i);
+}

-    //////////////////////////////////////////////
-    // Fermion <-> propagator assignements
-    //////////////////////////////////////////////
+//////////////////////////////////////////////
+// Fermion <-> propagator assignements
+//////////////////////////////////////////////
    //template <class Prop, class Ferm>
    template <class Fimpl>
      void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
+{
+  for(int j = 0; j < Ns; ++j)
    {
-      for(int j = 0; j < Ns; ++j)
-        {
-            auto pjs = peekSpin(p, j, s);
-            auto fj  = peekSpin(f, j);
+      auto pjs = peekSpin(p, j, s);
+      auto fj  = peekSpin(f, j);
            
            for(int i = 0; i < Fimpl::Dimension; ++i)
-            {
-                pokeColour(pjs, peekColour(fj, i), i, c);
-            }
-            pokeSpin(p, pjs, j, s);
-        }
+	{
+	  pokeColour(pjs, peekColour(fj, i), i, c);
+	}
+      pokeSpin(p, pjs, j, s);
    }
+}
    
    //template <class Prop, class Ferm>
    template <class Fimpl>
      void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
+{
+  for(int j = 0; j < Ns; ++j)
    {
-        for(int j = 0; j < Ns; ++j)
-        {
-            auto pjs = peekSpin(p, j, s);
-            auto fj  = peekSpin(f, j);
+      auto pjs = peekSpin(p, j, s);
+      auto fj  = peekSpin(f, j);
            
            for(int i = 0; i < Fimpl::Dimension; ++i)
-            {
-                pokeColour(fj, peekColour(pjs, i, c), i);
-            }
-            pokeSpin(f, fj, j);
-        }
+	{
+	  pokeColour(fj, peekColour(pjs, i, c), i);
+	}
+      pokeSpin(f, fj, j);
    }
+}
    
-    //////////////////////////////////////////////
-    // transpose array and scalar
-    //////////////////////////////////////////////
-    template<int Index,class vobj> inline Lattice<vobj> transposeSpin(const Lattice<vobj> &lhs){
-      return transposeIndex<SpinIndex>(lhs);
-    }
-    template<int Index,class vobj> inline Lattice<vobj> transposeColour(const Lattice<vobj> &lhs){
-      return transposeIndex<ColourIndex>(lhs);
-    }
-    template<int Index,class vobj> inline vobj transposeSpin(const vobj &lhs){
-      return transposeIndex<SpinIndex>(lhs);
-    }
-    template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
-      return transposeIndex<ColourIndex>(lhs);
-    }
+//////////////////////////////////////////////
+// transpose array and scalar
+//////////////////////////////////////////////
+template<int Index,class vobj> inline Lattice<vobj> transposeSpin(const Lattice<vobj> &lhs){
+  return transposeIndex<SpinIndex>(lhs);
+}
+template<int Index,class vobj> inline Lattice<vobj> transposeColour(const Lattice<vobj> &lhs){
+  return transposeIndex<ColourIndex>(lhs);
+}
+template<int Index,class vobj> inline vobj transposeSpin(const vobj &lhs){
+  return transposeIndex<SpinIndex>(lhs);
+}
+template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
+  return transposeIndex<ColourIndex>(lhs);
+}

-    //////////////////////////////////////////
-    // Trace lattice and non-lattice
-    //////////////////////////////////////////
-    template<int Index,class vobj>
-    inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs._odata[0]))>
-    {
-      return traceIndex<SpinIndex>(lhs);
-    }
-    template<int Index,class vobj>
-    inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs._odata[0]))>
-    {
-      return traceIndex<ColourIndex>(lhs);
-    }
-    template<int Index,class vobj>
-    inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))>
-    {
-      return traceIndex<SpinIndex>(lhs);
-    }
-    template<int Index,class vobj>
-    inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs))>
-    {
-      return traceIndex<ColourIndex>(lhs);
-    }
+//////////////////////////////////////////
+// Trace lattice and non-lattice
+//////////////////////////////////////////
+template<int Index,class vobj>
+inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(vobj()))>
+{
+  return traceIndex<SpinIndex>(lhs);
+}
+template<int Index,class vobj>
+inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(vobj()))>
+{
+  return traceIndex<ColourIndex>(lhs);
+}
+template<int Index,class vobj>
+inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))>
+{
+  return traceIndex<SpinIndex>(lhs);
+}
+template<int Index,class vobj>
+inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs))>
+{
+  return traceIndex<ColourIndex>(lhs);
+}

-    //////////////////////////////////////////
-    // Current types
-    //////////////////////////////////////////
-    GRID_SERIALIZABLE_ENUM(Current, undef,
-                           Vector,  0,
-                           Axial,   1,
-                           Tadpole, 2);
+//////////////////////////////////////////
+// Current types
+//////////////////////////////////////////
+GRID_SERIALIZABLE_ENUM(Current, undef,
+		       Vector,  0,
+		       Axial,   1,
+		       Tadpole, 2);

-}   //namespace QCD
-} // Grid
+NAMESPACE_END(Grid);

-
-
-#endif
--- a/Grid/qcd/action/Action.h
+++ b/Grid/qcd/action/Action.h
@@ -37,14 +37,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 // Abstract base interface
 ////////////////////////////////////////////
 #include <Grid/qcd/action/ActionCore.h>
+NAMESPACE_CHECK(ActionCore);
 ////////////////////////////////////////////////////////////////////////
 // Fermion actions; prevent coupling fermion.cc files to other headers
 ////////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/FermionCore.h>
+NAMESPACE_CHECK(FermionCore);
 #include <Grid/qcd/action/fermion/Fermion.h>
+NAMESPACE_CHECK(Fermion);
 ////////////////////////////////////////
 // Pseudo fermion combinations for HMC
 ////////////////////////////////////////
 #include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
+NAMESPACE_CHECK(PseudoFermion);

 #endif
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@@ -27,19 +27,18 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */

 #ifndef ACTION_BASE_H
 #define ACTION_BASE_H

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template <class GaugeField >
 class Action 
 {

- public:
+public:
  bool is_smeared = false;
  // Heatbath?
  virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
@@ -50,7 +49,6 @@ class Action
  virtual ~Action(){}
 };

-}
-}
+NAMESPACE_END(Grid);

 #endif // ACTION_BASE_H
--- a/Grid/qcd/action/ActionCore.h
+++ b/Grid/qcd/action/ActionCore.h
@@ -31,29 +31,37 @@ directory
 #define QCD_ACTION_CORE

 #include <Grid/qcd/action/ActionBase.h>
+NAMESPACE_CHECK(ActionBase);
 #include <Grid/qcd/action/ActionSet.h>
+NAMESPACE_CHECK(ActionSet);
 #include <Grid/qcd/action/ActionParams.h>
+NAMESPACE_CHECK(ActionParams);

 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
 #include <Grid/qcd/action/gauge/Gauge.h>
+NAMESPACE_CHECK(Gauge);

 ////////////////////////////////////////////
 // Fermion prereqs
 ////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/FermionCore.h>
+NAMESPACE_CHECK(ActionFermionCore);

 ////////////////////////////////////////////
 // Scalar Actions
 ////////////////////////////////////////////
 #include <Grid/qcd/action/scalar/Scalar.h>
+NAMESPACE_CHECK(Scalar);

 ////////////////////////////////////////////
 // Utility functions
 ////////////////////////////////////////////
 #include <Grid/qcd/utils/Metric.h>
+NAMESPACE_CHECK(Metric);
 #include <Grid/qcd/utils/CovariantLaplacian.h>
+NAMESPACE_CHECK(CovariantLaplacian);



--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -27,37 +27,35 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */

 #ifndef GRID_QCD_ACTION_PARAMS_H
 #define GRID_QCD_ACTION_PARAMS_H

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  // These can move into a params header and be given MacroMagic serialisation
-  struct GparityWilsonImplParams {
-    bool overlapCommsCompute;
-    std::vector<int> twists;
-    GparityWilsonImplParams() : twists(Nd, 0), overlapCommsCompute(false){};
-  };
+// These can move into a params header and be given MacroMagic serialisation
+struct GparityWilsonImplParams {
+  Coordinate twists;
+  GparityWilsonImplParams() : twists(Nd, 0) {};
+};
  
-  struct WilsonImplParams {
-    bool overlapCommsCompute;
-    std::vector<Real> twist_n_2pi_L;
-    std::vector<Complex> boundary_phases;
-    WilsonImplParams() : overlapCommsCompute(false) {
-      boundary_phases.resize(Nd, 1.0);
+struct WilsonImplParams {
+  bool overlapCommsCompute;
+  AcceleratorVector<Real,Nd> twist_n_2pi_L;
+  AcceleratorVector<Complex,Nd> boundary_phases;
+  WilsonImplParams()  {
+    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
-    };
-    WilsonImplParams(const std::vector<Complex> phi) : boundary_phases(phi), overlapCommsCompute(false) {
-      twist_n_2pi_L.resize(Nd, 0.0);
-    }
  };
+  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
+    twist_n_2pi_L.resize(Nd, 0.0);
+  }
+};

-  struct StaggeredImplParams {
-    StaggeredImplParams()  {};
-  };
+struct StaggeredImplParams {
+  StaggeredImplParams()  {};
+};
  
  struct OneFlavourRationalParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(OneFlavourRationalParams, 
@@ -69,10 +67,10 @@ namespace QCD {
 				    int,   precision,
 				    int,   BoundsCheckFreq);
    
-    // MaxIter and tolerance, vectors??
+  // MaxIter and tolerance, vectors??
    
-    // constructor 
-    OneFlavourRationalParams(	RealD _lo      = 0.0, 
+  // constructor 
+  OneFlavourRationalParams(	RealD _lo      = 0.0, 
 				RealD _hi      = 1.0, 
 				int _maxit     = 1000,
 				RealD tol      = 1.0e-8, 
@@ -88,11 +86,6 @@ namespace QCD {
        BoundsCheckFreq(_BoundsCheckFreq){};
  };
  
-  
-}
-}
-
-
-
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/ActionSet.h
+++ b/Grid/qcd/action/ActionSet.h
@@ -26,14 +26,11 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */
 #ifndef ACTION_SET_H
 #define ACTION_SET_H

-namespace Grid {
-
-// Should drop this namespace here
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 //////////////////////////////////
 // Indexing of tuple types
@@ -62,7 +59,7 @@ struct Index<T, std::tuple<U, Types...>> {

 template <class Field, class Repr = NoHirep >
 struct ActionLevel {
- public:
+public:
  unsigned int multiplier;

  // Fundamental repr actions separated because of the smearing
@@ -77,7 +74,7 @@ struct ActionLevel {
  std::vector<ActPtr>& actions;

  explicit ActionLevel(unsigned int mul = 1) : 
-  actions(std::get<0>(actions_hirep)), multiplier(mul) {
+    actions(std::get<0>(actions_hirep)), multiplier(mul) {
    // initialize the hirep vectors to zero.
    // apply(this->resize, actions_hirep, 0); //need a working resize
    assert(mul >= 1);
@@ -87,7 +84,7 @@ struct ActionLevel {
  void push_back(Action<GenField>* ptr) {
    // insert only in the correct vector
    std::get< Index < GenField, action_hirep_types>::value >(actions_hirep).push_back(ptr);
-  };
+  }

  template <class ActPtr>
  static void resize(ActPtr ap, unsigned int n) {
@@ -110,7 +107,6 @@ struct ActionLevel {
 template <class GaugeField, class R>
 using ActionSet = std::vector<ActionLevel<GaugeField, R> >;

-} // QCD
-} // Grid
+NAMESPACE_END(Grid);

 #endif  // ACTION_SET_H
--- a/Grid/qcd/action/fermion/AbstractEOFAFermion.h
+++ b/Grid/qcd/action/fermion/AbstractEOFAFermion.h
@@ -26,75 +26,75 @@ with this program; if not, write to the Free Software Foundation, Inc.,

 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */
 #ifndef  GRID_QCD_ABSTRACT_EOFA_FERMION_H
 #define  GRID_QCD_ABSTRACT_EOFA_FERMION_H

 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  // DJM: Abstract base class for EOFA fermion types.
-  // Defines layout of additional EOFA-specific parameters and operators.
-  // Use to construct EOFA pseudofermion actions that are agnostic to
-  // Shamir / Mobius / etc., and ensure that no one can construct EOFA
-  // pseudofermion action with non-EOFA fermion type.
-  template<class Impl>
-  class AbstractEOFAFermion : public CayleyFermion5D<Impl> {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
+// DJM: Abstract base class for EOFA fermion types.
+// Defines layout of additional EOFA-specific parameters and operators.
+// Use to construct EOFA pseudofermion actions that are agnostic to
+// Shamir / Mobius / etc., and ensure that no one can construct EOFA
+// pseudofermion action with non-EOFA fermion type.
+template<class Impl>
+class AbstractEOFAFermion : public CayleyFermion5D<Impl> {
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    public:
-      // Fermion operator: D(mq1) + shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm}
-      RealD mq1;
-      RealD mq2;
-      RealD mq3;
-      RealD shift;
-      int pm;
+public:
+  // Fermion operator: D(mq1) + shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm}
+  RealD mq1;
+  RealD mq2;
+  RealD mq3;
+  RealD shift;
+  int pm;

-      RealD alpha; // Mobius scale
-      RealD k;     // EOFA normalization constant
+  RealD alpha; // Mobius scale
+  RealD k;     // EOFA normalization constant

-      virtual void Instantiatable(void) = 0;
+  virtual void Instantiatable(void) = 0;

-      // EOFA-specific operations
-      // Force user to implement in derived classes
-      virtual void  Omega    (const FermionField& in, FermionField& out, int sign, int dag) = 0;
-      virtual void  Dtilde   (const FermionField& in, FermionField& out) = 0;
-      virtual void  DtildeInv(const FermionField& in, FermionField& out) = 0;
+  // EOFA-specific operations
+  // Force user to implement in derived classes
+  virtual void  Omega    (const FermionField& in, FermionField& out, int sign, int dag) = 0;
+  virtual void  Dtilde   (const FermionField& in, FermionField& out) = 0;
+  virtual void  DtildeInv(const FermionField& in, FermionField& out) = 0;

-      // Implement derivatives in base class:
-      // for EOFA both DWF and Mobius just need d(Dw)/dU
-      virtual void MDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
-        this->DhopDeriv(mat, U, V, dag);
-      };
-      virtual void MoeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
-        this->DhopDerivOE(mat, U, V, dag);
-      };
-      virtual void MeoDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
-        this->DhopDerivEO(mat, U, V, dag);
-      };
-
-      // Recompute 5D coefficients for different value of shift constant
-      // (needed for heatbath loop over poles)
-      virtual void RefreshShiftCoefficients(RealD new_shift) = 0;
-
-      // Constructors
-      AbstractEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
-        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
-        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int _pm,
-        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams())
-        : CayleyFermion5D<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid,
-          _mq1, _M5, p), mq1(_mq1), mq2(_mq2), mq3(_mq3), shift(_shift), pm(_pm)
-      {
-        int Ls = this->Ls;
-        this->alpha = _b + _c;
-        this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
-                    ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
-                    ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
-      };
+  // Implement derivatives in base class:
+  // for EOFA both DWF and Mobius just need d(Dw)/dU
+  virtual void MDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
+    this->DhopDeriv(mat, U, V, dag);
  };
-}}
+  virtual void MoeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
+    this->DhopDerivOE(mat, U, V, dag);
+  };
+  virtual void MeoDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
+    this->DhopDerivEO(mat, U, V, dag);
+  };
+
+  // Recompute 5D coefficients for different value of shift constant
+  // (needed for heatbath loop over poles)
+  virtual void RefreshShiftCoefficients(RealD new_shift) = 0;
+
+  // Constructors
+  AbstractEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
+		      GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
+		      RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int _pm,
+		      RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams())
+    : CayleyFermion5D<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid,
+			    _mq1, _M5, p), mq1(_mq1), mq2(_mq2), mq3(_mq3), shift(_shift), pm(_pm)
+  {
+    int Ls = this->Ls;
+    this->alpha = _b + _c;
+    this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
+      ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
+      ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
+  };
+};
+
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid

@@ -24,203 +24,146 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_CAYLEY_FERMION_H
-#define  GRID_QCD_CAYLEY_FERMION_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once

 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class CayleyFermion5D : public WilsonFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-     template<typename T> struct switcheroo   {
-       static inline int iscomplex()  { return 0; }
+  // override multiply
+  virtual RealD  M    (const FermionField &in, FermionField &out);
+  virtual RealD  Mdag (const FermionField &in, FermionField &out);

-       template<class vec>
-       static inline vec mult(vec a, vec b) {
-	 return real_mult(a,b);
-       }
-     };
-     template<> struct switcheroo<ComplexD> {
-       static inline int iscomplex()  { return 1; }
+  // half checkerboard operations
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+  virtual void   Meo5D (const FermionField &psi, FermionField &chi);

-       template<class vec>
-       static inline vec mult(vec a, vec b) {
-	 return a*b;
-       }
-     };
-     template<> struct switcheroo<ComplexF> {
-       static inline int iscomplex()  { return 1; }
-       template<class vec>
-       static inline vec mult(vec a, vec b) {
-	 return a*b;
-       }
-     };
+  virtual void   M5D   (const FermionField &psi, FermionField &chi);
+  virtual void   M5Ddag(const FermionField &psi, FermionField &chi);

+  ///////////////////////////////////////////////////////////////
+  // Physical surface field utilities
+  ///////////////////////////////////////////////////////////////
+  virtual void Dminus(const FermionField &psi, FermionField &chi);
+  virtual void DminusDag(const FermionField &psi, FermionField &chi);
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+  virtual void ExportPhysicalFermionSource(const FermionField &solution5d, FermionField &exported4d);
+  virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);
+  virtual void ImportUnphysicalFermion(const FermionField &solution5d, FermionField &exported4d);

-    template<class Impl>
-    class CayleyFermion5D : public WilsonFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
+  ///////////////////////////////////////////////////////////////
+  // Support for MADWF tricks
+  ///////////////////////////////////////////////////////////////
+  RealD Mass(void) { return mass; };
+  void  SetMass(RealD _mass) { 
+    mass=_mass; 
+    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
+  } ;
+  void  P(const FermionField &psi, FermionField &chi);
+  void  Pdag(const FermionField &psi, FermionField &chi);
+  
+  /////////////////////////////////////////////////////
+  // Instantiate different versions depending on Impl
+  /////////////////////////////////////////////////////
+  void M5D(const FermionField &psi,
+	   const FermionField &phi,
+	   FermionField &chi,
+	   Vector<Coeff_t> &lower,
+	   Vector<Coeff_t> &diag,
+	   Vector<Coeff_t> &upper);

-      // override multiply
-      virtual RealD  M    (const FermionField &in, FermionField &out);
-      virtual RealD  Mdag (const FermionField &in, FermionField &out);
+  void M5Ddag(const FermionField &psi,
+	      const FermionField &phi,
+	      FermionField &chi,
+	      Vector<Coeff_t> &lower,
+	      Vector<Coeff_t> &diag,
+	      Vector<Coeff_t> &upper);

-      // half checkerboard operations
-      virtual void   Meooe       (const FermionField &in, FermionField &out);
-      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
-      virtual void   Mooee       (const FermionField &in, FermionField &out);
-      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
-      virtual void   Meo5D (const FermionField &psi, FermionField &chi);
+  virtual void   Instantiatable(void)=0;

-      virtual void   M5D   (const FermionField &psi, FermionField &chi);
-      virtual void   M5Ddag(const FermionField &psi, FermionField &chi);
+  // force terms; five routines; default to Dhop on diagonal
+  virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

-      ///////////////////////////////////////////////////////////////
-      // Physical surface field utilities
-      ///////////////////////////////////////////////////////////////
-      virtual void   Dminus(const FermionField &psi, FermionField &chi);
-      virtual void   DminusDag(const FermionField &psi, FermionField &chi);
-      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
-      virtual void ExportPhysicalFermionSource(const FermionField &solution5d, FermionField &exported4d);
-      virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);
-      virtual void ImportUnphysicalFermion(const FermionField &solution5d, FermionField &exported4d);
+  // Efficient support for multigrid coarsening
+  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);

-      ///////////////////////////////////////////////////////////////
-      // Support for MADWF tricks
-      ///////////////////////////////////////////////////////////////
-      RealD Mass(void) { return mass; };
-      void  SetMass(RealD _mass) { 
-	mass=_mass; 
-	SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
-      } ;
-      void  P(const FermionField &psi, FermionField &chi);
-      void  Pdag(const FermionField &psi, FermionField &chi);
+  void   Meooe5D       (const FermionField &in, FermionField &out);
+  void   MeooeDag5D    (const FermionField &in, FermionField &out);

-      /////////////////////////////////////////////////////
-      // Instantiate different versions depending on Impl
-      /////////////////////////////////////////////////////
-      void M5D(const FermionField &psi,
-	       const FermionField &phi,
-	       FermionField &chi,
-	       std::vector<Coeff_t> &lower,
-	       std::vector<Coeff_t> &diag,
-	       std::vector<Coeff_t> &upper);
+  //    protected:
+  RealD mass;

-      void M5Ddag(const FermionField &psi,
-		  const FermionField &phi,
-		  FermionField &chi,
-		  std::vector<Coeff_t> &lower,
-		  std::vector<Coeff_t> &diag,
-		  std::vector<Coeff_t> &upper);
+  // Save arguments to SetCoefficientsInternal
+  Vector<Coeff_t> _gamma;
+  RealD                _zolo_hi;
+  RealD                _b;
+  RealD                _c;

-      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
-      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd> > & Matp, Vector<iSinglet<Simd> > & Matm);
+  // Cayley form Moebius (tanh and zolotarev)
+  Vector<Coeff_t> omega;
+  Vector<Coeff_t> bs;    // S dependent coeffs
+  Vector<Coeff_t> cs;
+  Vector<Coeff_t> as;
+  // For preconditioning Cayley form
+  Vector<Coeff_t> bee;
+  Vector<Coeff_t> cee;
+  Vector<Coeff_t> aee;
+  Vector<Coeff_t> beo;
+  Vector<Coeff_t> ceo;
+  Vector<Coeff_t> aeo;
+  // LDU factorisation of the eeoo matrix
+  Vector<Coeff_t> lee;
+  Vector<Coeff_t> leem;
+  Vector<Coeff_t> uee;
+  Vector<Coeff_t> ueem;
+  Vector<Coeff_t> dee;

-      void MooeeInternalAsm(const FermionField &in, FermionField &out,
-			    int LLs, int site,
-			    Vector<iSinglet<Simd> > &Matp,
-			    Vector<iSinglet<Simd> > &Matm);
-      void MooeeInternalZAsm(const FermionField &in, FermionField &out,
-			    int LLs, int site,
-			    Vector<iSinglet<Simd> > &Matp,
-			    Vector<iSinglet<Simd> > &Matm);
+  // Matrices of 5d ee inverse params
+  Vector<iSinglet<Simd> >  MatpInv;
+  Vector<iSinglet<Simd> >  MatmInv;
+  Vector<iSinglet<Simd> >  MatpInvDag;
+  Vector<iSinglet<Simd> >  MatmInvDag;

+  // Constructors
+  CayleyFermion5D(GaugeField &_Umu,
+		  GridCartesian         &FiveDimGrid,
+		  GridRedBlackCartesian &FiveDimRedBlackGrid,
+		  GridCartesian         &FourDimGrid,
+		  GridRedBlackCartesian &FourDimRedBlackGrid,
+		  RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

-      virtual void   Instantiatable(void)=0;
+  void CayleyReport(void);
+  void CayleyZeroCounters(void);

-      // force terms; five routines; default to Dhop on diagonal
-      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  double M5Dflops;
+  double M5Dcalls;
+  double M5Dtime;

-      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  double MooeeInvFlops;
+  double MooeeInvCalls;
+  double MooeeInvTime;

-      void   Meooe5D       (const FermionField &in, FermionField &out);
-      void   MeooeDag5D    (const FermionField &in, FermionField &out);
+protected:
+  virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
+  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
+  virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c);
+};

-      //    protected:
-      RealD mass;
+NAMESPACE_END(Grid);

-      // Save arguments to SetCoefficientsInternal
-      std::vector<Coeff_t> _gamma;
-      RealD                _zolo_hi;
-      RealD                _b;
-      RealD                _c;
-
-      // Cayley form Moebius (tanh and zolotarev)
-      std::vector<Coeff_t> omega;
-      std::vector<Coeff_t> bs;    // S dependent coeffs
-      std::vector<Coeff_t> cs;
-      std::vector<Coeff_t> as;
-      // For preconditioning Cayley form
-      std::vector<Coeff_t> bee;
-      std::vector<Coeff_t> cee;
-      std::vector<Coeff_t> aee;
-      std::vector<Coeff_t> beo;
-      std::vector<Coeff_t> ceo;
-      std::vector<Coeff_t> aeo;
-      // LDU factorisation of the eeoo matrix
-      std::vector<Coeff_t> lee;
-      std::vector<Coeff_t> leem;
-      std::vector<Coeff_t> uee;
-      std::vector<Coeff_t> ueem;
-      std::vector<Coeff_t> dee;
-
-      // Matrices of 5d ee inverse params
-      Vector<iSinglet<Simd> >  MatpInv;
-      Vector<iSinglet<Simd> >  MatmInv;
-      Vector<iSinglet<Simd> >  MatpInvDag;
-      Vector<iSinglet<Simd> >  MatmInvDag;
-
-      // Constructors
-      CayleyFermion5D(GaugeField &_Umu,
-		      GridCartesian         &FiveDimGrid,
-		      GridRedBlackCartesian &FiveDimRedBlackGrid,
-		      GridCartesian         &FourDimGrid,
-		      GridRedBlackCartesian &FourDimRedBlackGrid,
-		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
-
-
-
-     void CayleyReport(void);
-     void CayleyZeroCounters(void);
-
-     double M5Dflops;
-     double M5Dcalls;
-     double M5Dtime;
-
-     double MooeeInvFlops;
-     double MooeeInvCalls;
-     double MooeeInvTime;
-
-    protected:
-      virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
-      virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
-      virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
-    };
-
-  }
-}
-#define INSTANTIATE_DPERP(A)\
-template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
-					std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
-template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
-					   std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
-template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \
-template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);
-
-#undef  CAYLEY_DPERP_DENSE
-#define  CAYLEY_DPERP_CACHE
-#undef  CAYLEY_DPERP_LINALG
-#define CAYLEY_DPERP_VEC
-
-#endif
--- a/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -1,249 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
-
-
-namespace Grid {
-namespace QCD {
-
-  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-
-  // Pminus fowards
-  // Pplus  backwards..
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
-				const FermionField &phi, 
-				FermionField &chi,
-				std::vector<Coeff_t> &lower,
-				std::vector<Coeff_t> &diag,
-				std::vector<Coeff_t> &upper)
-{
-  int Ls =this->Ls;
-  GridBase *grid=psi._grid;
-  assert(phi.checkerboard == psi.checkerboard);
-  chi.checkerboard=psi.checkerboard;
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    for(int s=0;s<Ls;s++){
-      auto tmp = psi._odata[0];
-      if ( s==0 ) {
- 	                            spProj5m(tmp,psi._odata[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	                    spProj5p(tmp,psi._odata[ss+Ls-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else if ( s==(Ls-1)) {
-	                            spProj5m(tmp,psi._odata[ss+0]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
- 	                    spProj5p(tmp,psi._odata[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else { 
-	                            spProj5m(tmp,psi._odata[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	                    spProj5p(tmp,psi._odata[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      }
-    }
-  }
-  M5Dtime+=usecond();
-}
-
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
-				   const FermionField &phi, 
-				   FermionField &chi,
-				   std::vector<Coeff_t> &lower,
-				   std::vector<Coeff_t> &diag,
-				   std::vector<Coeff_t> &upper)
-{
-  int Ls =this->Ls;
-  GridBase *grid=psi._grid;
-  assert(phi.checkerboard == psi.checkerboard);
-  chi.checkerboard=psi.checkerboard;
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    auto tmp = psi._odata[0];
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	spProj5p(tmp,psi._odata[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5m(tmp,psi._odata[ss+Ls-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else if ( s==(Ls-1)) {
-	spProj5p(tmp,psi._odata[ss+0]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5m(tmp,psi._odata[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else { 
-	spProj5p(tmp,psi._odata[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5m(tmp,psi._odata[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      }
-    }
-  }
-  M5Dtime+=usecond();
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
-{
-  GridBase *grid=psi._grid;
-  int Ls=this->Ls;
-
-  chi.checkerboard=psi.checkerboard;
-
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    auto tmp = psi._odata[0];
-
-    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-    // Apply (L^{\prime})^{-1}
-    chi[ss]=psi[ss]; // chi[0]=psi[0]
-    for(int s=1;s<Ls;s++){
-                            spProj5p(tmp,chi[ss+s-1]);  
-      chi[ss+s] = psi[ss+s]-lee[s-1]*tmp;
-    }
-    // L_m^{-1} 
-    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-                                   spProj5m(tmp,chi[ss+s]);    
-      chi[ss+Ls-1] = chi[ss+Ls-1] - leem[s]*tmp;
-    }
-    // U_m^{-1} D^{-1}
-    for (int s=0;s<Ls-1;s++){
-      // Chi[s] + 1/d chi[s] 
-                                                spProj5p(tmp,chi[ss+Ls-1]); 
-      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(ueem[s]/dee[Ls-1])*tmp;
-    }	
-    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
-      
-    // Apply U^{-1}
-    for (int s=Ls-2;s>=0;s--){
-                            spProj5m(tmp,chi[ss+s+1]);  
-      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
-    }
-  }
-
-  MooeeInvTime+=usecond();
-
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
-{
-  GridBase *grid=psi._grid;
-  int Ls=this->Ls;
-
-  assert(psi.checkerboard == psi.checkerboard);
-  chi.checkerboard=psi.checkerboard;
-
-  std::vector<Coeff_t> ueec(Ls);
-  std::vector<Coeff_t> deec(Ls);
-  std::vector<Coeff_t> leec(Ls);
-  std::vector<Coeff_t> ueemc(Ls);
-  std::vector<Coeff_t> leemc(Ls);
-  for(int s=0;s<ueec.size();s++){
-    ueec[s] = conjugate(uee[s]);
-    deec[s] = conjugate(dee[s]);
-    leec[s] = conjugate(lee[s]);
-    ueemc[s]= conjugate(ueem[s]);
-    leemc[s]= conjugate(leem[s]);
-  }
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-
-    auto tmp = psi._odata[0];
-
-    // Apply (U^{\prime})^{-dagger}
-    chi[ss]=psi[ss];
-    for (int s=1;s<Ls;s++){
-                            spProj5m(tmp,chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s]-ueec[s-1]*tmp;
-    }
-    // U_m^{-\dagger} 
-    for (int s=0;s<Ls-1;s++){
-                                   spProj5p(tmp,chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp;
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for (int s=0;s<Ls-1;s++){
-      spProj5m(tmp,chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/deec[s])*chi[ss+s]-(leemc[s]/deec[Ls-1])*tmp;
-    }	
-    chi[ss+Ls-1]= (1.0/deec[Ls-1])*chi[ss+Ls-1];
-  
-    // Apply L^{-dagger}
-    for (int s=Ls-2;s>=0;s--){
-      spProj5p(tmp,chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - leec[s]*tmp;
-    }
-  }
-
-  MooeeInvTime+=usecond();
-
-}
-
-#ifdef CAYLEY_DPERP_CACHE
-  INSTANTIATE_DPERP(WilsonImplF);
-  INSTANTIATE_DPERP(WilsonImplD);
-  INSTANTIATE_DPERP(GparityWilsonImplF);
-  INSTANTIATE_DPERP(GparityWilsonImplD);
-  INSTANTIATE_DPERP(ZWilsonImplF);
-  INSTANTIATE_DPERP(ZWilsonImplD);
-
-  INSTANTIATE_DPERP(WilsonImplFH);
-  INSTANTIATE_DPERP(WilsonImplDF);
-  INSTANTIATE_DPERP(GparityWilsonImplFH);
-  INSTANTIATE_DPERP(GparityWilsonImplDF);
-  INSTANTIATE_DPERP(ZWilsonImplFH);
-  INSTANTIATE_DPERP(ZWilsonImplDF);
-#endif
-
-}}
--- a/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -1,828 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
-
-
-namespace Grid {
-namespace QCD {  
-  /*
-   * Dense matrix versions of routines
-   */
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
-{
-  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
-}
-  
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
-{
-  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
-}
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
-				const FermionField &phi, 
-				FermionField &chi,
-				std::vector<Coeff_t> &lower,
-				std::vector<Coeff_t> &diag,
-				std::vector<Coeff_t> &upper)
-{
-  GridBase *grid=psi._grid;
-  int Ls   = this->Ls;
-  int LLs  = grid->_rdimensions[0];
-  const int nsimd= Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs==nsimd);
-  assert(phi.checkerboard == psi.checkerboard);
-
-  chi.checkerboard=psi.checkerboard;
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type * u_p = (scalar_type *)&u[0];
-  scalar_type * l_p = (scalar_type *)&l[0];
-  scalar_type * d_p = (scalar_type *)&d[0];
-
-  for(int o=0;o<LLs;o++){ // outer
-  for(int i=0;i<nsimd;i++){ //inner
-    int s  = o+i*LLs;
-    int ss = o*nsimd+i;
-    u_p[ss] = upper[s];
-    l_p[ss] = lower[s];
-    d_p[ss] = diag[s];
-  }}
-
-
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  assert(Nc==3);
-
-  parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
-#if 0
-      alignas(64) SiteHalfSpinor hp;
-      alignas(64) SiteHalfSpinor hm;
-      alignas(64) SiteSpinor fp;
-      alignas(64) SiteSpinor fm;
-
-      for(int v=0;v<LLs;v++){
-
-	int vp=(v+1)%LLs;
-	int vm=(v+LLs-1)%LLs;
-
-	spProj5m(hp,psi[ss+vp]);
-	spProj5p(hm,psi[ss+vm]);
-
-	if ( vp<=v ) rotate(hp,hp,1);
-	if ( vm>=v ) rotate(hm,hm,nsimd-1);
-	
-	hp=0.5*hp;
-        hm=0.5*hm;
-
-	spRecon5m(fp,hp);
-	spRecon5p(fm,hm);
-
-	chi[ss+v] = d[v]*phi[ss+v];
-	chi[ss+v] = chi[ss+v]     +u[v]*fp;
-	chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-      }
-#else
-      for(int v=0;v<LLs;v++){
-
-	vprefetch(psi[ss+v+LLs]);
-
-	int vp= (v==LLs-1) ? 0     : v+1;
-	int vm= (v==0    ) ? LLs-1 : v-1;
-	
-	Simd hp_00 = psi[ss+vp]()(2)(0); 
-	Simd hp_01 = psi[ss+vp]()(2)(1); 
-	Simd hp_02 = psi[ss+vp]()(2)(2); 
-	Simd hp_10 = psi[ss+vp]()(3)(0); 
-	Simd hp_11 = psi[ss+vp]()(3)(1); 
-	Simd hp_12 = psi[ss+vp]()(3)(2); 
-	
-	Simd hm_00 = psi[ss+vm]()(0)(0); 
-	Simd hm_01 = psi[ss+vm]()(0)(1); 
-	Simd hm_02 = psi[ss+vm]()(0)(2); 
-	Simd hm_10 = psi[ss+vm]()(1)(0); 
-	Simd hm_11 = psi[ss+vm]()(1)(1); 
-	Simd hm_12 = psi[ss+vm]()(1)(2); 
-
-	if ( vp<=v ) {
-	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-	}
-	if ( vm>=v ) {
-	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-	}
-
-	// Can force these to real arithmetic and save 2x.
-	Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
-	Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
-	Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02); 
-	Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
-	Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
-	Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
-	Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
-	Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
-	Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);  
-	Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
-	Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
-	Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
-
-	vstream(chi[ss+v]()(0)(0),p_00);
-	vstream(chi[ss+v]()(0)(1),p_01);
-	vstream(chi[ss+v]()(0)(2),p_02);
-	vstream(chi[ss+v]()(1)(0),p_10);
-	vstream(chi[ss+v]()(1)(1),p_11);
-	vstream(chi[ss+v]()(1)(2),p_12);
-	vstream(chi[ss+v]()(2)(0),p_20);
-	vstream(chi[ss+v]()(2)(1),p_21);
-	vstream(chi[ss+v]()(2)(2),p_22);
-	vstream(chi[ss+v]()(3)(0),p_30);
-	vstream(chi[ss+v]()(3)(1),p_31);
-	vstream(chi[ss+v]()(3)(2),p_32);
-
-      }
-#endif
-  }
-  M5Dtime+=usecond();
-}
-
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
-				   const FermionField &phi, 
-				   FermionField &chi,
-				   std::vector<Coeff_t> &lower,
-				   std::vector<Coeff_t> &diag,
-				   std::vector<Coeff_t> &upper)
-{
-  GridBase *grid=psi._grid;
-  int Ls   = this->Ls;
-  int LLs  = grid->_rdimensions[0];
-  int nsimd= Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs==nsimd);
-  assert(phi.checkerboard == psi.checkerboard);
-
-  chi.checkerboard=psi.checkerboard;
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type * u_p = (scalar_type *)&u[0];
-  scalar_type * l_p = (scalar_type *)&l[0];
-  scalar_type * d_p = (scalar_type *)&d[0];
-
-  for(int o=0;o<LLs;o++){ // outer
-  for(int i=0;i<nsimd;i++){ //inner
-    int s  = o+i*LLs;
-    int ss = o*nsimd+i;
-    u_p[ss] = upper[s];
-    l_p[ss] = lower[s];
-    d_p[ss] = diag[s];
-  }}
-
-  M5Dcalls++;
-  M5Dtime-=usecond();
-  parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
-#if 0
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0;v<LLs;v++){
-
-      int vp=(v+1)%LLs;
-      int vm=(v+LLs-1)%LLs;
-
-      spProj5p(hp,psi[ss+vp]);
-      spProj5m(hm,psi[ss+vm]);
-
-      if ( vp<=v ) rotate(hp,hp,1);
-      if ( vm>=v ) rotate(hm,hm,nsimd-1);
-      
-      hp=hp*0.5;
-      hm=hm*0.5;
-      spRecon5p(fp,hp);
-      spRecon5m(fm,hm);
-
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-    }
-#else
-      for(int v=0;v<LLs;v++){
-
-	vprefetch(psi[ss+v+LLs]);
-
-	int vp= (v==LLs-1) ? 0     : v+1;
-	int vm= (v==0    ) ? LLs-1 : v-1;
-	
-	Simd hp_00 = psi[ss+vp]()(0)(0); 
-	Simd hp_01 = psi[ss+vp]()(0)(1); 
-	Simd hp_02 = psi[ss+vp]()(0)(2); 
-	Simd hp_10 = psi[ss+vp]()(1)(0); 
-	Simd hp_11 = psi[ss+vp]()(1)(1); 
-	Simd hp_12 = psi[ss+vp]()(1)(2); 
-	
-	Simd hm_00 = psi[ss+vm]()(2)(0); 
-	Simd hm_01 = psi[ss+vm]()(2)(1); 
-	Simd hm_02 = psi[ss+vm]()(2)(2); 
-	Simd hm_10 = psi[ss+vm]()(3)(0); 
-	Simd hm_11 = psi[ss+vm]()(3)(1); 
-	Simd hm_12 = psi[ss+vm]()(3)(2); 
-
-	if ( vp<=v ) {
-	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-	}
-	if ( vm>=v ) {
-	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-	}
-
-	Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
-	Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
-	Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02); 
-	Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
-	Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
-	Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
-
-	Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
-	Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
-	Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);  
-	Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
-	Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
-	Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
-
-	vstream(chi[ss+v]()(0)(0),p_00);
-	vstream(chi[ss+v]()(0)(1),p_01);
-	vstream(chi[ss+v]()(0)(2),p_02);
-	vstream(chi[ss+v]()(1)(0),p_10);
-	vstream(chi[ss+v]()(1)(1),p_11);
-	vstream(chi[ss+v]()(1)(2),p_12);
-	vstream(chi[ss+v]()(2)(0),p_20);
-	vstream(chi[ss+v]()(2)(1),p_21);
-	vstream(chi[ss+v]()(2)(2),p_22);
-	vstream(chi[ss+v]()(3)(0),p_30);
-	vstream(chi[ss+v]()(3)(1),p_31);
-	vstream(chi[ss+v]()(3)(2),p_32);
-      }
-#endif
-  }
-  M5Dtime+=usecond();
-}
-
-
-#ifdef AVX512 
-#include <simd/Intel512common.h>
-#include <simd/Intel512avx.h>
-#include <simd/Intel512single.h>
-#endif 
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionField &chi,
-					     int LLs, int site,
-					     Vector<iSinglet<Simd> > &Matp,
-					     Vector<iSinglet<Simd> > &Matm)
-{
-#ifndef AVX512
-  {
-  SiteHalfSpinor BcastP;
-  SiteHalfSpinor BcastM;
-  SiteHalfSpinor SiteChiP;
-  SiteHalfSpinor SiteChiM;
-
-  // Ls*Ls * 2 * 12 * vol flops
-  for(int s1=0;s1<LLs;s1++){ 
-    for(int s2=0;s2<LLs;s2++){ 
-      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-
-        int s=s2+l*LLs;
-	int lex=s2+LLs*site;
-	
-	if ( s2==0 && l==0) {
-	  SiteChiP=zero;
-	  SiteChiM=zero;
-	}
-	
-	for(int sp=0;sp<2;sp++){
-        for(int co=0;co<Nc;co++){
-	  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
-	}}
-	for(int sp=0;sp<2;sp++){
-        for(int co=0;co<Nc;co++){
-	  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
-	}}
-
-	for(int sp=0;sp<2;sp++){
-        for(int co=0;co<Nc;co++){
-	  SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
-	  SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
-	}}
-
-    }}
-    {
-      int lex = s1+LLs*site;
-      for(int sp=0;sp<2;sp++){
-      for(int co=0;co<Nc;co++){
-	vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
-	vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-      }}
-    }
-  }
-
-  }
-#else
-  {
-  // pointers
-    //  MASK_REGS;
-#define Chi_00 %%zmm1
-#define Chi_01 %%zmm2
-#define Chi_02 %%zmm3
-#define Chi_10 %%zmm4
-#define Chi_11 %%zmm5
-#define Chi_12 %%zmm6
-#define Chi_20 %%zmm7
-#define Chi_21 %%zmm8
-#define Chi_22 %%zmm9
-#define Chi_30 %%zmm10
-#define Chi_31 %%zmm11
-#define Chi_32 %%zmm12
-
-#define BCAST0   %%zmm13
-#define BCAST1   %%zmm14
-#define BCAST2   %%zmm15
-#define BCAST3   %%zmm16
-#define BCAST4   %%zmm17
-#define BCAST5   %%zmm18
-#define BCAST6   %%zmm19
-#define BCAST7   %%zmm20
-#define BCAST8   %%zmm21
-#define BCAST9   %%zmm22
-#define BCAST10  %%zmm23
-#define BCAST11  %%zmm24
-
-  int incr=LLs*LLs*sizeof(iSinglet<Simd>);
-  for(int s1=0;s1<LLs;s1++){ 
-    for(int s2=0;s2<LLs;s2++){ 
-      int lex=s2+LLs*site;
-      uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
-      uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
-      uint64_t a2 = (uint64_t)&psi[lex];
-      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-	if ( (s2+l)==0 ) {
-	  asm (
-  	           VPREFETCH1(0,%2)  	     VPREFETCH1(0,%1)
-  	           VPREFETCH1(12,%2)  	     VPREFETCH1(13,%2)
-  	           VPREFETCH1(14,%2)  	     VPREFETCH1(15,%2)         
-		   VBCASTCDUP(0,%2,BCAST0)   
-		   VBCASTCDUP(1,%2,BCAST1)   
-		   VBCASTCDUP(2,%2,BCAST2)   
-		   VBCASTCDUP(3,%2,BCAST3)   
-		   VBCASTCDUP(4,%2,BCAST4)     VMULMEM (0,%0,BCAST0,Chi_00)
-		   VBCASTCDUP(5,%2,BCAST5)     VMULMEM (0,%0,BCAST1,Chi_01)
-		   VBCASTCDUP(6,%2,BCAST6)     VMULMEM (0,%0,BCAST2,Chi_02)
-		   VBCASTCDUP(7,%2,BCAST7)     VMULMEM (0,%0,BCAST3,Chi_10)
-		   VBCASTCDUP(8,%2,BCAST8)     VMULMEM (0,%0,BCAST4,Chi_11)
-		   VBCASTCDUP(9,%2,BCAST9)     VMULMEM (0,%0,BCAST5,Chi_12)
-		   VBCASTCDUP(10,%2,BCAST10)   VMULMEM (0,%1,BCAST6,Chi_20)
-		   VBCASTCDUP(11,%2,BCAST11)   VMULMEM (0,%1,BCAST7,Chi_21)
-		   VMULMEM (0,%1,BCAST8,Chi_22)         
-		   VMULMEM (0,%1,BCAST9,Chi_30)
-		   VMULMEM (0,%1,BCAST10,Chi_31)       
-		   VMULMEM (0,%1,BCAST11,Chi_32)
-		   : : "r" (a0), "r" (a1), "r" (a2)  );
-	} else { 
-	  asm (
-		   VBCASTCDUP(0,%2,BCAST0)   VMADDMEM (0,%0,BCAST0,Chi_00)
-		   VBCASTCDUP(1,%2,BCAST1)   VMADDMEM (0,%0,BCAST1,Chi_01)
-		   VBCASTCDUP(2,%2,BCAST2)   VMADDMEM (0,%0,BCAST2,Chi_02)
-		   VBCASTCDUP(3,%2,BCAST3)   VMADDMEM (0,%0,BCAST3,Chi_10)
-		   VBCASTCDUP(4,%2,BCAST4)   VMADDMEM (0,%0,BCAST4,Chi_11)
-		   VBCASTCDUP(5,%2,BCAST5)   VMADDMEM (0,%0,BCAST5,Chi_12)
-		   VBCASTCDUP(6,%2,BCAST6)   VMADDMEM (0,%1,BCAST6,Chi_20)
-		   VBCASTCDUP(7,%2,BCAST7)   VMADDMEM (0,%1,BCAST7,Chi_21)
-		   VBCASTCDUP(8,%2,BCAST8)   VMADDMEM (0,%1,BCAST8,Chi_22)
-		   VBCASTCDUP(9,%2,BCAST9)   VMADDMEM (0,%1,BCAST9,Chi_30)
-		   VBCASTCDUP(10,%2,BCAST10)  VMADDMEM (0,%1,BCAST10,Chi_31)
-		   VBCASTCDUP(11,%2,BCAST11)  VMADDMEM (0,%1,BCAST11,Chi_32) 
-		   : : "r" (a0), "r" (a1), "r" (a2)  );
-	}
-	a0 = a0+incr;
-	a1 = a1+incr;
-	a2 = a2+sizeof(typename Simd::scalar_type);
-      }}
-    {
-      int lexa = s1+LLs*site;
-      asm (
-	       VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)		
-	       VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)		
-	       VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)		
-	       VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)		
-	       : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-    }
-  }
-  }
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-#endif
-};
-
-  // Z-mobius version
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionField &chi,
-					     int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
-{
-#ifndef AVX512
-  {
-  SiteHalfSpinor BcastP;
-  SiteHalfSpinor BcastM;
-  SiteHalfSpinor SiteChiP;
-  SiteHalfSpinor SiteChiM;
-
-  // Ls*Ls * 2 * 12 * vol flops
-  for(int s1=0;s1<LLs;s1++){ 
-    for(int s2=0;s2<LLs;s2++){ 
-      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-
-        int s=s2+l*LLs;
-	int lex=s2+LLs*site;
-	
-	if ( s2==0 && l==0) {
-	  SiteChiP=zero;
-	  SiteChiM=zero;
-	}
-	
-	for(int sp=0;sp<2;sp++){
-        for(int co=0;co<Nc;co++){
-	  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
-	}}
-	for(int sp=0;sp<2;sp++){
-        for(int co=0;co<Nc;co++){
-	  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
-	}}
-
-	for(int sp=0;sp<2;sp++){
-        for(int co=0;co<Nc;co++){
-	  SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co); 
-	  SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co); 
-	}}
-
-
-    }}
-    {
-      int lex = s1+LLs*site;
-      for(int sp=0;sp<2;sp++){
-      for(int co=0;co<Nc;co++){
-	vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
-	vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-      }}
-    }
-  }
-
-  }
-#else
-  {
-  // pointers
-  //  MASK_REGS;
-#define Chi_00 %zmm0
-#define Chi_01 %zmm1
-#define Chi_02 %zmm2
-#define Chi_10 %zmm3
-#define Chi_11 %zmm4
-#define Chi_12 %zmm5
-#define Chi_20 %zmm6
-#define Chi_21 %zmm7
-#define Chi_22 %zmm8
-#define Chi_30 %zmm9
-#define Chi_31 %zmm10
-#define Chi_32 %zmm11
-#define pChi_00 %%zmm0
-#define pChi_01 %%zmm1
-#define pChi_02 %%zmm2
-#define pChi_10 %%zmm3
-#define pChi_11 %%zmm4
-#define pChi_12 %%zmm5
-#define pChi_20 %%zmm6
-#define pChi_21 %%zmm7
-#define pChi_22 %%zmm8
-#define pChi_30 %%zmm9
-#define pChi_31 %%zmm10
-#define pChi_32 %%zmm11
-
-#define BCAST_00   %zmm12
-#define  SHUF_00   %zmm13
-#define BCAST_01   %zmm14
-#define  SHUF_01   %zmm15
-#define BCAST_02   %zmm16
-#define  SHUF_02   %zmm17
-#define BCAST_10   %zmm18
-#define  SHUF_10   %zmm19
-#define BCAST_11   %zmm20
-#define  SHUF_11   %zmm21
-#define BCAST_12   %zmm22
-#define  SHUF_12   %zmm23
-
-#define Mp  %zmm24
-#define Mps %zmm25
-#define Mm  %zmm26
-#define Mms %zmm27
-#define N 8
-  int incr=LLs*LLs*sizeof(iSinglet<Simd>);
-  for(int s1=0;s1<LLs;s1++){ 
-    for(int s2=0;s2<LLs;s2++){ 
-      int lex=s2+LLs*site;
-      uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
-      uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
-      uint64_t a2 = (uint64_t)&psi[lex];
-      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-	if ( (s2+l)==0 ) {
-	  LOAD64(%r8,a0);
-	  LOAD64(%r9,a1);
-	  LOAD64(%r10,a2);
-	  asm (
-	       VLOAD(0,%r8,Mp)// i r
-	       VLOAD(0,%r9,Mm)
-	       VSHUF(Mp,Mps)  // r i 
-	       VSHUF(Mm,Mms)
-	       VPREFETCH1(12,%r10)  	     VPREFETCH1(13,%r10)
-	       VPREFETCH1(14,%r10)  	     VPREFETCH1(15,%r10)         
-
-	       VMULIDUP(0*N,%r10,Mps,Chi_00)
-	       VMULIDUP(1*N,%r10,Mps,Chi_01)
-	       VMULIDUP(2*N,%r10,Mps,Chi_02)
-	       VMULIDUP(3*N,%r10,Mps,Chi_10)
-	       VMULIDUP(4*N,%r10,Mps,Chi_11)
-	       VMULIDUP(5*N,%r10,Mps,Chi_12)
-
-	       VMULIDUP(6*N ,%r10,Mms,Chi_20)
-	       VMULIDUP(7*N ,%r10,Mms,Chi_21)
-	       VMULIDUP(8*N ,%r10,Mms,Chi_22)
-	       VMULIDUP(9*N ,%r10,Mms,Chi_30)
-	       VMULIDUP(10*N,%r10,Mms,Chi_31)
-	       VMULIDUP(11*N,%r10,Mms,Chi_32)
-
-	       VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
-	       VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
-	       VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
-	       VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
-	       VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
-	       VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
-
-	       VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
-	       VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
-	       VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
-	       VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
-	       VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
-	       VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
-	       );
-	} else { 
-	  LOAD64(%r8,a0);
-	  LOAD64(%r9,a1);
-	  LOAD64(%r10,a2);
-	  asm (
-	       VLOAD(0,%r8,Mp)
-	       VSHUF(Mp,Mps)
-
-	       VLOAD(0,%r9,Mm)
-	       VSHUF(Mm,Mms)
-
-	       VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) //  Mri * Pii +- Cir
-	       VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
-	       VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
-	       VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
-	       VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
-	       VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
-
-	       VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
-	       VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
-	       VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
-	       VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
-	       VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
-	       VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
-
-	       VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) //  Cir = Mir * Prr +- ( Mri * Pii +- Cir) 
-	       VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) //  Ci = MiPr + Ci + MrPi ;    Cr = MrPr - ( MiPi - Cr)
-	       VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
-	       VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
-	       VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
-	       VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
-
-	       VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
-	       VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
-	       VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
-	       VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
-	       VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
-	       VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
-	       );
-	}
-	a0 = a0+incr;
-	a1 = a1+incr;
-	a2 = a2+sizeof(typename Simd::scalar_type);
-      }}
-    {
-      int lexa = s1+LLs*site;
-      /*
-      SiteSpinor tmp;
-      asm (
-	       VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
-	       VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
-	       VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
-	       VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
-	       : : "r" ((uint64_t)&tmp) : "memory" );
-      */
-
-      asm (
-	       VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
-	       VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
-	       VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
-	       VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
-	       : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-      //      if ( 1 || (site==0) ) { 
-      //	std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
-      //      }
-    }
-  }
-  }
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-
-#endif
-};
-
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
-{
-  int Ls=this->Ls;
-  int LLs = psi._grid->_rdimensions[0];
-  int vol = psi._grid->oSites()/LLs;
-
-  chi.checkerboard=psi.checkerboard;
-  
-  Vector<iSinglet<Simd> >  Matp;
-  Vector<iSinglet<Simd> >  Matm;
-  Vector<iSinglet<Simd> >  *_Matp;
-  Vector<iSinglet<Simd> >  *_Matm;
-  
-  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-  if ( inv && dag ) { 
-    _Matp = &MatpInvDag;
-    _Matm = &MatmInvDag;
-  }
-  if ( inv && (!dag) ) { 
-    _Matp = &MatpInv;
-    _Matm = &MatmInv;
-  } 
-  if ( !inv ) {
-    MooeeInternalCompute(dag,inv,Matp,Matm);
-    _Matp = &Matp;
-    _Matm = &Matm;
-  }
-  assert(_Matp->size()==Ls*LLs);
-
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
-  if ( switcheroo<Coeff_t>::iscomplex() ) {
-    parallel_for(auto site=0;site<vol;site++){
-      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
-    }
-  } else { 
-    parallel_for(auto site=0;site<vol;site++){
-      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
-    }
-  }
-  MooeeInvTime+=usecond();
-}
-
-INSTANTIATE_DPERP(DomainWallVec5dImplD);
-INSTANTIATE_DPERP(DomainWallVec5dImplF);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplF);
-
-INSTANTIATE_DPERP(DomainWallVec5dImplDF);
-INSTANTIATE_DPERP(DomainWallVec5dImplFH);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplDF);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplFH);
-
-template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-
-template void CayleyFermion5D<DomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<DomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-
-
-
-}}
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.cc
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.cc
@@ -1,323 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
-
-namespace Grid {
-  namespace QCD {
-
-    template<class Impl>
-    void ContinuedFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
-    {
-      SetCoefficientsZolotarev(1.0/scale,zdata);
-    }
-    template<class Impl>
-    void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
-    {
-      // How to check Ls matches??
-      //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
-      int Ls = this->Ls;
-      assert(zdata->db==Ls);// Beta has Ls coeffs
-
-      R=(1+this->mass)/(1-this->mass);
-
-      Beta.resize(Ls);
-      cc.resize(Ls);
-      cc_d.resize(Ls);
-      sqrt_cc.resize(Ls);
-      for(int i=0; i < Ls ; i++){
-	Beta[i] = zdata -> beta[i];
-	cc[i] = 1.0/Beta[i];
-	cc_d[i]=sqrt(cc[i]);
-      }
-    
-      cc_d[Ls-1]=1.0;
-      for(int i=0; i < Ls-1 ; i++){
-	sqrt_cc[i]= sqrt(cc[i]*cc[i+1]);
-      }    
-      sqrt_cc[Ls-2]=sqrt(cc[Ls-2]);
-
-
-      ZoloHiInv =1.0/zolo_hi;
-      dw_diag = (4.0-this->M5)*ZoloHiInv;
-    
-      See.resize(Ls);
-      Aee.resize(Ls);
-      int sign=1;
-      for(int s=0;s<Ls;s++){
-	Aee[s] = sign * Beta[s] * dw_diag;
-	sign   = - sign;
-      }
-      Aee[Ls-1] += R;
-    
-      See[0] = Aee[0];
-      for(int s=1;s<Ls;s++){
-	See[s] = Aee[s] - 1.0/See[s-1];
-      }
-      for(int s=0;s<Ls;s++){
-	std::cout<<GridLogMessage <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
-      }
-    }
-
-
-
-    template<class Impl>
-    RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
-    {
-      int Ls = this->Ls;
-
-      FermionField D(psi._grid);
-
-      this->DW(psi,D,DaggerNo); 
-
-      int sign=1;
-      for(int s=0;s<Ls;s++){
-	if ( s==0 ) {
-	  ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*ZoloHiInv,D,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
-	} else if ( s==(Ls-1) ){
-	  RealD R=(1.0+mass)/(1.0-mass);
-	  ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,D,sqrt_cc[s-1],psi,s,s-1);
-	  ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
-	} else {
-	  ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,D,sqrt_cc[s],psi,s,s+1);
-  	  axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
-	}
-	sign=-sign; 
-      }
-      return norm2(chi);
-    }
-    template<class Impl>
-    RealD  ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
-    {
-      // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
-      // The rest of matrix is symmetric.
-      // Can ignore "dag"
-      return M(psi,chi);
-    }
-    template<class Impl>
-    void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-      int Ls = this->Ls;
-
-      this->DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
-
-      int sign=1;
-      for(int s=0;s<Ls;s++){
-	if ( s==(Ls-1) ){
-	  ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
-	} else {
-	  ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
-	}
-	sign=-sign; 
-      }
-    }
-    template<class Impl>
-    void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
-    {
-      int Ls = this->Ls;
-
-      // Apply 4d dslash
-      if ( psi.checkerboard == Odd ) {
-	this->DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
-      } else {
-	this->DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
-      }
-      
-      int sign=1;
-      for(int s=0;s<Ls;s++){
-	if ( s==(Ls-1) ){
-	  ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
-	} else {
-	  ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
-	}
-	sign=-sign; 
-      }
-    }
-    template<class Impl>
-    void   ContinuedFractionFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
-    {
-      this->Meooe(psi,chi);
-    }
-    template<class Impl>
-    void   ContinuedFractionFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
-    {
-      int Ls = this->Ls;
-
-      int sign=1;
-      for(int s=0;s<Ls;s++){
-	if ( s==0 ) {
-	  ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*dw_diag,psi,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
-	} else if ( s==(Ls-1) ){
-	  // Drop the CC here.
-	  double R=(1+mass)/(1-mass);
-	  ag5xpby_ssp(chi,Beta[s]*dw_diag,psi,sqrt_cc[s-1],psi,s,s-1);
-	  ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
-	} else {
-	  ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*dw_diag,psi,sqrt_cc[s],psi,s,s+1);
-	  axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
-	}
-	sign=-sign; 
-      }
-    }
-
-    template<class Impl>
-    void   ContinuedFractionFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
-    {
-      this->Mooee(psi,chi);
-    }
-    template<class Impl>
-    void   ContinuedFractionFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
-    {
-      int Ls = this->Ls;
-
-      // Apply Linv
-      axpby_ssp(chi,1.0/cc_d[0],psi,0.0,psi,0,0); 
-      for(int s=1;s<Ls;s++){
-	axpbg5y_ssp(chi,1.0/cc_d[s],psi,-1.0/See[s-1],chi,s,s-1);
-      }
-      // Apply Dinv
-      for(int s=0;s<Ls;s++){
-	ag5xpby_ssp(chi,1.0/See[s],chi,0.0,chi,s,s); //only appearance of See[0]
-      }
-      // Apply Uinv = (Linv)^T
-      axpby_ssp(chi,1.0/cc_d[Ls-1],chi,0.0,chi,Ls-1,Ls-1);
-      for(int s=Ls-2;s>=0;s--){
-	axpbg5y_ssp(chi,1.0/cc_d[s],chi,-1.0*cc_d[s+1]/See[s]/cc_d[s],chi,s,s+1);
-      }
-    }
-    template<class Impl>
-    void   ContinuedFractionFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
-    {
-      this->MooeeInv(psi,chi);
-    }
-
-  // force terms; five routines; default to Dhop on diagonal
-    template<class Impl>
-   void ContinuedFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    int Ls = this->Ls;
-
-    FermionField D(V._grid);
-
-    int sign=1;
-    for(int s=0;s<Ls;s++){
-      if ( s==(Ls-1) ){
-	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
-      } else {
-	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
-      }
-      sign=-sign; 
-    }
-    this->DhopDeriv(mat,D,V,DaggerNo); 
-  };
-    template<class Impl>
-   void ContinuedFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    int Ls = this->Ls;
-
-    FermionField D(V._grid);
-
-    int sign=1;
-    for(int s=0;s<Ls;s++){
-      if ( s==(Ls-1) ){
-	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
-      } else {
-	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
-      }
-      sign=-sign; 
-    }
-    this->DhopDerivOE(mat,D,V,DaggerNo); 
-  };
-  template<class Impl>
-  void ContinuedFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    int Ls = this->Ls;
-
-    FermionField D(V._grid);
-
-    int sign=1;
-    for(int s=0;s<Ls;s++){
-      if ( s==(Ls-1) ){
-	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
-      } else {
-	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
-      }
-      sign=-sign; 
-    }
-    this->DhopDerivEO(mat,D,V,DaggerNo); 
-  };
-    
-    // Constructors
-    template<class Impl>
-    ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
-							   GaugeField &_Umu,
-							   GridCartesian         &FiveDimGrid,
-							   GridRedBlackCartesian &FiveDimRedBlackGrid,
-							   GridCartesian         &FourDimGrid,
-							   GridRedBlackCartesian &FourDimRedBlackGrid,
-							   RealD _mass,RealD M5,const ImplParams &p) :
-      WilsonFermion5D<Impl>(_Umu,
-			    FiveDimGrid, FiveDimRedBlackGrid,
-			    FourDimGrid, FourDimRedBlackGrid,M5,p),
-      mass(_mass)
-    {
-      int Ls = this->Ls;
-      assert((Ls&0x1)==1); // Odd Ls required
-    }
-
-    template<class Impl>
-    void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
-    {
-      int Ls = this->Ls;
-      conformable(solution5d._grid,this->FermionGrid());
-      conformable(exported4d._grid,this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
-    }
-    template<class Impl>
-    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
-    {
-      int Ls = this->Ls;
-      conformable(imported5d._grid,this->FermionGrid());
-      conformable(input4d._grid   ,this->GaugeGrid());
-      FermionField tmp(this->FermionGrid());
-      tmp=zero;
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
-      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
-      this->Dminus(tmp,imported5d);
-    }
-
-    FermOpTemplateInstantiate(ContinuedFractionFermion5D);
-
-  }
-}
-
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,46 +24,44 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_CONTINUED_FRACTION_H
 #define  GRID_QCD_CONTINUED_FRACTION_H

 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class ContinuedFractionFermion5D : public WilsonFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class ContinuedFractionFermion5D : public WilsonFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
+  // override multiply
+  virtual RealD  M    (const FermionField &in, FermionField &out);
+  virtual RealD  Mdag (const FermionField &in, FermionField &out);

-      // override multiply
-      virtual RealD  M    (const FermionField &in, FermionField &out);
-      virtual RealD  Mdag (const FermionField &in, FermionField &out);
+  // half checkerboard operaions
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);

-      // half checkerboard operaions
-      virtual void   Meooe       (const FermionField &in, FermionField &out);
-      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
-      virtual void   Mooee       (const FermionField &in, FermionField &out);
-      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+  // force terms; five routines; default to Dhop on diagonal
+  virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

-      // force terms; five routines; default to Dhop on diagonal
-      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  //      virtual void   Instantiatable(void)=0;
+  virtual void   Instantiatable(void) =0;

-      //      virtual void   Instantiatable(void)=0;
-      virtual void   Instantiatable(void) =0;
-
-      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  // Efficient support for multigrid coarsening
+  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);

      ///////////////////////////////////////////////////////////////
      // Physical surface field utilities
@@ -73,35 +71,34 @@ namespace Grid {
      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);

-      // Constructors
-      ContinuedFractionFermion5D(GaugeField &_Umu,
-				 GridCartesian         &FiveDimGrid,
-				 GridRedBlackCartesian &FiveDimRedBlackGrid,
-				 GridCartesian         &FourDimGrid,
-				 GridRedBlackCartesian &FourDimRedBlackGrid,
-				 RealD _mass,RealD M5,const ImplParams &p= ImplParams());
+  // Constructors
+  ContinuedFractionFermion5D(GaugeField &_Umu,
+			     GridCartesian         &FiveDimGrid,
+			     GridRedBlackCartesian &FiveDimRedBlackGrid,
+			     GridCartesian         &FourDimGrid,
+			     GridRedBlackCartesian &FourDimRedBlackGrid,
+			     RealD _mass,RealD M5,const ImplParams &p= ImplParams());

-    protected:
+protected:

-      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
-      void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);;
+  void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
+  void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);;

-      // Cont frac
-      RealD dw_diag;
-      RealD mass;
-      RealD R;
-      RealD ZoloHiInv;
-      std::vector<double> Beta;
-      std::vector<double> cc;;
-      std::vector<double> cc_d;;
-      std::vector<double> sqrt_cc;
-      std::vector<double> See;
-      std::vector<double> Aee;
+  // Cont frac
+  RealD dw_diag;
+  RealD mass;
+  RealD R;
+  RealD ZoloHiInv;
+  Vector<double> Beta;
+  Vector<double> cc;;
+  Vector<double> cc_d;;
+  Vector<double> sqrt_cc;
+  Vector<double> See;
+  Vector<double> Aee;

-    };
+};


-  }
-}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc
@@ -1,438 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    template<class Impl>
-    DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
-      GaugeField            &_Umu,
-      GridCartesian         &FiveDimGrid,
-      GridRedBlackCartesian &FiveDimRedBlackGrid,
-      GridCartesian         &FourDimGrid,
-      GridRedBlackCartesian &FourDimRedBlackGrid,
-      RealD _mq1, RealD _mq2, RealD _mq3,
-      RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
-    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-        _shift, _pm, _M5, 1.0, 0.0, p)
-    {
-        RealD eps = 1.0;
-        Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
-        assert(zdata->n == this->Ls);
-
-        std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
-        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-
-        Approx::zolotarev_free(zdata);
-    }
-
-    /***************************************************************
-     * Additional EOFA operators only called outside the inverter.
-     * Since speed is not essential, simple axpby-style
-     * implementations should be fine.
-     ***************************************************************/
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-    {
-        int Ls = this->Ls;
-
-        Din = zero;
-        if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
-        else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-        else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
-        else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-    }
-
-    // This is just the identity for DWF
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-    // This is just the identity for DWF
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-    /*****************************************************************************************************/
-
-    template<class Impl>
-    RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-    {
-        int Ls = this->Ls;
-
-        FermionField Din(psi._grid);
-
-        this->Meooe5D(psi, Din);
-        this->DW(Din, chi, DaggerNo);
-        axpby(chi, 1.0, 1.0, chi, psi);
-        this->M5D(psi, chi);
-        return(norm2(chi));
-    }
-
-    template<class Impl>
-    RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-    {
-        int Ls = this->Ls;
-
-        FermionField Din(psi._grid);
-
-        this->DW(psi, Din, DaggerYes);
-        this->MeooeDag5D(Din, chi);
-        this->M5Ddag(psi, chi);
-        axpby(chi, 1.0, 1.0, chi, psi);
-        return(norm2(chi));
-    }
-
-    /********************************************************************
-     * Performance critical fermion operators called inside the inverter
-     ********************************************************************/
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-    {
-        int   Ls    = this->Ls;
-        int   pm    = this->pm;
-        RealD shift = this->shift;
-        RealD mq1   = this->mq1;
-        RealD mq2   = this->mq2;
-        RealD mq3   = this->mq3;
-
-        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-        Coeff_t shiftp(0.0), shiftm(0.0);
-        if(shift != 0.0){
-          if(pm == 1){ shiftp = shift*(mq3-mq2); }
-          else{ shiftm = -shift*(mq3-mq2); }
-        }
-
-        std::vector<Coeff_t> diag(Ls,1.0);
-        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
-        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
-
-        #if(0)
-            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
-            for(int i=0; i<diag.size(); ++i){
-                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-            }
-            for(int i=0; i<upper.size(); ++i){
-                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-            }
-            for(int i=0; i<lower.size(); ++i){
-                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-            }
-        #endif
-
-        this->M5D(psi, chi, chi, lower, diag, upper);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-    {
-        int   Ls    = this->Ls;
-        int   pm    = this->pm;
-        RealD shift = this->shift;
-        RealD mq1   = this->mq1;
-        RealD mq2   = this->mq2;
-        RealD mq3   = this->mq3;
-
-        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-        Coeff_t shiftp(0.0), shiftm(0.0);
-        if(shift != 0.0){
-          if(pm == 1){ shiftp = shift*(mq3-mq2); }
-          else{ shiftm = -shift*(mq3-mq2); }
-        }
-
-        std::vector<Coeff_t> diag(Ls,1.0);
-        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
-        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
-
-        #if(0)
-            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
-            for(int i=0; i<diag.size(); ++i){
-                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-            }
-            for(int i=0; i<upper.size(); ++i){
-                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-            }
-            for(int i=0; i<lower.size(); ++i){
-                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-            }
-        #endif
-
-        this->M5Ddag(psi, chi, chi, lower, diag, upper);
-    }
-
-    // half checkerboard operations
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-    {
-        int Ls = this->Ls;
-
-        std::vector<Coeff_t> diag = this->bee;
-        std::vector<Coeff_t> upper(Ls);
-        std::vector<Coeff_t> lower(Ls);
-
-        for(int s=0; s<Ls; s++){
-          upper[s] = -this->cee[s];
-          lower[s] = -this->cee[s];
-        }
-        upper[Ls-1] = this->dm;
-        lower[0]    = this->dp;
-
-        this->M5D(psi, psi, chi, lower, diag, upper);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-    {
-        int Ls = this->Ls;
-
-        std::vector<Coeff_t> diag = this->bee;
-        std::vector<Coeff_t> upper(Ls);
-        std::vector<Coeff_t> lower(Ls);
-
-        for(int s=0; s<Ls; s++){
-          upper[s] = -this->cee[s];
-          lower[s] = -this->cee[s];
-        }
-        upper[Ls-1] = this->dp;
-        lower[0]    = this->dm;
-
-        this->M5Ddag(psi, psi, chi, lower, diag, upper);
-    }
-
-    /****************************************************************************************/
-
-    //Zolo
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
-    {
-        int   Ls    = this->Ls;
-        int   pm    = this->pm;
-        RealD mq1   = this->mq1;
-        RealD mq2   = this->mq2;
-        RealD mq3   = this->mq3;
-        RealD shift = this->shift;
-
-        ////////////////////////////////////////////////////////
-        // Constants for the preconditioned matrix Cayley form
-        ////////////////////////////////////////////////////////
-        this->bs.resize(Ls);
-        this->cs.resize(Ls);
-        this->aee.resize(Ls);
-        this->aeo.resize(Ls);
-        this->bee.resize(Ls);
-        this->beo.resize(Ls);
-        this->cee.resize(Ls);
-        this->ceo.resize(Ls);
-
-        for(int i=0; i<Ls; ++i){
-          this->bee[i] = 4.0 - this->M5 + 1.0;
-          this->cee[i] = 1.0;
-        }
-
-        for(int i=0; i<Ls; ++i){
-          this->aee[i] = this->cee[i];
-          this->bs[i] = this->beo[i] = 1.0;
-          this->cs[i] = this->ceo[i] = 0.0;
-        }
-
-        //////////////////////////////////////////
-        // EOFA shift terms
-        //////////////////////////////////////////
-        if(pm == 1){
-          this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
-          this->dm = mq1*this->cee[Ls-1];
-        } else if(this->pm == -1) {
-          this->dp = mq1*this->cee[0];
-          this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
-        } else {
-          this->dp = mq1*this->cee[0];
-          this->dm = mq1*this->cee[Ls-1];
-        }
-
-        //////////////////////////////////////////
-        // LDU decomposition of eeoo
-        //////////////////////////////////////////
-        this->dee.resize(Ls+1);
-        this->lee.resize(Ls);
-        this->leem.resize(Ls);
-        this->uee.resize(Ls);
-        this->ueem.resize(Ls);
-
-        for(int i=0; i<Ls; ++i){
-
-          if(i < Ls-1){
-
-            this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
-
-            this->leem[i] = this->dm/this->bee[i];
-            for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
-
-            this->dee[i] = this->bee[i];
-
-            this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
-
-            this->ueem[i] = this->dp / this->bee[0];
-            for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
-
-          } else {
-
-            this->lee[i]  = 0.0;
-            this->leem[i] = 0.0;
-            this->uee[i]  = 0.0;
-            this->ueem[i] = 0.0;
-
-          }
-        }
-
-        {
-          Coeff_t delta_d = 1.0 / this->bee[0];
-          for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
-          this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
-          this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
-        }
-
-        int inv = 1;
-        this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
-        this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
-    }
-
-    // Recompute Cayley-form coefficients for different shift
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-    {
-        this->shift = new_shift;
-        Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
-        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-        Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-    {
-        int Ls = this->Ls;
-
-        GridBase* grid = this->FermionRedBlackGrid();
-        int LLs = grid->_rdimensions[0];
-
-        if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-        Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-        Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-        for(int s=0; s<Ls; s++){
-            Pplus(s,s)  = this->bee[s];
-            Pminus(s,s) = this->bee[s];
-        }
-
-        for(int s=0; s<Ls-1; s++){
-            Pminus(s,s+1) = -this->cee[s];
-        }
-
-        for(int s=0; s<Ls-1; s++){
-            Pplus(s+1,s) = -this->cee[s+1];
-        }
-
-        Pplus (0,Ls-1) = this->dp;
-        Pminus(Ls-1,0) = this->dm;
-
-        Eigen::MatrixXcd PplusMat ;
-        Eigen::MatrixXcd PminusMat;
-
-        #if(0)
-            std::cout << GridLogMessage << "Pplus:" << std::endl;
-            for(int s=0; s<Ls; ++s){
-                for(int ss=0; ss<Ls; ++ss){
-                    std::cout << Pplus(s,ss) << "\t";
-                }
-                std::cout << std::endl;
-            }
-            std::cout << GridLogMessage << "Pminus:" << std::endl;
-            for(int s=0; s<Ls; ++s){
-                for(int ss=0; ss<Ls; ++ss){
-                    std::cout << Pminus(s,ss) << "\t";
-                }
-                std::cout << std::endl;
-            }
-        #endif
-
-        if(inv) {
-            PplusMat  = Pplus.inverse();
-            PminusMat = Pminus.inverse();
-        } else {
-            PplusMat  = Pplus;
-            PminusMat = Pminus;
-        }
-
-        if(dag){
-            PplusMat.adjointInPlace();
-            PminusMat.adjointInPlace();
-        }
-
-        typedef typename SiteHalfSpinor::scalar_type scalar_type;
-        const int Nsimd = Simd::Nsimd();
-        Matp.resize(Ls*LLs);
-        Matm.resize(Ls*LLs);
-
-        for(int s2=0; s2<Ls; s2++){
-        for(int s1=0; s1<LLs; s1++){
-            int istride = LLs;
-            int ostride = 1;
-            Simd Vp;
-            Simd Vm;
-            scalar_type *sp = (scalar_type*) &Vp;
-            scalar_type *sm = (scalar_type*) &Vm;
-            for(int l=0; l<Nsimd; l++){
-                if(switcheroo<Coeff_t>::iscomplex()) {
-                    sp[l] = PplusMat (l*istride+s1*ostride,s2);
-                    sm[l] = PminusMat(l*istride+s1*ostride,s2);
-                } else {
-                    // if real
-                    scalar_type tmp;
-                    tmp = PplusMat (l*istride+s1*ostride,s2);
-                    sp[l] = scalar_type(tmp.real(),tmp.real());
-                    tmp = PminusMat(l*istride+s1*ostride,s2);
-                    sm[l] = scalar_type(tmp.real(),tmp.real());
-                }
-            }
-            Matp[LLs*s2+s1] = Vp;
-            Matm[LLs*s2+s1] = Vm;
-        }}
-    }
-
-    FermOpTemplateInstantiate(DomainWallEOFAFermion);
-    GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
-
-}}
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h
@@ -26,90 +26,65 @@ with this program; if not, write to the Free Software Foundation, Inc.,

 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
-/*  END LEGAL */
-#ifndef  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
-#define  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
+			   /*  END LEGAL */
+#pragma once

 #include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  template<class Impl>
-  class DomainWallEOFAFermion : public AbstractEOFAFermion<Impl>
-  {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
+template<class Impl>
+class DomainWallEOFAFermion : public AbstractEOFAFermion<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    public:
-      // Modified (0,Ls-1) and (Ls-1,0) elements of Mooee
-      // for red-black preconditioned Shamir EOFA
-      Coeff_t dm;
-      Coeff_t dp;
+public:
+  // Modified (0,Ls-1) and (Ls-1,0) elements of Mooee
+  // for red-black preconditioned Shamir EOFA
+  Coeff_t dm;
+  Coeff_t dp;

-      virtual void Instantiatable(void) {};
+  virtual void Instantiatable(void) {};

-      // EOFA-specific operations
-      virtual void  Omega      (const FermionField& in, FermionField& out, int sign, int dag);
-      virtual void  Dtilde     (const FermionField& in, FermionField& out);
-      virtual void  DtildeInv  (const FermionField& in, FermionField& out);
+  // EOFA-specific operations
+  virtual void  Omega      (const FermionField& in, FermionField& out, int sign, int dag);
+  virtual void  Dtilde     (const FermionField& in, FermionField& out);
+  virtual void  DtildeInv  (const FermionField& in, FermionField& out);

-      // override multiply
-      virtual RealD M          (const FermionField& in, FermionField& out);
-      virtual RealD Mdag       (const FermionField& in, FermionField& out);
+  // override multiply
+  virtual RealD M          (const FermionField& in, FermionField& out);
+  virtual RealD Mdag       (const FermionField& in, FermionField& out);

-      // half checkerboard operations
-      virtual void  Mooee      (const FermionField& in, FermionField& out);
-      virtual void  MooeeDag   (const FermionField& in, FermionField& out);
-      virtual void  MooeeInv   (const FermionField& in, FermionField& out);
-      virtual void  MooeeInvDag(const FermionField& in, FermionField& out);
+  // half checkerboard operations
+  virtual void  Mooee      (const FermionField& in, FermionField& out);
+  virtual void  MooeeDag   (const FermionField& in, FermionField& out);
+  virtual void  MooeeInv   (const FermionField& in, FermionField& out);
+  virtual void  MooeeInvDag(const FermionField& in, FermionField& out);

-      virtual void   M5D       (const FermionField& psi, FermionField& chi);
-      virtual void   M5Ddag    (const FermionField& psi, FermionField& chi);
+  virtual void   M5D       (const FermionField& psi, FermionField& chi);
+  virtual void   M5Ddag    (const FermionField& psi, FermionField& chi);

-      /////////////////////////////////////////////////////
-      // Instantiate different versions depending on Impl
-      /////////////////////////////////////////////////////
-      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+  /////////////////////////////////////////////////////
+  // Instantiate different versions depending on Impl
+  /////////////////////////////////////////////////////
+  void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
+	   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);

-      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+  void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
+	      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);

-      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
+  virtual void RefreshShiftCoefficients(RealD new_shift);

-      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+  // Constructors
+  DomainWallEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
+			GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
+			RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
+			RealD _M5, const ImplParams& p=ImplParams());

-      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
-        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+protected:
+  void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c);
+};

-      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
-        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+NAMESPACE_END(Grid);

-      virtual void RefreshShiftCoefficients(RealD new_shift);
-
-      // Constructors
-      DomainWallEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
-        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
-        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
-        RealD _M5, const ImplParams& p=ImplParams());
-
-    protected:
-      void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
-  };
-}}
-
-#define INSTANTIATE_DPERP_DWF_EOFA(A)\
-template void DomainWallEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
-template void DomainWallEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
-template void DomainWallEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
-template void DomainWallEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi);
-
-#undef  DOMAIN_WALL_EOFA_DPERP_DENSE
-#define DOMAIN_WALL_EOFA_DPERP_CACHE
-#undef  DOMAIN_WALL_EOFA_DPERP_LINALG
-#define DOMAIN_WALL_EOFA_DPERP_VEC
-
-#endif
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc
@@ -1,248 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-
-    // Pminus fowards
-    // Pplus  backwards..
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        int Ls = this->Ls;
-        GridBase* grid = psi._grid;
-
-        assert(phi.checkerboard == psi.checkerboard);
-        chi.checkerboard = psi.checkerboard;
-        // Flops = 6.0*(Nc*Ns) *Ls*vol
-        this->M5Dcalls++;
-        this->M5Dtime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
-            for(int s=0; s<Ls; s++){
-                auto tmp = psi._odata[0];
-                if(s==0) {
-                    spProj5m(tmp, psi._odata[ss+s+1]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5p(tmp, psi._odata[ss+Ls-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                } else if(s==(Ls-1)) {
-                    spProj5m(tmp, psi._odata[ss+0]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5p(tmp, psi._odata[ss+s-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                } else {
-                    spProj5m(tmp, psi._odata[ss+s+1]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5p(tmp, psi._odata[ss+s-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                }
-            }
-        }
-
-        this->M5Dtime += usecond();
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        int Ls = this->Ls;
-        GridBase* grid = psi._grid;
-        assert(phi.checkerboard == psi.checkerboard);
-        chi.checkerboard=psi.checkerboard;
-
-        // Flops = 6.0*(Nc*Ns) *Ls*vol
-        this->M5Dcalls++;
-        this->M5Dtime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
-            auto tmp = psi._odata[0];
-            for(int s=0; s<Ls; s++){
-                if(s==0) {
-                    spProj5p(tmp, psi._odata[ss+s+1]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5m(tmp, psi._odata[ss+Ls-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                } else if(s==(Ls-1)) {
-                    spProj5p(tmp, psi._odata[ss+0]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5m(tmp, psi._odata[ss+s-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                } else {
-                    spProj5p(tmp, psi._odata[ss+s+1]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5m(tmp, psi._odata[ss+s-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                }
-            }
-        }
-
-        this->M5Dtime += usecond();
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-    {
-        GridBase* grid = psi._grid;
-        int Ls = this->Ls;
-
-        chi.checkerboard = psi.checkerboard;
-
-        this->MooeeInvCalls++;
-        this->MooeeInvTime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
-
-            auto tmp1 = psi._odata[0];
-            auto tmp2 = psi._odata[0];
-
-            // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-            // Apply (L^{\prime})^{-1}
-            chi[ss] = psi[ss]; // chi[0]=psi[0]
-            for(int s=1; s<Ls; s++){
-                spProj5p(tmp1, chi[ss+s-1]);
-                chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
-            }
-
-            // L_m^{-1}
-            for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-                spProj5m(tmp1, chi[ss+s]);
-                chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
-            }
-
-            // U_m^{-1} D^{-1}
-            for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-                spProj5p(tmp1, chi[ss+Ls-1]);
-                chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
-            }
-            spProj5m(tmp2, chi[ss+Ls-1]);
-            chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
-
-            // Apply U^{-1}
-            for(int s=Ls-2; s>=0; s--){
-                spProj5m(tmp1, chi[ss+s+1]);
-                chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
-            }
-        }
-
-        this->MooeeInvTime += usecond();
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-    {
-        GridBase* grid = psi._grid;
-        int Ls = this->Ls;
-
-        assert(psi.checkerboard == psi.checkerboard);
-        chi.checkerboard = psi.checkerboard;
-
-        std::vector<Coeff_t> ueec(Ls);
-        std::vector<Coeff_t> deec(Ls+1);
-        std::vector<Coeff_t> leec(Ls);
-        std::vector<Coeff_t> ueemc(Ls);
-        std::vector<Coeff_t> leemc(Ls);
-
-        for(int s=0; s<ueec.size(); s++){
-            ueec[s]  = conjugate(this->uee[s]);
-            deec[s]  = conjugate(this->dee[s]);
-            leec[s]  = conjugate(this->lee[s]);
-            ueemc[s] = conjugate(this->ueem[s]);
-            leemc[s] = conjugate(this->leem[s]);
-        }
-        deec[Ls] = conjugate(this->dee[Ls]);
-
-        this->MooeeInvCalls++;
-        this->MooeeInvTime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
-
-            auto tmp1 = psi._odata[0];
-            auto tmp2 = psi._odata[0];
-
-            // Apply (U^{\prime})^{-dagger}
-            chi[ss] = psi[ss];
-            for(int s=1; s<Ls; s++){
-                spProj5m(tmp1, chi[ss+s-1]);
-                chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
-            }
-
-            // U_m^{-\dagger}
-            for(int s=0; s<Ls-1; s++){
-                spProj5p(tmp1, chi[ss+s]);
-                chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
-            }
-
-            // L_m^{-\dagger} D^{-dagger}
-            for(int s=0; s<Ls-1; s++){
-                spProj5m(tmp1, chi[ss+Ls-1]);
-                chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
-            }
-            spProj5p(tmp2, chi[ss+Ls-1]);
-            chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
-
-            // Apply L^{-dagger}
-            for(int s=Ls-2; s>=0; s--){
-                spProj5p(tmp1, chi[ss+s+1]);
-                chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
-            }
-        }
-
-        this->MooeeInvTime += usecond();
-    }
-
-    #ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
-
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
-
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
-
-    #endif
-
-}}
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermiondense.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermiondense.cc
@@ -1,159 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    /*
-    * Dense matrix versions of routines
-    */
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-    {
-        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-    {
-        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-    {
-        int Ls = this->Ls;
-        int LLs = psi._grid->_rdimensions[0];
-        int vol = psi._grid->oSites()/LLs;
-
-        chi.checkerboard = psi.checkerboard;
-
-        assert(Ls==LLs);
-
-        Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
-        Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
-
-        for(int s=0;s<Ls;s++){
-            Pplus(s,s)  = this->bee[s];
-            Pminus(s,s) = this->bee[s];
-        }
-
-        for(int s=0; s<Ls-1; s++){
-            Pminus(s,s+1) = -this->cee[s];
-        }
-
-        for(int s=0; s<Ls-1; s++){
-            Pplus(s+1,s) = -this->cee[s+1];
-        }
-
-        Pplus (0,Ls-1) = this->dp;
-        Pminus(Ls-1,0) = this->dm;
-
-        Eigen::MatrixXd PplusMat ;
-        Eigen::MatrixXd PminusMat;
-
-        if(inv) {
-            PplusMat  = Pplus.inverse();
-            PminusMat = Pminus.inverse();
-        } else {
-            PplusMat  = Pplus;
-            PminusMat = Pminus;
-        }
-
-        if(dag){
-            PplusMat.adjointInPlace();
-            PminusMat.adjointInPlace();
-        }
-
-        // For the non-vectorised s-direction this is simple
-
-        for(auto site=0; site<vol; site++){
-
-            SiteSpinor     SiteChi;
-            SiteHalfSpinor SitePplus;
-            SiteHalfSpinor SitePminus;
-
-            for(int s1=0; s1<Ls; s1++){
-                SiteChi = zero;
-                for(int s2=0; s2<Ls; s2++){
-                    int lex2 = s2 + Ls*site;
-                    if(PplusMat(s1,s2) != 0.0){
-                        spProj5p(SitePplus,psi[lex2]);
-                        accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
-                    }
-                    if(PminusMat(s1,s2) != 0.0){
-                        spProj5m(SitePminus, psi[lex2]);
-                        accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
-                    }
-                }
-                chi[s1+Ls*site] = SiteChi*0.5;
-            }
-        }
-    }
-
-    #ifdef DOMAIN_WALL_EOFA_DPERP_DENSE
-
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
-
-        template void DomainWallEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
-
-        template void DomainWallEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-    #endif
-
-}}
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermionssp.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermionssp.cc
@@ -1,168 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-    // Pminus fowards
-    // Pplus  backwards
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        Coeff_t one(1.0);
-        int Ls = this->Ls;
-        for(int s=0; s<Ls; s++){
-            if(s==0) {
-              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
-            } else if (s==(Ls-1)) {
-              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
-              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
-            } else {
-              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-              axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
-            }
-        }
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        Coeff_t one(1.0);
-        int Ls = this->Ls;
-        for(int s=0; s<Ls; s++){
-            if(s==0) {
-              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
-            } else if (s==(Ls-1)) {
-              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
-              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-            } else {
-              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-            }
-        }
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-    {
-        Coeff_t one(1.0);
-        Coeff_t czero(0.0);
-        chi.checkerboard = psi.checkerboard;
-        int Ls = this->Ls;
-
-        FermionField tmp(psi._grid);
-
-        // Apply (L^{\prime})^{-1}
-        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-        for(int s=1; s<Ls; s++){
-            axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
-        }
-
-        // L_m^{-1}
-        for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-            axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
-        }
-
-        // U_m^{-1} D^{-1}
-        for(int s=0; s<Ls-1; s++){
-            axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls], chi, s, Ls-1);
-        }
-        axpby_ssp_pminus(tmp, czero, chi, one/this->dee[Ls-1], chi, Ls-1, Ls-1);
-        axpby_ssp_pplus(chi, one, tmp, one/this->dee[Ls], chi, Ls-1, Ls-1);
-
-        // Apply U^{-1}
-        for(int s=Ls-2; s>=0; s--){
-            axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
-        }
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-    {
-        Coeff_t one(1.0);
-        Coeff_t czero(0.0);
-        chi.checkerboard = psi.checkerboard;
-        int Ls = this->Ls;
-
-        FermionField tmp(psi._grid);
-
-        // Apply (U^{\prime})^{-dagger}
-        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-        for(int s=1; s<Ls; s++){
-            axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
-        }
-
-        // U_m^{-\dagger}
-        for(int s=0; s<Ls-1; s++){
-            axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
-        }
-
-        // L_m^{-\dagger} D^{-dagger}
-        for(int s=0; s<Ls-1; s++){
-            axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
-        }
-        axpby_ssp_pminus(tmp, czero, chi, one/conjugate(this->dee[Ls-1]), chi, Ls-1, Ls-1);
-        axpby_ssp_pplus(chi, one, tmp, one/conjugate(this->dee[Ls]), chi, Ls-1, Ls-1);
-
-        // Apply L^{-dagger}
-        for(int s=Ls-2; s>=0; s--){
-            axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
-        }
-    }
-
-    #ifdef DOMAIN_WALL_EOFA_DPERP_LINALG
-
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
-
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
-
-    #endif
-
-}}
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc
@@ -1,605 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    /*
-    * Dense matrix versions of routines
-    */
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-    {
-        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-    {
-        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        GridBase* grid = psi._grid;
-        int Ls  = this->Ls;
-        int LLs = grid->_rdimensions[0];
-        const int nsimd = Simd::Nsimd();
-
-        Vector<iSinglet<Simd> > u(LLs);
-        Vector<iSinglet<Simd> > l(LLs);
-        Vector<iSinglet<Simd> > d(LLs);
-
-        assert(Ls/LLs == nsimd);
-        assert(phi.checkerboard == psi.checkerboard);
-
-        chi.checkerboard = psi.checkerboard;
-
-        // just directly address via type pun
-        typedef typename Simd::scalar_type scalar_type;
-        scalar_type* u_p = (scalar_type*) &u[0];
-        scalar_type* l_p = (scalar_type*) &l[0];
-        scalar_type* d_p = (scalar_type*) &d[0];
-
-        for(int o=0;o<LLs;o++){ // outer
-        for(int i=0;i<nsimd;i++){ //inner
-            int s  = o + i*LLs;
-            int ss = o*nsimd + i;
-            u_p[ss] = upper[s];
-            l_p[ss] = lower[s];
-            d_p[ss] = diag[s];
-        }}
-
-        this->M5Dcalls++;
-        this->M5Dtime -= usecond();
-
-        assert(Nc == 3);
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-            #if 0
-
-                alignas(64) SiteHalfSpinor hp;
-                alignas(64) SiteHalfSpinor hm;
-                alignas(64) SiteSpinor fp;
-                alignas(64) SiteSpinor fm;
-
-                for(int v=0; v<LLs; v++){
-
-                    int vp = (v+1)%LLs;
-                    int vm = (v+LLs-1)%LLs;
-
-                    spProj5m(hp, psi[ss+vp]);
-                    spProj5p(hm, psi[ss+vm]);
-
-                    if (vp <= v){ rotate(hp, hp, 1); }
-                    if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-                    hp = 0.5*hp;
-                    hm = 0.5*hm;
-
-                    spRecon5m(fp, hp);
-                    spRecon5p(fm, hm);
-
-                    chi[ss+v] = d[v]*phi[ss+v];
-                    chi[ss+v] = chi[ss+v] + u[v]*fp;
-                    chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-                }
-
-            #else
-
-                for(int v=0; v<LLs; v++){
-
-                    vprefetch(psi[ss+v+LLs]);
-
-                    int vp = (v==LLs-1) ? 0     : v+1;
-                    int vm = (v==0)     ? LLs-1 : v-1;
-
-                    Simd hp_00 = psi[ss+vp]()(2)(0);
-                    Simd hp_01 = psi[ss+vp]()(2)(1);
-                    Simd hp_02 = psi[ss+vp]()(2)(2);
-                    Simd hp_10 = psi[ss+vp]()(3)(0);
-                    Simd hp_11 = psi[ss+vp]()(3)(1);
-                    Simd hp_12 = psi[ss+vp]()(3)(2);
-
-                    Simd hm_00 = psi[ss+vm]()(0)(0);
-                    Simd hm_01 = psi[ss+vm]()(0)(1);
-                    Simd hm_02 = psi[ss+vm]()(0)(2);
-                    Simd hm_10 = psi[ss+vm]()(1)(0);
-                    Simd hm_11 = psi[ss+vm]()(1)(1);
-                    Simd hm_12 = psi[ss+vm]()(1)(2);
-
-                    if(vp <= v){
-                        hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-                        hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-                        hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-                        hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-                        hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-                        hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-                    }
-
-                    if(vm >= v){
-                        hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-                        hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-                        hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-                        hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-                        hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-                        hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-                    }
-
-                    // Can force these to real arithmetic and save 2x.
-                    Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-                    Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-                    Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-                    Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-                    Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-                    Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-                    Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-                    Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-                    Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-                    Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-                    Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-                    Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-                    vstream(chi[ss+v]()(0)(0), p_00);
-                    vstream(chi[ss+v]()(0)(1), p_01);
-                    vstream(chi[ss+v]()(0)(2), p_02);
-                    vstream(chi[ss+v]()(1)(0), p_10);
-                    vstream(chi[ss+v]()(1)(1), p_11);
-                    vstream(chi[ss+v]()(1)(2), p_12);
-                    vstream(chi[ss+v]()(2)(0), p_20);
-                    vstream(chi[ss+v]()(2)(1), p_21);
-                    vstream(chi[ss+v]()(2)(2), p_22);
-                    vstream(chi[ss+v]()(3)(0), p_30);
-                    vstream(chi[ss+v]()(3)(1), p_31);
-                    vstream(chi[ss+v]()(3)(2), p_32);
-                }
-
-            #endif
-        }
-
-        this->M5Dtime += usecond();
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        GridBase* grid = psi._grid;
-        int Ls  = this->Ls;
-        int LLs = grid->_rdimensions[0];
-        int nsimd = Simd::Nsimd();
-
-        Vector<iSinglet<Simd> > u(LLs);
-        Vector<iSinglet<Simd> > l(LLs);
-        Vector<iSinglet<Simd> > d(LLs);
-
-        assert(Ls/LLs == nsimd);
-        assert(phi.checkerboard == psi.checkerboard);
-
-        chi.checkerboard = psi.checkerboard;
-
-        // just directly address via type pun
-        typedef typename Simd::scalar_type scalar_type;
-        scalar_type* u_p = (scalar_type*) &u[0];
-        scalar_type* l_p = (scalar_type*) &l[0];
-        scalar_type* d_p = (scalar_type*) &d[0];
-
-        for(int o=0; o<LLs; o++){ // outer
-        for(int i=0; i<nsimd; i++){ //inner
-            int s  = o + i*LLs;
-            int ss = o*nsimd + i;
-            u_p[ss] = upper[s];
-            l_p[ss] = lower[s];
-            d_p[ss] = diag[s];
-        }}
-
-        this->M5Dcalls++;
-        this->M5Dtime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-        #if 0
-
-            alignas(64) SiteHalfSpinor hp;
-            alignas(64) SiteHalfSpinor hm;
-            alignas(64) SiteSpinor fp;
-            alignas(64) SiteSpinor fm;
-
-            for(int v=0; v<LLs; v++){
-
-                int vp = (v+1)%LLs;
-                int vm = (v+LLs-1)%LLs;
-
-                spProj5p(hp, psi[ss+vp]);
-                spProj5m(hm, psi[ss+vm]);
-
-                if(vp <= v){ rotate(hp, hp, 1); }
-                if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-                hp = hp*0.5;
-                hm = hm*0.5;
-                spRecon5p(fp, hp);
-                spRecon5m(fm, hm);
-
-                chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-                chi[ss+v] = chi[ss+v]     +l[v]*fm;
-            }
-
-        #else
-
-            for(int v=0; v<LLs; v++){
-
-                vprefetch(psi[ss+v+LLs]);
-
-                int vp = (v == LLs-1) ? 0     : v+1;
-                int vm = (v == 0    ) ? LLs-1 : v-1;
-
-                Simd hp_00 = psi[ss+vp]()(0)(0);
-                Simd hp_01 = psi[ss+vp]()(0)(1);
-                Simd hp_02 = psi[ss+vp]()(0)(2);
-                Simd hp_10 = psi[ss+vp]()(1)(0);
-                Simd hp_11 = psi[ss+vp]()(1)(1);
-                Simd hp_12 = psi[ss+vp]()(1)(2);
-
-                Simd hm_00 = psi[ss+vm]()(2)(0);
-                Simd hm_01 = psi[ss+vm]()(2)(1);
-                Simd hm_02 = psi[ss+vm]()(2)(2);
-                Simd hm_10 = psi[ss+vm]()(3)(0);
-                Simd hm_11 = psi[ss+vm]()(3)(1);
-                Simd hm_12 = psi[ss+vm]()(3)(2);
-
-                if (vp <= v){
-                    hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-                    hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-                    hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-                    hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-                    hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-                    hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-                }
-
-                if(vm >= v){
-                    hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-                    hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-                    hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-                    hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-                    hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-                    hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-                }
-
-                Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-                Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-                Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-                Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-                Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-                Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-                Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-                Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-                Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-                Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-                Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-                Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-                vstream(chi[ss+v]()(0)(0), p_00);
-                vstream(chi[ss+v]()(0)(1), p_01);
-                vstream(chi[ss+v]()(0)(2), p_02);
-                vstream(chi[ss+v]()(1)(0), p_10);
-                vstream(chi[ss+v]()(1)(1), p_11);
-                vstream(chi[ss+v]()(1)(2), p_12);
-                vstream(chi[ss+v]()(2)(0), p_20);
-                vstream(chi[ss+v]()(2)(1), p_21);
-                vstream(chi[ss+v]()(2)(2), p_22);
-                vstream(chi[ss+v]()(3)(0), p_30);
-                vstream(chi[ss+v]()(3)(1), p_31);
-                vstream(chi[ss+v]()(3)(2), p_32);
-            }
-        #endif
-
-        }
-
-        this->M5Dtime += usecond();
-    }
-
-    #ifdef AVX512
-        #include<simd/Intel512common.h>
-        #include<simd/Intel512avx.h>
-        #include<simd/Intel512single.h>
-    #endif
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
-        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-    {
-        #ifndef AVX512
-        {
-            SiteHalfSpinor BcastP;
-            SiteHalfSpinor BcastM;
-            SiteHalfSpinor SiteChiP;
-            SiteHalfSpinor SiteChiM;
-
-            // Ls*Ls * 2 * 12 * vol flops
-            for(int s1=0; s1<LLs; s1++){
-
-                for(int s2=0; s2<LLs; s2++){
-                for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-                    int s = s2 + l*LLs;
-                    int lex = s2 + LLs*site;
-
-                    if( s2==0 && l==0 ){
-                        SiteChiP=zero;
-                        SiteChiM=zero;
-                    }
-
-                    for(int sp=0; sp<2;  sp++){
-                    for(int co=0; co<Nc; co++){
-                        vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-                    }}
-
-                    for(int sp=0; sp<2;  sp++){
-                    for(int co=0; co<Nc; co++){
-                        vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-                    }}
-
-                    for(int sp=0; sp<2;  sp++){
-                    for(int co=0; co<Nc; co++){
-                        SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-                        SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-                    }}
-                }}
-
-                {
-                    int lex = s1 + LLs*site;
-                    for(int sp=0; sp<2;  sp++){
-                    for(int co=0; co<Nc; co++){
-                        vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-                        vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-                    }}
-                }
-            }
-
-        }
-        #else
-        {
-            // pointers
-            //  MASK_REGS;
-            #define Chi_00 %%zmm1
-            #define Chi_01 %%zmm2
-            #define Chi_02 %%zmm3
-            #define Chi_10 %%zmm4
-            #define Chi_11 %%zmm5
-            #define Chi_12 %%zmm6
-            #define Chi_20 %%zmm7
-            #define Chi_21 %%zmm8
-            #define Chi_22 %%zmm9
-            #define Chi_30 %%zmm10
-            #define Chi_31 %%zmm11
-            #define Chi_32 %%zmm12
-
-            #define BCAST0  %%zmm13
-            #define BCAST1  %%zmm14
-            #define BCAST2  %%zmm15
-            #define BCAST3  %%zmm16
-            #define BCAST4  %%zmm17
-            #define BCAST5  %%zmm18
-            #define BCAST6  %%zmm19
-            #define BCAST7  %%zmm20
-            #define BCAST8  %%zmm21
-            #define BCAST9  %%zmm22
-            #define BCAST10 %%zmm23
-            #define BCAST11 %%zmm24
-
-            int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-            for(int s1=0; s1<LLs; s1++){
-
-                for(int s2=0; s2<LLs; s2++){
-
-                    int lex = s2 + LLs*site;
-                    uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-                    uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-                    uint64_t a2 = (uint64_t) &psi[lex];
-
-                    for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-                        if((s2+l)==0) {
-                            asm(
-                                    VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-                                    VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-                                    VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-                                    VBCASTCDUP(0,%2,BCAST0)
-                                    VBCASTCDUP(1,%2,BCAST1)
-                                    VBCASTCDUP(2,%2,BCAST2)
-                                    VBCASTCDUP(3,%2,BCAST3)
-                                    VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-                                    VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-                                    VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-                                    VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-                                    VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-                                    VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-                                    VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-                                    VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-                                    VMULMEM(0,%1,BCAST8,Chi_22)
-                                    VMULMEM(0,%1,BCAST9,Chi_30)
-                                    VMULMEM(0,%1,BCAST10,Chi_31)
-                                    VMULMEM(0,%1,BCAST11,Chi_32)
-                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
-                        } else {
-                            asm(
-                                    VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-                                    VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-                                    VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-                                    VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-                                    VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-                                    VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-                                    VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-                                    VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-                                    VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-                                    VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-                                    VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-                                    VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
-                        }
-                        a0 = a0 + incr;
-                        a1 = a1 + incr;
-                        a2 = a2 + sizeof(typename Simd::scalar_type);
-                    }
-                }
-
-                {
-                  int lexa = s1+LLs*site;
-                  asm (
-                     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-                     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-                     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-                     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-                     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-                }
-            }
-        }
-
-        #undef Chi_00
-        #undef Chi_01
-        #undef Chi_02
-        #undef Chi_10
-        #undef Chi_11
-        #undef Chi_12
-        #undef Chi_20
-        #undef Chi_21
-        #undef Chi_22
-        #undef Chi_30
-        #undef Chi_31
-        #undef Chi_32
-
-        #undef BCAST0
-        #undef BCAST1
-        #undef BCAST2
-        #undef BCAST3
-        #undef BCAST4
-        #undef BCAST5
-        #undef BCAST6
-        #undef BCAST7
-        #undef BCAST8
-        #undef BCAST9
-        #undef BCAST10
-        #undef BCAST11
-        #endif
-    };
-
-    // Z-mobius version
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-    {
-        std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-        exit(-1);
-    };
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-    {
-        int Ls  = this->Ls;
-        int LLs = psi._grid->_rdimensions[0];
-        int vol = psi._grid->oSites()/LLs;
-
-        chi.checkerboard = psi.checkerboard;
-
-        Vector<iSinglet<Simd> > Matp;
-        Vector<iSinglet<Simd> > Matm;
-        Vector<iSinglet<Simd> > *_Matp;
-        Vector<iSinglet<Simd> > *_Matm;
-
-        //  MooeeInternalCompute(dag,inv,Matp,Matm);
-        if(inv && dag){
-            _Matp = &this->MatpInvDag;
-            _Matm = &this->MatmInvDag;
-        }
-
-        if(inv && (!dag)){
-            _Matp = &this->MatpInv;
-            _Matm = &this->MatmInv;
-        }
-
-        if(!inv){
-            MooeeInternalCompute(dag, inv, Matp, Matm);
-            _Matp = &Matp;
-            _Matm = &Matm;
-        }
-
-        assert(_Matp->size() == Ls*LLs);
-
-        this->MooeeInvCalls++;
-        this->MooeeInvTime -= usecond();
-
-        if(switcheroo<Coeff_t>::iscomplex()){
-            parallel_for(auto site=0; site<vol; site++){
-                MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-            }
-        } else {
-            parallel_for(auto site=0; site<vol; site++){
-                MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-            }
-        }
-
-        this->MooeeInvTime += usecond();
-    }
-
-    #ifdef DOMAIN_WALL_EOFA_DPERP_VEC
-
-        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
-
-        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
-
-        template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-        template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-    #endif
-
-}}
--- a/Grid/qcd/action/fermion/DomainWallFermion.h
+++ b/Grid/qcd/action/fermion/DomainWallFermion.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -25,34 +25,33 @@ Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_DOMAIN_WALL_FERMION_H
 #define  GRID_QCD_DOMAIN_WALL_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class DomainWallFermion : public CayleyFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class DomainWallFermion : public CayleyFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
+  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist, bool fiveD) {
+	FermionField in_k(in.Grid());
+	FermionField prop_k(in.Grid());

-      void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist, bool fiveD) {
-	FermionField in_k(in._grid);
-	FermionField prop_k(in._grid);
-
-	FFT theFFT((GridCartesian *) in._grid);
+	FFT theFFT((GridCartesian *) in.Grid());

 	//phase for boundary condition
-	ComplexField coor(in._grid);
-	ComplexField ph(in._grid);  ph = zero;
-	FermionField in_buf(in._grid); in_buf = zero;
+	ComplexField coor(in.Grid());
+	ComplexField ph(in.Grid());  ph = Zero();
+	FermionField in_buf(in.Grid()); in_buf = Zero();
+	typedef typename Simd::scalar_type Scalar;
 	Scalar ci(0.0,1.0);
 	assert(twist.size() == Nd);//check that twist is Nd
 	assert(boundary.size() == Nd);//check that boundary conditions is Nd
@@ -63,13 +62,12 @@ namespace Grid {
 	  // Shift coordinate lattice index by 1 to account for 5th dimension.
          LatticeCoordinate(coor, nu + shift);
 	  double boundary_phase = ::acos(real(boundary[nu]));
-	  ph = ph + boundary_phase*coor*((1./(in._grid->_fdimensions[nu+shift])));
+	  ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
 	  //momenta for propagator shifted by twist+boundary
 	  twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
 	}
 	in_buf = exp(ci*ph*(-1.0))*in;

-
 	if(fiveD){//FFT only on temporal and spatial dimensions
          std::vector<int> mask(Nd+1,1); mask[0] = 0;
 	  theFFT.FFT_dim_mask(in_k,in_buf,mask,FFT::forward);
@@ -82,7 +80,7 @@ namespace Grid {
 	  theFFT.FFT_all_dim(out,prop_k,FFT::backward);
        }
 	//phase for boundary condition
-	out = out * exp(ci*ph);
+	out = out * exp(Scalar(2.0*M_PI)*ci*ph);
      };

      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) {
@@ -105,38 +103,37 @@ namespace Grid {
 	FreePropagator(in,out,mass,boundary,twist,fiveD);
      };

-      virtual void   Instantiatable(void) {};
-      // Constructors
-      DomainWallFermion(GaugeField &_Umu,
-			GridCartesian         &FiveDimGrid,
-			GridRedBlackCartesian &FiveDimRedBlackGrid,
-			GridCartesian         &FourDimGrid,
-			GridRedBlackCartesian &FourDimRedBlackGrid,
-			RealD _mass,RealD _M5,const ImplParams &p= ImplParams()) : 
+  virtual void   Instantiatable(void) {};
+  // Constructors
+  DomainWallFermion(GaugeField &_Umu,
+		    GridCartesian         &FiveDimGrid,
+		    GridRedBlackCartesian &FiveDimRedBlackGrid,
+		    GridCartesian         &FourDimGrid,
+		    GridRedBlackCartesian &FourDimRedBlackGrid,
+		    RealD _mass,RealD _M5,const ImplParams &p= ImplParams()) : 


-      CayleyFermion5D<Impl>(_Umu,
-			    FiveDimGrid,
-			    FiveDimRedBlackGrid,
-			    FourDimGrid,
-			    FourDimRedBlackGrid,_mass,_M5,p)
+    CayleyFermion5D<Impl>(_Umu,
+			  FiveDimGrid,
+			  FiveDimRedBlackGrid,
+			  FourDimGrid,
+			  FourDimRedBlackGrid,_mass,_M5,p)

-      {
-	RealD eps = 1.0;
+  {
+    RealD eps = 1.0;

-	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
-	assert(zdata->n==this->Ls);
+    Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
+    assert(zdata->n==this->Ls);
 	
-	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
-	// Call base setter
-	this->SetCoefficientsTanh(zdata,1.0,0.0);
-
-	Approx::zolotarev_free(zdata);
-      }
-
-    };
+    //    std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
+    // Call base setter
+    this->SetCoefficientsTanh(zdata,1.0,0.0);

+    Approx::zolotarev_free(zdata);
  }
-}
+
+};
+
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
+++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
@@ -0,0 +1,213 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class S,class Representation = FundamentalRepresentation, class Options=CoeffReal>
+class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Representation::Dimension> > { 
+public:
+
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > Gimpl;
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  static const int Dimension = Representation::Dimension;
+  static const bool isFundamental = Representation::isFundamental;
+  static const bool LsVectorised=true;
+  static const int Nhcs = Options::Nhcs;
+      
+  typedef typename Options::_Coeff_t Coeff_t;      
+  typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
+  
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
+  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+  
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplPropagator<Simd>        SitePropagator;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+  typedef iImplHalfCommSpinor<SimdL>   SiteHalfCommSpinor;
+  typedef Lattice<SiteSpinor>          FermionField;
+  typedef Lattice<SitePropagator>      PropagatorField;
+
+  /////////////////////////////////////////////////
+  // Make the doubled gauge field a *scalar*
+  /////////////////////////////////////////////////
+  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
+  typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar
+  typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar
+  typedef Lattice<SiteDoubledGaugeField>                      DoubledGaugeField;
+      
+  typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
+  typedef WilsonImplParams ImplParams;
+  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
+  typedef typename StencilImpl::View_type StencilView;
+  
+  ImplParams Params;
+
+  DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
+      
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    vsplat(reg, memory);
+  }
+
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi, const SiteDoubledGaugeField &U,
+					  const _Spinor &chi, int mu, StencilEntry *SE,
+					  StencilView &St) 
+  {
+#ifdef GPU_VEC
+    // Gauge link is scalarised
+    mult(&phi(), &U(mu), &chi());
+#else
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+        vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mult(&phi(), &UU(), &chi());
+#endif
+  }
+
+  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) 
+  {
+    SiteScalarGaugeField  ScalarUmu;
+    SiteDoubledGaugeField ScalarUds;
+    
+    GaugeLinkField U(Umu.Grid());
+    GaugeField  Uadj(Umu.Grid());
+    for (int mu = 0; mu < Nd; mu++) {
+      U = PeekIndex<LorentzIndex>(Umu, mu);
+      U = adj(Cshift(U, mu, -1));
+      PokeIndex<LorentzIndex>(Uadj, U, mu);
+    }
+    
+    for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
+      Coordinate lcoor;
+      GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
+      
+      peekLocalSite(ScalarUmu, Umu, lcoor);
+      for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
+      
+      peekLocalSite(ScalarUmu, Uadj, lcoor);
+      for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
+      
+      pokeLocalSite(ScalarUds, Uds, lcoor);
+    }
+  }
+      
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) 
+  {
+    assert(0);
+  }
+
+  inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
+    assert(0);
+  } 
+
+  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+    assert(0);
+  }
+
+  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    assert(0);
+  }
+
+
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
+
+    assert(0);
+    // Following lines to be revised after Peter's addition of half prec
+    // missing put lane...
+    /*
+      typedef decltype(traceIndex<SpinIndex>(outerProduct(Btilde[0], Atilde[0]))) result_type;
+      unsigned int LLs = Btilde.Grid()->_rdimensions[0];
+      conformable(Atilde.Grid(),Btilde.Grid());
+      GridBase* grid = mat.Grid();
+      GridBase* Bgrid = Btilde.Grid();
+      unsigned int dimU = grid->Nd();
+      unsigned int dimF = Bgrid->Nd();
+      GaugeLinkField tmp(grid); 
+      tmp = Zero();
+    
+      // FIXME 
+      // Current implementation works, thread safe, probably suboptimal
+      // Passing through the local coordinate for grid transformation
+      // the force grid is in general very different from the Ls vectorized grid
+
+      for (int so = 0; so < grid->oSites(); so++) {
+      std::vector<typename result_type::scalar_object> vres(Bgrid->Nsimd());
+      std::vector<int> ocoor;  grid->oCoorFromOindex(ocoor,so); 
+      for (int si = 0; si < tmp.Grid()->iSites(); si++){
+      typename result_type::scalar_object scalar_object; scalar_object = Zero();
+      std::vector<int> local_coor;      
+      std::vector<int> icoor; grid->iCoorFromIindex(icoor,si);
+      grid->InOutCoorToLocalCoor(ocoor, icoor, local_coor);
+      for (int s = 0; s < LLs; s++) {
+      std::vector<int> slocal_coor(dimF);
+      slocal_coor[0] = s;
+      for (int s4d = 1; s4d< dimF; s4d++) slocal_coor[s4d] = local_coor[s4d-1];
+      int sF = Bgrid->oIndexReduced(slocal_coor);  
+      assert(sF < Bgrid->oSites());
+
+      extract(traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF])), vres); 
+      // sum across the 5d dimension
+      for (auto v : vres) scalar_object += v;  
+      }
+      tmp[so].putlane(scalar_object, si);
+      }
+      }
+      PokeIndex<LorentzIndex>(mat, tmp, mu);
+    */
+  }
+};
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplD; // Double
+ 
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
+ 
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplD; // Double
+ 
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -23,10 +23,9 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_FERMION_H
-#define  GRID_QCD_FERMION_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Explicit explicit template instantiation is still required in the .cc files
@@ -50,12 +49,17 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ////////////////////////////////////////////

 #include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
-#include <Grid/qcd/action/fermion/WilsonTMFermion.h>     // 4d wilson like
+NAMESPACE_CHECK(Wilson);
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
+NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
+NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
+NAMESPACE_CHECK(Wilson5D);

 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
+NAMESPACE_CHECK(Staggered);

 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
@@ -63,7 +67,8 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 #include <Grid/qcd/action/fermion/MobiusFermion.h>
 #include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 #include <Grid/qcd/action/fermion/ZMobiusFermion.h>
-#include <Grid/qcd/action/fermion/SchurDiagTwoKappa.h>
+NAMESPACE_CHECK(DomainWall);
+
 #include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
 #include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
 #include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
@@ -75,6 +80,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 #include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
 #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
 #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
+NAMESPACE_CHECK(Overlap);
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
@@ -84,14 +90,17 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 // Fourier accelerated Pauli Villars inverse support
 ///////////////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/WilsonTMFermion5D.h>   
+NAMESPACE_CHECK(WilsonTM5);

 ////////////////////////////////////////////////////////////////////////////////
 // Move this group to a DWF specific tools/algorithms subdir? 
 ////////////////////////////////////////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/SchurDiagTwoKappa.h>
 #include <Grid/qcd/action/fermion/FourierAcceleratedPV.h>
 #include <Grid/qcd/action/fermion/PauliVillarsInverters.h>
 #include <Grid/qcd/action/fermion/Reconstruct5Dprop.h>
 #include <Grid/qcd/action/fermion/MADWF.h>
+NAMESPACE_CHECK(DWFutils);

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // More maintainable to maintain the following typedef list centrally, as more "impl" targets
@@ -99,8 +108,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ////////////////////////////////////////////////////////////////////////////////////////////////////

 // Cayley 5d
-namespace Grid {
-  namespace QCD {
+NAMESPACE_BEGIN(Grid);

 typedef WilsonFermion<WilsonImplR> WilsonFermionR;
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
@@ -186,46 +194,6 @@ typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
 typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;

 // Ls vectorised
-typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
-typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
-typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
-
-typedef DomainWallFermion<DomainWallVec5dImplRL> DomainWallFermionVec5dRL;
-typedef DomainWallFermion<DomainWallVec5dImplFH> DomainWallFermionVec5dFH;
-typedef DomainWallFermion<DomainWallVec5dImplDF> DomainWallFermionVec5dDF;
-
-typedef DomainWallEOFAFermion<DomainWallVec5dImplR> DomainWallEOFAFermionVec5dR;
-typedef DomainWallEOFAFermion<DomainWallVec5dImplF> DomainWallEOFAFermionVec5dF;
-typedef DomainWallEOFAFermion<DomainWallVec5dImplD> DomainWallEOFAFermionVec5dD;
-
-typedef DomainWallEOFAFermion<DomainWallVec5dImplRL> DomainWallEOFAFermionVec5dRL;
-typedef DomainWallEOFAFermion<DomainWallVec5dImplFH> DomainWallEOFAFermionVec5dFH;
-typedef DomainWallEOFAFermion<DomainWallVec5dImplDF> DomainWallEOFAFermionVec5dDF;
-
-typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
-typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
-typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
-
-typedef MobiusFermion<DomainWallVec5dImplRL> MobiusFermionVec5dRL;
-typedef MobiusFermion<DomainWallVec5dImplFH> MobiusFermionVec5dFH;
-typedef MobiusFermion<DomainWallVec5dImplDF> MobiusFermionVec5dDF;
-
-typedef MobiusEOFAFermion<DomainWallVec5dImplR> MobiusEOFAFermionVec5dR;
-typedef MobiusEOFAFermion<DomainWallVec5dImplF> MobiusEOFAFermionVec5dF;
-typedef MobiusEOFAFermion<DomainWallVec5dImplD> MobiusEOFAFermionVec5dD;
-
-typedef MobiusEOFAFermion<DomainWallVec5dImplRL> MobiusEOFAFermionVec5dRL;
-typedef MobiusEOFAFermion<DomainWallVec5dImplFH> MobiusEOFAFermionVec5dFH;
-typedef MobiusEOFAFermion<DomainWallVec5dImplDF> MobiusEOFAFermionVec5dDF;
-
-typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
-typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
-typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
-
-typedef ZMobiusFermion<ZDomainWallVec5dImplRL> ZMobiusFermionVec5dRL;
-typedef ZMobiusFermion<ZDomainWallVec5dImplFH> ZMobiusFermionVec5dFH;
-typedef ZMobiusFermion<ZDomainWallVec5dImplDF> ZMobiusFermionVec5dDF;
-
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
@@ -318,12 +286,13 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;

+#ifndef GRID_NVCC
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
+#endif

-
-  }}
+NAMESPACE_END(Grid);

 ////////////////////
 // Scalar QED actions
@@ -332,4 +301,4 @@ typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermion
 #include <Grid/qcd/action/scalar/Scalar.h>
 #include <Grid/qcd/action/gauge/Photon.h>

-#endif
+
--- a/Grid/qcd/action/fermion/FermionCore.h
+++ b/Grid/qcd/action/fermion/FermionCore.h
@@ -36,58 +36,13 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 // Fermion prereqs
 ////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
+NAMESPACE_CHECK(Compressor);
 #include <Grid/qcd/action/fermion/FermionOperatorImpl.h>
+NAMESPACE_CHECK(FermionOperatorImpl);
 #include <Grid/qcd/action/fermion/FermionOperator.h>
+NAMESPACE_CHECK(FermionOperator);
 #include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
 #include <Grid/qcd/action/fermion/StaggeredKernels.h>        //used by all wilson type fermions
-
-#define FermOpStaggeredTemplateInstantiate(A) \
-  template class A<StaggeredImplF>; \
-  template class A<StaggeredImplD>; 
-
-#define FermOpStaggeredVec5dTemplateInstantiate(A) \
-  template class A<StaggeredVec5dImplF>; \
-  template class A<StaggeredVec5dImplD>; 
-
-#define FermOp4dVecTemplateInstantiate(A) \
-  template class A<WilsonImplF>;		\
-  template class A<WilsonImplD>;		\
-  template class A<ZWilsonImplF>;		\
-  template class A<ZWilsonImplD>;		\
-  template class A<GparityWilsonImplF>;		\
-  template class A<GparityWilsonImplD>;		\
-  template class A<WilsonImplFH>;		\
-  template class A<WilsonImplDF>;		\
-  template class A<ZWilsonImplFH>;		\
-  template class A<ZWilsonImplDF>;		\
-  template class A<GparityWilsonImplFH>;		\
-  template class A<GparityWilsonImplDF>;		
-
-
-#define AdjointFermOpTemplateInstantiate(A) \
-  template class A<WilsonAdjImplF>; \
-  template class A<WilsonAdjImplD>; 
-
-#define TwoIndexFermOpTemplateInstantiate(A) \
-  template class A<WilsonTwoIndexSymmetricImplF>; \
-  template class A<WilsonTwoIndexSymmetricImplD>; \
-  template class A<WilsonTwoIndexAntiSymmetricImplF>; \
-  template class A<WilsonTwoIndexAntiSymmetricImplD>;
-
-#define FermOp5dVecTemplateInstantiate(A) \
-  template class A<DomainWallVec5dImplF>;	\
-  template class A<DomainWallVec5dImplD>;	\
-  template class A<ZDomainWallVec5dImplF>;	\
-  template class A<ZDomainWallVec5dImplD>;	\
-  template class A<DomainWallVec5dImplFH>;	\
-  template class A<DomainWallVec5dImplDF>;	\
-  template class A<ZDomainWallVec5dImplFH>;	\
-  template class A<ZDomainWallVec5dImplDF>;	
-
-#define FermOpTemplateInstantiate(A) \
- FermOp4dVecTemplateInstantiate(A) \
- FermOp5dVecTemplateInstantiate(A) 
-
-#define GparityFermOpTemplateInstantiate(A) 
+NAMESPACE_CHECK(Kernels);

 #endif
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -26,86 +26,87 @@ Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_FERMION_OPERATOR_H
-#define  GRID_QCD_FERMION_OPERATOR_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
-
-    ////////////////////////////////////////////////////////////////
-    // Allow to select  between gauge representation rank bc's, flavours etc.
-    // and single/double precision.
-    ////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+// Allow to select  between gauge representation rank bc's, flavours etc.
+// and single/double precision.
+////////////////////////////////////////////////////////////////
    
-    template<class Impl>
-    class FermionOperator : public CheckerBoardedSparseMatrixBase<typename Impl::FermionField>, public Impl
-    {
-    public:
+template<class Impl>
+class FermionOperator : public CheckerBoardedSparseMatrixBase<typename Impl::FermionField>, public Impl
+{
+public:

-      INHERIT_IMPL_TYPES(Impl);
+  INHERIT_IMPL_TYPES(Impl);

-      FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
-      virtual ~FermionOperator(void) = default;
+  FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
+  virtual ~FermionOperator(void) = default;

-      virtual FermionField &tmp(void) = 0;
+  virtual FermionField &tmp(void) = 0;

-      GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
-      GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
+  GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
+  GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };

-      virtual GridBase *FermionGrid(void)         =0;
-      virtual GridBase *FermionRedBlackGrid(void) =0;
-      virtual GridBase *GaugeGrid(void)           =0;
-      virtual GridBase *GaugeRedBlackGrid(void)   =0;
+  virtual GridBase *FermionGrid(void)         =0;
+  virtual GridBase *FermionRedBlackGrid(void) =0;
+  virtual GridBase *GaugeGrid(void)           =0;
+  virtual GridBase *GaugeRedBlackGrid(void)   =0;

-      // override multiply
-      virtual RealD  M    (const FermionField &in, FermionField &out)=0;
-      virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;
+  // override multiply
+  virtual RealD  M    (const FermionField &in, FermionField &out)=0;
+  virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;

-      // half checkerboard operaions
-      virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
-      virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0;
-      virtual void   Mooee       (const FermionField &in, FermionField &out)=0;
-      virtual void   MooeeDag    (const FermionField &in, FermionField &out)=0;
-      virtual void   MooeeInv    (const FermionField &in, FermionField &out)=0;
-      virtual void   MooeeInvDag (const FermionField &in, FermionField &out)=0;
+  // half checkerboard operaions
+  virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0;
+  virtual void   Mooee       (const FermionField &in, FermionField &out)=0;
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out)=0;
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out)=0;
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out)=0;

-      // non-hermitian hopping term; half cb or both
-      virtual void Dhop  (const FermionField &in, FermionField &out,int dag)=0;
-      virtual void DhopOE(const FermionField &in, FermionField &out,int dag)=0;
-      virtual void DhopEO(const FermionField &in, FermionField &out,int dag)=0;
-      virtual void DhopDir(const FermionField &in, FermionField &out,int dir,int disp)=0; // implemented by WilsonFermion and WilsonFermion5D
+  // non-hermitian hopping term; half cb or both
+  virtual void Dhop  (const FermionField &in, FermionField &out,int dag)=0;
+  virtual void DhopOE(const FermionField &in, FermionField &out,int dag)=0;
+  virtual void DhopEO(const FermionField &in, FermionField &out,int dag)=0;
+  virtual void DhopDir(const FermionField &in, FermionField &out,int dir,int disp)=0; // implemented by WilsonFermion and WilsonFermion5D

-      // force terms; five routines; default to Dhop on diagonal
-      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDeriv(mat,U,V,dag);};
-      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivOE(mat,U,V,dag);};
-      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivEO(mat,U,V,dag);};
-      virtual void MooDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=zero;}; // Clover can override these
-      virtual void MeeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=zero;};
+  // force terms; five routines; default to Dhop on diagonal
+  virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDeriv(mat,U,V,dag);};
+  virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivOE(mat,U,V,dag);};
+  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivEO(mat,U,V,dag);};
+  virtual void MooDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=Zero();}; // Clover can override these
+  virtual void MeeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=Zero();};

-      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
-      virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
-      virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;


-      virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
-      virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
+  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac


      virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};

-      virtual void  FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) {
-	FFT theFFT((GridCartesian *) in._grid);
+      virtual void  FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) 
+      {
+	FFT theFFT((GridCartesian *) in.Grid());

-	FermionField in_k(in._grid);
-	FermionField prop_k(in._grid);
+	typedef typename Simd::scalar_type Scalar;
+
+	FermionField in_k(in.Grid());
+	FermionField prop_k(in.Grid());

 	//phase for boundary condition
-	ComplexField coor(in._grid);
-	ComplexField ph(in._grid);  ph = zero;
-	FermionField in_buf(in._grid); in_buf = zero;
+	ComplexField coor(in.Grid());
+	ComplexField ph(in.Grid());  ph = Zero();
+	FermionField in_buf(in.Grid()); in_buf = Zero();
+
 	Scalar ci(0.0,1.0);
 	assert(twist.size() == Nd);//check that twist is Nd
 	assert(boundary.size() == Nd);//check that boundary conditions is Nd
@@ -113,7 +114,7 @@ namespace Grid {
 	{
          LatticeCoordinate(coor, nu);
 	  double boundary_phase = ::acos(real(boundary[nu]));
-	  ph = ph + boundary_phase*coor*((1./(in._grid->_fdimensions[nu])));
+	  ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu])));
 	  //momenta for propagator shifted by twist+boundary
 	  twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
 	}
@@ -124,43 +125,42 @@ namespace Grid {
 	theFFT.FFT_all_dim(out,prop_k,FFT::backward);

 	//phase for boundary condition
-	out = out * exp(ci*ph);
+        out = out * exp(Scalar(2.0*M_PI)*ci*ph);

      };

      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
-		std::vector<Complex> boundary;
-		for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
-		std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
-	        FreePropagator(in,out,mass,boundary,twist);
+	std::vector<Complex> boundary;
+	for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
+	std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+	FreePropagator(in,out,mass,boundary,twist);
      };

-      ///////////////////////////////////////////////
-      // Updates gauge field during HMC
-      ///////////////////////////////////////////////
-      virtual void ImportGauge(const GaugeField & _U)=0;
+  ///////////////////////////////////////////////
+  // Updates gauge field during HMC
+  ///////////////////////////////////////////////
+  virtual void ImportGauge(const GaugeField & _U)=0;

-      //////////////////////////////////////////////////////////////////////
-      // Conserved currents, either contract at sink or insert sequentially.
-      //////////////////////////////////////////////////////////////////////
-
-      virtual void ContractConservedCurrent(PropagatorField &q_in_1,
-                                            PropagatorField &q_in_2,
-                                            PropagatorField &q_out,
-                                            Current curr_type,
-                                            unsigned int mu)=0;
-      virtual void SeqConservedCurrent(PropagatorField &q_in, 
-                                       PropagatorField &q_out,
-                                       Current curr_type,
-                                       unsigned int mu,
-                                       unsigned int tmin, 
-                                       unsigned int tmax,
-                                       ComplexField &lattice_cmplx)=0;
+  //////////////////////////////////////////////////////////////////////
+  // Conserved currents, either contract at sink or insert sequentially.
+  //////////////////////////////////////////////////////////////////////
+  virtual void ContractConservedCurrent(PropagatorField &q_in_1,
+					PropagatorField &q_in_2,
+					PropagatorField &q_out,
+					Current curr_type,
+					unsigned int mu)=0;
+  virtual void SeqConservedCurrent(PropagatorField &q_in, 
+				   PropagatorField &q_out,
+				   Current curr_type,
+				   unsigned int mu,
+				   unsigned int tmin, 
+				   unsigned int tmax,
+				   ComplexField &lattice_cmplx)=0;

      // Only reimplemented in Wilson5D 
      // Default to just a zero correlation function
-      virtual void ContractJ5q(FermionField &q_in   ,ComplexField &J5q) { J5q=zero; };
-      virtual void ContractJ5q(PropagatorField &q_in,ComplexField &J5q) { J5q=zero; };
+  virtual void ContractJ5q(FermionField &q_in   ,ComplexField &J5q) { J5q=Zero(); };
+  virtual void ContractJ5q(PropagatorField &q_in,ComplexField &J5q) { J5q=Zero(); };

      ///////////////////////////////////////////////
      // Physical field import/export
@@ -183,9 +183,7 @@ namespace Grid {
      {
 	exported=solution;
      };
-    };
+};

-  }
-}
+NAMESPACE_END(Grid);

-#endif
--- a/Grid/qcd/action/fermion/FermionOperatorImpl.h
+++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h
--- a/Grid/qcd/action/fermion/FourierAcceleratedPV.h
+++ b/Grid/qcd/action/fermion/FourierAcceleratedPV.h
@@ -28,8 +28,8 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once
-namespace Grid {
-namespace QCD {
+
+NAMESPACE_BEGIN(Grid);

  template<typename M>
    void get_real_const_bc(M& m, RealD& _b, RealD& _c) {
@@ -63,8 +63,8 @@ class FourierAcceleratedPV {
   : dwfPV(_dwfPV), Umu(_Umu), cg(_cg), group_in_s(_group_in_s) 
  {
    assert( dwfPV.FermionGrid()->_fdimensions[0] % (2*group_in_s) == 0);
-    grid5D = QCD::SpaceTimeGrid::makeFiveDimGrid(2*group_in_s, (GridCartesian*)Umu._grid);
-    gridRB5D = QCD::SpaceTimeGrid::makeFiveDimRedBlackGrid(2*group_in_s, (GridCartesian*)Umu._grid);
+    grid5D   = SpaceTimeGrid::makeFiveDimGrid(2*group_in_s, (GridCartesian*)Umu.Grid());
+    gridRB5D = SpaceTimeGrid::makeFiveDimRedBlackGrid(2*group_in_s, (GridCartesian*)Umu.Grid());
  }

  void rotatePV(const Vi& _src, Vi& dst, bool forward) const {
@@ -72,13 +72,13 @@ class FourierAcceleratedPV {
    GridStopWatch gsw1, gsw2;

    typedef typename Vi::scalar_type Coeff_t;
-    int Ls = dst._grid->_fdimensions[0];
+    int Ls = dst.Grid()->_fdimensions[0];

-    Vi _tmp(dst._grid);
+    Vi _tmp(dst.Grid());
    double phase = M_PI / (double)Ls;
    Coeff_t bzero(0.0,0.0);

-    FFT theFFT((GridCartesian*)dst._grid);
+    FFT theFFT((GridCartesian*)dst.Grid());

    if (!forward) {
      gsw1.Start();
@@ -115,7 +115,7 @@ class FourierAcceleratedPV {
    std::cout << GridLogMessage << "Fourier-Accelerated Outer Pauli Villars"<<std::endl;

    typedef typename Vi::scalar_type Coeff_t;
-    int Ls = _dst._grid->_fdimensions[0];
+    int Ls = _dst.Grid()->_fdimensions[0];

    GridStopWatch gswT;
    gswT.Start();
@@ -126,12 +126,12 @@ class FourierAcceleratedPV {
    
    // U(true) Rightinv TMinv U(false) = Minv

-    Vi _src_diag(_dst._grid);
+    Vi _src_diag(_dst.Grid());
    Vi _src_diag_slice(dwfPV.GaugeGrid());
    Vi _dst_diag_slice(dwfPV.GaugeGrid());
    Vi _src_diag_slices(grid5D);
    Vi _dst_diag_slices(grid5D);
-    Vi _dst_diag(_dst._grid);
+    Vi _dst_diag(_dst.Grid());

    rotatePV(_src,_src_diag,false);

@@ -163,7 +163,7 @@ class FourierAcceleratedPV {
      for (int sidx=0;sidx<group_in_s;sidx++) {

 	int s = sgroup*group_in_s + sidx;
-	int sprime = Ls-s-1;
+	//	int sprime = Ls-s-1;

 	RealD phase = M_PI / (RealD)Ls * (2.0 * s + 1.0);
 	RealD cosp = ::cos(phase);
@@ -196,7 +196,7 @@ class FourierAcceleratedPV {

      GridStopWatch gsw;
      gsw.Start();
-      _dst_diag_slices = zero; // zero guess
+      _dst_diag_slices = Zero(); // zero guess
      sol(tm,_src_diag_slices,_dst_diag_slices);
      gsw.Stop();
      std::cout << GridLogMessage << "Solve[sgroup=" << sgroup << "] completed in " << gsw.Elapsed() << ", " << gswA.Elapsed() << std::endl;
@@ -212,7 +212,7 @@ class FourierAcceleratedPV {

 	// now rotate with inverse of
 	Coeff_t pA = b + c*cosp;
-	Coeff_t pB = - Coeff_t(0.0,1.0)*c*sinp;
+	Coeff_t pB = - Coeff_t(0.0,1.0)*Coeff_t(c*sinp);
 	Coeff_t pABden = pA*pA - pB*pB;
 	// (pA + pB * G5) * (pA - pB*G5) = (pA^2 - pB^2)
      
@@ -234,4 +234,5 @@ class FourierAcceleratedPV {
  }

 };
-}}
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -0,0 +1,321 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
+class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
+public:
+
+ static const int Dimension = Representation::Dimension;
+ static const bool isFundamental = Representation::isFundamental;
+ static const int Nhcs = Options::Nhcs;
+ static const bool LsVectorised=false;
+
+ typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
+ INHERIT_GIMPL_TYPES(Gimpl);
+ 
+ typedef typename Options::_Coeff_t Coeff_t;
+ typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
+      
+ template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Dimension>, Ns>,   Ngp>;
+ template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Dimension>, Ns>,   Ngp>;
+ template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Dimension>, Nhs>,  Ngp>;
+ template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
+ template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;
+
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplPropagator<Simd>        SitePropagator;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+  typedef iImplHalfCommSpinor<SimdL>   SiteHalfCommSpinor;
+  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+
+  typedef Lattice<SiteSpinor> FermionField;
+  typedef Lattice<SitePropagator> PropagatorField;
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+ 
+  typedef GparityWilsonImplParams ImplParams;
+  typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
+  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor, ImplParams> StencilImpl;
+  typedef typename StencilImpl::View_type StencilView;
+      
+  ImplParams Params;
+
+  GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
+
+  // provide the multiply by link that is differentiated between Gparity (with
+  // flavour index) and non-Gparity
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi, 
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi, 
+					  int mu) 
+  {
+    assert(0);
+  } 
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi, 
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi, 
+					  int mu, 
+					  StencilEntry *SE,
+					  StencilView &St) 
+  {
+    int direction = St._directions[mu];
+    int distance  = St._distances[mu];
+    int ptype     = St._permute_type[mu];
+    int sl        = St._simd_layout[direction];
+    Coordinate icoor;
+
+#ifdef __CUDA_ARCH__
+    _Spinor tmp;
+
+    const int Nsimd =SiteDoubledGaugeField::Nsimd();
+    int s = SIMTlane(Nsimd);
+    St.iCoorFromIindex(icoor,s);
+
+    int mmu = mu % Nd;
+    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
+      
+      int permute_lane = (sl==1) 
+    	|| ((distance== 1)&&(icoor[direction]==1))
+	|| ((distance==-1)&&(icoor[direction]==0));
+
+      if ( permute_lane ) { 
+	tmp(0) = chi(1);
+	tmp(1) = chi(0);
+      } else {
+	tmp(0) = chi(0);
+	tmp(1) = chi(1);
+      }
+
+      auto UU0=coalescedRead(U(0)(mu));
+      auto UU1=coalescedRead(U(1)(mu));
+
+      mult(&phi(0),&UU0,&tmp(0));
+      mult(&phi(1),&UU1,&tmp(1));
+
+    } else {
+
+      auto UU0=coalescedRead(U(0)(mu));
+      auto UU1=coalescedRead(U(1)(mu));
+
+      mult(&phi(0),&UU0,&chi(0));
+      mult(&phi(1),&UU1,&chi(1));
+
+    }
+
+#else
+    typedef _Spinor vobj;
+    typedef typename SiteHalfSpinor::scalar_object sobj;
+    typedef typename SiteHalfSpinor::vector_type   vector_type;
+	
+    vobj vtmp;
+    sobj stmp;
+        
+    const int Nsimd =vector_type::Nsimd();
+    
+    // Fixme X.Y.Z.T hardcode in stencil
+    int mmu = mu % Nd;
+        
+    // assert our assumptions
+    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
+    assert((sl == 1) || (sl == 2));
+
+    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
+
+      if ( sl == 2 ) {
+       
+	ExtractBuffer<sobj> vals(Nsimd);
+
+	extract(chi,vals);
+	for(int s=0;s<Nsimd;s++){
+
+	  St.iCoorFromIindex(icoor,s);
+              
+	  assert((icoor[direction]==0)||(icoor[direction]==1));
+              
+	  int permute_lane;
+	  if ( distance == 1) {
+	    permute_lane = icoor[direction]?1:0;
+	  } else {
+	    permute_lane = icoor[direction]?0:1;
+	  }
+              
+	  if ( permute_lane ) { 
+	    stmp(0) = vals[s](1);
+	    stmp(1) = vals[s](0);
+	    vals[s] = stmp;
+	  }
+	}
+	merge(vtmp,vals);
+            
+      } else { 
+	vtmp(0) = chi(1);
+	vtmp(1) = chi(0);
+      }
+      mult(&phi(0),&U(0)(mu),&vtmp(0));
+      mult(&phi(1),&U(1)(mu),&vtmp(1));
+     
+    } else { 
+      mult(&phi(0),&U(0)(mu),&chi(0));
+      mult(&phi(1),&U(1)(mu),&chi(1));
+    }
+#endif   
+  }
+
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    reg = memory;
+  }
+
+  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+  {
+    conformable(Uds.Grid(),GaugeGrid);
+    conformable(Umu.Grid(),GaugeGrid);
+   
+    GaugeLinkField Utmp (GaugeGrid);
+    GaugeLinkField U    (GaugeGrid);
+    GaugeLinkField Uconj(GaugeGrid);
+   
+    Lattice<iScalar<vInteger> > coor(GaugeGrid);
+        
+    for(int mu=0;mu<Nd;mu++){
+          
+      LatticeCoordinate(coor,mu);
+          
+      U     = PeekIndex<LorentzIndex>(Umu,mu);
+      Uconj = conjugate(U);
+     
+      // This phase could come from a simple bc 1,1,-1,1 ..
+      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
+      if ( Params.twists[mu] ) { 
+	Uconj = where(coor==neglink,-Uconj,Uconj);
+      }
+	  
+      auto U_v = U.View();
+      auto Uds_v = Uds.View();
+      auto Uconj_v = Uconj.View();
+      auto Utmp_v= Utmp.View();
+      thread_foreach(ss,U_v,{
+	Uds_v[ss](0)(mu) = U_v[ss]();
+	Uds_v[ss](1)(mu) = Uconj_v[ss]();
+      });
+          
+      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
+      Uconj = adj(Cshift(Uconj,mu,-1));
+ 
+      Utmp = U;
+      if ( Params.twists[mu] ) { 
+	Utmp = where(coor==0,Uconj,Utmp);
+      }
+
+      thread_foreach(ss,Utmp_v,{
+	Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
+      });
+          
+      Utmp = Uconj;
+      if ( Params.twists[mu] ) { 
+	Utmp = where(coor==0,U,Utmp);
+      }
+	  
+      thread_foreach(ss,Utmp_v,{
+        Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
+      });
+          
+    }
+  }
+      
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
+
+    // DhopDir provides U or Uconj depending on coor/flavour.
+    GaugeLinkField link(mat.Grid());
+    // use lorentz for flavour as hack.
+    auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
+    auto link_v = link.View();
+    auto tmp_v = tmp.View();
+    thread_foreach(ss,tmp_v,{
+      link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
+    });
+    PokeIndex<LorentzIndex>(mat, link, mu);
+    return;
+  }
+      
+ inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
+   //mat = outerProduct(Btilde, A);
+   assert(0);
+  }
+
+  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+    assert(0);
+    /*
+    auto tmp = TraceIndex<SpinIndex>(P);
+    parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
+      mat[ss]() = tmp[ss](0, 0) + conjugate(tmp[ss](1, 1));
+    }
+    */
+  }
+
+  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    assert(0);
+  }
+  
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
+
+    int Ls = Btilde.Grid()->_fdimensions[0];
+        
+    GaugeLinkField tmp(mat.Grid());
+    tmp = Zero();
+    auto tmp_v = tmp.View();
+    auto Atilde_v = Atilde.View();
+    auto Btilde_v = Btilde.View();
+    thread_for(ss,tmp.Grid()->oSites(),{
+      for (int s = 0; s < Ls; s++) {
+	int sF = s + Ls * ss;
+	auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
+	tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
+      }
+    });
+    PokeIndex<LorentzIndex>(mat, tmp, mu);
+    return;
+  }
+  
+};
+
+typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
+typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF;  // Float
+typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD;  // Double
+ 
+typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
+typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
+typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF;  // Double
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -25,16 +25,14 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */
 #ifndef GRID_QCD_IMPR_STAG_FERMION_H
 #define GRID_QCD_IMPR_STAG_FERMION_H

-namespace Grid {
-
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 class ImprovedStaggeredFermionStatic {
- public:
+public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static const int npoint = 16;
@@ -42,7 +40,7 @@ class ImprovedStaggeredFermionStatic {

 template <class Impl>
 class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedStaggeredFermionStatic {
- public:
+public:
  INHERIT_IMPL_TYPES(Impl);
  typedef StaggeredKernels<Impl> Kernels;

@@ -139,7 +137,7 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS

  // DoubleStore impl dependent
  void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
-  void ImportGauge      (const GaugeField &_Uthin  ,const GaugeField &_Ufat);
+  void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
  void ImportGaugeSimple(const GaugeField &_UUU    ,const GaugeField &_U);
  void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
  DoubledGaugeField &GetU(void)   { return Umu ; } ;
@@ -151,7 +149,7 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
  ///////////////////////////////////////////////////////////////

  //    protected:
- public:
+public:
  // any other parameters of action ???
  virtual int   isTrivialEE(void) { return 1; };
  virtual RealD Mass(void) { return mass; }
@@ -188,11 +186,11 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
                                PropagatorField &q_out,
                                Current curr_type,
                                unsigned int mu);
-  void SeqConservedCurrent(PropagatorField &q_in, 
+  void SeqConservedCurrent(PropagatorField &q_in,
                           PropagatorField &q_out,
-                           Current curr_type, 
-                           unsigned int mu,
-                           unsigned int tmin, 
+                           Current curr_type,
+                           unsigned int mu, 
+                           unsigned int tmin,
                           unsigned int tmax,
 			   ComplexField &lattice_cmplx);
 };
@@ -200,6 +198,6 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;

-}
-}
+NAMESPACE_END(Grid);
+
 #endif
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -1,5 +1,5 @@

-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -25,101 +25,99 @@ Author: AzusaYamaguchi <ayamaguc@staffmail.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_IMPROVED_STAGGERED_FERMION_5D_H
-#define  GRID_QCD_IMPROVED_STAGGERED_FERMION_5D_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  ////////////////////////////////////////////////////////////////////////////////
-  // This is the 4d red black case appropriate to support
-  ////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// This is the 4d red black case appropriate to support
+////////////////////////////////////////////////////////////////////////////////

-    class ImprovedStaggeredFermion5DStatic { 
-    public:
-      // S-direction is INNERMOST and takes no part in the parity.
-      static const std::vector<int> directions;
-      static const std::vector<int> displacements;
-      const int npoint = 16;
-    };
+class ImprovedStaggeredFermion5DStatic { 
+public:
+  // S-direction is INNERMOST and takes no part in the parity.
+  static const std::vector<int> directions;
+  static const std::vector<int> displacements;
+  const int npoint = 16;
+};

-    template<class Impl>
-    class ImprovedStaggeredFermion5D :  public StaggeredKernels<Impl>, public ImprovedStaggeredFermion5DStatic 
-    {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
-      typedef StaggeredKernels<Impl> Kernels;
+template<class Impl>
+class ImprovedStaggeredFermion5D :  public StaggeredKernels<Impl>, public ImprovedStaggeredFermion5DStatic 
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+  typedef StaggeredKernels<Impl> Kernels;

-      FermionField _tmp;
-      FermionField &tmp(void) { return _tmp; }
+  FermionField _tmp;
+  FermionField &tmp(void) { return _tmp; }

-      ////////////////////////////////////////
-      // Performance monitoring
-      ////////////////////////////////////////
-      void Report(void);
-      void ZeroCounters(void);
-      double DhopTotalTime;
-      double DhopCalls;
-      double DhopCommTime;
-      double DhopComputeTime;
+  ////////////////////////////////////////
+  // Performance monitoring
+  ////////////////////////////////////////
+  void Report(void);
+  void ZeroCounters(void);
+  double DhopTotalTime;
+  double DhopCalls;
+  double DhopCommTime;
+  double DhopComputeTime;
      double DhopComputeTime2;
      double DhopFaceTime;

-      ///////////////////////////////////////////////////////////////
-      // Implement the abstract base
-      ///////////////////////////////////////////////////////////////
-      GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
-      GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
-      GridBase *FermionGrid(void)            { return _FiveDimGrid;}
-      GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
+  ///////////////////////////////////////////////////////////////
+  // Implement the abstract base
+  ///////////////////////////////////////////////////////////////
+  GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
+  GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
+  GridBase *FermionGrid(void)            { return _FiveDimGrid;}
+  GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}

-      // full checkerboard operations; leave unimplemented as abstract for now
-      RealD  M    (const FermionField &in, FermionField &out);
-      RealD  Mdag (const FermionField &in, FermionField &out);
+  // full checkerboard operations; leave unimplemented as abstract for now
+  RealD  M    (const FermionField &in, FermionField &out);
+  RealD  Mdag (const FermionField &in, FermionField &out);

-      // half checkerboard operations
-      void   Meooe       (const FermionField &in, FermionField &out);
-      void   Mooee       (const FermionField &in, FermionField &out);
-      void   MooeeInv    (const FermionField &in, FermionField &out);
+  // half checkerboard operations
+  void   Meooe       (const FermionField &in, FermionField &out);
+  void   Mooee       (const FermionField &in, FermionField &out);
+  void   MooeeInv    (const FermionField &in, FermionField &out);

-      void   MeooeDag    (const FermionField &in, FermionField &out);
-      void   MooeeDag    (const FermionField &in, FermionField &out);
-      void   MooeeInvDag (const FermionField &in, FermionField &out);
+  void   MeooeDag    (const FermionField &in, FermionField &out);
+  void   MooeeDag    (const FermionField &in, FermionField &out);
+  void   MooeeInvDag (const FermionField &in, FermionField &out);

-      void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
-      void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+  void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);

-      // These can be overridden by fancy 5d chiral action
-      void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  // These can be overridden by fancy 5d chiral action
+  void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

-      // Implement hopping term non-hermitian hopping term; half cb or both
-      void Dhop  (const FermionField &in, FermionField &out,int dag);
-      void DhopOE(const FermionField &in, FermionField &out,int dag);
-      void DhopEO(const FermionField &in, FermionField &out,int dag);
+  // Implement hopping term non-hermitian hopping term; half cb or both
+  void Dhop  (const FermionField &in, FermionField &out,int dag);
+  void DhopOE(const FermionField &in, FermionField &out,int dag);
+  void DhopEO(const FermionField &in, FermionField &out,int dag);

    
-    ///////////////////////////////////////////////////////////////
-    // New methods added 
-    ///////////////////////////////////////////////////////////////
-    void DerivInternal(StencilImpl & st,
-		       DoubledGaugeField & U,
-		       DoubledGaugeField & UUU,
-		       GaugeField &mat,
-		       const FermionField &A,
-		       const FermionField &B,
-		       int dag);
+  ///////////////////////////////////////////////////////////////
+  // New methods added 
+  ///////////////////////////////////////////////////////////////
+  void DerivInternal(StencilImpl & st,
+		     DoubledGaugeField & U,
+		     DoubledGaugeField & UUU,
+		     GaugeField &mat,
+		     const FermionField &A,
+		     const FermionField &B,
+		     int dag);
    
-    void DhopInternal(StencilImpl & st,
-		      LebesgueOrder &lo,
-		      DoubledGaugeField &U,
-		      DoubledGaugeField &UUU,
-		      const FermionField &in, 
-		      FermionField &out,
-		      int dag);
+  void DhopInternal(StencilImpl & st,
+		    LebesgueOrder &lo,
+		    DoubledGaugeField &U,
+		    DoubledGaugeField &UUU,
+		    const FermionField &in, 
+		    FermionField &out,
+		    int dag);
    
    void DhopInternalOverlappedComms(StencilImpl & st,
 		      LebesgueOrder &lo,
@@ -138,17 +136,17 @@ namespace QCD {
 		      int dag);
    
    
-    // Constructors
+  // Constructors
    ////////////////////////////////////////////////////////////////////////////////////////////////
    // Grid internal interface -- Thin link and fat link, with coefficients
    ////////////////////////////////////////////////////////////////////////////////////////////////
-    ImprovedStaggeredFermion5D(GaugeField &_Uthin,
-			       GaugeField &_Ufat,
-			       GridCartesian         &FiveDimGrid,
-			       GridRedBlackCartesian &FiveDimRedBlackGrid,
-			       GridCartesian         &FourDimGrid,
-			       GridRedBlackCartesian &FourDimRedBlackGrid,
-			       double _mass,
+  ImprovedStaggeredFermion5D(GaugeField &_Uthin,
+			     GaugeField &_Ufat,
+			     GridCartesian         &FiveDimGrid,
+			     GridRedBlackCartesian &FiveDimRedBlackGrid,
+			     GridCartesian         &FourDimGrid,
+			     GridRedBlackCartesian &FourDimRedBlackGrid,
+			     double _mass,
 			       RealD _c1, RealD _c2,RealD _u0,
 			       const ImplParams &p= ImplParams());
    ////////////////////////////////////////////////////////////////////////////////////////////////
@@ -160,11 +158,11 @@ namespace QCD {
 			       GridRedBlackCartesian &FourDimRedBlackGrid,
 			       double _mass,
 			       RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
-			       const ImplParams &p= ImplParams());
-
+			     const ImplParams &p= ImplParams());
+    
    // DoubleStore gauge field in operator
    void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
-    void ImportGauge      (const GaugeField &_Uthin  ,const GaugeField &_Ufat);
+  void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat);
    void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
    void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
    // Give a reference; can be used to do an assignment or copy back out after import
@@ -173,62 +171,61 @@ namespace QCD {
    DoubledGaugeField &GetUUU(void) { return UUUmu; };
    void CopyGaugeCheckerboards(void);
    
-    ///////////////////////////////////////////////////////////////
-    // Data members require to support the functionality
-    ///////////////////////////////////////////////////////////////
-  public:
-
+  ///////////////////////////////////////////////////////////////
+  // Data members require to support the functionality
+  ///////////////////////////////////////////////////////////////
+public:
+    
    virtual int   isTrivialEE(void) { return 1; };
    virtual RealD Mass(void) { return mass; }
    
-    GridBase *_FourDimGrid;
-    GridBase *_FourDimRedBlackGrid;
-    GridBase *_FiveDimGrid;
-    GridBase *_FiveDimRedBlackGrid;
+  GridBase *_FourDimGrid;
+  GridBase *_FourDimRedBlackGrid;
+  GridBase *_FiveDimGrid;
+  GridBase *_FiveDimRedBlackGrid;
    
-    RealD mass;
-    RealD c1;
-    RealD c2;
-    RealD u0;
-    int Ls;
+  RealD mass;
+  RealD c1;
+  RealD c2;
+  RealD u0;
+  int Ls;
    
-    //Defines the stencils for even and odd
-    StencilImpl Stencil; 
-    StencilImpl StencilEven; 
-    StencilImpl StencilOdd; 
+  //Defines the stencils for even and odd
+  StencilImpl Stencil; 
+  StencilImpl StencilEven; 
+  StencilImpl StencilOdd; 
    
-    // Copy of the gauge field , with even and odd subsets
-    DoubledGaugeField Umu;
-    DoubledGaugeField UmuEven;
-    DoubledGaugeField UmuOdd;
+  // Copy of the gauge field , with even and odd subsets
+  DoubledGaugeField Umu;
+  DoubledGaugeField UmuEven;
+  DoubledGaugeField UmuOdd;

-    DoubledGaugeField UUUmu;
-    DoubledGaugeField UUUmuEven;
-    DoubledGaugeField UUUmuOdd;
+  DoubledGaugeField UUUmu;
+  DoubledGaugeField UUUmuEven;
+  DoubledGaugeField UUUmuOdd;
    
-    LebesgueOrder Lebesgue;
-    LebesgueOrder LebesgueEvenOdd;
+  LebesgueOrder Lebesgue;
+  LebesgueOrder LebesgueEvenOdd;
    
-    // Comms buffer
-    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  // Comms buffer
+  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    
-    ///////////////////////////////////////////////////////////////
-    // Conserved current utilities
-    ///////////////////////////////////////////////////////////////
-    void ContractConservedCurrent(PropagatorField &q_in_1,
-                                  PropagatorField &q_in_2,
-                                  PropagatorField &q_out,
-                                  Current curr_type,
-                                  unsigned int mu);
-    void SeqConservedCurrent(PropagatorField &q_in, 
-                             PropagatorField &q_out,
-                             Current curr_type, 
-                             unsigned int mu,
-                             unsigned int tmin, 
+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+				PropagatorField &q_in_2,
+				PropagatorField &q_out,
+				Current curr_type,
+				unsigned int mu);
+  void SeqConservedCurrent(PropagatorField &q_in,
+			   PropagatorField &q_out,
+			   Current curr_type,
+			   unsigned int mu, 
+			   unsigned int tmin,
                             unsigned int tmax,
                 	     ComplexField &lattice_cmplx);
-  };
+};

-}}
+NAMESPACE_END(Grid);

-#endif
--- a/Grid/qcd/action/fermion/MADWF.h
+++ b/Grid/qcd/action/fermion/MADWF.h
@@ -27,8 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #pragma once

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template <class Fieldi, class Fieldo,IfNotSame<Fieldi,Fieldo> X=0>
 inline void convert(const Fieldi &from,Fieldo &to) 
@@ -109,7 +108,7 @@ class MADWF
    std::cout << GridLogMessage << " b    " <<norm2(b)<<std::endl;

    defect = b;
-    sol5=zero;
+    sol5=Zero();
    for (int i=0;i<maxiter;i++) {

      ///////////////////////////////////////
@@ -122,7 +121,7 @@ class MADWF
      ////////////////////////////////////////////////
      // Solve the inner system with surface term c0
      ////////////////////////////////////////////////
-      ci = zero;  
+      ci = Zero();  
      convert(c0,c0i); // Possible precison change
      InsertSlice(c0i,ci,0, 0);

@@ -190,4 +189,4 @@ class MADWF

 };

-}}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/MobiusEOFAFermion.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.cc
@@ -1,502 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  template<class Impl>
-    MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
-      GaugeField            &_Umu,
-      GridCartesian         &FiveDimGrid,
-      GridRedBlackCartesian &FiveDimRedBlackGrid,
-      GridCartesian         &FourDimGrid,
-      GridRedBlackCartesian &FourDimRedBlackGrid,
-      RealD _mq1, RealD _mq2, RealD _mq3,
-      RealD _shift, int _pm, RealD _M5,
-      RealD _b, RealD _c, const ImplParams &p) :
-    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-        _shift, _pm, _M5, _b, _c, p)
-    {
-      int Ls = this->Ls;
-
-      RealD eps = 1.0;
-      Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
-      assert(zdata->n == this->Ls);
-
-      std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
-        ",c=" << _c << ") with Ls=" << Ls << std::endl;
-      this->SetCoefficientsTanh(zdata, _b, _c);
-      std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
-        ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
-        ",pm=" << _pm << ")" << std::endl;
-
-      Approx::zolotarev_free(zdata);
-
-      if(_shift != 0.0){
-        SetCoefficientsPrecondShiftOps();
-      } else {
-        Mooee_shift.resize(Ls, 0.0);
-        MooeeInv_shift_lc.resize(Ls, 0.0);
-        MooeeInv_shift_norm.resize(Ls, 0.0);
-        MooeeInvDag_shift_lc.resize(Ls, 0.0);
-        MooeeInvDag_shift_norm.resize(Ls, 0.0);
-      }
-    }
-
-    /****************************************************************
-     * Additional EOFA operators only called outside the inverter.  
-     * Since speed is not essential, simple axpby-style
-     * implementations should be fine.
-     ***************************************************************/
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-    {
-      int Ls = this->Ls;
-      RealD alpha = this->alpha;
-
-      Din = zero;
-      if((sign == 1) && (dag == 0)) { // \Omega_{+}
-        for(int s=0; s<Ls; ++s){
-          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
-        }
-      } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
-        for(int s=0; s<Ls; ++s){
-          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
-        }
-      } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
-        for(int sp=0; sp<Ls; ++sp){
-          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
-        }
-      } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
-        for(int sp=0; sp<Ls; ++sp){
-          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
-        }
-      }
-    }
-
-    // This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
-    // It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
-    {
-      int Ls    = this->Ls;
-      RealD b   = 0.5 * ( 1.0 + this->alpha );
-      RealD c   = 0.5 * ( 1.0 - this->alpha );
-      RealD mq1 = this->mq1;
-
-      for(int s=0; s<Ls; ++s){
-        if(s == 0) {
-          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-          axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
-        } else if(s == (Ls-1)) {
-          axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
-          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-        } else {
-          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-        }
-      }
-    }
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-      RealD m = this->mq1;
-      RealD c = 0.5 * this->alpha;
-      RealD d = 0.5;
-
-      RealD DtInv_p(0.0), DtInv_m(0.0);
-      RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
-      FermionField tmp(this->FermionGrid());
-
-      for(int s=0; s<Ls; ++s){
-      for(int sp=0; sp<Ls; ++sp){
-
-        DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
-        DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
-        DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
-        DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
-
-        if(sp == 0){
-          axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
-          axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
-        } else {
-          axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
-          axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
-        }
-
-      }}
-    }
-
-    /*****************************************************************************************************/
-
-    template<class Impl>
-    RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      FermionField Din(psi._grid);
-
-      this->Meooe5D(psi, Din);
-      this->DW(Din, chi, DaggerNo);
-      axpby(chi, 1.0, 1.0, chi, psi);
-      this->M5D(psi, chi);
-      return(norm2(chi));
-    }
-
-    template<class Impl>
-    RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      FermionField Din(psi._grid);
-
-      this->DW(psi, Din, DaggerYes);
-      this->MeooeDag5D(Din, chi);
-      this->M5Ddag(psi, chi);
-      axpby(chi, 1.0, 1.0, chi, psi);
-      return(norm2(chi));
-    }
-
-    /********************************************************************
-     * Performance critical fermion operators called inside the inverter
-     ********************************************************************/
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      std::vector<Coeff_t> diag(Ls,1.0);
-      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-      // no shift term
-      if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
-
-      // fused M + shift operation
-      else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-    }
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      std::vector<Coeff_t> diag(Ls,1.0);
-      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-      // no shift term
-      if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
-
-      // fused M + shift operation
-      else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-    }
-
-    // half checkerboard operations
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      // coefficients of Mooee
-      std::vector<Coeff_t> diag = this->bee;
-      std::vector<Coeff_t> upper(Ls);
-      std::vector<Coeff_t> lower(Ls);
-      for(int s=0; s<Ls; s++){
-        upper[s] = -this->cee[s];
-        lower[s] = -this->cee[s];
-      }
-      upper[Ls-1] *= -this->mq1;
-      lower[0]    *= -this->mq1;
-
-      // no shift term
-      if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
-
-      // fused M + shift operation
-      else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-    }
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      // coefficients of MooeeDag
-      std::vector<Coeff_t> diag = this->bee;
-      std::vector<Coeff_t> upper(Ls);
-      std::vector<Coeff_t> lower(Ls);
-      for(int s=0; s<Ls; s++){
-        if(s==0) {
-          upper[s] = -this->cee[s+1];
-          lower[s] = this->mq1*this->cee[Ls-1];
-        } else if(s==(Ls-1)) {
-          upper[s] = this->mq1*this->cee[0];
-          lower[s] = -this->cee[s-1];
-        } else {
-          upper[s] = -this->cee[s+1];
-          lower[s] = -this->cee[s-1];
-        }
-      }
-
-      // no shift term
-      if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
-
-      // fused M + shift operation
-      else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-    }
-
-    /****************************************************************************************/
-
-    // Computes coefficients for applying Cayley preconditioned shift operators
-    //  (Mooee + \Delta) --> Mooee_shift
-    //  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
-    //  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
-    // For the latter two cases, the operation takes the form
-    //  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
-    //      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
-    {
-      int   Ls    = this->Ls;
-      int   pm    = this->pm;
-      RealD alpha = this->alpha;
-      RealD k     = this->k;
-      RealD mq1   = this->mq1;
-      RealD shift = this->shift;
-
-      // Initialize
-      Mooee_shift.resize(Ls);
-      MooeeInv_shift_lc.resize(Ls);
-      MooeeInv_shift_norm.resize(Ls);
-      MooeeInvDag_shift_lc.resize(Ls);
-      MooeeInvDag_shift_norm.resize(Ls);
-
-      // Construct Mooee_shift
-      int idx(0);
-      Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
-                  ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
-      for(int s=0; s<Ls; ++s){
-        idx = (pm == 1) ? (s) : (Ls-1-s);
-        Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
-      }
-
-      // Tridiagonal solve for MooeeInvDag_shift_lc
-      {
-        Coeff_t m(0.0);
-        std::vector<Coeff_t> d = Mooee_shift;
-        std::vector<Coeff_t> u(Ls,0.0);
-        std::vector<Coeff_t> y(Ls,0.0);
-        std::vector<Coeff_t> q(Ls,0.0);
-        if(pm == 1){ u[0] = 1.0; }
-        else{ u[Ls-1] = 1.0; }
-
-        // Tridiagonal matrix algorithm + Sherman-Morrison formula
-        //
-        // We solve
-        //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
-        // where Mooee' is the tridiagonal part of Mooee_{+}, and
-        // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
-        // so that the outer-product u \otimes v gives the (0,Ls-1)
-        // entry of Mooee_{+}.
-        //
-        // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
-        // and then construct the solution to the original system
-        //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
-        if(pm == 1){
-          for(int s=1; s<Ls; ++s){
-            m = -this->cee[s] / this->bee[s-1];
-            d[s] -= m*d[s-1];
-            u[s] -= m*u[s-1];
-          }
-        }
-        y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
-        q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
-        for(int s=Ls-2; s>=0; --s){
-          if(pm == 1){
-            y[s] = d[s] / this->bee[s];
-            q[s] = u[s] / this->bee[s];
-          } else {
-            y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
-            q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
-          }
-        }
-
-        // Construct MooeeInvDag_shift_lc
-        for(int s=0; s<Ls; ++s){
-          if(pm == 1){
-            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
-              (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
-          } else {
-            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
-              (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
-          }
-        }
-
-        // Compute remaining coefficients
-        N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
-        for(int s=0; s<Ls; ++s){
-
-          // MooeeInv_shift_lc
-          if(pm == 1){ MooeeInv_shift_lc[s] = std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s); }
-          else{ MooeeInv_shift_lc[s] = std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s); }
-
-          // MooeeInv_shift_norm
-          MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
-            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N;
-
-          // MooeeInvDag_shift_norm
-          if(pm == 1){ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s) /
-            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
-          else{ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s) /
-            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
-        }
-      }
-    }
-
-    // Recompute coefficients for a different value of shift constant
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-    {
-      this->shift = new_shift;
-      if(new_shift != 0.0){
-        SetCoefficientsPrecondShiftOps();
-      } else {
-        int Ls = this->Ls;
-        Mooee_shift.resize(Ls,0.0);
-        MooeeInv_shift_lc.resize(Ls,0.0);
-        MooeeInv_shift_norm.resize(Ls,0.0);
-        MooeeInvDag_shift_lc.resize(Ls,0.0);
-        MooeeInvDag_shift_norm.resize(Ls,0.0);
-      }
-    }
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-      Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-    {
-      int Ls = this->Ls;
-
-      GridBase* grid = this->FermionRedBlackGrid();
-      int LLs = grid->_rdimensions[0];
-
-      if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-      Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-      Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-      for(int s=0; s<Ls; s++){
-        Pplus(s,s)  = this->bee[s];
-        Pminus(s,s) = this->bee[s];
-      }
-
-      for(int s=0; s<Ls-1; s++){
-        Pminus(s,s+1) = -this->cee[s];
-        Pplus(s+1,s) = -this->cee[s+1];
-      }
-
-      Pplus (0,Ls-1) = this->mq1*this->cee[0];
-      Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
-
-      if(this->shift != 0.0){
-        RealD c = 0.5 * this->alpha;
-        RealD d = 0.5;
-        RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
-        if(this->pm == 1) {
-          for(int s=0; s<Ls; ++s){
-            Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
-          }
-        } else {
-          for(int s=0; s<Ls; ++s){
-            Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
-          }
-        }
-      }
-
-      Eigen::MatrixXcd PplusMat ;
-      Eigen::MatrixXcd PminusMat;
-
-      if(inv) {
-        PplusMat  = Pplus.inverse();
-        PminusMat = Pminus.inverse();
-      } else {
-        PplusMat  = Pplus;
-        PminusMat = Pminus;
-      }
-
-      if(dag){
-        PplusMat.adjointInPlace();
-        PminusMat.adjointInPlace();
-      }
-
-      typedef typename SiteHalfSpinor::scalar_type scalar_type;
-      const int Nsimd = Simd::Nsimd();
-      Matp.resize(Ls*LLs);
-      Matm.resize(Ls*LLs);
-
-      for(int s2=0; s2<Ls; s2++){
-      for(int s1=0; s1<LLs; s1++){
-        int istride = LLs;
-        int ostride = 1;
-        Simd Vp;
-        Simd Vm;
-        scalar_type *sp = (scalar_type*) &Vp;
-        scalar_type *sm = (scalar_type*) &Vm;
-        for(int l=0; l<Nsimd; l++){
-          if(switcheroo<Coeff_t>::iscomplex()) {
-            sp[l] = PplusMat (l*istride+s1*ostride,s2);
-            sm[l] = PminusMat(l*istride+s1*ostride,s2);
-          } else {
-            // if real
-            scalar_type tmp;
-            tmp = PplusMat (l*istride+s1*ostride,s2);
-            sp[l] = scalar_type(tmp.real(),tmp.real());
-            tmp = PminusMat(l*istride+s1*ostride,s2);
-            sm[l] = scalar_type(tmp.real(),tmp.real());
-          }
-        }
-        Matp[LLs*s2+s1] = Vp;
-        Matm[LLs*s2+s1] = Vm;
-      }}
-  }
-
-  FermOpTemplateInstantiate(MobiusEOFAFermion);
-  GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
-
-}}
--- a/Grid/qcd/action/fermion/MobiusEOFAFermion.h
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.h
@@ -26,108 +26,79 @@ with this program; if not, write to the Free Software Foundation, Inc.,

 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */
 #ifndef  GRID_QCD_MOBIUS_EOFA_FERMION_H
 #define  GRID_QCD_MOBIUS_EOFA_FERMION_H

 #include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  template<class Impl>
-  class MobiusEOFAFermion : public AbstractEOFAFermion<Impl>
-  {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
+template<class Impl>
+class MobiusEOFAFermion : public AbstractEOFAFermion<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    public:
-      // Shift operator coefficients for red-black preconditioned Mobius EOFA
-      std::vector<Coeff_t> Mooee_shift;
-      std::vector<Coeff_t> MooeeInv_shift_lc;
-      std::vector<Coeff_t> MooeeInv_shift_norm;
-      std::vector<Coeff_t> MooeeInvDag_shift_lc;
-      std::vector<Coeff_t> MooeeInvDag_shift_norm;
+public:
+  // Shift operator coefficients for red-black preconditioned Mobius EOFA
+  Vector<Coeff_t> Mooee_shift;
+  Vector<Coeff_t> MooeeInv_shift_lc;
+  Vector<Coeff_t> MooeeInv_shift_norm;
+  Vector<Coeff_t> MooeeInvDag_shift_lc;
+  Vector<Coeff_t> MooeeInvDag_shift_norm;

-      virtual void Instantiatable(void) {};
+  virtual void Instantiatable(void) {};

-      // EOFA-specific operations
-      virtual void  Omega            (const FermionField& in, FermionField& out, int sign, int dag);
-      virtual void  Dtilde           (const FermionField& in, FermionField& out);
-      virtual void  DtildeInv        (const FermionField& in, FermionField& out);
+  // EOFA-specific operations
+  virtual void  Omega            (const FermionField& in, FermionField& out, int sign, int dag);
+  virtual void  Dtilde           (const FermionField& in, FermionField& out);
+  virtual void  DtildeInv        (const FermionField& in, FermionField& out);

-      // override multiply
-      virtual RealD M                (const FermionField& in, FermionField& out);
-      virtual RealD Mdag             (const FermionField& in, FermionField& out);
+  // override multiply
+  virtual RealD M                (const FermionField& in, FermionField& out);
+  virtual RealD Mdag             (const FermionField& in, FermionField& out);

-      // half checkerboard operations
-      virtual void  Mooee            (const FermionField& in, FermionField& out);
-      virtual void  MooeeDag         (const FermionField& in, FermionField& out);
-      virtual void  MooeeInv         (const FermionField& in, FermionField& out);
-      virtual void  MooeeInv_shift   (const FermionField& in, FermionField& out);
-      virtual void  MooeeInvDag      (const FermionField& in, FermionField& out);
-      virtual void  MooeeInvDag_shift(const FermionField& in, FermionField& out);
+  // half checkerboard operations
+  virtual void  Mooee            (const FermionField& in, FermionField& out);
+  virtual void  MooeeDag         (const FermionField& in, FermionField& out);
+  virtual void  MooeeInv         (const FermionField& in, FermionField& out);
+  virtual void  MooeeInv_shift   (const FermionField& in, FermionField& out);
+  virtual void  MooeeInvDag      (const FermionField& in, FermionField& out);
+  virtual void  MooeeInvDag_shift(const FermionField& in, FermionField& out);

-      virtual void   M5D             (const FermionField& psi, FermionField& chi);
-      virtual void   M5Ddag          (const FermionField& psi, FermionField& chi);
+  virtual void   M5D             (const FermionField& psi, FermionField& chi);
+  virtual void   M5Ddag          (const FermionField& psi, FermionField& chi);

-      /////////////////////////////////////////////////////
-      // Instantiate different versions depending on Impl
-      /////////////////////////////////////////////////////
-      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+  /////////////////////////////////////////////////////
+  // Instantiate different versions depending on Impl
+  /////////////////////////////////////////////////////
+  void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
+	   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);

-      void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-        std::vector<Coeff_t>& shift_coeffs);
+  void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
+		 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
+		 Vector<Coeff_t>& shift_coeffs);

-      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+  void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
+	      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);

-      void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-        std::vector<Coeff_t>& shift_coeffs);
+  void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
+		    Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
+		    Vector<Coeff_t>& shift_coeffs);

-      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
+  virtual void RefreshShiftCoefficients(RealD new_shift);

-      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+  // Constructors
+  MobiusEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
+		    GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
+		    RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
+		    RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams());

-      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
-        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+protected:
+  void SetCoefficientsPrecondShiftOps(void);
+};

-      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
-        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
-
-      virtual void RefreshShiftCoefficients(RealD new_shift);
-
-      // Constructors
-      MobiusEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
-        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
-        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
-        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams());
-
-    protected:
-      void SetCoefficientsPrecondShiftOps(void);
-  };
-}}
-
-#define INSTANTIATE_DPERP_MOBIUS_EOFA(A)\
-template void MobiusEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
-template void MobiusEOFAFermion<A>::M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
-template void MobiusEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
-template void MobiusEOFAFermion<A>::M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
-template void MobiusEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
-template void MobiusEOFAFermion<A>::MooeeInv_shift(const FermionField& psi, FermionField& chi); \
-template void MobiusEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi); \
-template void MobiusEOFAFermion<A>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi);
-
-#undef  MOBIUS_EOFA_DPERP_DENSE
-#define MOBIUS_EOFA_DPERP_CACHE
-#undef  MOBIUS_EOFA_DPERP_LINALG
-#define MOBIUS_EOFA_DPERP_VEC
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/MobiusEOFAFermioncache.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermioncache.cc
@@ -1,429 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi, const FermionField &phi, FermionField &chi,
-    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
-  {
-    int Ls = this->Ls;
-    GridBase *grid = psi._grid;
-
-    assert(phi.checkerboard == psi.checkerboard);
-    chi.checkerboard = psi.checkerboard;
-
-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-      for(int s=0; s<Ls; s++){
-        auto tmp = psi._odata[0];
-        if(s==0){
-          spProj5m(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+Ls-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else if(s==(Ls-1)) {
-          spProj5m(tmp, psi._odata[ss+0]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else {
-          spProj5m(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        }
-      }
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
-    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
-    std::vector<Coeff_t> &shift_coeffs)
-  {
-    int Ls = this->Ls;
-    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
-    GridBase *grid = psi._grid;
-
-    assert(phi.checkerboard == psi.checkerboard);
-    chi.checkerboard = psi.checkerboard;
-
-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-      for(int s=0; s<Ls; s++){
-        auto tmp = psi._odata[0];
-        if(s==0){
-          spProj5m(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+Ls-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else if(s==(Ls-1)) {
-          spProj5m(tmp, psi._odata[ss+0]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else {
-          spProj5m(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        }
-        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+shift_s]); }
-        else{ spProj5m(tmp, psi._odata[ss+shift_s]); }
-        chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
-      }
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi,
-    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
-  {
-    int Ls = this->Ls;
-    GridBase *grid = psi._grid;
-
-    assert(phi.checkerboard == psi.checkerboard);
-    chi.checkerboard = psi.checkerboard;
-
-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-      auto tmp = psi._odata[0];
-      for(int s=0; s<Ls; s++){
-        if(s==0) {
-          spProj5p(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+Ls-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else if(s==(Ls-1)) {
-          spProj5p(tmp, psi._odata[ss+0]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else {
-          spProj5p(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        }
-      }
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
-    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
-    std::vector<Coeff_t> &shift_coeffs)
-  {
-    int Ls = this->Ls;
-    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
-    GridBase *grid = psi._grid;
-
-    assert(phi.checkerboard == psi.checkerboard);
-    chi.checkerboard = psi.checkerboard;
-
-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-      chi[ss+Ls-1] = zero;
-      auto tmp = psi._odata[0];
-      for(int s=0; s<Ls; s++){
-        if(s==0) {
-          spProj5p(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+Ls-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else if(s==(Ls-1)) {
-          spProj5p(tmp, psi._odata[ss+0]);
-          chi[ss+s] = chi[ss+s] + diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else {
-          spProj5p(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        }
-        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+s]); }
-        else{ spProj5m(tmp, psi._odata[ss+s]); }
-        chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
-      }
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
-  {
-    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
-
-    GridBase *grid = psi._grid;
-    int Ls = this->Ls;
-
-    chi.checkerboard = psi.checkerboard;
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-
-      auto tmp = psi._odata[0];
-
-      // Apply (L^{\prime})^{-1}
-      chi[ss] = psi[ss]; // chi[0]=psi[0]
-      for(int s=1; s<Ls; s++){
-        spProj5p(tmp, chi[ss+s-1]);
-        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp;
-      }
-
-      // L_m^{-1}
-      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-        spProj5m(tmp, chi[ss+s]);
-        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp;
-      }
-
-      // U_m^{-1} D^{-1}
-      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-        spProj5p(tmp, chi[ss+Ls-1]);
-        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp;
-      }
-      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-
-      // Apply U^{-1}
-      for(int s=Ls-2; s>=0; s--){
-        spProj5m(tmp, chi[ss+s+1]);
-        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi, FermionField &chi)
-  {
-    GridBase *grid = psi._grid;
-    int Ls = this->Ls;
-
-    chi.checkerboard = psi.checkerboard;
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-
-      auto tmp1        = psi._odata[0];
-      auto tmp2        = psi._odata[0];
-      auto tmp2_spProj = psi._odata[0];
-
-      // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
-      chi[ss] = psi[ss]; // chi[0]=psi[0]
-      tmp2 = MooeeInv_shift_lc[0]*psi[ss];
-      for(int s=1; s<Ls; s++){
-        spProj5p(tmp1, chi[ss+s-1]);
-        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
-        tmp2 = tmp2 + MooeeInv_shift_lc[s]*psi[ss+s];
-      }
-      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
-      else{ spProj5m(tmp2_spProj, tmp2); }
-
-      // L_m^{-1}
-      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-        spProj5m(tmp1, chi[ss+s]);
-        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
-      }
-
-      // U_m^{-1} D^{-1}
-      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-        spProj5p(tmp1, chi[ss+Ls-1]);
-        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp1;
-      }
-      // chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
-      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-      spProj5m(tmp1, chi[ss+Ls-1]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
-
-      // Apply U^{-1} and add shift term
-      for(int s=Ls-2; s>=0; s--){
-        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
-        spProj5m(tmp1, chi[ss+s]);
-        chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi, FermionField &chi)
-  {
-    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
-
-    GridBase *grid = psi._grid;
-    int Ls = this->Ls;
-
-    chi.checkerboard = psi.checkerboard;
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-
-      auto tmp = psi._odata[0];
-
-      // Apply (U^{\prime})^{-dag}
-      chi[ss] = psi[ss];
-      for(int s=1; s<Ls; s++){
-        spProj5m(tmp, chi[ss+s-1]);
-        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp;
-      }
-
-      // U_m^{-\dag}
-      for(int s=0; s<Ls-1; s++){
-        spProj5p(tmp, chi[ss+s]);
-        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp;
-      }
-
-      // L_m^{-\dag} D^{-dag}
-      for(int s=0; s<Ls-1; s++){
-        spProj5m(tmp, chi[ss+Ls-1]);
-        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp;
-      }
-      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-
-      // Apply L^{-dag}
-      for(int s=Ls-2; s>=0; s--){
-        spProj5p(tmp, chi[ss+s+1]);
-        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi, FermionField &chi)
-  {
-    GridBase *grid = psi._grid;
-    int Ls = this->Ls;
-
-    chi.checkerboard = psi.checkerboard;
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-
-      auto tmp1        = psi._odata[0];
-      auto tmp2        = psi._odata[0];
-      auto tmp2_spProj = psi._odata[0];
-
-      // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
-      chi[ss] = psi[ss];
-      tmp2 = MooeeInvDag_shift_lc[0]*psi[ss];
-      for(int s=1; s<Ls; s++){
-        spProj5m(tmp1, chi[ss+s-1]);
-        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp1;
-        tmp2 = tmp2 + MooeeInvDag_shift_lc[s]*psi[ss+s];
-      }
-      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
-      else{ spProj5m(tmp2_spProj, tmp2); }
-
-      // U_m^{-\dag}
-      for(int s=0; s<Ls-1; s++){
-        spProj5p(tmp1, chi[ss+s]);
-        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp1;
-      }
-
-      // L_m^{-\dag} D^{-dag}
-      for(int s=0; s<Ls-1; s++){
-        spProj5m(tmp1, chi[ss+Ls-1]);
-        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp1;
-      }
-      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-      spProj5p(tmp1, chi[ss+Ls-1]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInvDag_shift_norm[Ls-1]*tmp2_spProj;
-
-      // Apply L^{-dag}
-      for(int s=Ls-2; s>=0; s--){
-        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp1;
-        spProj5p(tmp1, chi[ss+s]);
-        chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  #ifdef MOBIUS_EOFA_DPERP_CACHE
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
-
-  #endif
-
-}}
--- a/Grid/qcd/action/fermion/MobiusEOFAFermiondense.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermiondense.cc
@@ -1,184 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  /*
-  * Dense matrix versions of routines
-  */
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-  {
-    int Ls = this->Ls;
-    int LLs = psi._grid->_rdimensions[0];
-    int vol = psi._grid->oSites()/LLs;
-
-    int pm      = this->pm;
-    RealD shift = this->shift;
-    RealD alpha = this->alpha;
-    RealD k     = this->k;
-    RealD mq1   = this->mq1;
-
-    chi.checkerboard = psi.checkerboard;
-
-    assert(Ls==LLs);
-
-    Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
-    Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
-
-    for(int s=0;s<Ls;s++){
-        Pplus(s,s)  = this->bee[s];
-        Pminus(s,s) = this->bee[s];
-    }
-
-    for(int s=0; s<Ls-1; s++){
-        Pminus(s,s+1) = -this->cee[s];
-    }
-
-    for(int s=0; s<Ls-1; s++){
-        Pplus(s+1,s) = -this->cee[s+1];
-    }
-    Pplus (0,Ls-1) = mq1*this->cee[0];
-    Pminus(Ls-1,0) = mq1*this->cee[Ls-1];
-
-    if(shift != 0.0){
-      Coeff_t N = 2.0 * ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
-      for(int s=0; s<Ls; ++s){
-        if(pm == 1){ Pplus(s,Ls-1) += shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
-        else{ Pminus(Ls-1-s,Ls-1) -= shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
-      }
-    }
-
-    Eigen::MatrixXd PplusMat ;
-    Eigen::MatrixXd PminusMat;
-
-    if(inv){
-      PplusMat  = Pplus.inverse();
-      PminusMat = Pminus.inverse();
-    } else {
-      PplusMat  = Pplus;
-      PminusMat = Pminus;
-    }
-
-    if(dag){
-      PplusMat.adjointInPlace();
-      PminusMat.adjointInPlace();
-    }
-
-    // For the non-vectorised s-direction this is simple
-
-    for(auto site=0; site<vol; site++){
-
-        SiteSpinor     SiteChi;
-        SiteHalfSpinor SitePplus;
-        SiteHalfSpinor SitePminus;
-
-        for(int s1=0; s1<Ls; s1++){
-            SiteChi = zero;
-            for(int s2=0; s2<Ls; s2++){
-                int lex2 = s2 + Ls*site;
-                if(PplusMat(s1,s2) != 0.0){
-                    spProj5p(SitePplus,psi[lex2]);
-                    accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
-                }
-                if(PminusMat(s1,s2) != 0.0){
-                    spProj5m(SitePminus, psi[lex2]);
-                    accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
-                }
-            }
-            chi[s1+Ls*site] = SiteChi*0.5;
-        }
-    }
-  }
-
-  #ifdef MOBIUS_EOFA_DPERP_DENSE
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
-
-    template void MobiusEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
-
-    template void MobiusEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-  #endif
-
-}}
--- a/Grid/qcd/action/fermion/MobiusEOFAFermionssp.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermionssp.cc
@@ -1,290 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-  // Pminus fowards
-  // Pplus  backwards
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-  {
-    Coeff_t one(1.0);
-    int Ls = this->Ls;
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
-      } else if (s==(Ls-1)) {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
-        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
-      } else {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
-      }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-    std::vector<Coeff_t>& shift_coeffs)
-  {
-    Coeff_t one(1.0);
-    int Ls = this->Ls;
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
-      } else if (s==(Ls-1)) {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
-        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
-      } else {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
-      }
-      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
-      else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-  {
-    Coeff_t one(1.0);
-    int Ls = this->Ls;
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
-      } else if (s==(Ls-1)) {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-      } else {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-      }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-    std::vector<Coeff_t>& shift_coeffs)
-  {
-    Coeff_t one(1.0);
-    int Ls = this->Ls;
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
-      } else if (s==(Ls-1)) {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-      } else {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-      }
-      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
-      else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-  {
-    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
-
-    Coeff_t one(1.0);
-    Coeff_t czero(0.0);
-    chi.checkerboard = psi.checkerboard;
-    int Ls = this->Ls;
-
-    // Apply (L^{\prime})^{-1}
-    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
-    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
-    }
-    axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
-
-    // Apply U^{-1}
-    for(int s=Ls-2; s>=0; s--){
-      axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
-  {
-    Coeff_t one(1.0);
-    Coeff_t czero(0.0);
-    chi.checkerboard = psi.checkerboard;
-    int Ls = this->Ls;
-
-    FermionField tmp(psi._grid);
-
-    // Apply (L^{\prime})^{-1}
-    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-    axpby_ssp(tmp, czero, tmp, this->MooeeInv_shift_lc[0], psi, 0, 0);
-    for(int s=1; s<Ls; s++){
-      axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
-      axpby_ssp(tmp, one, tmp, this->MooeeInv_shift_lc[s], psi, 0, s);
-    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
-    }
-    axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
-
-    // Apply U^{-1} and add shift term
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
-    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
-    for(int s=Ls-2; s>=0; s--){
-      axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
-      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
-      else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-  {
-    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
-
-    Coeff_t one(1.0);
-    Coeff_t czero(0.0);
-    chi.checkerboard = psi.checkerboard;
-    int Ls = this->Ls;
-
-    // Apply (U^{\prime})^{-dagger}
-    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
-    }
-
-    // U_m^{-\dagger}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
-    }
-    axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
-
-    // Apply L^{-dagger}
-    for(int s=Ls-2; s>=0; s--){
-      axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
-  {
-    Coeff_t one(1.0);
-    Coeff_t czero(0.0);
-    chi.checkerboard = psi.checkerboard;
-    int Ls = this->Ls;
-
-    FermionField tmp(psi._grid);
-
-    // Apply (U^{\prime})^{-dagger} and accumulate (MooeeInvDag_shift_lc)_{j} \psi_{j} in tmp[0]
-    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-    axpby_ssp(tmp, czero, tmp, this->MooeeInvDag_shift_lc[0], psi, 0, 0);
-    for(int s=1; s<Ls; s++){
-      axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
-      axpby_ssp(tmp, one, tmp, this->MooeeInvDag_shift_lc[s], psi, 0, s);
-    }
-
-    // U_m^{-\dagger}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
-    }
-    axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
-
-    // Apply L^{-dagger} and add shift
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
-    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
-    for(int s=Ls-2; s>=0; s--){
-      axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
-      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
-      else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
-    }
-  }
-
-  #ifdef MOBIUS_EOFA_DPERP_LINALG
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
-
-  #endif
-
-}}
--- a/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc
@@ -1,983 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  /*
-  * Dense matrix versions of routines
-  */
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-  {
-    GridBase* grid  = psi._grid;
-    int Ls          = this->Ls;
-    int LLs         = grid->_rdimensions[0];
-    const int nsimd = Simd::Nsimd();
-
-    Vector<iSinglet<Simd>> u(LLs);
-    Vector<iSinglet<Simd>> l(LLs);
-    Vector<iSinglet<Simd>> d(LLs);
-
-    assert(Ls/LLs == nsimd);
-    assert(phi.checkerboard == psi.checkerboard);
-
-    chi.checkerboard = psi.checkerboard;
-
-    // just directly address via type pun
-    typedef typename Simd::scalar_type scalar_type;
-    scalar_type* u_p = (scalar_type*) &u[0];
-    scalar_type* l_p = (scalar_type*) &l[0];
-    scalar_type* d_p = (scalar_type*) &d[0];
-
-    for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s   = o + i*LLs;
-      int ss  = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    assert(Nc == 3);
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-      #if 0
-
-        alignas(64) SiteHalfSpinor hp;
-        alignas(64) SiteHalfSpinor hm;
-        alignas(64) SiteSpinor fp;
-        alignas(64) SiteSpinor fm;
-
-        for(int v=0; v<LLs; v++){
-
-          int vp = (v+1)%LLs;
-          int vm = (v+LLs-1)%LLs;
-
-          spProj5m(hp, psi[ss+vp]);
-          spProj5p(hm, psi[ss+vm]);
-
-          if (vp <= v){ rotate(hp, hp, 1); }
-          if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-          hp = 0.5*hp;
-          hm = 0.5*hm;
-
-          spRecon5m(fp, hp);
-          spRecon5p(fm, hm);
-
-          chi[ss+v] = d[v]*phi[ss+v];
-          chi[ss+v] = chi[ss+v] + u[v]*fp;
-          chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-        }
-
-      #else
-
-        for(int v=0; v<LLs; v++){
-
-          vprefetch(psi[ss+v+LLs]);
-
-          int vp = (v == LLs-1) ? 0     : v+1;
-          int vm = (v == 0)     ? LLs-1 : v-1;
-
-          Simd hp_00 = psi[ss+vp]()(2)(0);
-          Simd hp_01 = psi[ss+vp]()(2)(1);
-          Simd hp_02 = psi[ss+vp]()(2)(2);
-          Simd hp_10 = psi[ss+vp]()(3)(0);
-          Simd hp_11 = psi[ss+vp]()(3)(1);
-          Simd hp_12 = psi[ss+vp]()(3)(2);
-
-          Simd hm_00 = psi[ss+vm]()(0)(0);
-          Simd hm_01 = psi[ss+vm]()(0)(1);
-          Simd hm_02 = psi[ss+vm]()(0)(2);
-          Simd hm_10 = psi[ss+vm]()(1)(0);
-          Simd hm_11 = psi[ss+vm]()(1)(1);
-          Simd hm_12 = psi[ss+vm]()(1)(2);
-
-          if(vp <= v){
-            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-          }
-
-          if(vm >= v){
-            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-          }
-
-          // Can force these to real arithmetic and save 2x.
-          Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-          Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-          Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-          Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-          Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-          Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-          Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-          Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-          Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-          Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-          Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-          Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-          vstream(chi[ss+v]()(0)(0), p_00);
-          vstream(chi[ss+v]()(0)(1), p_01);
-          vstream(chi[ss+v]()(0)(2), p_02);
-          vstream(chi[ss+v]()(1)(0), p_10);
-          vstream(chi[ss+v]()(1)(1), p_11);
-          vstream(chi[ss+v]()(1)(2), p_12);
-          vstream(chi[ss+v]()(2)(0), p_20);
-          vstream(chi[ss+v]()(2)(1), p_21);
-          vstream(chi[ss+v]()(2)(2), p_22);
-          vstream(chi[ss+v]()(3)(0), p_30);
-          vstream(chi[ss+v]()(3)(1), p_31);
-          vstream(chi[ss+v]()(3)(2), p_32);
-        }
-
-      #endif
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-    std::vector<Coeff_t>& shift_coeffs)
-  {
-    #if 0
-
-      this->M5D(psi, phi, chi, lower, diag, upper);
-
-      // FIXME: possible gain from vectorizing shift operation as well?
-      Coeff_t one(1.0);
-      int Ls = this->Ls;
-      for(int s=0; s<Ls; s++){
-        if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
-        else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
-      }
-
-    #else
-
-      GridBase* grid  = psi._grid;
-      int Ls          = this->Ls;
-      int LLs         = grid->_rdimensions[0];
-      const int nsimd = Simd::Nsimd();
-
-      Vector<iSinglet<Simd>> u(LLs);
-      Vector<iSinglet<Simd>> l(LLs);
-      Vector<iSinglet<Simd>> d(LLs);
-      Vector<iSinglet<Simd>> s(LLs);
-
-      assert(Ls/LLs == nsimd);
-      assert(phi.checkerboard == psi.checkerboard);
-
-      chi.checkerboard = psi.checkerboard;
-
-      // just directly address via type pun
-      typedef typename Simd::scalar_type scalar_type;
-      scalar_type* u_p = (scalar_type*) &u[0];
-      scalar_type* l_p = (scalar_type*) &l[0];
-      scalar_type* d_p = (scalar_type*) &d[0];
-      scalar_type* s_p = (scalar_type*) &s[0];
-
-      for(int o=0; o<LLs; o++){ // outer
-      for(int i=0; i<nsimd; i++){ //inner
-        int s   = o + i*LLs;
-        int ss  = o*nsimd + i;
-        u_p[ss] = upper[s];
-        l_p[ss] = lower[s];
-        d_p[ss] = diag[s];
-        s_p[ss] = shift_coeffs[s];
-      }}
-
-      this->M5Dcalls++;
-      this->M5Dtime -= usecond();
-
-      assert(Nc == 3);
-
-      parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-        int vs     = (this->pm == 1) ? LLs-1 : 0;
-        Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
-        Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
-        Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
-        Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
-        Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
-        Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
-
-        for(int v=0; v<LLs; v++){
-
-          vprefetch(psi[ss+v+LLs]);
-
-          int vp = (v == LLs-1) ? 0     : v+1;
-          int vm = (v == 0)     ? LLs-1 : v-1;
-
-          Simd hp_00 = psi[ss+vp]()(2)(0);
-          Simd hp_01 = psi[ss+vp]()(2)(1);
-          Simd hp_02 = psi[ss+vp]()(2)(2);
-          Simd hp_10 = psi[ss+vp]()(3)(0);
-          Simd hp_11 = psi[ss+vp]()(3)(1);
-          Simd hp_12 = psi[ss+vp]()(3)(2);
-
-          Simd hm_00 = psi[ss+vm]()(0)(0);
-          Simd hm_01 = psi[ss+vm]()(0)(1);
-          Simd hm_02 = psi[ss+vm]()(0)(2);
-          Simd hm_10 = psi[ss+vm]()(1)(0);
-          Simd hm_11 = psi[ss+vm]()(1)(1);
-          Simd hm_12 = psi[ss+vm]()(1)(2);
-
-          if(vp <= v){
-            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-          }
-
-          if(this->pm == 1 && vs <= v){
-            hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-            hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-            hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-            hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-            hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-            hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-          }
-
-          if(vm >= v){
-            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-          }
-
-          if(this->pm == -1 && vs >= v){
-            hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-            hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-            hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-            hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-            hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-            hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-          }
-
-          // Can force these to real arithmetic and save 2x.
-          Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-          Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-          Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-          Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-          Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-          Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-          Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-          Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-          Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-          Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-          Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-          Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-          vstream(chi[ss+v]()(0)(0), p_00);
-          vstream(chi[ss+v]()(0)(1), p_01);
-          vstream(chi[ss+v]()(0)(2), p_02);
-          vstream(chi[ss+v]()(1)(0), p_10);
-          vstream(chi[ss+v]()(1)(1), p_11);
-          vstream(chi[ss+v]()(1)(2), p_12);
-          vstream(chi[ss+v]()(2)(0), p_20);
-          vstream(chi[ss+v]()(2)(1), p_21);
-          vstream(chi[ss+v]()(2)(2), p_22);
-          vstream(chi[ss+v]()(3)(0), p_30);
-          vstream(chi[ss+v]()(3)(1), p_31);
-          vstream(chi[ss+v]()(3)(2), p_32);
-        }
-      }
-
-      this->M5Dtime += usecond();
-
-    #endif
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-  {
-    GridBase* grid = psi._grid;
-    int Ls  = this->Ls;
-    int LLs = grid->_rdimensions[0];
-    int nsimd = Simd::Nsimd();
-
-    Vector<iSinglet<Simd>> u(LLs);
-    Vector<iSinglet<Simd>> l(LLs);
-    Vector<iSinglet<Simd>> d(LLs);
-
-    assert(Ls/LLs == nsimd);
-    assert(phi.checkerboard == psi.checkerboard);
-
-    chi.checkerboard = psi.checkerboard;
-
-    // just directly address via type pun
-    typedef typename Simd::scalar_type scalar_type;
-    scalar_type* u_p = (scalar_type*) &u[0];
-    scalar_type* l_p = (scalar_type*) &l[0];
-    scalar_type* d_p = (scalar_type*) &d[0];
-
-    for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-      #if 0
-
-        alignas(64) SiteHalfSpinor hp;
-        alignas(64) SiteHalfSpinor hm;
-        alignas(64) SiteSpinor fp;
-        alignas(64) SiteSpinor fm;
-
-        for(int v=0; v<LLs; v++){
-
-          int vp = (v+1)%LLs;
-          int vm = (v+LLs-1)%LLs;
-
-          spProj5p(hp, psi[ss+vp]);
-          spProj5m(hm, psi[ss+vm]);
-
-          if(vp <= v){ rotate(hp, hp, 1); }
-          if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-          hp = hp*0.5;
-          hm = hm*0.5;
-          spRecon5p(fp, hp);
-          spRecon5m(fm, hm);
-
-          chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-          chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-        }
-
-      #else
-
-        for(int v=0; v<LLs; v++){
-
-          vprefetch(psi[ss+v+LLs]);
-
-          int vp = (v == LLs-1) ? 0     : v+1;
-          int vm = (v == 0    ) ? LLs-1 : v-1;
-
-          Simd hp_00 = psi[ss+vp]()(0)(0);
-          Simd hp_01 = psi[ss+vp]()(0)(1);
-          Simd hp_02 = psi[ss+vp]()(0)(2);
-          Simd hp_10 = psi[ss+vp]()(1)(0);
-          Simd hp_11 = psi[ss+vp]()(1)(1);
-          Simd hp_12 = psi[ss+vp]()(1)(2);
-
-          Simd hm_00 = psi[ss+vm]()(2)(0);
-          Simd hm_01 = psi[ss+vm]()(2)(1);
-          Simd hm_02 = psi[ss+vm]()(2)(2);
-          Simd hm_10 = psi[ss+vm]()(3)(0);
-          Simd hm_11 = psi[ss+vm]()(3)(1);
-          Simd hm_12 = psi[ss+vm]()(3)(2);
-
-          if (vp <= v){
-            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-          }
-
-          if(vm >= v){
-            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-          }
-
-          Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-          Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-          Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-          Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-          Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-          Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-          Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-          Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-          Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-          Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-          Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-          Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-          vstream(chi[ss+v]()(0)(0), p_00);
-          vstream(chi[ss+v]()(0)(1), p_01);
-          vstream(chi[ss+v]()(0)(2), p_02);
-          vstream(chi[ss+v]()(1)(0), p_10);
-          vstream(chi[ss+v]()(1)(1), p_11);
-          vstream(chi[ss+v]()(1)(2), p_12);
-          vstream(chi[ss+v]()(2)(0), p_20);
-          vstream(chi[ss+v]()(2)(1), p_21);
-          vstream(chi[ss+v]()(2)(2), p_22);
-          vstream(chi[ss+v]()(3)(0), p_30);
-          vstream(chi[ss+v]()(3)(1), p_31);
-          vstream(chi[ss+v]()(3)(2), p_32);
-
-        }
-
-      #endif
-
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-    std::vector<Coeff_t>& shift_coeffs)
-  {
-    #if 0
-
-      this->M5Ddag(psi, phi, chi, lower, diag, upper);
-
-      // FIXME: possible gain from vectorizing shift operation as well?
-      Coeff_t one(1.0);
-      int Ls = this->Ls;
-      for(int s=0; s<Ls; s++){
-        if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
-        else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
-      }
-
-    #else
-
-      GridBase* grid = psi._grid;
-      int Ls  = this->Ls;
-      int LLs = grid->_rdimensions[0];
-      int nsimd = Simd::Nsimd();
-
-      Vector<iSinglet<Simd>> u(LLs);
-      Vector<iSinglet<Simd>> l(LLs);
-      Vector<iSinglet<Simd>> d(LLs);
-      Vector<iSinglet<Simd>> s(LLs);
-
-      assert(Ls/LLs == nsimd);
-      assert(phi.checkerboard == psi.checkerboard);
-
-      chi.checkerboard = psi.checkerboard;
-
-      // just directly address via type pun
-      typedef typename Simd::scalar_type scalar_type;
-      scalar_type* u_p = (scalar_type*) &u[0];
-      scalar_type* l_p = (scalar_type*) &l[0];
-      scalar_type* d_p = (scalar_type*) &d[0];
-      scalar_type* s_p = (scalar_type*) &s[0];
-
-      for(int o=0; o<LLs; o++){ // outer
-      for(int i=0; i<nsimd; i++){ //inner
-        int s  = o + i*LLs;
-        int ss = o*nsimd + i;
-        u_p[ss] = upper[s];
-        l_p[ss] = lower[s];
-        d_p[ss] = diag[s];
-        s_p[ss] = shift_coeffs[s];
-      }}
-
-      this->M5Dcalls++;
-      this->M5Dtime -= usecond();
-
-      parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-        int vs     = (this->pm == 1) ? LLs-1 : 0;
-        Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
-        Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
-        Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
-        Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
-        Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
-        Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
-
-        for(int v=0; v<LLs; v++){
-
-          vprefetch(psi[ss+v+LLs]);
-
-          int vp = (v == LLs-1) ? 0     : v+1;
-          int vm = (v == 0    ) ? LLs-1 : v-1;
-
-          Simd hp_00 = psi[ss+vp]()(0)(0);
-          Simd hp_01 = psi[ss+vp]()(0)(1);
-          Simd hp_02 = psi[ss+vp]()(0)(2);
-          Simd hp_10 = psi[ss+vp]()(1)(0);
-          Simd hp_11 = psi[ss+vp]()(1)(1);
-          Simd hp_12 = psi[ss+vp]()(1)(2);
-
-          Simd hm_00 = psi[ss+vm]()(2)(0);
-          Simd hm_01 = psi[ss+vm]()(2)(1);
-          Simd hm_02 = psi[ss+vm]()(2)(2);
-          Simd hm_10 = psi[ss+vm]()(3)(0);
-          Simd hm_11 = psi[ss+vm]()(3)(1);
-          Simd hm_12 = psi[ss+vm]()(3)(2);
-
-          if (vp <= v){
-            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-          }
-
-          if(this->pm == 1 && vs <= v){
-            hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-            hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-            hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-            hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-            hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-            hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-          }
-
-          if(vm >= v){
-            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-          }
-
-          if(this->pm == -1 && vs >= v){
-            hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-            hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-            hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-            hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-            hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-            hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-          }
-
-          Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-          Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-          Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-          Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-          Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-          Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-          Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-          Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-          Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-          Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-          Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-          Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-
-          vstream(chi[ss+v]()(0)(0), p_00);
-          vstream(chi[ss+v]()(0)(1), p_01);
-          vstream(chi[ss+v]()(0)(2), p_02);
-          vstream(chi[ss+v]()(1)(0), p_10);
-          vstream(chi[ss+v]()(1)(1), p_11);
-          vstream(chi[ss+v]()(1)(2), p_12);
-          vstream(chi[ss+v]()(2)(0), p_20);
-          vstream(chi[ss+v]()(2)(1), p_21);
-          vstream(chi[ss+v]()(2)(2), p_22);
-          vstream(chi[ss+v]()(3)(0), p_30);
-          vstream(chi[ss+v]()(3)(1), p_31);
-          vstream(chi[ss+v]()(3)(2), p_32);
-
-        }
-
-      }
-
-      this->M5Dtime += usecond();
-
-    #endif
-  }
-
-  #ifdef AVX512
-    #include<simd/Intel512common.h>
-    #include<simd/Intel512avx.h>
-    #include<simd/Intel512single.h>
-  #endif
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
-    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-  {
-    #ifndef AVX512
-      {
-        SiteHalfSpinor BcastP;
-        SiteHalfSpinor BcastM;
-        SiteHalfSpinor SiteChiP;
-        SiteHalfSpinor SiteChiM;
-
-        // Ls*Ls * 2 * 12 * vol flops
-        for(int s1=0; s1<LLs; s1++){
-
-          for(int s2=0; s2<LLs; s2++){
-          for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-            int s = s2 + l*LLs;
-            int lex = s2 + LLs*site;
-
-            if( s2==0 && l==0 ){
-              SiteChiP=zero;
-              SiteChiM=zero;
-            }
-
-            for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-            }}
-
-            for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-            }}
-
-            for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-              SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-            }}
-          }}
-
-          {
-            int lex = s1 + LLs*site;
-            for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-              vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-            }}
-          }
-        }
-      }
-    #else
-      {
-        // pointers
-        //  MASK_REGS;
-        #define Chi_00 %%zmm1
-        #define Chi_01 %%zmm2
-        #define Chi_02 %%zmm3
-        #define Chi_10 %%zmm4
-        #define Chi_11 %%zmm5
-        #define Chi_12 %%zmm6
-        #define Chi_20 %%zmm7
-        #define Chi_21 %%zmm8
-        #define Chi_22 %%zmm9
-        #define Chi_30 %%zmm10
-        #define Chi_31 %%zmm11
-        #define Chi_32 %%zmm12
-
-        #define BCAST0  %%zmm13
-        #define BCAST1  %%zmm14
-        #define BCAST2  %%zmm15
-        #define BCAST3  %%zmm16
-        #define BCAST4  %%zmm17
-        #define BCAST5  %%zmm18
-        #define BCAST6  %%zmm19
-        #define BCAST7  %%zmm20
-        #define BCAST8  %%zmm21
-        #define BCAST9  %%zmm22
-        #define BCAST10 %%zmm23
-        #define BCAST11 %%zmm24
-
-        int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-
-        for(int s1=0; s1<LLs; s1++){
-
-          for(int s2=0; s2<LLs; s2++){
-
-            int lex = s2 + LLs*site;
-            uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-            uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-            uint64_t a2 = (uint64_t) &psi[lex];
-
-            for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-
-              if((s2+l)==0) {
-                asm(
-                      VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-                      VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-                      VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-                      VBCASTCDUP(0,%2,BCAST0)
-                      VBCASTCDUP(1,%2,BCAST1)
-                      VBCASTCDUP(2,%2,BCAST2)
-                      VBCASTCDUP(3,%2,BCAST3)
-                      VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-                      VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-                      VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-                      VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-                      VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-                      VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-                      VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-                      VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-                      VMULMEM(0,%1,BCAST8,Chi_22)
-                      VMULMEM(0,%1,BCAST9,Chi_30)
-                      VMULMEM(0,%1,BCAST10,Chi_31)
-                      VMULMEM(0,%1,BCAST11,Chi_32)
-                      : : "r" (a0), "r" (a1), "r" (a2)                            );
-              } else {
-                asm(
-                      VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-                      VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-                      VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-                      VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-                      VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-                      VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-                      VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-                      VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-                      VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-                      VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-                      VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-                      VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-                      : : "r" (a0), "r" (a1), "r" (a2)                            );
-              }
-
-              a0 = a0 + incr;
-              a1 = a1 + incr;
-              a2 = a2 + sizeof(typename Simd::scalar_type);
-            }
-          }
-
-          {
-            int lexa = s1+LLs*site;
-            asm (
-               VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-               VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-               VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-               VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-               : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-          }
-        }
-      }
-
-      #undef Chi_00
-      #undef Chi_01
-      #undef Chi_02
-      #undef Chi_10
-      #undef Chi_11
-      #undef Chi_12
-      #undef Chi_20
-      #undef Chi_21
-      #undef Chi_22
-      #undef Chi_30
-      #undef Chi_31
-      #undef Chi_32
-
-      #undef BCAST0
-      #undef BCAST1
-      #undef BCAST2
-      #undef BCAST3
-      #undef BCAST4
-      #undef BCAST5
-      #undef BCAST6
-      #undef BCAST7
-      #undef BCAST8
-      #undef BCAST9
-      #undef BCAST10
-      #undef BCAST11
-
-    #endif
-  };
-
-  // Z-mobius version
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-  {
-    std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-    exit(-1);
-  };
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-  {
-    int Ls  = this->Ls;
-    int LLs = psi._grid->_rdimensions[0];
-    int vol = psi._grid->oSites()/LLs;
-
-    chi.checkerboard = psi.checkerboard;
-
-    Vector<iSinglet<Simd>>   Matp;
-    Vector<iSinglet<Simd>>   Matm;
-    Vector<iSinglet<Simd>>* _Matp;
-    Vector<iSinglet<Simd>>* _Matm;
-
-    //  MooeeInternalCompute(dag,inv,Matp,Matm);
-    if(inv && dag){
-      _Matp = &this->MatpInvDag;
-      _Matm = &this->MatmInvDag;
-    }
-
-    if(inv && (!dag)){
-      _Matp = &this->MatpInv;
-      _Matm = &this->MatmInv;
-    }
-
-    if(!inv){
-      MooeeInternalCompute(dag, inv, Matp, Matm);
-      _Matp = &Matp;
-      _Matm = &Matm;
-    }
-
-    assert(_Matp->size() == Ls*LLs);
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    if(switcheroo<Coeff_t>::iscomplex()){
-      parallel_for(auto site=0; site<vol; site++){
-        MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-      }
-    } else {
-      parallel_for(auto site=0; site<vol; site++){
-        MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  #ifdef MOBIUS_EOFA_DPERP_VEC
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
-
-    template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-    template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-  #endif
-
-}}
--- a/Grid/qcd/action/fermion/MobiusFermion.h
+++ b/Grid/qcd/action/fermion/MobiusFermion.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,57 +24,54 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_MOBIUS_FERMION_H
 #define  GRID_QCD_MOBIUS_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class MobiusFermion : public CayleyFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class MobiusFermion : public CayleyFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
-
-      virtual void   Instantiatable(void) {};
-      // Constructors
-      MobiusFermion(GaugeField &_Umu,
-		    GridCartesian         &FiveDimGrid,
-		    GridRedBlackCartesian &FiveDimRedBlackGrid,
-		    GridCartesian         &FourDimGrid,
-		    GridRedBlackCartesian &FourDimRedBlackGrid,
-		    RealD _mass,RealD _M5,
-		    RealD b, RealD c,const ImplParams &p= ImplParams()) : 
+  virtual void   Instantiatable(void) {};
+  // Constructors
+  MobiusFermion(GaugeField &_Umu,
+		GridCartesian         &FiveDimGrid,
+		GridRedBlackCartesian &FiveDimRedBlackGrid,
+		GridCartesian         &FourDimGrid,
+		GridRedBlackCartesian &FourDimRedBlackGrid,
+		RealD _mass,RealD _M5,
+		RealD b, RealD c,const ImplParams &p= ImplParams()) : 
      
-      CayleyFermion5D<Impl>(_Umu,
-			    FiveDimGrid,
-			    FiveDimRedBlackGrid,
-			    FourDimGrid,
-			    FourDimRedBlackGrid,_mass,_M5,p)
+    CayleyFermion5D<Impl>(_Umu,
+			  FiveDimGrid,
+			  FiveDimRedBlackGrid,
+			  FourDimGrid,
+			  FourDimRedBlackGrid,_mass,_M5,p)

-      {
-	RealD eps = 1.0;
+  {
+    RealD eps = 1.0;

-	std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
-	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
-	assert(zdata->n==this->Ls);
+    std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
+    Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
+    assert(zdata->n==this->Ls);
 	
-	// Call base setter
-	this->SetCoefficientsTanh(zdata,b,c);
+    // Call base setter
+    this->SetCoefficientsTanh(zdata,b,c);

-	Approx::zolotarev_free(zdata);
+    Approx::zolotarev_free(zdata);
 
-      }
-
-    };
-
  }
-}
+
+};
+
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/MobiusZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/MobiusZolotarevFermion.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,58 +24,55 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class MobiusZolotarevFermion : public CayleyFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class MobiusZolotarevFermion : public CayleyFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
-
-      virtual void   Instantiatable(void) {};
-      // Constructors
-       MobiusZolotarevFermion(GaugeField &_Umu,
-			      GridCartesian         &FiveDimGrid,
-			      GridRedBlackCartesian &FiveDimRedBlackGrid,
-			      GridCartesian         &FourDimGrid,
-			      GridRedBlackCartesian &FourDimRedBlackGrid,
-			      RealD _mass,RealD _M5,
-			      RealD b, RealD c,
-			      RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
+  virtual void   Instantiatable(void) {};
+  // Constructors
+  MobiusZolotarevFermion(GaugeField &_Umu,
+			 GridCartesian         &FiveDimGrid,
+			 GridRedBlackCartesian &FiveDimRedBlackGrid,
+			 GridCartesian         &FourDimGrid,
+			 GridRedBlackCartesian &FourDimRedBlackGrid,
+			 RealD _mass,RealD _M5,
+			 RealD b, RealD c,
+			 RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
      
-      CayleyFermion5D<Impl>(_Umu,
-			    FiveDimGrid,
-			    FiveDimRedBlackGrid,
-			    FourDimGrid,
-			    FourDimRedBlackGrid,_mass,_M5,p)
+    CayleyFermion5D<Impl>(_Umu,
+			  FiveDimGrid,
+			  FiveDimRedBlackGrid,
+			  FourDimGrid,
+			  FourDimRedBlackGrid,_mass,_M5,p)

-      {
-	RealD eps = lo/hi;
+  {
+    RealD eps = lo/hi;

-	Approx::zolotarev_data *zdata = Approx::zolotarev(eps,this->Ls,0);
-	assert(zdata->n==this->Ls);
+    Approx::zolotarev_data *zdata = Approx::zolotarev(eps,this->Ls,0);
+    assert(zdata->n==this->Ls);

-	std::cout<<GridLogMessage << "MobiusZolotarevFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Zolotarev range ["<<lo<<","<<hi<<"]"<<std::endl;
+    std::cout<<GridLogMessage << "MobiusZolotarevFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Zolotarev range ["<<lo<<","<<hi<<"]"<<std::endl;
 	
-	// Call base setter
-	this->SetCoefficientsZolotarev(hi,zdata,b,c);
+    // Call base setter
+    this->SetCoefficientsZolotarev(hi,zdata,b,c);
 
-	Approx::zolotarev_free(zdata);
-      }
-
-    };
-
+    Approx::zolotarev_free(zdata);
  }
-}
+
+};
+
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,46 +24,44 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef OVERLAP_WILSON_CAYLEY_TANH_FERMION_H
 #define OVERLAP_WILSON_CAYLEY_TANH_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
-
-    template<class Impl>
-    class OverlapWilsonCayleyTanhFermion : public MobiusFermion<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
+template<class Impl>
+class OverlapWilsonCayleyTanhFermion : public MobiusFermion<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

     void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
       this->MomentumSpacePropagatorHw(out,in,_m,twist);
-     };
+  };

-     // Constructors
-    OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
-				   GridCartesian         &FiveDimGrid,
-				   GridRedBlackCartesian &FiveDimRedBlackGrid,
-				   GridCartesian         &FourDimGrid,
-				   GridRedBlackCartesian &FourDimRedBlackGrid,
-				   RealD _mass,RealD _M5,
-				   RealD scale,const ImplParams &p= ImplParams()) :
+  // Constructors
+  OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
+				 GridCartesian         &FiveDimGrid,
+				 GridRedBlackCartesian &FiveDimRedBlackGrid,
+				 GridCartesian         &FourDimGrid,
+				 GridRedBlackCartesian &FourDimRedBlackGrid,
+				 RealD _mass,RealD _M5,
+				 RealD scale,const ImplParams &p= ImplParams()) :
      
-      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      MobiusFermion<Impl>(_Umu,
-			  FiveDimGrid,
-			  FiveDimRedBlackGrid,
-			  FourDimGrid,
-			  FourDimRedBlackGrid,_mass,_M5,0.5*scale,0.5*scale,p)
-	{
-	}
-    };
+    // b+c=scale, b-c = 0 <=> b =c = scale/2
+    MobiusFermion<Impl>(_Umu,
+			FiveDimGrid,
+			FiveDimRedBlackGrid,
+			FourDimGrid,
+			FourDimRedBlackGrid,_mass,_M5,0.5*scale,0.5*scale,p)
+  {
  }
-}
+};
+
+NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,45 +24,42 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H
 #define  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class OverlapWilsonCayleyZolotarevFermion : public MobiusZolotarevFermion<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class OverlapWilsonCayleyZolotarevFermion : public MobiusZolotarevFermion<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
+  // Constructors

-      // Constructors
+  OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
+				      GridCartesian         &FiveDimGrid,
+				      GridRedBlackCartesian &FiveDimRedBlackGrid,
+				      GridCartesian         &FourDimGrid,
+				      GridRedBlackCartesian &FourDimRedBlackGrid,
+				      RealD _mass,RealD _M5,
+				      RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
+    // b+c=1.0, b-c = 0 <=> b =c = 1/2
+    MobiusZolotarevFermion<Impl>(_Umu,
+				 FiveDimGrid,
+				 FiveDimRedBlackGrid,
+				 FourDimGrid,
+				 FourDimRedBlackGrid,_mass,_M5,0.5,0.5,lo,hi,p)

-    OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
-					GridCartesian         &FiveDimGrid,
-					GridRedBlackCartesian &FiveDimRedBlackGrid,
-					GridCartesian         &FourDimGrid,
-					GridRedBlackCartesian &FourDimRedBlackGrid,
-					RealD _mass,RealD _M5,
-					RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
-      // b+c=1.0, b-c = 0 <=> b =c = 1/2
-      MobiusZolotarevFermion<Impl>(_Umu,
-				   FiveDimGrid,
-				   FiveDimRedBlackGrid,
-				   FourDimGrid,
-				   FourDimRedBlackGrid,_mass,_M5,0.5,0.5,lo,hi,p)
+  {}

-      {}
+};

-    };
-
-  }
-}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,48 +24,47 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class OverlapWilsonContFracTanhFermion : public ContinuedFractionFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class OverlapWilsonContFracTanhFermion : public ContinuedFractionFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
-
-      virtual void   Instantiatable(void){};
-      // Constructors
-    OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
-				     GridCartesian         &FiveDimGrid,
-				     GridRedBlackCartesian &FiveDimRedBlackGrid,
-				     GridCartesian         &FourDimGrid,
-				     GridRedBlackCartesian &FourDimRedBlackGrid,
-				     RealD _mass,RealD _M5,
-				     RealD scale,const ImplParams &p= ImplParams()) :
+  virtual void   Instantiatable(void){};
+  // Constructors
+  OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
+				   GridCartesian         &FiveDimGrid,
+				   GridRedBlackCartesian &FiveDimRedBlackGrid,
+				   GridCartesian         &FourDimGrid,
+				   GridRedBlackCartesian &FourDimRedBlackGrid,
+				   RealD _mass,RealD _M5,
+				   RealD scale,const ImplParams &p= ImplParams()) :
      
-      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      ContinuedFractionFermion5D<Impl>(_Umu,
-				       FiveDimGrid,
-				       FiveDimRedBlackGrid,
-				       FourDimGrid,
-				       FourDimRedBlackGrid,_mass,_M5,p)
-	{
-	  assert((this->Ls&0x1)==1); // Odd Ls required
-	  int nrational=this->Ls-1;// Even rational order
-	  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
-	  this->SetCoefficientsTanh(zdata,scale);
-	  Approx::zolotarev_free(zdata);
-	}
-    };
+    // b+c=scale, b-c = 0 <=> b =c = scale/2
+    ContinuedFractionFermion5D<Impl>(_Umu,
+				     FiveDimGrid,
+				     FiveDimRedBlackGrid,
+				     FourDimGrid,
+				     FourDimRedBlackGrid,_mass,_M5,p)
+  {
+    assert((this->Ls&0x1)==1); // Odd Ls required
+    int nrational=this->Ls-1;// Even rational order
+    Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
+    this->SetCoefficientsTanh(zdata,scale);
+    Approx::zolotarev_free(zdata);
  }
-}
+};
+
+NAMESPACE_END(Grid);
+
 #endif
--- a/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,51 +24,49 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class OverlapWilsonContFracZolotarevFermion : public ContinuedFractionFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    template<class Impl>
-    class OverlapWilsonContFracZolotarevFermion : public ContinuedFractionFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-
-      virtual void   Instantiatable(void){};
-      // Constructors
-    OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
-					  GridCartesian         &FiveDimGrid,
-					  GridRedBlackCartesian &FiveDimRedBlackGrid,
-					  GridCartesian         &FourDimGrid,
-					  GridRedBlackCartesian &FourDimRedBlackGrid,
-					  RealD _mass,RealD _M5,
-					  RealD lo,RealD hi,const ImplParams &p= ImplParams()):
+  virtual void   Instantiatable(void){};
+  // Constructors
+  OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
+					GridCartesian         &FiveDimGrid,
+					GridRedBlackCartesian &FiveDimRedBlackGrid,
+					GridCartesian         &FourDimGrid,
+					GridRedBlackCartesian &FourDimRedBlackGrid,
+					RealD _mass,RealD _M5,
+					RealD lo,RealD hi,const ImplParams &p= ImplParams()):
      
-      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      ContinuedFractionFermion5D<Impl>(_Umu,
-				       FiveDimGrid,
-				       FiveDimRedBlackGrid,
-				       FourDimGrid,
-				       FourDimRedBlackGrid,_mass,_M5,p)
-	{
-	  assert((this->Ls&0x1)==1); // Odd Ls required
+    // b+c=scale, b-c = 0 <=> b =c = scale/2
+    ContinuedFractionFermion5D<Impl>(_Umu,
+				     FiveDimGrid,
+				     FiveDimRedBlackGrid,
+				     FourDimGrid,
+				     FourDimRedBlackGrid,_mass,_M5,p)
+  {
+    assert((this->Ls&0x1)==1); // Odd Ls required

-	  int nrational=this->Ls;// Odd rational order
-	  RealD eps = lo/hi;
+    int nrational=this->Ls;// Odd rational order
+    RealD eps = lo/hi;

-	  Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
-	  this->SetCoefficientsZolotarev(hi,zdata);
-	  Approx::zolotarev_free(zdata);
+    Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
+    this->SetCoefficientsZolotarev(hi,zdata);
+    Approx::zolotarev_free(zdata);

-	}
-    };
  }
-}
+};
+
+NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,48 +24,46 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class OverlapWilsonPartialFractionTanhFermion : public PartialFractionFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class OverlapWilsonPartialFractionTanhFermion : public PartialFractionFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
-
-      virtual void   Instantiatable(void){};
-      // Constructors
-    OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
-					    GridCartesian         &FiveDimGrid,
-					    GridRedBlackCartesian &FiveDimRedBlackGrid,
-					    GridCartesian         &FourDimGrid,
-					    GridRedBlackCartesian &FourDimRedBlackGrid,
-					    RealD _mass,RealD _M5,
-					    RealD scale,const ImplParams &p= ImplParams()) :
+  virtual void   Instantiatable(void){};
+  // Constructors
+  OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
+					  GridCartesian         &FiveDimGrid,
+					  GridRedBlackCartesian &FiveDimRedBlackGrid,
+					  GridCartesian         &FourDimGrid,
+					  GridRedBlackCartesian &FourDimRedBlackGrid,
+					  RealD _mass,RealD _M5,
+					  RealD scale,const ImplParams &p= ImplParams()) :
      
-      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      PartialFractionFermion5D<Impl>(_Umu,
-				     FiveDimGrid,
-				     FiveDimRedBlackGrid,
-				     FourDimGrid,
-				     FourDimRedBlackGrid,_mass,_M5,p)
-	{
-	  assert((this->Ls&0x1)==1); // Odd Ls required
-	  int nrational=this->Ls-1;// Even rational order
-	  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
-	  this->SetCoefficientsTanh(zdata,scale);
-	  Approx::zolotarev_free(zdata);
-	}
-    };
+    // b+c=scale, b-c = 0 <=> b =c = scale/2
+    PartialFractionFermion5D<Impl>(_Umu,
+				   FiveDimGrid,
+				   FiveDimRedBlackGrid,
+				   FourDimGrid,
+				   FourDimRedBlackGrid,_mass,_M5,p)
+  {
+    assert((this->Ls&0x1)==1); // Odd Ls required
+    int nrational=this->Ls-1;// Even rational order
+    Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
+    this->SetCoefficientsTanh(zdata,scale);
+    Approx::zolotarev_free(zdata);
  }
-}
+};
+
+NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,51 +24,50 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class OverlapWilsonPartialFractionZolotarevFermion : public PartialFractionFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    template<class Impl>
-    class OverlapWilsonPartialFractionZolotarevFermion : public PartialFractionFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-
-      virtual void   Instantiatable(void){};
-      // Constructors
-    OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
-						 GridCartesian         &FiveDimGrid,
-						 GridRedBlackCartesian &FiveDimRedBlackGrid,
-						 GridCartesian         &FourDimGrid,
-						 GridRedBlackCartesian &FourDimRedBlackGrid,
-						 RealD _mass,RealD _M5,
-						 RealD lo,RealD hi,const ImplParams &p= ImplParams()):
+  virtual void   Instantiatable(void){};
+  // Constructors
+  OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
+					       GridCartesian         &FiveDimGrid,
+					       GridRedBlackCartesian &FiveDimRedBlackGrid,
+					       GridCartesian         &FourDimGrid,
+					       GridRedBlackCartesian &FourDimRedBlackGrid,
+					       RealD _mass,RealD _M5,
+					       RealD lo,RealD hi,const ImplParams &p= ImplParams()):
      
-      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      PartialFractionFermion5D<Impl>(_Umu,
-				     FiveDimGrid,
-				     FiveDimRedBlackGrid,
-				     FourDimGrid,
-				     FourDimRedBlackGrid,_mass,_M5,p)
-	{
-	  assert((this->Ls&0x1)==1); // Odd Ls required
+    // b+c=scale, b-c = 0 <=> b =c = scale/2
+    PartialFractionFermion5D<Impl>(_Umu,
+				   FiveDimGrid,
+				   FiveDimRedBlackGrid,
+				   FourDimGrid,
+				   FourDimRedBlackGrid,_mass,_M5,p)
+  {
+    assert((this->Ls&0x1)==1); // Odd Ls required

-	  int nrational=this->Ls;// Odd rational order
-	  RealD eps = lo/hi;
+    int nrational=this->Ls;// Odd rational order
+    RealD eps = lo/hi;

-	  Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
-	  this->SetCoefficientsZolotarev(hi,zdata);
-	  Approx::zolotarev_free(zdata);
+    Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
+    this->SetCoefficientsZolotarev(hi,zdata);
+    Approx::zolotarev_free(zdata);

-	}
-    };
  }
-}
+};
+
+NAMESPACE_END(Grid);
+
 #endif
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.cc
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.cc
@@ -1,459 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
-
-namespace Grid {
-  namespace QCD {
-
-
-    template<class Impl>
-    void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-      // this does both dag and undag but is trivial; make a common helper routing
-
-      int sign = 1;
-      int Ls = this->Ls;
-
-      this->DhopDir(psi,chi,dir,disp);
-
-      int nblock=(Ls-1)/2;
-      for(int b=0;b<nblock;b++){
-	int s = 2*b;
-	ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
-	ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
-      }
-      ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
-
-    }
-    template<class Impl>
-    void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
-    {
-      int Ls = this->Ls;
-      int sign = dag ? (-1) : 1;
-
-      if ( psi.checkerboard == Odd ) {
-	this->DhopEO(psi,chi,DaggerNo);
-      } else {
-	this->DhopOE(psi,chi,DaggerNo);
-      }
-
-      int nblock=(Ls-1)/2;
-      for(int b=0;b<nblock;b++){
-	int s = 2*b;
-	ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
-	ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
-      }
-      ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
-    }
-
-    template<class Impl>
-    void   PartialFractionFermion5D<Impl>::Mooee_internal(const FermionField &psi, FermionField &chi,int dag)
-    {
-      // again dag and undag are trivially related
-      int sign = dag ? (-1) : 1;
-      int Ls = this->Ls;
-      
-      int nblock=(Ls-1)/2;
-      for(int b=0;b<nblock;b++){
-	
-	int s = 2*b;
-	RealD pp = p[nblock-1-b];
-	RealD qq = q[nblock-1-b];
-	
-	// Do each 2x2 block aligned at s and multiplies Dw site diagonal by G5 so Hw
-	ag5xpby_ssp(chi,-dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s  ,s+1); 
-	ag5xpby_ssp(chi, dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s+1,s);
-	axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
-      }
-      
-      {
-	RealD R=(1+mass)/(1-mass);
-	//R g5 psi[Ls-1] + p[0] H
-	ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale*dw_diag/amax,psi,Ls-1,Ls-1);
-	
-	for(int b=0;b<nblock;b++){
-	  int s = 2*b+1;
-	  RealD pp = p[nblock-1-b];
-	  axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
-	}
-      }
-    }
-
-    template<class Impl>
-    void   PartialFractionFermion5D<Impl>::MooeeInv_internal(const FermionField &psi, FermionField &chi,int dag)
-    {
-      int sign = dag ? (-1) : 1;
-      int Ls = this->Ls;
-
-      FermionField tmp(psi._grid);
-      
-      ///////////////////////////////////////////////////////////////////////////////////////
-      //Linv
-      ///////////////////////////////////////////////////////////////////////////////////////
-      int nblock=(Ls-1)/2;
-
-      axpy(chi,0.0,psi,psi); // Identity piece
-      
-      for(int b=0;b<nblock;b++){
-	int s = 2*b;
-	RealD pp = p[nblock-1-b];
-	RealD qq = q[nblock-1-b];
-	RealD coeff1=sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
-	RealD coeff2=sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
-	axpby_ssp  (chi,1.0,chi,coeff1,psi,Ls-1,s);
-	axpbg5y_ssp(chi,1.0,chi,coeff2,psi,Ls-1,s+1);
-      }
-      
-      ///////////////////////////////////////////////////////////////////////////////////////
-      //Dinv (note D isn't really diagonal -- just diagonal enough that we can still invert)
-      // Compute Seeinv (coeff of gamma5)
-      ///////////////////////////////////////////////////////////////////////////////////////
-      RealD R=(1+mass)/(1-mass);
-      RealD Seeinv = R + p[nblock]*dw_diag/amax;
-      for(int b=0;b<nblock;b++){
-	Seeinv += p[nblock-1-b]*dw_diag/amax / ( dw_diag*dw_diag/amax/amax + q[nblock-1-b]);
-      }    
-      Seeinv = 1.0/Seeinv;
-      
-      for(int b=0;b<nblock;b++){
-	int s = 2*b;
-	RealD pp = p[nblock-1-b];
-	RealD qq = q[nblock-1-b];
-	RealD coeff1=dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
-	RealD coeff2=amax*sqrt(qq) / ( dw_diag*dw_diag + amax*amax* qq);
-	ag5xpby_ssp  (tmp,-coeff1,chi,coeff2,chi,s,s+1);
-	ag5xpby_ssp  (tmp, coeff1,chi,coeff2,chi,s+1,s);
-      }
-      ag5xpby_ssp  (tmp, Seeinv,chi,0.0,chi,Ls-1,Ls-1);
-      
-      ///////////////////////////////////////////////////////////////////////////////////////
-      // Uinv
-      ///////////////////////////////////////////////////////////////////////////////////////
-      for(int b=0;b<nblock;b++){
-	int s = 2*b;
-	RealD pp = p[nblock-1-b];
-	RealD qq = q[nblock-1-b];
-	RealD coeff1=-sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
-	RealD coeff2=-sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
-	axpby_ssp  (chi,1.0/scale,tmp,coeff1/scale,tmp,s,Ls-1);
-	axpbg5y_ssp(chi,1.0/scale,tmp,coeff2/scale,tmp,s+1,Ls-1);
-      }
-      axpby_ssp  (chi, 1.0/scale,tmp,0.0,tmp,Ls-1,Ls-1);
-    }
-
-    template<class Impl>
-    void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, FermionField &chi,int dag)
-    {
-      FermionField D(psi._grid);
-  
-      int Ls = this->Ls;
-      int sign = dag ? (-1) : 1;
-
-      // For partial frac Hw case (b5=c5=1) chroma quirkily computes
-      //
-      // Conventions for partfrac appear to be a mess.
-      // Tony's Nara lectures have
-      //
-      // BlockDiag(  H/p_i  1             | 1       )    
-      //          (  1      p_i H / q_i^2 | 0       )  
-      //           ---------------------------------
-      //           ( -1      0                | R  +p0 H  )
-      //
-      //Chroma     ( -2H    2sqrt(q_i)    |   0         )
-      //           (2 sqrt(q_i)   2H      |  2 sqrt(p_i) )
-      //           ---------------------------------
-      //           ( 0     -2 sqrt(p_i)   |  2 R gamma_5 + p0 2H
-      //
-      // Edwards/Joo/Kennedy/Wenger
-      //
-      // Here, the "beta's" selected by chroma to scale the unphysical bulk constraint fields
-      // incorporate the approx scale factor. This is obtained by propagating the
-      // scale on "H" out to the off diagonal elements as follows:
-      //
-      // BlockDiag(  H/p_i  1             | 1       ) 
-      //          (  1      p_i H / q_i^2 | 0       )  
-      //           ---------------------------------
-      //          ( -1      0                | R  + p_0 H  )
-      //
-      // becomes:
-      // BlockDiag(  H/ sp_i  1               | 1             ) 
-      //          (  1      sp_i H / s^2q_i^2 | 0             )  
-      //           ---------------------------------
-      //           ( -1      0                | R + p_0/s H   )
-      //
-      //
-      // This is implemented in Chroma by
-      //           p0' = p0/approxMax
-      //           p_i' = p_i*approxMax
-      //           q_i' = q_i*approxMax*approxMax
-      //
-      // After the equivalence transform is applied the matrix becomes
-      // 
-      //Chroma     ( -2H    sqrt(q'_i)    |   0         )
-      //           (sqrt(q'_i)   2H       |   sqrt(p'_i) )
-      //           ---------------------------------
-      //           ( 0     -sqrt(p'_i)    |  2 R gamma_5 + p'0 2H
-      //
-      //     =     ( -2H    sqrt(q_i)amax    |   0              )
-      //           (sqrt(q_i)amax   2H       |   sqrt(p_i*amax) )
-      //           ---------------------------------
-      //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H
-      //
-
-      this->DW(psi,D,DaggerNo); 
-
-      int nblock=(Ls-1)/2;
-      for(int b=0;b<nblock;b++){
-	
-	int s = 2*b;
-	double pp = p[nblock-1-b];
-	double qq = q[nblock-1-b];
-	
-	// Do each 2x2 block aligned at s and
-	ag5xpby_ssp(chi,-1.0*scale,D,amax*sqrt(qq)*scale,psi, s  ,s+1); // Multiplies Dw by G5 so Hw
-	ag5xpby_ssp(chi, 1.0*scale,D,amax*sqrt(qq)*scale,psi, s+1,s);
-	
-	// Pick up last column
-	axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
-      }
-	
-      {
-	double R=(1+this->mass)/(1-this->mass);
-	//R g5 psi[Ls] + p[0] H
-	ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
-	
-	for(int b=0;b<nblock;b++){
-	  int s = 2*b+1;
-	  double pp = p[nblock-1-b];
-	  axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
-	}
-      }
-
-    }
-
-    template<class Impl>
-    RealD  PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
-    {
-      M_internal(in,out,DaggerNo);
-      return norm2(out);
-    }
-    template<class Impl>
-    RealD  PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
-    {
-      M_internal(in,out,DaggerYes);
-      return norm2(out);
-    }
-
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::Meooe       (const FermionField &in, FermionField &out)
-    {
-      Meooe_internal(in,out,DaggerNo);
-    }
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::MeooeDag    (const FermionField &in, FermionField &out)
-    {
-      Meooe_internal(in,out,DaggerYes);
-    }
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::Mooee       (const FermionField &in, FermionField &out)
-    {
-      Mooee_internal(in,out,DaggerNo);
-    }
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::MooeeDag    (const FermionField &in, FermionField &out)
-    {
-      Mooee_internal(in,out,DaggerYes);
-    }
-
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::MooeeInv    (const FermionField &in, FermionField &out)
-    {
-      MooeeInv_internal(in,out,DaggerNo);
-    }
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::MooeeInvDag (const FermionField &in, FermionField &out)
-    {
-      MooeeInv_internal(in,out,DaggerYes);
-    }
-
-
-  // force terms; five routines; default to Dhop on diagonal
-    template<class Impl>
-   void PartialFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    int Ls = this->Ls;
-
-    FermionField D(V._grid);
-
-    int nblock=(Ls-1)/2;
-    for(int b=0;b<nblock;b++){
-      int s = 2*b;
-      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
-      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
-    }
-    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
-
-    this->DhopDeriv(mat,D,V,DaggerNo); 
-  };
-    template<class Impl>
-   void PartialFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    int Ls = this->Ls;
-
-    FermionField D(V._grid);
-
-    int nblock=(Ls-1)/2;
-    for(int b=0;b<nblock;b++){
-      int s = 2*b;
-      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
-      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
-    }
-    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
-
-    this->DhopDerivOE(mat,D,V,DaggerNo); 
-  };
-    template<class Impl>
-   void PartialFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    int Ls = this->Ls;
-
-    FermionField D(V._grid);
-
-    int nblock=(Ls-1)/2;
-    for(int b=0;b<nblock;b++){
-      int s = 2*b;
-      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
-      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
-    }
-    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
-
-    this->DhopDerivEO(mat,D,V,DaggerNo); 
-  };
-
-    template<class Impl>
-    void  PartialFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
-      SetCoefficientsZolotarev(1.0/scale,zdata);
-    }
-    template<class Impl>
-    void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){
-
-      // check on degree matching
-      //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
-      int Ls = this->Ls;
-
-      assert(Ls == (2*zdata->da -1) );
-
-      // Part frac
-      //      RealD R;
-      R=(1+mass)/(1-mass);
-      dw_diag = (4.0-this->M5);
-
-      //      std::vector<RealD> p; 
-      //      std::vector<RealD> q;
-      p.resize(zdata->da);
-      q.resize(zdata->dd);
-	
-      for(int n=0;n<zdata->da;n++){
-	p[n] = zdata -> alpha[n];
-      }
-      for(int n=0;n<zdata->dd;n++){
-	q[n] = -zdata -> ap[n];
-      }
-      
-      scale= part_frac_chroma_convention ? 2.0 : 1.0; // Chroma conventions annoy me
-
-      amax=zolo_hi;
-    }
-
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
-    {
-      int Ls = this->Ls;
-      conformable(solution5d._grid,this->FermionGrid());
-      conformable(exported4d._grid,this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
-    }
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
-    {
-      int Ls = this->Ls;
-      conformable(imported5d._grid,this->FermionGrid());
-      conformable(input4d._grid   ,this->GaugeGrid());
-      FermionField tmp(this->FermionGrid());
-      tmp=zero;
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
-      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
-      this->Dminus(tmp,imported5d);
-    }
-
-      // Constructors
-    template<class Impl>
-    PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
-							     GridCartesian         &FiveDimGrid,
-							     GridRedBlackCartesian &FiveDimRedBlackGrid,
-							     GridCartesian         &FourDimGrid,
-							     GridRedBlackCartesian &FourDimRedBlackGrid,
-							     RealD _mass,RealD M5,
-							     const ImplParams &p) :
-      WilsonFermion5D<Impl>(_Umu,
-			    FiveDimGrid, FiveDimRedBlackGrid,
-			    FourDimGrid, FourDimRedBlackGrid,M5,p),
-      mass(_mass)
-
-    {
-      int Ls = this->Ls;
-
-      assert((Ls&0x1)==1); // Odd Ls required
-      int nrational=Ls-1;
-
-
-      Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);
-
-      // NB: chroma uses a cast to "float" for the zolotarev range(!?).
-      // this creates a real difference in the operator which I do not like but we can replicate here
-      // to demonstrate compatibility
-      //      RealD eps = (zolo_lo / zolo_hi);
-      //      zdata = bfm_zolotarev(eps,nrational,0);
-      
-      SetCoefficientsTanh(zdata,1.0);
-
-      Approx::zolotarev_free(zdata);
-
-    }
- 
-    FermOpTemplateInstantiate(PartialFractionFermion5D);
-
- }
-}
-
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,51 +24,49 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_PARTIAL_FRACTION_H
 #define  GRID_QCD_PARTIAL_FRACTION_H

 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class PartialFractionFermion5D : public WilsonFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    template<class Impl>
-    class PartialFractionFermion5D : public WilsonFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
+  const int part_frac_chroma_convention=1;

-      const int part_frac_chroma_convention=1;
+  void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
+  void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
+  void   MooeeInv_internal(const FermionField &in, FermionField &out,int dag);
+  void   M_internal(const FermionField &in, FermionField &out,int dag);

-      void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
-      void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
-      void   MooeeInv_internal(const FermionField &in, FermionField &out,int dag);
-      void   M_internal(const FermionField &in, FermionField &out,int dag);
+  // override multiply
+  virtual RealD  M    (const FermionField &in, FermionField &out);
+  virtual RealD  Mdag (const FermionField &in, FermionField &out);

-      // override multiply
-      virtual RealD  M    (const FermionField &in, FermionField &out);
-      virtual RealD  Mdag (const FermionField &in, FermionField &out);
+  // half checkerboard operaions
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);

-      // half checkerboard operaions
-      virtual void   Meooe       (const FermionField &in, FermionField &out);
-      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
-      virtual void   Mooee       (const FermionField &in, FermionField &out);
-      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+  // force terms; five routines; default to Dhop on diagonal
+  virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

-      // force terms; five routines; default to Dhop on diagonal
-      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void   Instantiatable(void) =0; // ensure no make-eee

-      virtual void   Instantiatable(void) =0; // ensure no make-eee
-
-      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  // Efficient support for multigrid coarsening
+  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);

      ///////////////////////////////////////////////////////////////
      // Physical surface field utilities
@@ -76,32 +74,30 @@ namespace Grid {
      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);

-      // Constructors
-      PartialFractionFermion5D(GaugeField &_Umu,
-			       GridCartesian         &FiveDimGrid,
-			       GridRedBlackCartesian &FiveDimRedBlackGrid,
-			       GridCartesian         &FourDimGrid,
-			       GridRedBlackCartesian &FourDimRedBlackGrid,
-			       RealD _mass,RealD M5,const ImplParams &p= ImplParams());
+  // Constructors
+  PartialFractionFermion5D(GaugeField &_Umu,
+			   GridCartesian         &FiveDimGrid,
+			   GridRedBlackCartesian &FiveDimRedBlackGrid,
+			   GridCartesian         &FourDimGrid,
+			   GridRedBlackCartesian &FourDimRedBlackGrid,
+			   RealD _mass,RealD M5,const ImplParams &p= ImplParams());

-    protected:
+protected:

-      virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
-      virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);
+  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
+  virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);

-      // Part frac
-      RealD mass;
-      RealD dw_diag;
-      RealD R;
-      RealD amax;
-      RealD scale;
-      std::vector<double> p; 
-      std::vector<double> q;
+  // Part frac
+  RealD mass;
+  RealD dw_diag;
+  RealD R;
+  RealD amax;
+  RealD scale;
+  Vector<double> p; 
+  Vector<double> q;

-    };
+};

-
-  }
-}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/PauliVillarsInverters.h
+++ b/Grid/qcd/action/fermion/PauliVillarsInverters.h
@@ -27,8 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #pragma once

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template<class Field>
 class PauliVillarsSolverUnprec
@@ -90,6 +89,4 @@ class PauliVillarsSolverFourierAccel
  };
 };

-
-}
-}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/Reconstruct5Dprop.h
+++ b/Grid/qcd/action/fermion/Reconstruct5Dprop.h
@@ -27,8 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #pragma once

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template<class Field,class PVinverter> class Reconstruct5DfromPhysical {
 private:
@@ -131,5 +130,5 @@ template<class Field,class PVinverter> class Reconstruct5DfromPhysical {
  }
 };

-}
-}
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/ScaledShamirFermion.h
+++ b/Grid/qcd/action/fermion/ScaledShamirFermion.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,46 +24,43 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_SCALED_SHAMIR_FERMION_H
 #define  GRID_QCD_SCALED_SHAMIR_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class ScaledShamirFermion : public MobiusFermion<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    template<class Impl>
-    class ScaledShamirFermion : public MobiusFermion<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-
-      // Constructors
-    ScaledShamirFermion(GaugeField &_Umu,
-			GridCartesian         &FiveDimGrid,
-			GridRedBlackCartesian &FiveDimRedBlackGrid,
-			GridCartesian         &FourDimGrid,
-			GridRedBlackCartesian &FourDimRedBlackGrid,
-			RealD _mass,RealD _M5,
-//			RealD scale):
-			RealD scale,const ImplParams &p= ImplParams()) :
+  // Constructors
+  ScaledShamirFermion(GaugeField &_Umu,
+		      GridCartesian         &FiveDimGrid,
+		      GridRedBlackCartesian &FiveDimRedBlackGrid,
+		      GridCartesian         &FourDimGrid,
+		      GridRedBlackCartesian &FourDimRedBlackGrid,
+		      RealD _mass,RealD _M5,
+		      //			RealD scale):
+		      RealD scale,const ImplParams &p= ImplParams()) :
      
-      // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
-      MobiusFermion<Impl>(_Umu,
-		    FiveDimGrid,
-		    FiveDimRedBlackGrid,
-		    FourDimGrid,
-	FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0),p)
-//		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
-      {
-      }
-
-    };
-
+    // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
+    MobiusFermion<Impl>(_Umu,
+			FiveDimGrid,
+			FiveDimRedBlackGrid,
+			FourDimGrid,
+			FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0),p)
+    //		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
+  {
  }
-}
+
+};
+
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h
+++ b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,40 +24,40 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  _SCHUR_DIAG_TWO_KAPPA_H
-#define  _SCHUR_DIAG_TWO_KAPPA_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  // This is specific to (Z)mobius fermions
-  template<class Matrix, class Field>
-    class KappaSimilarityTransform {
-  public:
-    INHERIT_IMPL_TYPES(Matrix);
-    std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
+// This is specific to (Z)mobius fermions
+template<class Matrix, class Field>
+class KappaSimilarityTransform {
+public:
+  INHERIT_IMPL_TYPES(Matrix);
+  Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;

-    KappaSimilarityTransform (Matrix &zmob) {
-      for (int i=0;i<(int)zmob.bs.size();i++) {
-	Coeff_t k = 1.0 / ( 2.0 * (zmob.bs[i] *(4 - zmob.M5) + 1.0) );
-	kappa.push_back( k );
-	kappaDag.push_back( conj(k) );
-	kappaInv.push_back( 1.0 / k );
-	kappaInvDag.push_back( 1.0 / conj(k) );
-      }
+  KappaSimilarityTransform (Matrix &zmob) {
+    for (int i=0;i<(int)zmob.bs.size();i++) {
+      Coeff_t k = 1.0 / ( 2.0 * (zmob.bs[i] *(4 - zmob.M5) + 1.0) );
+      kappa.push_back( k );
+      kappaDag.push_back( conj(k) );
+      kappaInv.push_back( 1.0 / k );
+      kappaInvDag.push_back( 1.0 / conj(k) );
    }
+  }

  template<typename vobj>
-    void sscale(const Lattice<vobj>& in, Lattice<vobj>& out, Coeff_t* s) {
-    GridBase *grid=out._grid;
-    out.checkerboard = in.checkerboard;
+  void sscale(const Lattice<vobj>& in, Lattice<vobj>& out, Coeff_t* s) {
+    GridBase *grid=out.Grid();
+    out.Checkerboard() = in.Checkerboard();
    assert(grid->_simd_layout[0] == 1); // should be fine for ZMobius for now
    int Ls = grid->_rdimensions[0];
-    parallel_for(int ss=0;ss<grid->oSites();ss++){
-      vobj tmp = s[ss % Ls]*in._odata[ss];
-      vstream(out._odata[ss],tmp);
-    }
+    thread_for(ss, grid->oSites(),
+    {
+      vobj tmp = s[ss % Ls]*in[ss];
+      vstream(out[ss],tmp);
+    });
  }

  RealD sscale_norm(const Field& in, Field& out, Coeff_t* s) {
@@ -70,33 +70,33 @@ namespace Grid {
  virtual RealD MInv    (const Field& in, Field& out) { return sscale_norm(in,out,&kappaInv[0]);}
  virtual RealD MInvDag (const Field& in, Field& out) { return sscale_norm(in,out,&kappaInvDag[0]);}

-  };
+};

-  template<class Matrix,class Field>
-    class SchurDiagTwoKappaOperator :  public SchurOperatorBase<Field> {
-  public:
-    KappaSimilarityTransform<Matrix, Field> _S;
-    SchurDiagTwoOperator<Matrix, Field> _Mat;
+template<class Matrix,class Field>
+class SchurDiagTwoKappaOperator :  public SchurOperatorBase<Field> {
+public:
+  KappaSimilarityTransform<Matrix, Field> _S;
+  SchurDiagTwoOperator<Matrix, Field> _Mat;

-    SchurDiagTwoKappaOperator (Matrix &Mat): _S(Mat), _Mat(Mat) {};
+  SchurDiagTwoKappaOperator (Matrix &Mat): _S(Mat), _Mat(Mat) {};

-    virtual  RealD Mpc      (const Field &in, Field &out) {
-      Field tmp(in._grid);
+  virtual  RealD Mpc      (const Field &in, Field &out) {
+    Field tmp(in.Grid());

-      _S.MInv(in,out);
-      _Mat.Mpc(out,tmp);
-      return _S.M(tmp,out);
+    _S.MInv(in,out);
+    _Mat.Mpc(out,tmp);
+    return _S.M(tmp,out);

-    }
-    virtual  RealD MpcDag   (const Field &in, Field &out){
-      Field tmp(in._grid);
+  }
+  virtual  RealD MpcDag   (const Field &in, Field &out){
+    Field tmp(in.Grid());

-      _S.MDag(in,out);
-      _Mat.MpcDag(out,tmp);
-      return _S.MInvDag(tmp,out);
-    }
-  };
+    _S.MDag(in,out);
+    _Mat.MpcDag(out,tmp);
+    return _S.MInvDag(tmp,out);
+  }
+};
+
+NAMESPACE_END(Grid);

-}

-#endif
--- a/Grid/qcd/action/fermion/ShamirZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/ShamirZolotarevFermion.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,46 +24,43 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class ShamirZolotarevFermion : public MobiusZolotarevFermion<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    template<class Impl>
-    class ShamirZolotarevFermion : public MobiusZolotarevFermion<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-
-      // Constructors
+  // Constructors


-    ShamirZolotarevFermion(GaugeField &_Umu,
-			   GridCartesian         &FiveDimGrid,
-			   GridRedBlackCartesian &FiveDimRedBlackGrid,
-			   GridCartesian         &FourDimGrid,
-			   GridRedBlackCartesian &FourDimRedBlackGrid,
-			   RealD _mass,RealD _M5,
-			   RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
+  ShamirZolotarevFermion(GaugeField &_Umu,
+			 GridCartesian         &FiveDimGrid,
+			 GridRedBlackCartesian &FiveDimRedBlackGrid,
+			 GridCartesian         &FourDimGrid,
+			 GridRedBlackCartesian &FourDimRedBlackGrid,
+			 RealD _mass,RealD _M5,
+			 RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
      
-      // b+c = 1; b-c = 1 => b=1, c=0
-      MobiusZolotarevFermion<Impl>(_Umu,
-				   FiveDimGrid,
-				   FiveDimRedBlackGrid,
-				   FourDimGrid,
-				   FourDimRedBlackGrid,_mass,_M5,1.0,0.0,lo,hi,p)
+    // b+c = 1; b-c = 1 => b=1, c=0
+    MobiusZolotarevFermion<Impl>(_Umu,
+				 FiveDimGrid,
+				 FiveDimRedBlackGrid,
+				 FourDimGrid,
+				 FourDimRedBlackGrid,_mass,_M5,1.0,0.0,lo,hi,p)
      
-      {}
+  {}

-    };
+};

-  }
-}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/StaggeredImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredImpl.h
@@ -0,0 +1,175 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template <class S, class Representation = FundamentalRepresentation >
+class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > 
+{
+
+public:
+
+  typedef RealD  _Coeff_t ;
+  static const int Dimension = Representation::Dimension;
+  static const bool isFundamental = Representation::isFundamental;
+  static const bool LsVectorised=false;
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
+      
+  //Necessary?
+  constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
+    
+  typedef _Coeff_t Coeff_t;
+
+  INHERIT_GIMPL_TYPES(Gimpl);
+      
+  template <typename vtype> using iImplSpinor            = iScalar<iScalar<iVector<vtype, Dimension> > >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iScalar<iVector<vtype, Dimension> > >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+  template <typename vtype> using iImplPropagator        = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+    
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+  typedef iImplPropagator<Simd>        SitePropagator;
+    
+  typedef Lattice<SiteSpinor>            FermionField;
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+  typedef Lattice<SitePropagator> PropagatorField;
+    
+  typedef StaggeredImplParams ImplParams;
+  typedef SimpleCompressor<SiteSpinor> Compressor;
+  typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
+  typedef typename StencilImpl::View_type StencilView;
+
+  ImplParams Params;
+    
+  StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){};
+      
+  static accelerator_inline void multLink(SiteSpinor &phi,
+		       const SiteDoubledGaugeField &U,
+		       const SiteSpinor &chi,
+		       int mu)
+  {
+    mult(&phi(), &U(mu), &chi());
+  }
+  static accelerator_inline void multLinkAdd(SiteSpinor &phi,
+			  const SiteDoubledGaugeField &U,
+			  const SiteSpinor &chi,
+			  int mu)
+  {
+    mac(&phi(), &U(mu), &chi());
+  }
+      
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    reg = memory;
+  }
+      
+    inline void InsertGaugeField(DoubledGaugeField &U_ds,
+				 const GaugeLinkField &U,int mu)
+    {
+      PokeIndex<LorentzIndex>(U_ds, U, mu);
+    }
+  inline void DoubleStore(GridBase *GaugeGrid,
+			  DoubledGaugeField &UUUds, // for Naik term
+			  DoubledGaugeField &Uds,
+			  const GaugeField &Uthin,
+			  const GaugeField &Ufat) {
+    conformable(Uds.Grid(), GaugeGrid);
+    conformable(Uthin.Grid(), GaugeGrid);
+    conformable(Ufat.Grid(), GaugeGrid);
+    GaugeLinkField U(GaugeGrid);
+    GaugeLinkField UU(GaugeGrid);
+    GaugeLinkField UUU(GaugeGrid);
+    GaugeLinkField Udag(GaugeGrid);
+    GaugeLinkField UUUdag(GaugeGrid);
+    for (int mu = 0; mu < Nd; mu++) {
+
+      // Staggered Phase.
+      Lattice<iScalar<vInteger> > coor(GaugeGrid);
+      Lattice<iScalar<vInteger> > x(GaugeGrid); LatticeCoordinate(x,0);
+      Lattice<iScalar<vInteger> > y(GaugeGrid); LatticeCoordinate(y,1);
+      Lattice<iScalar<vInteger> > z(GaugeGrid); LatticeCoordinate(z,2);
+      Lattice<iScalar<vInteger> > t(GaugeGrid); LatticeCoordinate(t,3);
+
+      Lattice<iScalar<vInteger> > lin_z(GaugeGrid); lin_z=x+y;
+      Lattice<iScalar<vInteger> > lin_t(GaugeGrid); lin_t=x+y+z;
+
+      ComplexField phases(GaugeGrid);	phases=1.0;
+
+      if ( mu == 1 ) phases = where( mod(x    ,2)==(Integer)0, phases,-phases);
+      if ( mu == 2 ) phases = where( mod(lin_z,2)==(Integer)0, phases,-phases);
+      if ( mu == 3 ) phases = where( mod(lin_t,2)==(Integer)0, phases,-phases);
+
+      // 1 hop based on fat links
+      U      = PeekIndex<LorentzIndex>(Ufat, mu);
+      Udag   = adj( Cshift(U, mu, -1));
+
+      U    = U    *phases;
+      Udag = Udag *phases;
+
+	InsertGaugeField(Uds,U,mu);
+	InsertGaugeField(Uds,Udag,mu+4);
+	//	PokeIndex<LorentzIndex>(Uds, U, mu);
+	//	PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
+
+      // 3 hop based on thin links. Crazy huh ?
+      U  = PeekIndex<LorentzIndex>(Uthin, mu);
+      UU = Gimpl::CovShiftForward(U,mu,U);
+      UUU= Gimpl::CovShiftForward(U,mu,UU);
+	
+      UUUdag = adj( Cshift(UUU, mu, -3));
+
+      UUU    = UUU    *phases;
+      UUUdag = UUUdag *phases;
+
+	InsertGaugeField(UUUds,UUU,mu);
+	InsertGaugeField(UUUds,UUUdag,mu+4);
+
+    }
+  }
+
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+    GaugeLinkField link(mat.Grid());
+    link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+    PokeIndex<LorentzIndex>(mat,link,mu);
+  }   
+      
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+    assert (0); 
+    // Must never hit
+  }
+};
+typedef StaggeredImpl<vComplex,  FundamentalRepresentation > StaggeredImplR;   // Real.. whichever prec
+typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF;  // Float
+typedef StaggeredImpl<vComplexD, FundamentalRepresentation > StaggeredImplD;  // Double
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/StaggeredKernels.h
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@@ -26,11 +26,9 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef GRID_QCD_STAGGERED_KERNELS_H
-#define GRID_QCD_STAGGERED_KERNELS_H
+#pragma once

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid)

  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Helper routines that implement Staggered stencil for a single site.
@@ -51,72 +49,69 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   
 public:
    
-   void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
-		      int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
+   void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
+		      int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);

   ///////////////////////////////////////////////////////////////////////////////////////
   // Generic Nc kernels
   ///////////////////////////////////////////////////////////////////////////////////////
   void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
-			DoubledGaugeField &U, DoubledGaugeField &UUU, 
+			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
-			const FermionField &in, FermionField &out,int dag);
+			const FermionFieldView &in, FermionFieldView &out,int dag);
   void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
-			   DoubledGaugeField &U, DoubledGaugeField &UUU, 
+			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			   SiteSpinor * buf, int LLs, int sU, 
-			   const FermionField &in, FermionField &out,int dag);
+			   const FermionFieldView &in, FermionFieldView &out,int dag);
   void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
-			   DoubledGaugeField &U, DoubledGaugeField &UUU,
+			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 			   SiteSpinor * buf, int LLs, int sU, 
-			   const FermionField &in, FermionField &out,int dag);
+			   const FermionFieldView &in, FermionFieldView &out,int dag);

   ///////////////////////////////////////////////////////////////////////////////////////
   // Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
   void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
-		     DoubledGaugeField &U,DoubledGaugeField &UUU, 
+		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		     SiteSpinor * buf, int LLs, int sU, 
-		     const FermionField &in, FermionField &out,int dag);
+		     const FermionFieldView &in, FermionFieldView &out,int dag);
   void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
-			DoubledGaugeField &U,DoubledGaugeField &UUU, 
+			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
-			const FermionField &in, FermionField &out,int dag);
+			const FermionFieldView &in, FermionFieldView &out,int dag);
   void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
-			DoubledGaugeField &U,DoubledGaugeField &UUU, 
+			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
-			const FermionField &in, FermionField &out,int dag);
+			const FermionFieldView &in, FermionFieldView &out,int dag);

   ///////////////////////////////////////////////////////////////////////////////////////
   // Asm Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
   void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-		    DoubledGaugeField &U,DoubledGaugeField &UUU, 
+		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU, 
-		    const FermionField &in, FermionField &out,int dag);
+		    const FermionFieldView &in, FermionFieldView &out,int dag);
   ///////////////////////////////////////////////////////////////////////////////////////////////////
   // Generic interface; fan out to right routine
   ///////////////////////////////////////////////////////////////////////////////////////////////////
   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
-		 DoubledGaugeField &U, DoubledGaugeField &UUU, 
+		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 		 SiteSpinor * buf, int LLs, int sU,
-		 const FermionField &in, FermionField &out, int interior=1,int exterior=1);
+		 const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);

   void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, 
-		    DoubledGaugeField &U, DoubledGaugeField &UUU, 
+		    DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU,
-		    const FermionField &in, FermionField &out, int interior=1,int exterior=1);
+		    const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);

   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
-		 DoubledGaugeField &U, DoubledGaugeField &UUU, 
+		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 		 SiteSpinor * buf, int LLs, int sU,
-		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
+		 const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
  
 public:

  StaggeredKernels(const ImplParams &p = ImplParams());

 };
-    
-}}
-
-#endif
+NAMESPACE_END(Grid);    
--- a/Grid/qcd/action/fermion/StaggeredVec5dImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredVec5dImpl.h
@@ -0,0 +1,203 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template <class S, class Representation = FundamentalRepresentation >
+class StaggeredVec5dImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
+
+public:
+
+  static const int Dimension = Representation::Dimension;
+    static const bool isFundamental = Representation::isFundamental;
+  static const bool LsVectorised=true;
+  typedef RealD   Coeff_t ;
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
+      
+  //Necessary?
+  constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
+
+
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  template <typename vtype> using iImplSpinor            = iScalar<iScalar<iVector<vtype, Dimension> > >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iScalar<iVector<vtype, Dimension> > >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
+  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+  template <typename vtype> using iImplPropagator        = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+
+  // Make the doubled gauge field a *scalar*
+  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
+  typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar
+  typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar
+  typedef iImplPropagator<Simd>        SitePropagator;
+
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+  typedef Lattice<SitePropagator> PropagatorField;
+    
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+
+    
+  typedef Lattice<SiteSpinor>            FermionField;
+    
+  typedef StaggeredImplParams ImplParams;
+  typedef SimpleCompressor<SiteSpinor> Compressor;
+  typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
+  typedef typename StencilImpl::View_type StencilView;
+    
+  ImplParams Params;
+    
+  StaggeredVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
+
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    vsplat(reg, memory);
+  }
+
+  static accelerator_inline void multLink(SiteHalfSpinor &phi, 
+					  const SiteDoubledGaugeField &U,
+					  const SiteHalfSpinor &chi, 
+					  int mu) 
+  {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+	vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mult(&phi(), &UU(), &chi());
+  }
+  static accelerator_inline void multLinkAdd(SiteHalfSpinor &phi, 
+					     const SiteDoubledGaugeField &U,
+					     const SiteHalfSpinor &chi, 
+					     int mu) 
+  {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+	vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mac(&phi(), &UU(), &chi());
+  }
+      
+  inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
+  {
+    GridBase *GaugeGrid = U_ds.Grid();
+    thread_for(lidx, GaugeGrid->lSites(),{
+
+	SiteScalarGaugeLink   ScalarU;
+	SiteDoubledGaugeField ScalarUds;
+	
+	Coordinate lcoor;
+	GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
+	peekLocalSite(ScalarUds, U_ds, lcoor);
+	
+	peekLocalSite(ScalarU, U, lcoor);
+	ScalarUds(mu) = ScalarU();
+	
+    });
+  }
+  inline void DoubleStore(GridBase *GaugeGrid,
+			  DoubledGaugeField &UUUds, // for Naik term
+			  DoubledGaugeField &Uds,
+			  const GaugeField &Uthin,
+			  const GaugeField &Ufat) 
+  {
+
+    GridBase * InputGrid = Uthin.Grid();
+    conformable(InputGrid,Ufat.Grid());
+
+    GaugeLinkField U(InputGrid);
+    GaugeLinkField UU(InputGrid);
+    GaugeLinkField UUU(InputGrid);
+    GaugeLinkField Udag(InputGrid);
+    GaugeLinkField UUUdag(InputGrid);
+
+    for (int mu = 0; mu < Nd; mu++) {
+
+      // Staggered Phase.
+      Lattice<iScalar<vInteger> > coor(InputGrid);
+      Lattice<iScalar<vInteger> > x(InputGrid); LatticeCoordinate(x,0);
+      Lattice<iScalar<vInteger> > y(InputGrid); LatticeCoordinate(y,1);
+      Lattice<iScalar<vInteger> > z(InputGrid); LatticeCoordinate(z,2);
+      Lattice<iScalar<vInteger> > t(InputGrid); LatticeCoordinate(t,3);
+
+      Lattice<iScalar<vInteger> > lin_z(InputGrid); lin_z=x+y;
+      Lattice<iScalar<vInteger> > lin_t(InputGrid); lin_t=x+y+z;
+
+      ComplexField phases(InputGrid);	phases=1.0;
+
+      if ( mu == 1 ) phases = where( mod(x    ,2)==(Integer)0, phases,-phases);
+      if ( mu == 2 ) phases = where( mod(lin_z,2)==(Integer)0, phases,-phases);
+      if ( mu == 3 ) phases = where( mod(lin_t,2)==(Integer)0, phases,-phases);
+
+      // 1 hop based on fat links
+      U      = PeekIndex<LorentzIndex>(Ufat, mu);
+      Udag   = adj( Cshift(U, mu, -1));
+
+      U    = U    *phases;
+      Udag = Udag *phases;
+
+      InsertGaugeField(Uds,U,mu);
+      InsertGaugeField(Uds,Udag,mu+4);
+
+      // 3 hop based on thin links. Crazy huh ?
+      U  = PeekIndex<LorentzIndex>(Uthin, mu);
+      UU = Gimpl::CovShiftForward(U,mu,U);
+      UUU= Gimpl::CovShiftForward(U,mu,UU);
+	
+      UUUdag = adj( Cshift(UUU, mu, -3));
+
+      UUU    = UUU    *phases;
+      UUUdag = UUUdag *phases;
+
+      InsertGaugeField(UUUds,UUU,mu);
+      InsertGaugeField(UUUds,UUUdag,mu+4);
+
+    }
+  }
+
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+    assert(0);
+  }   
+      
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+    assert (0); 
+  }
+};
+typedef StaggeredVec5dImpl<vComplex,  FundamentalRepresentation > StaggeredVec5dImplR;   // Real.. whichever prec
+typedef StaggeredVec5dImpl<vComplexF, FundamentalRepresentation > StaggeredVec5dImplF;  // Float
+typedef StaggeredVec5dImpl<vComplexD, FundamentalRepresentation > StaggeredVec5dImplD;  // Double
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@@ -27,15 +27,11 @@
    *************************************************************************************/
 /*  END LEGAL */

-#ifndef GRID_QCD_WILSON_CLOVER_FERMION_H
-#define GRID_QCD_WILSON_CLOVER_FERMION_H
+#pragma once

 #include <Grid/Grid.h>

-namespace Grid
-{
-namespace QCD
-{
+NAMESPACE_BEGIN(Grid);

 ///////////////////////////////////////////////////////////////////
 // Wilson Clover
@@ -131,22 +127,22 @@ public:
  // Derivative parts unpreconditioned pseudofermions
  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
  {
-    conformable(X._grid, Y._grid);
-    conformable(X._grid, force._grid);
-    GaugeLinkField force_mu(force._grid), lambda(force._grid);
-    GaugeField clover_force(force._grid);
-    PropagatorField Lambda(force._grid);
+    conformable(X.Grid(), Y.Grid());
+    conformable(X.Grid(), force.Grid());
+    GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
+    GaugeField clover_force(force.Grid());
+    PropagatorField Lambda(force.Grid());

    // Guido: Here we are hitting some performance issues:
    // need to extract the components of the DoubledGaugeField
    // for each call
    // Possible solution
    // Create a vector object to store them? (cons: wasting space)
-    std::vector<GaugeLinkField> U(Nd, this->Umu._grid);
+    std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());

    Impl::extractLinkField(U, this->Umu);

-    force = zero;
+    force = Zero();
    // Derivative of the Wilson hopping term
    this->DhopDeriv(force, X, Y, dag);

@@ -179,10 +175,10 @@ public:
    */

    int count = 0;
-    clover_force = zero;
+    clover_force = Zero();
    for (int mu = 0; mu < 4; mu++)
    {
-      force_mu = zero;
+      force_mu = Zero();
      for (int nu = 0; nu < 4; nu++)
      {
        if (mu == nu)
@@ -212,8 +208,8 @@ public:
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
-    conformable(lambda._grid, U[0]._grid);
-    GaugeLinkField out(lambda._grid), tmp(lambda._grid);
+    conformable(lambda.Grid(), U[0].Grid());
+    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
    // insertion in upper staple
    // please check redundancy of shift operations

@@ -266,102 +262,113 @@ private:
  // using the DeGrand-Rossi basis for the gamma matrices
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    CloverFieldType T(F.Grid());
+    T = Zero();
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_for(i, CloverTerm.Grid()->oSites(),
    {
-      T._odata[i]()(0, 1) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(1, 0) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
-    }
+      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
+      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
+      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
+    });

    return T;
  }

  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    CloverFieldType T(F.Grid());
+    T = Zero();
+    
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_for(i, CloverTerm.Grid()->oSites(),
    {
-      T._odata[i]()(0, 1) = -F._odata[i]()();
-      T._odata[i]()(1, 0) = F._odata[i]()();
-      T._odata[i]()(2, 3) = -F._odata[i]()();
-      T._odata[i]()(3, 2) = F._odata[i]()();
-    }
+      T_v[i]()(0, 1) = -F_v[i]()();
+      T_v[i]()(1, 0) = F_v[i]()();
+      T_v[i]()(2, 3) = -F_v[i]()();
+      T_v[i]()(3, 2) = F_v[i]()();
+    });

    return T;
  }

  CloverFieldType fillCloverXY(const GaugeLinkField &F)
  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
-    {
+    CloverFieldType T(F.Grid());
+    T = Zero();

-      T._odata[i]()(0, 0) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(1, 1) = timesI(F._odata[i]()());
-      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
-    }
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_for(i, CloverTerm.Grid()->oSites(),
+    {
+      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
+      T_v[i]()(1, 1) = timesI(F_v[i]()());
+      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 3) = timesI(F_v[i]()());
+    });

    return T;
  }

  CloverFieldType fillCloverXT(const GaugeLinkField &F)
  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    CloverFieldType T(F.Grid());
+    T = Zero();
+
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_for(i, CloverTerm.Grid()->oSites(),
    {
-      T._odata[i]()(0, 1) = timesI(F._odata[i]()());
-      T._odata[i]()(1, 0) = timesI(F._odata[i]()());
-      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
-    }
+      T_v[i]()(0, 1) = timesI(F_v[i]()());
+      T_v[i]()(1, 0) = timesI(F_v[i]()());
+      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
+    });

    return T;
  }

  CloverFieldType fillCloverYT(const GaugeLinkField &F)
  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    CloverFieldType T(F.Grid());
+    T = Zero();
+    
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_for(i, CloverTerm.Grid()->oSites(),
    {
-      T._odata[i]()(0, 1) = -(F._odata[i]()());
-      T._odata[i]()(1, 0) = (F._odata[i]()());
-      T._odata[i]()(2, 3) = (F._odata[i]()());
-      T._odata[i]()(3, 2) = -(F._odata[i]()());
-    }
+      T_v[i]()(0, 1) = -(F_v[i]()());
+      T_v[i]()(1, 0) = (F_v[i]()());
+      T_v[i]()(2, 3) = (F_v[i]()());
+      T_v[i]()(3, 2) = -(F_v[i]()());
+    });

    return T;
  }

  CloverFieldType fillCloverZT(const GaugeLinkField &F)
  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    CloverFieldType T(F.Grid());
+
+    T = Zero();
+
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_for(i, CloverTerm.Grid()->oSites(),
    {
-      T._odata[i]()(0, 0) = timesI(F._odata[i]()());
-      T._odata[i]()(1, 1) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
-    }
+      T_v[i]()(0, 0) = timesI(F_v[i]()());
+      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
+      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 3) = timesI(F_v[i]()());
+    });

    return T;
  }
 };
-}
-}
+NAMESPACE_END(Grid);
+
+

-#endif // GRID_QCD_WILSON_CLOVER_FERMION_H
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -25,13 +25,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_WILSON_COMPRESSOR_H
 #define  GRID_QCD_WILSON_COMPRESSOR_H

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 /////////////////////////////////////////////////////////////////////////////////////////////
 // optimised versions supporting half precision too
@@ -43,9 +42,9 @@ class WilsonCompressorTemplate;

 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
 class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
-  typename std::enable_if<std::is_same<_HCspinor,_Hspinor>::value>::type >
+				typename std::enable_if<std::is_same<_HCspinor,_Hspinor>::value>::type >
 {
- public:
+public:
  
  int mu,dag;  

@@ -62,15 +61,16 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);

-  inline int CommDatumSize(void) {
+  accelerator_inline int CommDatumSize(void) {
    return sizeof(SiteHalfCommSpinor);
  }

  /*****************************************************/
  /* Compress includes precision change if mpi data is not same */
  /*****************************************************/
-  inline void Compress(SiteHalfSpinor * __restrict__ buf,Integer o,const SiteSpinor &in) {
-    SiteHalfSpinor tmp;
+  template<class _SiteHalfSpinor, class _SiteSpinor>
+  accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
+    _SiteHalfSpinor tmp;
    projector::Proj(tmp,in,mu,dag);
    vstream(buf[o],tmp);
  }
@@ -78,10 +78,10 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Exchange includes precision change if mpi data is not same */
  /*****************************************************/
-  inline void Exchange(SiteHalfSpinor * __restrict__ mp,
-                       const SiteHalfSpinor * __restrict__ vp0,
-                       const SiteHalfSpinor * __restrict__ vp1,
-		       Integer type,Integer o){
+  accelerator_inline void Exchange(SiteHalfSpinor *mp,
+				   const SiteHalfSpinor * __restrict__ vp0,
+				   const SiteHalfSpinor * __restrict__ vp1,
+				   Integer type,Integer o){
    SiteHalfSpinor tmp1;
    SiteHalfSpinor tmp2;
    exchange(tmp1,tmp2,vp0[o],vp1[o],type);
@@ -92,19 +92,21 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
-  inline void Decompress(SiteHalfSpinor * __restrict__ out,
-			 SiteHalfSpinor * __restrict__ in, Integer o) {    
+  accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out,
+				     SiteHalfSpinor * __restrict__ in, Integer o) {    
    assert(0);
  }

  /*****************************************************/
  /* Compress Exchange                                 */
  /*****************************************************/
-  inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
-			       SiteHalfSpinor * __restrict__ out1,
-			       const SiteSpinor * __restrict__ in,
-			       Integer j,Integer k, Integer m,Integer type){
-    SiteHalfSpinor temp1, temp2,temp3,temp4;
+  accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
+					   SiteHalfSpinor * __restrict__ out1,
+					   const SiteSpinor * __restrict__ in,
+					   Integer j,Integer k, Integer m,Integer type)
+  {
+    SiteHalfSpinor temp1, temp2;
+    SiteHalfSpinor temp3, temp4;
    projector::Proj(temp1,in[k],mu,dag);
    projector::Proj(temp2,in[m],mu,dag);
    exchange(temp3,temp4,temp1,temp2,type);
@@ -115,15 +117,15 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Pass the info to the stencil */
  /*****************************************************/
-  inline bool DecompressionStep(void) { return false; }
+  accelerator_inline bool DecompressionStep(void) { return false; }

 };

 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
 class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
-  typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
+				typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
 {
- public:
+public:
  
  int mu,dag;  

@@ -140,15 +142,16 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);

-  inline int CommDatumSize(void) {
+  accelerator_inline int CommDatumSize(void) {
    return sizeof(SiteHalfCommSpinor);
  }

  /*****************************************************/
  /* Compress includes precision change if mpi data is not same */
  /*****************************************************/
-  inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
-    SiteHalfSpinor hsp;
+  template<class _SiteHalfSpinor, class _SiteSpinor>
+  accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
+    _SiteHalfSpinor hsp;
    SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
    projector::Proj(hsp,in,mu,dag);
    precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
@@ -157,7 +160,7 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Exchange includes precision change if mpi data is not same */
  /*****************************************************/
-  inline void Exchange(SiteHalfSpinor *mp,
+  accelerator_inline void Exchange(SiteHalfSpinor *mp,
                       SiteHalfSpinor *vp0,
                       SiteHalfSpinor *vp1,
 		       Integer type,Integer o){
@@ -172,8 +175,7 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
-  inline void Decompress(SiteHalfSpinor *out,
-			 SiteHalfSpinor *in, Integer o){
+  accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o){
    SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
    precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
  }
@@ -181,7 +183,7 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Compress Exchange                                 */
  /*****************************************************/
-  inline void CompressExchange(SiteHalfSpinor *out0,
+  accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
 			       SiteHalfSpinor *out1,
 			       const SiteSpinor *in,
 			       Integer j,Integer k, Integer m,Integer type){
@@ -198,19 +200,19 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Pass the info to the stencil */
  /*****************************************************/
-  inline bool DecompressionStep(void) { return true; }
+  accelerator_inline bool DecompressionStep(void) { return true; }

 };

 #define DECLARE_PROJ(Projector,Compressor,spProj)			\
  class Projector {							\
  public:								\
-    template<class hsp,class fsp>					\
-    static void Proj(hsp &result,const fsp &in,int mu,int dag){			\
-      spProj(result,in);						\
-    }									\
+  template<class hsp,class fsp>						\
+  static accelerator void Proj(hsp &result,const fsp &in,int mu,int dag){ \
+    spProj(result,in);							\
+  }									\
  };									\
-template<typename HCS,typename HS,typename S> using Compressor = WilsonCompressorTemplate<HCS,HS,S,Projector>;
+  template<typename HCS,typename HS,typename S> using Compressor = WilsonCompressorTemplate<HCS,HS,S,Projector>;

 DECLARE_PROJ(WilsonXpProjector,WilsonXpCompressor,spProjXp);
 DECLARE_PROJ(WilsonYpProjector,WilsonYpCompressor,spProjYp);
@@ -222,9 +224,9 @@ DECLARE_PROJ(WilsonZmProjector,WilsonZmCompressor,spProjZm);
 DECLARE_PROJ(WilsonTmProjector,WilsonTmCompressor,spProjTm);

 class WilsonProjector {
- public:
+public:
  template<class hsp,class fsp>
-  static void Proj(hsp &result,const fsp &in,int mu,int dag){
+  static accelerator void Proj(hsp &result,const fsp &in,int mu,int dag){
    int mudag=dag? mu : (mu+Nd)%(2*Nd);
    switch(mudag) {
    case Xp:	spProjXp(result,in);	break;
@@ -243,9 +245,14 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom

 // Fast comms buffer manipulation which should inline right through (avoid direction
 // dependent logic that prevents inlining
-template<class vobj,class cobj>
-class WilsonStencil : public CartesianStencil<vobj,cobj> {
+template<class vobj,class cobj,class Parameters>
+class WilsonStencil : public CartesianStencil<vobj,cobj,Parameters> {
 public:
+
+  typedef CartesianStencil<vobj,cobj,Parameters> Base;
+  typedef typename Base::View_type View_type;
+  typedef typename Base::StencilVector StencilVector;
+
  double timer0;
  double timer1;
  double timer2;
@@ -274,16 +281,40 @@ public:
    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
  }

+  std::vector<int> surface_list;
+
  WilsonStencil(GridBase *grid,
 		int npoints,
 		int checkerboard,
 		const std::vector<int> &directions,
-		const std::vector<int> &distances)  
-    : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) 
+		const std::vector<int> &distances,Parameters p)  
+    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p) 
  { 
    ZeroCountersi();
+    surface_list.resize(0);
+    this->same_node.resize(npoints);
  };

+  void BuildSurfaceList(int Ls,int vol4){
+
+    // find same node for SHM
+    // Here we know the distance is 1 for WilsonStencil
+    for(int point=0;point<this->_npoints;point++){
+      this->same_node[point] = this->SameNode(point);
+    }
+    
+    for(int site = 0 ;site< vol4;site++){
+      int local = 1;
+      for(int point=0;point<this->_npoints;point++){
+	if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ 
+	  local = 0;
+	}
+      }
+      if(local == 0) { 
+	surface_list.push_back(site);
+      }
+    }
+  }

  template < class compressor>
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
@@ -292,8 +323,6 @@ public:
    this->HaloExchangeOptGather(source,compress);
    double t1=usecond();
    // Asynchronous MPI calls multidirectional, Isend etc...
-    //    this->CommunicateBegin(reqs);
-    //    this->CommunicateComplete(reqs);
    // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
    this->Communicate();
    double t2=usecond(); timer1 += t2-t1;
@@ -327,7 +356,7 @@ public:
    this->_grid->StencilBarrier();
    this->mpi3synctime_g+=usecond();

-    assert(source._grid==this->_grid);
+    assert(source.Grid()==this->_grid);
    this->halogtime-=usecond();
    
    this->u_comm_offset=0;
@@ -365,9 +394,10 @@ public:
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
    this->halogtime+=usecond();
+    accelerator_barrier();
  }

- };
+};

-}} // namespace close
+NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -27,16 +27,13 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_QCD_WILSON_FERMION_H
-#define GRID_QCD_WILSON_FERMION_H
+			   /*  END LEGAL */
+#pragma once

-namespace Grid {
-
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 class WilsonFermionStatic {
- public:
+public:
  static int HandOptDslash;  // these are a temporary hack
  static int MortonOrder;
  static const std::vector<int> directions;
@@ -60,8 +57,9 @@ class WilsonFermionStatic {
 };

 template <class Impl>
-class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
- public:
+class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic 
+{
+public:
  INHERIT_IMPL_TYPES(Impl);
  typedef WilsonKernels<Impl> Kernels;

@@ -138,10 +136,10 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {

  // Constructor
  WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
-                GridRedBlackCartesian &Hgrid, RealD _mass, 
+                GridRedBlackCartesian &Hgrid, RealD _mass,
                const ImplParams &p = ImplParams(), 
                const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );
-  
+
  // DoubleStore impl dependent
  void ImportGauge(const GaugeField &_Umu);

@@ -150,7 +148,7 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  ///////////////////////////////////////////////////////////////

  //    protected:
- public:
+public:
  virtual RealD Mass(void) { return mass; }
  virtual int   isTrivialEE(void) { return 1; };
  RealD mass;
@@ -171,7 +169,7 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {

  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
-
+  
  WilsonAnisotropyCoefficients anisotropyCoeff;
  
  ///////////////////////////////////////////////////////////////
@@ -182,11 +180,11 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
                                PropagatorField &q_out,
                                Current curr_type,
                                unsigned int mu);
-  void SeqConservedCurrent(PropagatorField &q_in, 
-                             PropagatorField &q_out,
-                             Current curr_type, 
-                             unsigned int mu,
-                             unsigned int tmin, 
+  void SeqConservedCurrent(PropagatorField &q_in,
+                           PropagatorField &q_out,
+                           Current curr_type,
+                           unsigned int mu, 
+                           unsigned int tmin,
                             unsigned int tmax,
 			     ComplexField &lattice_cmplx);
 };
@@ -194,7 +192,6 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 typedef WilsonFermion<WilsonImplD> WilsonFermionD;

+NAMESPACE_END(Grid);
+

-}
-}
-#endif
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -1,5 +1,5 @@

-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -26,216 +26,215 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_WILSON_FERMION_5D_H
 #define  GRID_QCD_WILSON_FERMION_5D_H

 #include <Grid/perfmon/Stat.h>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  ////////////////////////////////////////////////////////////////////////////////
-  // This is the 4d red black case appropriate to support
-  //
-  // parity = (x+y+z+t)|2;
-  // generalised five dim fermions like mobius, zolotarev etc..	
-  //
-  // i.e. even even contains fifth dim hopping term.
-  //
-  // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
-  ////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// This is the 4d red black case appropriate to support
+//
+// parity = (x+y+z+t)|2;
+// generalised five dim fermions like mobius, zolotarev etc..	
+//
+// i.e. even even contains fifth dim hopping term.
+//
+// [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
+////////////////////////////////////////////////////////////////////////////////

-    ////////////////////////////////////////////////////////////////////////////////
-    // This is the 4d red black case appropriate to support
-    //
-    // parity = (x+y+z+t)|2;
-    // generalised five dim fermions like mobius, zolotarev etc..	
-    //
-    // i.e. even even contains fifth dim hopping term.
-    //
-    // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
-    ////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// This is the 4d red black case appropriate to support
+//
+// parity = (x+y+z+t)|2;
+// generalised five dim fermions like mobius, zolotarev etc..	
+//
+// i.e. even even contains fifth dim hopping term.
+//
+// [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
+////////////////////////////////////////////////////////////////////////////////

-    class WilsonFermion5DStatic { 
-    public:
-      // S-direction is INNERMOST and takes no part in the parity.
-      static const std::vector<int> directions;
-      static const std::vector<int> displacements;
-      const int npoint = 8;
-    };
+class WilsonFermion5DStatic { 
+public:
+  // S-direction is INNERMOST and takes no part in the parity.
+  static const std::vector<int> directions;
+  static const std::vector<int> displacements;
+  static constexpr int npoint = 8;
+};

-    template<class Impl>
-    class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-     typedef WilsonKernels<Impl> Kernels;
-     PmuStat stat;
+template<class Impl>
+class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+  typedef WilsonKernels<Impl> Kernels;
+  PmuStat stat;

-     FermionField _tmp;
-     FermionField &tmp(void) { return _tmp; }
+  FermionField _tmp;
+  FermionField &tmp(void) { return _tmp; }

-     void Report(void);
-     void ZeroCounters(void);
-     double DhopCalls;
-     double DhopCommTime;
-     double DhopComputeTime;
-     double DhopComputeTime2;
-     double DhopFaceTime;
-     double DhopTotalTime;
+  void Report(void);
+  void ZeroCounters(void);
+  double DhopCalls;
+  double DhopCommTime;
+  double DhopComputeTime;
+  double DhopComputeTime2;
+  double DhopFaceTime;
+  double DhopTotalTime;

-     double DerivCalls;
-     double DerivCommTime;
-     double DerivComputeTime;
-     double DerivDhopComputeTime;
+  double DerivCalls;
+  double DerivCommTime;
+  double DerivComputeTime;
+  double DerivDhopComputeTime;

-      ///////////////////////////////////////////////////////////////
-      // Implement the abstract base
-      ///////////////////////////////////////////////////////////////
-      GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
-      GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
-      GridBase *FermionGrid(void)            { return _FiveDimGrid;}
-      GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
+  ///////////////////////////////////////////////////////////////
+  // Implement the abstract base
+  ///////////////////////////////////////////////////////////////
+  GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
+  GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
+  GridBase *FermionGrid(void)            { return _FiveDimGrid;}
+  GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}

-      // full checkerboard operations; leave unimplemented as abstract for now
-      virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
-      virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+  // full checkerboard operations; leave unimplemented as abstract for now
+  virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+  virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};

-      // half checkerboard operations; leave unimplemented as abstract for now
-      virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
+  // half checkerboard operations; leave unimplemented as abstract for now
+  virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};

-      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

-      // These can be overridden by fancy 5d chiral action
-      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  // These can be overridden by fancy 5d chiral action
+  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

      void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
      void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
      void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;

-      // Implement hopping term non-hermitian hopping term; half cb or both
-      // Implement s-diagonal DW
-      void DW    (const FermionField &in, FermionField &out,int dag);
-      void Dhop  (const FermionField &in, FermionField &out,int dag);
-      void DhopOE(const FermionField &in, FermionField &out,int dag);
-      void DhopEO(const FermionField &in, FermionField &out,int dag);
+  // Implement hopping term non-hermitian hopping term; half cb or both
+  // Implement s-diagonal DW
+  void DW    (const FermionField &in, FermionField &out,int dag);
+  void Dhop  (const FermionField &in, FermionField &out,int dag);
+  void DhopOE(const FermionField &in, FermionField &out,int dag);
+  void DhopEO(const FermionField &in, FermionField &out,int dag);

-      // add a DhopComm
-      // -- suboptimal interface will presently trigger multiple comms.
-    void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+  // add a DhopComm
+  // -- suboptimal interface will presently trigger multiple comms.
+  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
    
-    ///////////////////////////////////////////////////////////////
-    // New methods added 
-    ///////////////////////////////////////////////////////////////
-    void DerivInternal(StencilImpl & st,
-		       DoubledGaugeField & U,
-		       GaugeField &mat,
-		       const FermionField &A,
-		       const FermionField &B,
-		       int dag);
+  ///////////////////////////////////////////////////////////////
+  // New methods added 
+  ///////////////////////////////////////////////////////////////
+  void DerivInternal(StencilImpl & st,
+		     DoubledGaugeField & U,
+		     GaugeField &mat,
+		     const FermionField &A,
+		     const FermionField &B,
+		     int dag);
    
-    void DhopInternal(StencilImpl & st,
-		      LebesgueOrder &lo,
-		      DoubledGaugeField &U,
-		      const FermionField &in, 
-		      FermionField &out,
-		      int dag);
+  void DhopInternal(StencilImpl & st,
+		    LebesgueOrder &lo,
+		    DoubledGaugeField &U,
+		    const FermionField &in, 
+		    FermionField &out,
+		    int dag);

-    void DhopInternalOverlappedComms(StencilImpl & st,
-				     LebesgueOrder &lo,
-				     DoubledGaugeField &U,
-				     const FermionField &in, 
-				     FermionField &out,
-				     int dag);
+  void DhopInternalOverlappedComms(StencilImpl & st,
+				   LebesgueOrder &lo,
+				   DoubledGaugeField &U,
+				   const FermionField &in, 
+				   FermionField &out,
+				   int dag);

-    void DhopInternalSerialComms(StencilImpl & st,
-				 LebesgueOrder &lo,
-				 DoubledGaugeField &U,
-				 const FermionField &in, 
-				 FermionField &out,
-				 int dag);
+  void DhopInternalSerialComms(StencilImpl & st,
+			       LebesgueOrder &lo,
+			       DoubledGaugeField &U,
+			       const FermionField &in, 
+			       FermionField &out,
+			       int dag);
    
-    // Constructors
-    WilsonFermion5D(GaugeField &_Umu,
-		    GridCartesian         &FiveDimGrid,
-		    GridRedBlackCartesian &FiveDimRedBlackGrid,
-		    GridCartesian         &FourDimGrid,
-		    GridRedBlackCartesian &FourDimRedBlackGrid,
-		    double _M5,const ImplParams &p= ImplParams());
+  // Constructors
+  WilsonFermion5D(GaugeField &_Umu,
+		  GridCartesian         &FiveDimGrid,
+		  GridRedBlackCartesian &FiveDimRedBlackGrid,
+		  GridCartesian         &FourDimGrid,
+		  GridRedBlackCartesian &FourDimRedBlackGrid,
+		  double _M5,const ImplParams &p= ImplParams());
    
-    // Constructors
-    /*
-      WilsonFermion5D(int simd, 
-      GaugeField &_Umu,
-      GridCartesian         &FiveDimGrid,
-      GridRedBlackCartesian &FiveDimRedBlackGrid,
-      GridCartesian         &FourDimGrid,
-      double _M5,const ImplParams &p= ImplParams());
-    */
+  // Constructors
+  /*
+    WilsonFermion5D(int simd, 
+    GaugeField &_Umu,
+    GridCartesian         &FiveDimGrid,
+    GridRedBlackCartesian &FiveDimRedBlackGrid,
+    GridCartesian         &FourDimGrid,
+    double _M5,const ImplParams &p= ImplParams());
+  */
    
-    // DoubleStore
-    void ImportGauge(const GaugeField &_Umu);
+  // DoubleStore
+  void ImportGauge(const GaugeField &_Umu);
    
-    ///////////////////////////////////////////////////////////////
-    // Data members require to support the functionality
-    ///////////////////////////////////////////////////////////////
-  public:
+  ///////////////////////////////////////////////////////////////
+  // Data members require to support the functionality
+  ///////////////////////////////////////////////////////////////
+public:
    
-    // Add these to the support from Wilson
-    GridBase *_FourDimGrid;
-    GridBase *_FourDimRedBlackGrid;
-    GridBase *_FiveDimGrid;
-    GridBase *_FiveDimRedBlackGrid;
+  // Add these to the support from Wilson
+  GridBase *_FourDimGrid;
+  GridBase *_FourDimRedBlackGrid;
+  GridBase *_FiveDimGrid;
+  GridBase *_FiveDimRedBlackGrid;
    
-    double                        M5;
-    int Ls;
+  double                        M5;
+  int Ls;
    
-    //Defines the stencils for even and odd
-    StencilImpl Stencil; 
-    StencilImpl StencilEven; 
-    StencilImpl StencilOdd; 
+  //Defines the stencils for even and odd
+  StencilImpl Stencil; 
+  StencilImpl StencilEven; 
+  StencilImpl StencilOdd; 
    
-    // Copy of the gauge field , with even and odd subsets
-    DoubledGaugeField Umu;
-    DoubledGaugeField UmuEven;
-    DoubledGaugeField UmuOdd;
+  // Copy of the gauge field , with even and odd subsets
+  DoubledGaugeField Umu;
+  DoubledGaugeField UmuEven;
+  DoubledGaugeField UmuOdd;
    
-    LebesgueOrder Lebesgue;
-    LebesgueOrder LebesgueEvenOdd;
+  LebesgueOrder Lebesgue;
+  LebesgueOrder LebesgueEvenOdd;
    
-    // Comms buffer
-    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  // Comms buffer
+  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    
-    ///////////////////////////////////////////////////////////////
-    // Conserved current utilities
-    ///////////////////////////////////////////////////////////////
-    void ContractConservedCurrent(PropagatorField &q_in_1,
-                                  PropagatorField &q_in_2,
-                                  PropagatorField &q_out,
-                                  Current curr_type, 
-                                  unsigned int mu);
-    void SeqConservedCurrent(PropagatorField &q_in, 
-                             PropagatorField &q_out,
-                             Current curr_type, 
-                             unsigned int mu,
-                             unsigned int tmin, 
-                             unsigned int tmax,
-			     ComplexField &lattice_cmplx);
+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+				PropagatorField &q_in_2,
+				PropagatorField &q_out,
+				Current curr_type, 
+				unsigned int mu);
+  void SeqConservedCurrent(PropagatorField &q_in,
+			   PropagatorField &q_out,
+			   Current curr_type,
+			   unsigned int mu,
+			   unsigned int tmin,
+			   unsigned int tmax,
+			   ComplexField &lattice_cmplx);

-    void ContractJ5q(PropagatorField &q_in,ComplexField &J5q);
-    void ContractJ5q(FermionField &q_in,ComplexField &J5q);
+  void ContractJ5q(PropagatorField &q_in,ComplexField &J5q);
+  void ContractJ5q(FermionField &q_in,ComplexField &J5q);

-  };
+};

-}}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -0,0 +1,226 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+  
+/////////////////////////////////////////////////////////////////////////////
+// Single flavour four spinors with colour index
+/////////////////////////////////////////////////////////////////////////////
+template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
+class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
+public:
+
+  static const int Dimension = Representation::Dimension;
+  static const bool isFundamental = Representation::isFundamental;
+  static const bool LsVectorised=false;
+  static const int Nhcs = Options::Nhcs;
+
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
+  INHERIT_GIMPL_TYPES(Gimpl);
+      
+  //Necessary?
+  constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
+    
+  typedef typename Options::_Coeff_t Coeff_t;
+  typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
+      
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+    
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplPropagator<Simd>        SitePropagator;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+  typedef iImplHalfCommSpinor<SimdL>   SiteHalfCommSpinor;
+  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+    
+  typedef Lattice<SiteSpinor>            FermionField;
+  typedef Lattice<SitePropagator>        PropagatorField;
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+    
+  typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
+  typedef WilsonImplParams ImplParams;
+  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
+  typedef typename StencilImpl::View_type StencilView;
+    
+  ImplParams Params;
+
+  WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){
+    assert(Params.boundary_phases.size() == Nd);
+  };
+
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi,
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi,
+					  int mu) 
+  {
+    auto UU = coalescedRead(U(mu));
+    mult(&phi(), &UU, &chi());
+  }
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi,
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi,
+					  int mu,
+					  StencilEntry *SE,
+					  StencilView &St) 
+  {
+    multLink(phi,U,chi,mu);
+  }
+    
+      
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    reg = memory;
+  }
+      
+  inline void DoubleStore(GridBase *GaugeGrid,
+			  DoubledGaugeField &Uds,
+			  const GaugeField &Umu) 
+  {
+    typedef typename Simd::scalar_type scalar_type;
+
+    conformable(Uds.Grid(), GaugeGrid);
+    conformable(Umu.Grid(), GaugeGrid);
+
+    GaugeLinkField U(GaugeGrid);
+    GaugeLinkField tmp(GaugeGrid);
+
+    Lattice<iScalar<vInteger> > coor(GaugeGrid);
+      ////////////////////////////////////////////////////
+      // apply any boundary phase or twists
+      ////////////////////////////////////////////////////
+    for (int mu = 0; mu < Nd; mu++) {
+
+	////////// boundary phase /////////////
+      auto pha = Params.boundary_phases[mu];
+      scalar_type phase( real(pha),imag(pha) );
+
+	int L   = GaugeGrid->GlobalDimensions()[mu];
+        int Lmu = L - 1;
+
+      LatticeCoordinate(coor, mu);
+
+      U = PeekIndex<LorentzIndex>(Umu, mu);
+
+	// apply any twists
+	RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
+	if ( theta != 0.0) { 
+	  scalar_type twphase(::cos(theta),::sin(theta));
+	  U = twphase*U;
+	  std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
+	}
+
+      tmp = where(coor == Lmu, phase * U, U);
+      PokeIndex<LorentzIndex>(Uds, tmp, mu);
+
+      U = adj(Cshift(U, mu, -1));
+      U = where(coor == 0, conjugate(phase) * U, U); 
+      PokeIndex<LorentzIndex>(Uds, U, mu + 4);
+    }
+  }
+
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+    GaugeLinkField link(mat.Grid());
+    link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+    PokeIndex<LorentzIndex>(mat,link,mu);
+  }   
+      
+    inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
+      mat = outerProduct(B,A); 
+    }  
+
+    inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+      mat = TraceIndex<SpinIndex>(P); 
+    }
+      
+    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+      for (int mu = 0; mu < Nd; mu++)
+      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
+    }
+
+
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+      
+    int Ls=Btilde.Grid()->_fdimensions[0];
+    GaugeLinkField tmp(mat.Grid());
+    tmp = Zero();
+    auto tmp_v = tmp.View();
+    auto Btilde_v = Btilde.View();
+    auto Atilde_v = Atilde.View();
+    thread_for(sss,tmp.Grid()->oSites(),{
+      int sU=sss;
+      for(int s=0;s<Ls;s++){
+	int sF = s+Ls*sU;
+	tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
+      }
+    });
+    PokeIndex<LorentzIndex>(mat,tmp,mu);
+      
+  }
+};
+
+
+typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffReal > WilsonImplR;  // Real.. whichever prec
+typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF;  // Float
+typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD;  // Double
+
+typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL;  // Real.. whichever prec
+typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH;  // Float
+typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF;  // Double
+
+typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec
+typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
+typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
+
+typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec
+typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
+typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double
+ 
+typedef WilsonImpl<vComplex,  AdjointRepresentation, CoeffReal > WilsonAdjImplR;   // Real.. whichever prec
+typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF;  // Float
+typedef WilsonImpl<vComplexD, AdjointRepresentation, CoeffReal > WilsonAdjImplD;  // Double
+ 
+typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplR;   // Real.. whichever prec
+typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplF;  // Float
+typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplD;  // Double
+ 
+typedef WilsonImpl<vComplex,  TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplR;   // Real.. whichever prec
+typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF;  // Float
+typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD;  // Double
+
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/WilsonKernels.cc
+++ b/Grid/qcd/action/fermion/WilsonKernels.cc
@@ -1,455 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-namespace Grid {
-namespace QCD {
-
-int WilsonKernelsStatic::Opt   = WilsonKernelsStatic::OptGeneric;
-int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
-
-template <class Impl>
-WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
-
-////////////////////////////////////////////
-// Generic implementation; move to different file?
-////////////////////////////////////////////
-  
-#define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
-  SE = st.GetEntry(ptype, Dir, sF);				\
-  if (SE->_is_local) {						\
-    chi_p = &chi;						\
-    if (SE->_permute) {						\
-      spProj(tmp, in._odata[SE->_offset]);			\
-      permute(chi, tmp, ptype);					\
-    } else {							\
-      spProj(chi, in._odata[SE->_offset]);			\
-    }								\
-  } else {							\
-    chi_p = &buf[SE->_offset];					\
-  }								\
-  Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st);	\
-  Recon(result, Uchi);
-  
-#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon)		\
-  SE = st.GetEntry(ptype, Dir, sF);				\
-  if (SE->_is_local) {						\
-    chi_p = &chi;						\
-    if (SE->_permute) {						\
-      spProj(tmp, in._odata[SE->_offset]);			\
-      permute(chi, tmp, ptype);					\
-    } else {							\
-      spProj(chi, in._odata[SE->_offset]);			\
-    }								\
-  } else if ( st.same_node[Dir] ) {				\
-      chi_p = &buf[SE->_offset];				\
-  }								\
-  if (SE->_is_local || st.same_node[Dir] ) {			\
-    Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st);	\
-    Recon(result, Uchi);					\
-  }
-
-#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
-  SE = st.GetEntry(ptype, Dir, sF);				\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
-    chi_p = &buf[SE->_offset];					\
-    Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st);	\
-    Recon(result, Uchi);					\
-    nmu++;							\
-  }
-
-#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
-  if (gamma == Dir) {						\
-    if (SE->_is_local && SE->_permute) {			\
-      spProj(tmp, in._odata[SE->_offset]);			\
-      permute(chi, tmp, ptype);					\
-    } else if (SE->_is_local) {					\
-      spProj(chi, in._odata[SE->_offset]);			\
-    } else {							\
-      chi = buf[SE->_offset];					\
-    }								\
-    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);	\
-    Recon(result, Uchi);					\
-  }
-
-  ////////////////////////////////////////////////////////////////////
-  // All legs kernels ; comms then compute
-  ////////////////////////////////////////////////////////////////////
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-					     SiteHalfSpinor *buf, int sF,
-					     int sU, const FermionField &in, FermionField &out)
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-
-  GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
-  GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG(Tp,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG(Xm,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG(Ym,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
-  vstream(out._odata[sF], result);
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-					  SiteHalfSpinor *buf, int sF,
-					  int sU, const FermionField &in, FermionField &out) 
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-
-  GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
-  GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG(Tm,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG(Xp,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG(Yp,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
-  vstream(out._odata[sF], result);
-};
-  ////////////////////////////////////////////////////////////////////
-  // Interior kernels
-  ////////////////////////////////////////////////////////////////////
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-						SiteHalfSpinor *buf, int sF,
-						int sU, const FermionField &in, FermionField &out)
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-
-  result=zero;
-  GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_INT(Yp,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_INT(Zp,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_INT(Tp,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_INT(Xm,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_INT(Ym,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
-  vstream(out._odata[sF], result);
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-					     SiteHalfSpinor *buf, int sF,
-					     int sU, const FermionField &in, FermionField &out) 
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-  result=zero;
-  GENERIC_STENCIL_LEG_INT(Xm,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_INT(Ym,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_INT(Zm,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_INT(Tm,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_INT(Xp,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_INT(Yp,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
-  vstream(out._odata[sF], result);
-};
-////////////////////////////////////////////////////////////////////
-// Exterior kernels
-////////////////////////////////////////////////////////////////////
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-						SiteHalfSpinor *buf, int sF,
-						int sU, const FermionField &in, FermionField &out)
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-  int nmu=0;
-  result=zero;
-  GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_EXT(Zp,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_EXT(Tp,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_EXT(Xm,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm);
-  if ( nmu ) { 
-    out._odata[sF] = out._odata[sF] + result; 
-  }
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-					     SiteHalfSpinor *buf, int sF,
-					     int sU, const FermionField &in, FermionField &out) 
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-  int nmu=0;
-  result=zero;
-  GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_EXT(Zm,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_EXT(Tm,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_EXT(Xp,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm);
-  if ( nmu ) { 
-    out._odata[sF] = out._odata[sF] + result; 
-  }
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF,
-					   int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
-
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteSpinor result;
-  SiteHalfSpinor Uchi;
-  StencilEntry *SE;
-  int ptype;
-
-  SE = st.GetEntry(ptype, dir, sF);
-  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
-  GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
-  GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
-  GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
-  GENERIC_DHOPDIR_LEG(Xm,spProjXm,spReconXm);
-  GENERIC_DHOPDIR_LEG(Ym,spProjYm,spReconYm);
-  GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
-  GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
-  vstream(out._odata[sF], result);
-}
-
-/*******************************************************************************
- * Conserved current utilities for Wilson fermions, for contracting propagators
- * to make a conserved current sink or inserting the conserved current 
- * sequentially. Common to both 4D and 5D.
- ******************************************************************************/
-// N.B. Functions below assume a -1/2 factor within U.
-#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
-#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
-
-/*******************************************************************************
- * Name: ContractConservedCurrentSiteFwd
- * Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in_1 shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(
-                                                  const SitePropagator &q_in_1,
-                                                  const SitePropagator &q_in_2,
-                                                  SitePropagator &q_out,
-                                                  DoubledGaugeField &U,
-                                                  unsigned int sU,
-                                                  unsigned int mu,
-                                                  bool switch_sign)
-{
-    SitePropagator result, tmp;
-    Gamma g5(Gamma::Algebra::Gamma5);
-    Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu);
-    result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
-    if (switch_sign)
-    {
-        q_out -= result;
-    }
-    else
-    {
-        q_out += result;
-    }
-}
-
-/*******************************************************************************
- * Name: ContractConservedCurrentSiteBwd
- * Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in_2 shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(
-                                                  const SitePropagator &q_in_1,
-                                                  const SitePropagator &q_in_2,
-                                                  SitePropagator &q_out,
-                                                  DoubledGaugeField &U,
-                                                  unsigned int sU,
-                                                  unsigned int mu,
-                                                  bool switch_sign)
-{
-    SitePropagator result, tmp;
-    Gamma g5(Gamma::Algebra::Gamma5);
-    Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu + Nd);
-    result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
-    if (switch_sign)
-    {
-        q_out += result;
-    }
-    else
-    {
-        q_out -= result;
-    }
-}
-
-// G-parity requires more specialised implementation.
-#define NO_CURR_SITE(Impl) \
-template <> \
-void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd( \
-                                                  const SitePropagator &q_in_1, \
-                                                  const SitePropagator &q_in_2, \
-                                                  SitePropagator &q_out,        \
-                                                  DoubledGaugeField &U,         \
-                                                  unsigned int sU,              \
-                                                  unsigned int mu,              \
-                                                  bool switch_sign)             \
-{ \
-    assert(0); \
-} \
-template <> \
-void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd( \
-                                                  const SitePropagator &q_in_1, \
-                                                  const SitePropagator &q_in_2, \
-                                                  SitePropagator &q_out,        \
-                                                  DoubledGaugeField &U,         \
-                                                  unsigned int mu,              \
-                                                  unsigned int sU,              \
-                                                  bool switch_sign)             \
-{ \
-    assert(0); \
-}
-
-NO_CURR_SITE(GparityWilsonImplF);
-NO_CURR_SITE(GparityWilsonImplD);
-NO_CURR_SITE(GparityWilsonImplFH);
-NO_CURR_SITE(GparityWilsonImplDF);
-
-
-/*******************************************************************************
- * Name: SeqConservedCurrentSiteFwd
- * Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
-                                                     SitePropagator &q_out,
-                                                     DoubledGaugeField &U,
-                                                     unsigned int sU,
-                                                     unsigned int mu,
-                                                     vInteger t_mask,
-                                                     bool switch_sign)
-{
-    SitePropagator result;
-    Impl::multLinkProp(result, U._odata[sU], q_in, mu);
-    result = WilsonCurrentFwd(result, mu);
-
-    // Zero any unwanted timeslice entries.
-    result = predicatedWhere(t_mask, result, 0.*result);
-
-    if (switch_sign)
-    {
-        q_out -= result;
-    }
-    else
-    {
-        q_out += result;
-    }
-}
-
-/*******************************************************************************
- * Name: SeqConservedCurrentSiteFwd
- * Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in shifted in -ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in, 
-                                                     SitePropagator &q_out,
-                                                     DoubledGaugeField &U,
-                                                     unsigned int sU,
-                                                     unsigned int mu,
-                                                     vInteger t_mask,
-                                                     bool switch_sign)
-{
-    SitePropagator result;
-    Impl::multLinkProp(result, U._odata[sU], q_in, mu + Nd);
-    result = WilsonCurrentBwd(result, mu);
-
-    // Zero any unwanted timeslice entries.
-    result = predicatedWhere(t_mask, result, 0.*result);
-
-    if (switch_sign)
-    {
-        q_out += result;
-    }
-    else
-    {
-        q_out -= result;
-    }
-}
-
-FermOpTemplateInstantiate(WilsonKernels);
-AdjointFermOpTemplateInstantiate(WilsonKernels);
-TwoIndexFermOpTemplateInstantiate(WilsonKernels);
-
-}}
-
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -27,19 +27,17 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_QCD_DHOP_H
-#define GRID_QCD_DHOP_H
+			   /*  END LEGAL */
+#pragma once

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Helper routines that implement Wilson stencil for a single site.
-  // Common to both the WilsonFermion and WilsonFermion5D
-  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Helper routines that implement Wilson stencil for a single site.
+// Common to both the WilsonFermion and WilsonFermion5D
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 class WilsonKernelsStatic { 
- public:
+public:
  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
  enum { CommsAndCompute, CommsThenCompute };
  static int Opt;  
@@ -47,235 +45,123 @@ class WilsonKernelsStatic {
 };
 
 template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
- public:
-   
+public:
+
  INHERIT_IMPL_TYPES(Impl);
  typedef FermionOperator<Impl> Base;
   
 public:

-  template <bool EnableBool = true>
-  typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
-  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
-  {
-    bgq_l1p_optimisation(1);
-    switch(Opt) {
-#if defined(AVX512) || defined (QPX)
-    case OptInlineAsm:
-      if(interior&&exterior) WilsonKernels<Impl>::AsmDhopSite   (st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-      else if (interior)     WilsonKernels<Impl>::AsmDhopSiteInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-      else if (exterior)     WilsonKernels<Impl>::AsmDhopSiteExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-      else assert(0);
-      break;
-#endif
-    case OptHandUnroll:
-         for (int site = 0; site < Ns; site++) {
-	   for (int s = 0; s < Ls; s++) {
-	     if(interior&&exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out);
-	     else if (interior)     WilsonKernels<Impl>::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
-	     else if (exterior)     WilsonKernels<Impl>::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
-	     sF++;
-	   }
-	   sU++;
-         }
-      break;
-    case OptGeneric:
-         for (int site = 0; site < Ns; site++) {
-	   for (int s = 0; s < Ls; s++) {
-	     if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
-	     else if (interior)     WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
-	     else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
-	     else assert(0);
-	     sF++;
-	   }
-	   sU++;
-       } 
-      break;
-    default:
-      assert(0);
-    }
-    bgq_l1p_optimisation(0);
-  }
-     
-  template <bool EnableBool = true>
-  typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool, void>::type
-  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-	   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) {
-    // no kernel choice  
-    for (int site = 0; site < Ns; site++) {
-      for (int s = 0; s < Ls; s++) {
-	if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
-	else if (interior)     WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
-	else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
-	else assert(0);
-	sF++;
-      }
-      sU++;
-    }
-  }
-     
-  template <bool EnableBool = true>
-  typename std::enable_if<Impl::isFundamental==true && Nc == 3 && EnableBool,void>::type
-  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-	      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
-{
-    bgq_l1p_optimisation(1);
-    switch(Opt) {
-#if defined(AVX512) || defined (QPX)
-    case OptInlineAsm:
-      if(interior&&exterior) WilsonKernels<Impl>::AsmDhopSiteDag   (st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-      else if (interior)     WilsonKernels<Impl>::AsmDhopSiteDagInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-      else if (exterior)     WilsonKernels<Impl>::AsmDhopSiteDagExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-      else assert(0);
-      break;
-#endif
-    case OptHandUnroll:
-      for (int site = 0; site < Ns; site++) {
-	for (int s = 0; s < Ls; s++) {
-	  if(interior&&exterior) WilsonKernels<Impl>::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-	  else if (interior)     WilsonKernels<Impl>::HandDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out);
-	  else if (exterior)     WilsonKernels<Impl>::HandDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out);
-	  else assert(0);
-	  sF++;
-	}
-	sU++;
-      }
-      break;
-    case OptGeneric:
-      for (int site = 0; site < Ns; site++) {
-	for (int s = 0; s < Ls; s++) {
-	  if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-	  else if (interior)     WilsonKernels<Impl>::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out);
-	  else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out);
-	  else assert(0);
-	  sF++;
-	}
-	sU++;
-      }
-      break;
-    default:
-      assert(0);
-    }
-    bgq_l1p_optimisation(0);
-  }
+  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+			 int Ls, int Nsite, const FermionField &in, FermionField &out,
+			 int interior=1,int exterior=1) ;

-  template <bool EnableBool = true>
-  typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool,void>::type
-  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
-		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) {
+  static void DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+			    int Ls, int Nsite, const FermionField &in, FermionField &out,
+			    int interior=1,int exterior=1) ;

-    for (int site = 0; site < Ns; site++) {
-      for (int s = 0; s < Ls; s++) {
-	if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-	else if (interior)     WilsonKernels<Impl>::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out);
-	else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out);
-	else assert(0);
-	sF++;
-      }
-      sU++;
-    }
-  }
+  static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
+			    int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);

-  void DhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
-		       int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
-      
  //////////////////////////////////////////////////////////////////////////////
  // Utilities for inserting Wilson conserved current.
  //////////////////////////////////////////////////////////////////////////////
-  void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
+  static void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
                                       const SitePropagator &q_in_2,
                                       SitePropagator &q_out,
-                                       DoubledGaugeField &U,
+                                       DoubledGaugeFieldView &U,
                                       unsigned int sU,
                                       unsigned int mu,
                                       bool switch_sign = false);
-  void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
+
+  static void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
                                       const SitePropagator &q_in_2,
                                       SitePropagator &q_out,
-                                       DoubledGaugeField &U,
+                                       DoubledGaugeFieldView &U,
                                       unsigned int sU,
                                       unsigned int mu,
                                       bool switch_sign = false);
-  void SeqConservedCurrentSiteFwd(const SitePropagator &q_in, 
+
+  static void SeqConservedCurrentSiteFwd(const SitePropagator &q_in, 
                                  SitePropagator &q_out,
-                                  DoubledGaugeField &U,
+                                  DoubledGaugeFieldView &U,
                                  unsigned int sU,
                                  unsigned int mu,
-                                  vInteger t_mask,
+                                  vPredicate t_mask,
                                  bool switch_sign = false);
-  void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
+
+  static void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
                                  SitePropagator &q_out,
-                                  DoubledGaugeField &U,
+                                  DoubledGaugeFieldView &U,
                                  unsigned int sU,
                                  unsigned int mu,
-                                  vInteger t_mask,
+                                  vPredicate t_mask,
                                  bool switch_sign = false);

 private:
-     // Specialised variants
-  void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		       int sF, int sU, const FermionField &in, FermionField &out);
+
+  static accelerator void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
+				   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
      
-  void GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			  int sF, int sU, const FermionField &in, FermionField &out);
-
-  void GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			  int sF, int sU, const FermionField &in, FermionField &out);
+  // Specialised variants
+  static accelerator void GenericDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
      
-  void GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			     int sF, int sU, const FermionField &in, FermionField &out);
-
-  void GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			  int sF, int sU, const FermionField &in, FermionField &out);
+  static accelerator void GenericDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+						    int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  
+  static accelerator void GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+						    int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
      
-  void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			     int sF, int sU, const FermionField &in, FermionField &out);
-
-
-  void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		   int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
-
-  void AsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
-
-  void AsmDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		      int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
-
-  void AsmDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			 int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
-
-  void AsmDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		      int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
-
-  void AsmDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			 int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
-
-
-  void HandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		    int sF, int sU, const FermionField &in, FermionField &out);
-
-  void HandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		       int sF, int sU, const FermionField &in, FermionField &out);
+  static accelerator void GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+						int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  
+  static accelerator void GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					     int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
      
-  void HandDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		       int sF, int sU, const FermionField &in, FermionField &out);
-  
-  void HandDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			  int sF, int sU, const FermionField &in, FermionField &out);
-  
-  void HandDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		       int sF, int sU, const FermionField &in, FermionField &out);
-  
-  void HandDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			  int sF, int sU, const FermionField &in, FermionField &out);
-  
-public:
+  static accelerator void GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+						       int sF, int sU, const FermionFieldView &in, FermionFieldView &out);

-  WilsonKernels(const ImplParams &p = ImplParams());
+  static void AsmDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+			  int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,FermionFieldView &out);
+  
+  static void AsmDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+			     int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out);
+  
+  static void AsmDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+			     int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,FermionFieldView &out);
+  
+  static void AsmDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+				int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out);
+  
+  static void AsmDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+			     int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,FermionFieldView &out);
+  
+  static void AsmDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+				int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out);

+// Keep Hand unrolled temporarily  
+  static accelerator void HandDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+				       int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  
+  static accelerator void HandDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  
+  static accelerator void HandDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  
+  static accelerator void HandDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					     int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  
+  static accelerator void HandDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  
+  static accelerator void HandDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					     int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+ public:
+ WilsonKernels(const ImplParams &p = ImplParams()) : Base(p){};
 };
    
-}}
+NAMESPACE_END(Grid);
+

-#endif
--- a/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -1,127 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-namespace Grid {
-namespace QCD {
-
-
-///////////////////////////////////////////////////////////
-// Default to no assembler implementation
-///////////////////////////////////////////////////////////
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-
-#include <Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h>
-#include <Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h>
-
-#define INSTANTIATE_ASM(A)\
-template void WilsonKernels<A>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
- \
-template void WilsonKernels<A>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
- \
-template void WilsonKernels<A>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
- \
-template void WilsonKernels<A>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
-
-INSTANTIATE_ASM(WilsonImplF);
-INSTANTIATE_ASM(WilsonImplD);
-INSTANTIATE_ASM(ZWilsonImplF);
-INSTANTIATE_ASM(ZWilsonImplD);
-INSTANTIATE_ASM(GparityWilsonImplF);
-INSTANTIATE_ASM(GparityWilsonImplD);
-INSTANTIATE_ASM(DomainWallVec5dImplF);
-INSTANTIATE_ASM(DomainWallVec5dImplD);
-INSTANTIATE_ASM(ZDomainWallVec5dImplF);
-INSTANTIATE_ASM(ZDomainWallVec5dImplD);
-
-INSTANTIATE_ASM(WilsonImplFH);
-INSTANTIATE_ASM(WilsonImplDF);
-INSTANTIATE_ASM(ZWilsonImplFH);
-INSTANTIATE_ASM(ZWilsonImplDF);
-INSTANTIATE_ASM(GparityWilsonImplFH);
-INSTANTIATE_ASM(GparityWilsonImplDF);
-INSTANTIATE_ASM(DomainWallVec5dImplFH);
-INSTANTIATE_ASM(DomainWallVec5dImplDF);
-INSTANTIATE_ASM(ZDomainWallVec5dImplFH);
-INSTANTIATE_ASM(ZDomainWallVec5dImplDF);
-
-}}
-
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h
@@ -1,650 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-
-#if defined(AVX512) 
-    ///////////////////////////////////////////////////////////
-    // If we are AVX512 specialise the single precision routine
-    ///////////////////////////////////////////////////////////
-#include <simd/Intel512wilson.h>
-#include <simd/Intel512single.h>
-    
-static Vector<vComplexF> signsF;
-
-  template<typename vtype>    
-  int setupSigns(Vector<vtype>& signs ){
-    Vector<vtype> bother(2);
-    signs = bother;
-    vrsign(signs[0]);
-    visign(signs[1]);
-    return 1;
-  }
-
-  static int signInitF = setupSigns(signsF);
-
-#define MAYBEPERM(A,perm) if (perm) { A ; }
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
-#define COMPLEX_SIGNS(isigns) vComplexF *isigns = &signsF[0];  
-  
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-#undef MAYBEPERM
-#undef MULT_2SPIN
-#define MAYBEPERM(A,B) 
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-#undef  MULT_2SPIN
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef COMPLEX_SIGNS
-#undef MAYBEPERM
-#undef MULT_2SPIN
-	
-
-
-///////////////////////////////////////////////////////////
-// If we are AVX512 specialise the double precision routine
-///////////////////////////////////////////////////////////
-
-#include <simd/Intel512double.h>
-    
-static Vector<vComplexD> signsD;
-static int signInitD = setupSigns(signsD);
-    
-#define MAYBEPERM(A,perm) if (perm) { A ; }
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
-#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];  
-
-
-#define INTERIOR_AND_EXTERIOR    
-#undef  INTERIOR
-#undef  EXTERIOR
-  
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-#undef MAYBEPERM
-#undef MULT_2SPIN
-#define MAYBEPERM(A,B) 
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-#undef  MULT_2SPIN
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef COMPLEX_SIGNS
-#undef MAYBEPERM
-#undef MULT_2SPIN
-
-#endif //AVX512
--- a/Grid/qcd/action/fermion/WilsonTMFermion.cc
+++ b/Grid/qcd/action/fermion/WilsonTMFermion.cc
@@ -1,99 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
-
-    Copyright (C) 2015
-
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    /*
-     * BF sequence
-     *
-      void bfmbase<Float>::MooeeInv(Fermion_t psi, 
-			       Fermion_t chi, 
-			      int dag, int cb)
-
-    double m    = this->mass;
-    double tm   = this->twistedmass;
-    double mtil = 4.0+this->mass;
-
-    double sq = mtil*mtil + tm*tm;
-
-    double a = mtil/sq;
-    double b = -tm /sq;
-    if(dag) b=-b;
-    axpibg5x(chi,psi,a,b);
-
-      void bfmbase<Float>::Mooee(Fermion_t psi, 
-			   Fermion_t chi, 
-			   int dag,int cb)
-    double a = 4.0+this->mass;
-    double b = this->twistedmass;
-    if(dag) b=-b;
-    axpibg5x(chi,psi,a,b);
-    */
-
-  template<class Impl>
-  void WilsonTMFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-    RealD a = 4.0+this->mass;
-    RealD b = this->mu;
-    out.checkerboard = in.checkerboard;
-    axpibg5x(out,in,a,b);
-  }
-  template<class Impl>
-  void WilsonTMFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-    RealD a = 4.0+this->mass;
-    RealD b = -this->mu;
-    out.checkerboard = in.checkerboard;
-    axpibg5x(out,in,a,b);
-  }
-  template<class Impl>
-  void WilsonTMFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-    RealD m    = this->mass;
-    RealD tm   = this->mu;
-    RealD mtil = 4.0+this->mass;
-    RealD sq   = mtil*mtil+tm*tm;
-    RealD a    = mtil/sq;
-    RealD b    = -tm /sq;
-    axpibg5x(out,in,a,b);
-  }
-  template<class Impl>
-  void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
-    RealD m    = this->mass;
-    RealD tm   = this->mu;
-    RealD mtil = 4.0+this->mass;
-    RealD sq   = mtil*mtil+tm*tm;
-    RealD a    = mtil/sq;
-    RealD b    = tm /sq;
-    axpibg5x(out,in,a,b);
-  }
-
-  FermOpTemplateInstantiate(WilsonTMFermion);
-
-}
-}
--- a/Grid/qcd/action/fermion/WilsonTMFermion.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -23,55 +23,52 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_WILSON_TM_FERMION_H
-#define  GRID_QCD_WILSON_TM_FERMION_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once 

 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/WilsonFermion.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class WilsonTMFermion : public WilsonFermion<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class WilsonTMFermion : public WilsonFermion<Impl>
-    {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
-    public:
+  virtual void   Instantiatable(void) {};
+  // Constructors
+  WilsonTMFermion(GaugeField &_Umu,
+		  GridCartesian         &Fgrid,
+		  GridRedBlackCartesian &Hgrid, 
+		  RealD _mass,
+		  RealD _mu,
+		  const ImplParams &p= ImplParams()
+		  ) :
+    WilsonFermion<Impl>(_Umu,
+			Fgrid,
+			Hgrid,
+			_mass,p)

-      virtual void   Instantiatable(void) {};
-      // Constructors
-      WilsonTMFermion(GaugeField &_Umu,
-		    GridCartesian         &Fgrid,
-		    GridRedBlackCartesian &Hgrid, 
-		    RealD _mass,
-		    RealD _mu,
-		    const ImplParams &p= ImplParams()
-		      ) :
-	WilsonFermion<Impl>(_Umu,
-			    Fgrid,
-			    Hgrid,
-			    _mass,p)
-
-      {
-	mu = _mu;
-      }
+  {
+    mu = _mu;
+  }


-    // allow override for twisted mass and clover
-    virtual void Mooee(const FermionField &in, FermionField &out) ;
-    virtual void MooeeDag(const FermionField &in, FermionField &out) ;
-    virtual void MooeeInv(const FermionField &in, FermionField &out) ;
-    virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
+  // allow override for twisted mass and clover
+  virtual void Mooee(const FermionField &in, FermionField &out) ;
+  virtual void MooeeDag(const FermionField &in, FermionField &out) ;
+  virtual void MooeeInv(const FermionField &in, FermionField &out) ;
+  virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;

-  private:
-     RealD mu; // TwistedMass parameter
+private:
+  RealD mu; // TwistedMass parameter

-  };
+};
+
+NAMESPACE_END(Grid);

-}}

-#endif
--- a/Grid/qcd/action/fermion/WilsonTMFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
@@ -30,126 +30,123 @@ Author: paboyle <paboyle@ph.ed.ac.uk> ; NB Christoph did similar in GPT
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/WilsonFermion.h>

-
-namespace Grid {
-
-  namespace QCD {
+NAMESPACE_BEGIN(Grid);
    
-    template<class Impl>
-      class WilsonTMFermion5D : public WilsonFermion5D<Impl>
-      {
-      public:
-	INHERIT_IMPL_TYPES(Impl);
-      public:
-
-	virtual void   Instantiatable(void) {};
-
-	// Constructors
-        WilsonTMFermion5D(GaugeField &_Umu,
-			  GridCartesian         &Fgrid,
-			  GridRedBlackCartesian &Frbgrid, 
-			  GridCartesian         &Ugrid,
-			  GridRedBlackCartesian &Urbgrid, 
-			  const std::vector<RealD> _mass,
-			  const std::vector<RealD> _mu,
-			  const ImplParams &p= ImplParams()
-			  ) :
-	WilsonFermion5D<Impl>(_Umu,
-			      Fgrid,
-			      Frbgrid,
-			      Ugrid,
-			      Urbgrid,
-			      4.0,p)
-	
-	  {
-	    update(_mass,_mu);
-	  }
-
-	virtual void Meooe(const FermionField &in, FermionField &out) {
-	  if (in.checkerboard == Odd) {
-	    this->DhopEO(in, out, DaggerNo);
-	  } else {
-	    this->DhopOE(in, out, DaggerNo);
-	  }
-	}
-
-	virtual void MeooeDag(const FermionField &in, FermionField &out) {
-	  if (in.checkerboard == Odd) {
-	    this->DhopEO(in, out, DaggerYes);
-	  } else {
-	    this->DhopOE(in, out, DaggerYes);
-	  }
-	}	
-	
-	// allow override for twisted mass and clover
-	virtual void Mooee(const FermionField &in, FermionField &out) {
-	  out.checkerboard = in.checkerboard;
-	  //axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
-	  for (int s=0;s<(int)this->mass.size();s++) {
-	    ComplexD a = 4.0+this->mass[s];
-	    ComplexD b(0.0,this->mu[s]);
-	    axpbg5y_ssp(out,a,in,b,in,s,s);
-	  }
-	}
-
-	virtual void MooeeDag(const FermionField &in, FermionField &out) {
-	  out.checkerboard = in.checkerboard;
-	  for (int s=0;s<(int)this->mass.size();s++) {
-	    ComplexD a = 4.0+this->mass[s];
-	    ComplexD b(0.0,-this->mu[s]);
-	    axpbg5y_ssp(out,a,in,b,in,s,s);
-	  }
-	}
-	virtual void MooeeInv(const FermionField &in, FermionField &out) {
-	  for (int s=0;s<(int)this->mass.size();s++) {
-	    RealD m    = this->mass[s];
-	    RealD tm   = this->mu[s];
-	    RealD mtil = 4.0+this->mass[s];
-	    RealD sq   = mtil*mtil+tm*tm;
-	    ComplexD a    = mtil/sq;
-	    ComplexD b(0.0, -tm /sq);
-	    axpbg5y_ssp(out,a,in,b,in,s,s);
-	  }
-	}
-	virtual void MooeeInvDag(const FermionField &in, FermionField &out) {
-	  for (int s=0;s<(int)this->mass.size();s++) {
-	    RealD m    = this->mass[s];
-	    RealD tm   = this->mu[s];
-	    RealD mtil = 4.0+this->mass[s];
-	    RealD sq   = mtil*mtil+tm*tm;
-	    ComplexD a    = mtil/sq;
-	    ComplexD b(0.0,tm /sq);
-	    axpbg5y_ssp(out,a,in,b,in,s,s);
-	  }
-	}
-
-	virtual RealD M(const FermionField &in, FermionField &out) {
-	  out.checkerboard = in.checkerboard;
-	  this->Dhop(in, out, DaggerNo);
-	  FermionField tmp(out._grid);
-	  for (int s=0;s<(int)this->mass.size();s++) {
-	    ComplexD a = 4.0+this->mass[s];
-	    ComplexD b(0.0,this->mu[s]);
-	    axpbg5y_ssp(tmp,a,in,b,in,s,s);
-	  }
-	  return axpy_norm(out, 1.0, tmp, out);
-	}
-	
-	// needed for fast PV
-	void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) {
-	  assert(_mass.size() == _mu.size());
-	  assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
-	  this->mass = _mass;
-	  this->mu = _mu;
-	}
-	
-      private:
-	std::vector<RealD> mu;
-	std::vector<RealD> mass;
-	
-      };
+template<class Impl>
+class WilsonTMFermion5D : public WilsonFermion5D<Impl>
+{
+ public:
+  INHERIT_IMPL_TYPES(Impl);
+ public:
+  
+  virtual void   Instantiatable(void) {};
+  
+  // Constructors
+ WilsonTMFermion5D(GaugeField &_Umu,
+		   GridCartesian         &Fgrid,
+		   GridRedBlackCartesian &Frbgrid, 
+		   GridCartesian         &Ugrid,
+		   GridRedBlackCartesian &Urbgrid, 
+		   const std::vector<RealD> _mass,
+		   const std::vector<RealD> _mu,
+		   const ImplParams &p= ImplParams()
+		   ) :
+  WilsonFermion5D<Impl>(_Umu,
+			Fgrid,
+			Frbgrid,
+			Ugrid,
+			Urbgrid,
+			4.0,p)
   
-    typedef WilsonTMFermion5D<WilsonImplF> WilsonTMFermion5DF; 
-    typedef WilsonTMFermion5D<WilsonImplD> WilsonTMFermion5DD; 
+    {
+      update(_mass,_mu);
+    }
+  
+  virtual void Meooe(const FermionField &in, FermionField &out) {
+    if (in.Checkerboard() == Odd) {
+      this->DhopEO(in, out, DaggerNo);
+    } else {
+      this->DhopOE(in, out, DaggerNo);
+    }
+  }
+  
+  virtual void MeooeDag(const FermionField &in, FermionField &out) {
+    if (in.Checkerboard() == Odd) {
+      this->DhopEO(in, out, DaggerYes);
+    } else {
+      this->DhopOE(in, out, DaggerYes);
+    }
+  }	
+  
+  // allow override for twisted mass and clover
+  virtual void Mooee(const FermionField &in, FermionField &out) {
+    out.Checkerboard() = in.Checkerboard();
+    //axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
+    for (int s=0;s<(int)this->mass.size();s++) {
+      ComplexD a = 4.0+this->mass[s];
+      ComplexD b(0.0,this->mu[s]);
+      axpbg5y_ssp(out,a,in,b,in,s,s);
+    }
+  }
+  
+  virtual void MooeeDag(const FermionField &in, FermionField &out) {
+    out.Checkerboard() = in.Checkerboard();
+    for (int s=0;s<(int)this->mass.size();s++) {
+      ComplexD a = 4.0+this->mass[s];
+      ComplexD b(0.0,-this->mu[s]);
+      axpbg5y_ssp(out,a,in,b,in,s,s);
+    }
+  }
+  virtual void MooeeInv(const FermionField &in, FermionField &out) {
+    for (int s=0;s<(int)this->mass.size();s++) {
+      RealD m    = this->mass[s];
+      RealD tm   = this->mu[s];
+      RealD mtil = 4.0+this->mass[s];
+      RealD sq   = mtil*mtil+tm*tm;
+      ComplexD a    = mtil/sq;
+      ComplexD b(0.0, -tm /sq);
+      axpbg5y_ssp(out,a,in,b,in,s,s);
+    }
+  }
+  virtual void MooeeInvDag(const FermionField &in, FermionField &out) {
+    for (int s=0;s<(int)this->mass.size();s++) {
+      RealD m    = this->mass[s];
+      RealD tm   = this->mu[s];
+      RealD mtil = 4.0+this->mass[s];
+      RealD sq   = mtil*mtil+tm*tm;
+      ComplexD a    = mtil/sq;
+      ComplexD b(0.0,tm /sq);
+      axpbg5y_ssp(out,a,in,b,in,s,s);
+    }
+  }
+  
+  virtual RealD M(const FermionField &in, FermionField &out) {
+    out.Checkerboard() = in.Checkerboard();
+    this->Dhop(in, out, DaggerNo);
+    FermionField tmp(out.Grid());
+    for (int s=0;s<(int)this->mass.size();s++) {
+      ComplexD a = 4.0+this->mass[s];
+      ComplexD b(0.0,this->mu[s]);
+      axpbg5y_ssp(tmp,a,in,b,in,s,s);
+    }
+    return axpy_norm(out, 1.0, tmp, out);
+  }
+  
+  // needed for fast PV
+  void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) {
+    assert(_mass.size() == _mu.size());
+    assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
+    this->mass = _mass;
+    this->mu = _mu;
+  }
+  
+ private:
+  std::vector<RealD> mu;
+  std::vector<RealD> mass;
+  
+};
+   
+typedef WilsonTMFermion5D<WilsonImplF> WilsonTMFermion5DF; 
+typedef WilsonTMFermion5D<WilsonImplD> WilsonTMFermion5DD; 

-}}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ZMobiusFermion.h
+++ b/Grid/qcd/action/fermion/ZMobiusFermion.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,56 +24,50 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_ZMOBIUS_FERMION_H
-#define  GRID_QCD_ZMOBIUS_FERMION_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class ZMobiusFermion : public CayleyFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class ZMobiusFermion : public CayleyFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
-
-      virtual void   Instantiatable(void) {};
-      // Constructors
-      ZMobiusFermion(GaugeField &_Umu,
-		     GridCartesian         &FiveDimGrid,
-		     GridRedBlackCartesian &FiveDimRedBlackGrid,
-		     GridCartesian         &FourDimGrid,
-		     GridRedBlackCartesian &FourDimRedBlackGrid,
-		     RealD _mass,RealD _M5,
-		     std::vector<ComplexD> &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : 
+  virtual void   Instantiatable(void) {};
+  // Constructors
+  ZMobiusFermion(GaugeField &_Umu,
+		 GridCartesian         &FiveDimGrid,
+		 GridRedBlackCartesian &FiveDimRedBlackGrid,
+		 GridCartesian         &FourDimGrid,
+		 GridRedBlackCartesian &FourDimRedBlackGrid,
+		 RealD _mass,RealD _M5,
+		 std::vector<ComplexD> &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : 
      
-      CayleyFermion5D<Impl>(_Umu,
-			    FiveDimGrid,
-			    FiveDimRedBlackGrid,
-			    FourDimGrid,
-			    FourDimRedBlackGrid,_mass,_M5,p)
+    CayleyFermion5D<Impl>(_Umu,
+			  FiveDimGrid,
+			  FiveDimRedBlackGrid,
+			  FourDimGrid,
+			  FourDimRedBlackGrid,_mass,_M5,p)

-      {
-	RealD eps = 1.0;
-	
-	std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
-	std::vector<Coeff_t> zgamma(this->Ls);
-	for(int s=0;s<this->Ls;s++){
-	  zgamma[s] = gamma[s];
-	}
-
-	// Call base setter
-	this->SetCoefficientsInternal(1.0,zgamma,b,c);
-      }
-
-    };
+  {
+    //    RealD eps = 1.0;
+    std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
+    Vector<Coeff_t> zgamma(this->Ls);
+    for(int s=0;s<this->Ls;s++){
+      zgamma[s] = gamma[s];
+    }

+    // Call base setter
+    this->SetCoefficientsInternal(1.0,zgamma,b,c);
  }
-}

-#endif
+};
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/deprecated/CayleyFermion5Ddense.h
+++ b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Ddense.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -26,19 +26,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */

 #include <Grid/Grid_Eigen_Dense.h>
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>


-namespace Grid {
-namespace QCD {
-  /*
-   * Dense matrix versions of routines
-   */
+NAMESPACE_BEGIN(Grid);
+
+/*
+ * Dense matrix versions of routines
+ */
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
 {
@@ -54,10 +54,10 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
  int Ls=this->Ls;
-  int LLs = psi._grid->_rdimensions[0];
-  int vol = psi._grid->oSites()/LLs;
+  int LLs = psi.Grid()->_rdimensions[0];
+  int vol = psi.Grid()->oSites()/LLs;
  
-  chi.checkerboard=psi.checkerboard;
+  chi.Checkerboard()=psi.Checkerboard();
  
  assert(Ls==LLs);
  
@@ -96,15 +96,14 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
  }

  // For the non-vectorised s-direction this is simple
-  
-  for(auto site=0;site<vol;site++){
+  thread_loop( (auto site=0;site<vol;site++), {
    
    SiteSpinor     SiteChi;
    SiteHalfSpinor SitePplus;
    SiteHalfSpinor SitePminus;
    
    for(int s1=0;s1<Ls;s1++){
-      SiteChi =zero;
+      SiteChi =Zero();
      for(int s2=0;s2<Ls;s2++){
 	int lex2 = s2+Ls*site;
 	
@@ -120,7 +119,7 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
      }
      chi[s1+Ls*site] = SiteChi*0.5;
    }
-  }
+  });
 }

 #ifdef CAYLEY_DPERP_DENSE
@@ -153,4 +152,4 @@ template void CayleyFermion5D<ZWilsonImplFH>::MooeeInternal(const FermionField &
 template void CayleyFermion5D<ZWilsonImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 #endif

-}}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dssp.h
+++ b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dssp.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -26,26 +26,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */

 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>


-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-  // Pminus fowards
-  // Pplus  backwards
+// Pminus fowards
+// Pplus  backwards
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
-				std::vector<Coeff_t> &lower,
-				std::vector<Coeff_t> &diag,
-				std::vector<Coeff_t> &upper)
+				Vector<Coeff_t> &lower,
+				Vector<Coeff_t> &diag,
+				Vector<Coeff_t> &upper)
 {
  Coeff_t one(1.0);
  int Ls=this->Ls;
@@ -66,9 +64,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
-				   std::vector<Coeff_t> &lower,
-				   std::vector<Coeff_t> &diag,
-				   std::vector<Coeff_t> &upper)
+				   Vector<Coeff_t> &lower,
+				   Vector<Coeff_t> &diag,
+				   Vector<Coeff_t> &upper)
 {
  Coeff_t one(1.0);
  int Ls=this->Ls;
@@ -91,7 +89,7 @@ void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &
 {
  Coeff_t one(1.0);
  Coeff_t czero(0.0);
-  chi.checkerboard=psi.checkerboard;
+  chi.Checkerboard()=psi.Checkerboard();
  int Ls=this->Ls;
  // Apply (L^{\prime})^{-1}
  axpby_ssp (chi,one,psi,     czero,psi,0,0);      // chi[0]=psi[0]
@@ -120,7 +118,7 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
 {
  Coeff_t one(1.0);
  Coeff_t czero(0.0);
-  chi.checkerboard=psi.checkerboard;
+  chi.Checkerboard()=psi.Checkerboard();
  int Ls=this->Ls;
  // Apply (U^{\prime})^{-dagger}
  axpby_ssp (chi,one,psi,     czero,psi,0,0);      // chi[0]=psi[0]
@@ -145,20 +143,19 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &


 #ifdef CAYLEY_DPERP_LINALG
-  INSTANTIATE_DPERP(WilsonImplF);
-  INSTANTIATE_DPERP(WilsonImplD);
-  INSTANTIATE_DPERP(GparityWilsonImplF);
-  INSTANTIATE_DPERP(GparityWilsonImplD);
-  INSTANTIATE_DPERP(ZWilsonImplF);
-  INSTANTIATE_DPERP(ZWilsonImplD);
+INSTANTIATE_DPERP(WilsonImplF);
+INSTANTIATE_DPERP(WilsonImplD);
+INSTANTIATE_DPERP(GparityWilsonImplF);
+INSTANTIATE_DPERP(GparityWilsonImplD);
+INSTANTIATE_DPERP(ZWilsonImplF);
+INSTANTIATE_DPERP(ZWilsonImplD);

-  INSTANTIATE_DPERP(WilsonImplFH);
-  INSTANTIATE_DPERP(WilsonImplDF);
-  INSTANTIATE_DPERP(GparityWilsonImplFH);
-  INSTANTIATE_DPERP(GparityWilsonImplDF);
-  INSTANTIATE_DPERP(ZWilsonImplFH);
-  INSTANTIATE_DPERP(ZWilsonImplDF);
+INSTANTIATE_DPERP(WilsonImplFH);
+INSTANTIATE_DPERP(WilsonImplDF);
+INSTANTIATE_DPERP(GparityWilsonImplFH);
+INSTANTIATE_DPERP(GparityWilsonImplDF);
+INSTANTIATE_DPERP(ZWilsonImplFH);
+INSTANTIATE_DPERP(ZWilsonImplDF);
 #endif

-}
-}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermiondense.h
+++ b/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermiondense.h
@@ -0,0 +1,158 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+/*
+ * Dense matrix versions of routines
+ */
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
+{
+  int Ls = this->Ls;
+  int LLs = psi.Grid()->_rdimensions[0];
+  int vol = psi.Grid()->oSites()/LLs;
+
+  chi.Checkerboard() = psi.Checkerboard();
+
+  assert(Ls==LLs);
+
+  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s)  = this->bee[s];
+    Pminus(s,s) = this->bee[s];
+  }
+
+  for(int s=0; s<Ls-1; s++){
+    Pminus(s,s+1) = -this->cee[s];
+  }
+
+  for(int s=0; s<Ls-1; s++){
+    Pplus(s+1,s) = -this->cee[s+1];
+  }
+
+  Pplus (0,Ls-1) = this->dp;
+  Pminus(Ls-1,0) = this->dm;
+
+  Eigen::MatrixXd PplusMat ;
+  Eigen::MatrixXd PminusMat;
+
+  if(inv) {
+    PplusMat  = Pplus.inverse();
+    PminusMat = Pminus.inverse();
+  } else {
+    PplusMat  = Pplus;
+    PminusMat = Pminus;
+  }
+
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+
+  // For the non-vectorised s-direction this is simple
+
+  for(auto site=0; site<vol; site++){
+
+    SiteSpinor     SiteChi;
+    SiteHalfSpinor SitePplus;
+    SiteHalfSpinor SitePminus;
+
+    for(int s1=0; s1<Ls; s1++){
+      SiteChi = Zero();
+      for(int s2=0; s2<Ls; s2++){
+	int lex2 = s2 + Ls*site;
+	if(PplusMat(s1,s2) != 0.0){
+	  spProj5p(SitePplus,psi[lex2]);
+	  accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
+	}
+	if(PminusMat(s1,s2) != 0.0){
+	  spProj5m(SitePminus, psi[lex2]);
+	  accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
+	}
+      }
+      chi[s1+Ls*site] = SiteChi*0.5;
+    }
+  }
+}
+
+#ifdef DOMAIN_WALL_EOFA_DPERP_DENSE
+
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
+
+template void DomainWallEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
+
+template void DomainWallEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermionssp.h
+++ b/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermionssp.h
@@ -0,0 +1,167 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// FIXME -- make a version of these routines with site loop outermost for cache reuse.
+// Pminus fowards
+// Pplus  backwards
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
+				      FermionField& chi, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  Coeff_t one(1.0);
+  int Ls = this->Ls;
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
+    } else if (s==(Ls-1)) {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
+      axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
+    } else {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
+    }
+  }
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
+					 FermionField& chi, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  Coeff_t one(1.0);
+  int Ls = this->Ls;
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
+    } else if (s==(Ls-1)) {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+    } else {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+    }
+  }
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+{
+  Coeff_t one(1.0);
+  Coeff_t czero(0.0);
+  chi.Checkerboard() = psi.Checkerboard();
+  int Ls = this->Ls;
+
+  FermionField tmp(psi.Grid());
+
+  // Apply (L^{\prime})^{-1}
+  axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+  for(int s=1; s<Ls; s++){
+    axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+  }
+
+  // L_m^{-1}
+  for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+    axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
+  }
+
+  // U_m^{-1} D^{-1}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls], chi, s, Ls-1);
+  }
+  axpby_ssp_pminus(tmp, czero, chi, one/this->dee[Ls-1], chi, Ls-1, Ls-1);
+  axpby_ssp_pplus(chi, one, tmp, one/this->dee[Ls], chi, Ls-1, Ls-1);
+
+  // Apply U^{-1}
+  for(int s=Ls-2; s>=0; s--){
+    axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
+  }
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+{
+  Coeff_t one(1.0);
+  Coeff_t czero(0.0);
+  chi.Checkerboard() = psi.Checkerboard();
+  int Ls = this->Ls;
+
+  FermionField tmp(psi.Grid());
+
+  // Apply (U^{\prime})^{-dagger}
+  axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+  for(int s=1; s<Ls; s++){
+    axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
+  }
+
+  // U_m^{-\dagger}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
+  }
+
+  // L_m^{-\dagger} D^{-dagger}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
+  }
+  axpby_ssp_pminus(tmp, czero, chi, one/conjugate(this->dee[Ls-1]), chi, Ls-1, Ls-1);
+  axpby_ssp_pplus(chi, one, tmp, one/conjugate(this->dee[Ls]), chi, Ls-1, Ls-1);
+
+  // Apply L^{-dagger}
+  for(int s=Ls-2; s>=0; s--){
+    axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
+  }
+}
+
+#ifdef DOMAIN_WALL_EOFA_DPERP_LINALG
+
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
+
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
+
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermiondense.h
+++ b/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermiondense.h
@@ -0,0 +1,183 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+/*
+ * Dense matrix versions of routines
+ */
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
+{
+  int Ls = this->Ls;
+  int LLs = psi.Grid()->_rdimensions[0];
+  int vol = psi.Grid()->oSites()/LLs;
+
+  int pm      = this->pm;
+  RealD shift = this->shift;
+  RealD alpha = this->alpha;
+  RealD k     = this->k;
+  RealD mq1   = this->mq1;
+
+  chi.Checkerboard() = psi.Checkerboard();
+
+  assert(Ls==LLs);
+
+  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s)  = this->bee[s];
+    Pminus(s,s) = this->bee[s];
+  }
+
+  for(int s=0; s<Ls-1; s++){
+    Pminus(s,s+1) = -this->cee[s];
+  }
+
+  for(int s=0; s<Ls-1; s++){
+    Pplus(s+1,s) = -this->cee[s+1];
+  }
+  Pplus (0,Ls-1) = mq1*this->cee[0];
+  Pminus(Ls-1,0) = mq1*this->cee[Ls-1];
+
+  if(shift != 0.0){
+    Coeff_t N = 2.0 * ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
+    for(int s=0; s<Ls; ++s){
+      if(pm == 1){ Pplus(s,Ls-1) += shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
+      else{ Pminus(Ls-1-s,Ls-1) -= shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
+    }
+  }
+
+  Eigen::MatrixXd PplusMat ;
+  Eigen::MatrixXd PminusMat;
+
+  if(inv){
+    PplusMat  = Pplus.inverse();
+    PminusMat = Pminus.inverse();
+  } else {
+    PplusMat  = Pplus;
+    PminusMat = Pminus;
+  }
+
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+
+  // For the non-vectorised s-direction this is simple
+
+  for(auto site=0; site<vol; site++){
+
+    SiteSpinor     SiteChi;
+    SiteHalfSpinor SitePplus;
+    SiteHalfSpinor SitePminus;
+
+    for(int s1=0; s1<Ls; s1++){
+      SiteChi = Zero();
+      for(int s2=0; s2<Ls; s2++){
+	int lex2 = s2 + Ls*site;
+	if(PplusMat(s1,s2) != 0.0){
+	  spProj5p(SitePplus,psi[lex2]);
+	  accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
+	}
+	if(PminusMat(s1,s2) != 0.0){
+	  spProj5m(SitePminus, psi[lex2]);
+	  accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
+	}
+      }
+      chi[s1+Ls*site] = SiteChi*0.5;
+    }
+  }
+}
+
+#ifdef MOBIUS_EOFA_DPERP_DENSE
+
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
+
+template void MobiusEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
+
+template void MobiusEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermionssp.h
+++ b/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermionssp.h
@@ -0,0 +1,289 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// FIXME -- make a version of these routines with site loop outermost for cache reuse.
+// Pminus fowards
+// Pplus  backwards
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
+				  FermionField& chi, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  Coeff_t one(1.0);
+  int Ls = this->Ls;
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
+    } else if (s==(Ls-1)) {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
+      axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
+    } else {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
+    }
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
+					FermionField& chi, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
+					Vector<Coeff_t>& shift_coeffs)
+{
+  Coeff_t one(1.0);
+  int Ls = this->Ls;
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
+    } else if (s==(Ls-1)) {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
+      axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
+    } else {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
+    }
+    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
+    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
+				     FermionField& chi, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  Coeff_t one(1.0);
+  int Ls = this->Ls;
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
+    } else if (s==(Ls-1)) {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+    } else {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+    }
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
+					   FermionField& chi, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
+					   Vector<Coeff_t>& shift_coeffs)
+{
+  Coeff_t one(1.0);
+  int Ls = this->Ls;
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
+    } else if (s==(Ls-1)) {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+    } else {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+    }
+    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
+    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+{
+  if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
+
+  Coeff_t one(1.0);
+  Coeff_t czero(0.0);
+  chi.Checkerboard() = psi.Checkerboard();
+  int Ls = this->Ls;
+
+  // Apply (L^{\prime})^{-1}
+  axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+  for(int s=1; s<Ls; s++){
+    axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+  }
+
+  // L_m^{-1}
+  for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+    axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
+  }
+
+  // U_m^{-1} D^{-1}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
+  }
+  axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
+
+  // Apply U^{-1}
+  for(int s=Ls-2; s>=0; s--){
+    axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
+{
+  Coeff_t one(1.0);
+  Coeff_t czero(0.0);
+  chi.Checkerboard() = psi.Checkerboard();
+  int Ls = this->Ls;
+
+  FermionField tmp(psi.Grid());
+
+  // Apply (L^{\prime})^{-1}
+  axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+  axpby_ssp(tmp, czero, tmp, this->MooeeInv_shift_lc[0], psi, 0, 0);
+  for(int s=1; s<Ls; s++){
+    axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+    axpby_ssp(tmp, one, tmp, this->MooeeInv_shift_lc[s], psi, 0, s);
+  }
+
+  // L_m^{-1}
+  for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+    axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
+  }
+
+  // U_m^{-1} D^{-1}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
+  }
+  axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
+
+  // Apply U^{-1} and add shift term
+  if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
+  else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
+  for(int s=Ls-2; s>=0; s--){
+    axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
+    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
+    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+{
+  if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
+
+  Coeff_t one(1.0);
+  Coeff_t czero(0.0);
+  chi.Checkerboard() = psi.Checkerboard();
+  int Ls = this->Ls;
+
+  // Apply (U^{\prime})^{-dagger}
+  axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+  for(int s=1; s<Ls; s++){
+    axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
+  }
+
+  // U_m^{-\dagger}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
+  }
+
+  // L_m^{-\dagger} D^{-dagger}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
+  }
+  axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
+
+  // Apply L^{-dagger}
+  for(int s=Ls-2; s>=0; s--){
+    axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
+{
+  Coeff_t one(1.0);
+  Coeff_t czero(0.0);
+  chi.Checkerboard() = psi.Checkerboard();
+  int Ls = this->Ls;
+
+  FermionField tmp(psi.Grid());
+
+  // Apply (U^{\prime})^{-dagger} and accumulate (MooeeInvDag_shift_lc)_{j} \psi_{j} in tmp[0]
+  axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+  axpby_ssp(tmp, czero, tmp, this->MooeeInvDag_shift_lc[0], psi, 0, 0);
+  for(int s=1; s<Ls; s++){
+    axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
+    axpby_ssp(tmp, one, tmp, this->MooeeInvDag_shift_lc[s], psi, 0, s);
+  }
+
+  // U_m^{-\dagger}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
+  }
+
+  // L_m^{-\dagger} D^{-dagger}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
+  }
+  axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
+
+  // Apply L^{-dagger} and add shift
+  if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
+  else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
+  for(int s=Ls-2; s>=0; s--){
+    axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
+    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
+    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
+  }
+}
+
+#ifdef MOBIUS_EOFA_DPERP_LINALG
+
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
+
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
+
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/g5HermitianLinop.h
+++ b/Grid/qcd/action/fermion/g5HermitianLinop.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -23,13 +23,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef G5_HERMITIAN_LINOP
 #define G5_HERMITIAN_LINOP

-namespace Grid {
-  namespace QCD {
+NAMESPACE_BEGIN(Grid);

 ////////////////////////////////////////////////////////////////////
 // Wrap an already herm matrix
@@ -46,12 +45,12 @@ public:
    HermOp(in,out);
  }
  void OpDiag (const Field &in, Field &out) {
-    Field tmp(in._grid);
+    Field tmp(in.Grid());
    _Mat.Mdiag(in,tmp);
    G5R5(out,tmp);
  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    Field tmp(in._grid);
+    Field tmp(in.Grid());
    _Mat.Mdir(in,tmp,dir,disp);
    G5R5(out,tmp);
  }
@@ -68,7 +67,7 @@ public:
    n2=real(dot);
  }
  void HermOp(const Field &in, Field &out){
-    Field tmp(in._grid);
+    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    G5R5(out,tmp);
  }
@@ -80,7 +79,7 @@ class Gamma5HermitianLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Gamma g5;
 public:
-    Gamma5HermitianLinearOperator(Matrix &Mat): _Mat(Mat), g5(Gamma::Algebra::Gamma5) {};
+  Gamma5HermitianLinearOperator(Matrix &Mat): _Mat(Mat), g5(Gamma::Algebra::Gamma5) {};
  void Op     (const Field &in, Field &out){
    HermOp(in,out);
  }
@@ -88,12 +87,12 @@ public:
    HermOp(in,out);
  }
  void OpDiag (const Field &in, Field &out) {
-    Field tmp(in._grid);
+    Field tmp(in.Grid());
    _Mat.Mdiag(in,tmp);
    out=g5*tmp;
  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    Field tmp(in._grid);
+    Field tmp(in.Grid());
    _Mat.Mdir(in,tmp,dir,disp);
    out=g5*tmp;
  }
@@ -110,12 +109,11 @@ public:
    n2=real(dot);
  }
  void HermOp(const Field &in, Field &out){
-    Field tmp(in._grid);
+    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    out=g5*tmp;
  }
 };

-
-}}
+NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -26,31 +26,30 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */

 #include <Grid/Grid_Eigen_Dense.h>
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

- template<class Impl>
- CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
-					GridCartesian         &FiveDimGrid,
-					GridRedBlackCartesian &FiveDimRedBlackGrid,
-					GridCartesian         &FourDimGrid,
-					GridRedBlackCartesian &FourDimRedBlackGrid,
-					RealD _mass,RealD _M5,const ImplParams &p) :
-   WilsonFermion5D<Impl>(_Umu,
-		   FiveDimGrid,
-		   FiveDimRedBlackGrid,
-		   FourDimGrid,
- 	 	   FourDimRedBlackGrid,_M5,p),
-   mass(_mass)
- { 
- }
+template<class Impl>
+CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
+				       GridCartesian         &FiveDimGrid,
+				       GridRedBlackCartesian &FiveDimRedBlackGrid,
+				       GridCartesian         &FourDimGrid,
+				       GridRedBlackCartesian &FourDimRedBlackGrid,
+				       RealD _mass,RealD _M5,const ImplParams &p) :
+  WilsonFermion5D<Impl>(_Umu,
+			FiveDimGrid,
+			FiveDimRedBlackGrid,
+			FourDimGrid,
+			FourDimRedBlackGrid,_M5,p),
+  mass(_mass)
+{ 
+}

 ///////////////////////////////////////////////////////////////
 // Physical surface field utilities
@@ -61,8 +60,8 @@ void CayleyFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &so
  int Ls = this->Ls;
  FermionField tmp(this->FermionGrid());
  tmp = solution5d;
-  conformable(solution5d._grid,this->FermionGrid());
-  conformable(exported4d._grid,this->GaugeGrid());
+  conformable(solution5d.Grid(),this->FermionGrid());
+  conformable(exported4d.Grid(),this->GaugeGrid());
  axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0);
  axpby_ssp_pplus (tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
  ExtractSlice(exported4d, tmp, 0, 0);
@@ -71,7 +70,7 @@ template<class Impl>
 void CayleyFermion5D<Impl>::P(const FermionField &psi, FermionField &chi)
 {
  int Ls= this->Ls;
-  chi=zero;
+  chi=Zero();
  for(int s=0;s<Ls;s++){
    axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
    axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s+1)%Ls);
@@ -81,7 +80,7 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Pdag(const FermionField &psi, FermionField &chi)
 {
  int Ls= this->Ls;
-  chi=zero;
+  chi=Zero();
  for(int s=0;s<Ls;s++){
    axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
    axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s-1+Ls)%Ls);
@@ -93,8 +92,8 @@ void CayleyFermion5D<Impl>::ExportPhysicalFermionSource(const FermionField &solu
  int Ls = this->Ls;
  FermionField tmp(this->FermionGrid());
  tmp = solution5d;
-  conformable(solution5d._grid,this->FermionGrid());
-  conformable(exported4d._grid,this->GaugeGrid());
+  conformable(solution5d.Grid(),this->FermionGrid());
+  conformable(exported4d.Grid(),this->GaugeGrid());
  axpby_ssp_pplus (tmp, 0., solution5d, 1., solution5d, 0, 0);
  axpby_ssp_pminus(tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
  ExtractSlice(exported4d, tmp, 0, 0);
@@ -104,9 +103,9 @@ void CayleyFermion5D<Impl>::ImportUnphysicalFermion(const FermionField &input4d,
 {
  int Ls = this->Ls;
  FermionField tmp(this->FermionGrid());
-  conformable(imported5d._grid,this->FermionGrid());
-  conformable(input4d._grid   ,this->GaugeGrid());
-  tmp = zero;
+  conformable(imported5d.Grid(),this->FermionGrid());
+  conformable(input4d.Grid()   ,this->GaugeGrid());
+  tmp = Zero();
  InsertSlice(input4d, tmp, 0   , 0);
  InsertSlice(input4d, tmp, Ls-1, 0);
  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
@@ -119,9 +118,9 @@ void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &inpu
 {
  int Ls = this->Ls;
  FermionField tmp(this->FermionGrid());
-  conformable(imported5d._grid,this->FermionGrid());
-  conformable(input4d._grid   ,this->GaugeGrid());
-  tmp = zero;
+  conformable(imported5d.Grid(),this->FermionGrid());
+  conformable(input4d.Grid()   ,this->GaugeGrid());
+  tmp = Zero();
  InsertSlice(input4d, tmp, 0   , 0);
  InsertSlice(input4d, tmp, Ls-1, 0);
  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
@@ -156,7 +155,7 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
 template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
 {
  this->Report();
-  std::vector<int> latt = GridDefaultLatt();          
+  Coordinate latt = GridDefaultLatt();          
  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP     = this->_FourDimGrid->_Nprocessors;
  if ( M5Dcalls > 0 ) {
@@ -164,10 +163,16 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;

-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
+    // Flops = 10.0*(Nc*Ns) *Ls*vol
+    RealD mflops = 10.0*(Nc*Ns)*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+
+    // Bytes = sizeof(Real) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting)
+    // read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 )
+    // write = 1
+    RealD Gbytes = sizeof(Real) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9;
+    std::cout << GridLogMessage << "Average bandwidth (GB/s)                 : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl;
  }

  if ( MooeeInvCalls > 0 ) {
@@ -175,11 +180,16 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
-
+#ifdef GRID_NVCC
+    RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
+    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+#else
    // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
    RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+#endif
  }

 }
@@ -198,18 +208,18 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag (Ls,1.0);
-  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
-  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
+  Vector<Coeff_t> diag (Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag = bs;
-  std::vector<Coeff_t> upper= cs;
-  std::vector<Coeff_t> lower= cs; 
+  Vector<Coeff_t> diag = bs;
+  Vector<Coeff_t> upper= cs;
+  Vector<Coeff_t> lower= cs; 
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
@@ -218,9 +228,9 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag = beo;
-  std::vector<Coeff_t> upper(Ls);
-  std::vector<Coeff_t> lower(Ls);
+  Vector<Coeff_t> diag = beo;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
@@ -233,9 +243,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag = bee;
-  std::vector<Coeff_t> upper(Ls);
-  std::vector<Coeff_t> lower(Ls);
+  Vector<Coeff_t> diag = bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-cee[i];
    lower[i]=-cee[i];
@@ -248,9 +258,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag = bee;
-  std::vector<Coeff_t> upper(Ls);
-  std::vector<Coeff_t> lower(Ls);
+  Vector<Coeff_t> diag = bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);

  for (int s=0;s<Ls;s++){
    // Assemble the 5d matrix
@@ -278,9 +288,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag(Ls,1.0);
-  std::vector<Coeff_t> upper(Ls,-1.0);
-  std::vector<Coeff_t> lower(Ls,-1.0);
+  Vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0);
+  Vector<Coeff_t> lower(Ls,-1.0);
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
@@ -290,9 +300,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag =bs;
-  std::vector<Coeff_t> upper=cs;
-  std::vector<Coeff_t> lower=cs; 
+  Vector<Coeff_t> diag =bs;
+  Vector<Coeff_t> upper=cs;
+  Vector<Coeff_t> lower=cs; 

  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
@@ -315,9 +325,7 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
 template<class Impl>
 RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
 {
-  int Ls=this->Ls;
-  
-  FermionField Din(psi._grid);
+  FermionField Din(psi.Grid());
  
  // Assemble Din
  Meooe5D(psi,Din);
@@ -337,7 +345,7 @@ RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
  //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
  //D2- P+     D2+            P-D1-^dag D2+dag
  
-  FermionField Din(psi._grid);
+  FermionField Din(psi.Grid());
  // Apply Dw
  this->DW(psi,Din,DaggerYes); 
  
@@ -353,11 +361,9 @@ RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
 template<class Impl>
 void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
 {
-  int Ls=this->Ls;
-
  Meooe5D(psi,this->tmp()); 

-  if ( psi.checkerboard == Odd ) {
+  if ( psi.Checkerboard() == Odd ) {
    this->DhopEO(this->tmp(),chi,DaggerNo);
  } else {
    this->DhopOE(this->tmp(),chi,DaggerNo);
@@ -368,7 +374,7 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
 {
  // Apply 4d dslash
-  if ( psi.checkerboard == Odd ) {
+  if ( psi.Checkerboard() == Odd ) {
    this->DhopEO(psi,this->tmp(),DaggerYes);
  } else {
    this->DhopOE(psi,this->tmp(),DaggerYes);
@@ -386,7 +392,7 @@ void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,in
 template<class Impl>
 void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
 {
-  FermionField Din(V._grid);
+  FermionField Din(V.Grid());
  
  if ( dag == DaggerNo ) {
    //      U d/du [D_w D5] V = U d/du DW D5 V
@@ -401,7 +407,7 @@ void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const
 template<class Impl>
 void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
 {
-  FermionField Din(V._grid);
+  FermionField Din(V.Grid());
  
  if ( dag == DaggerNo ) {
    //      U d/du [D_w D5] V = U d/du DW D5 V
@@ -416,7 +422,7 @@ void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const
 template<class Impl>
 void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
 {
-  FermionField Din(V._grid);
+  FermionField Din(V.Grid());
  
  if ( dag == DaggerNo ) {
    //      U d/du [D_w D5] V = U d/du DW D5 V
@@ -433,7 +439,7 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  std::vector<Coeff_t> gamma(this->Ls);
+  Vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(1.0,gamma,b,c);
 }
@@ -441,13 +447,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  std::vector<Coeff_t> gamma(this->Ls);
+  Vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(zolo_hi,gamma,b,c);
 }
 //Zolo
 template<class Impl>
-void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
+void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
 {
  int Ls=this->Ls;

@@ -568,12 +574,12 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
    dee[Ls-1] += delta_d;
  }  

-  int inv=1;
-  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
-  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
+  //  int inv=1;
+  //  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
+  //  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
 }

-
+#if 0
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
 						 Vector<iSinglet<Simd> > & Matp,
@@ -628,35 +634,32 @@ void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
  Matm.resize(Ls*LLs);

  for(int s2=0;s2<Ls;s2++){
-  for(int s1=0;s1<LLs;s1++){
-    int istride = LLs;
-    int ostride = 1;
-    Simd Vp;
-    Simd Vm;
-    scalar_type *sp = (scalar_type *)&Vp;
-    scalar_type *sm = (scalar_type *)&Vm;
-    for(int l=0;l<Nsimd;l++){
-      if ( switcheroo<Coeff_t>::iscomplex() ) {
-	sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	sm[l] = PminusMat(l*istride+s1*ostride,s2);
-      } else { 
-      // if real
-	scalar_type tmp;
-	tmp = PplusMat (l*istride+s1*ostride,s2);
-	sp[l] = scalar_type(tmp.real(),tmp.real());
-	tmp = PminusMat(l*istride+s1*ostride,s2);
-	sm[l] = scalar_type(tmp.real(),tmp.real());
+    for(int s1=0;s1<LLs;s1++){
+      int istride = LLs;
+      int ostride = 1;
+      Simd Vp;
+      Simd Vm;
+      scalar_type *sp = (scalar_type *)&Vp;
+      scalar_type *sm = (scalar_type *)&Vm;
+      for(int l=0;l<Nsimd;l++){
+	if ( switcheroo<Coeff_t>::iscomplex() ) {
+	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
+	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
+	} else { 
+	  // if real
+	  scalar_type tmp;
+	  tmp = PplusMat (l*istride+s1*ostride,s2);
+	  sp[l] = scalar_type(tmp.real(),tmp.real());
+	  tmp = PminusMat(l*istride+s1*ostride,s2);
+	  sm[l] = scalar_type(tmp.real(),tmp.real());
+	}
      }
-    }
-    Matp[LLs*s2+s1] = Vp;
-    Matm[LLs*s2+s1] = Vm;
-  }}
+      Matp[LLs*s2+s1] = Vp;
+      Matm[LLs*s2+s1] = Vm;
+    }}
 }
-
-
-  FermOpTemplateInstantiate(CayleyFermion5D);
-  GparityFermOpTemplateInstantiate(CayleyFermion5D);
-
-}}
+#endif
+
+NAMESPACE_END(Grid);


--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@@ -0,0 +1,235 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
+
+
+NAMESPACE_BEGIN(Grid);
+
+// Pminus fowards
+// Pplus  backwards..
+template<class Impl>  
+void
+CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
+			       const FermionField &phi_i, 
+			       FermionField &chi_i,
+			       Vector<Coeff_t> &lower,
+			       Vector<Coeff_t> &diag,
+			       Vector<Coeff_t> &upper)
+{
+  
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  int Ls =this->Ls;
+
+  // 10 = 3 complex mult + 2 complex add
+  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
+  M5Dcalls++;
+  M5Dtime-=usecond();
+
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss= sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1, tmp2;
+    for(int s=0;s<Ls;s++){
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      spProj5m(tmp1,psi(idx_u));
+      spProj5p(tmp2,psi(idx_l));
+      coalescedWrite(chi[ss+s],diag[s]*phi(ss+s)+upper[s]*tmp1+lower[s]*tmp2);
+    }
+  });
+  M5Dtime+=usecond();
+}
+
+template<class Impl>  
+void
+CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
+			      const FermionField &phi_i, 
+			      FermionField &chi_i,
+			      Vector<Coeff_t> &lower,
+			      Vector<Coeff_t> &diag,
+			      Vector<Coeff_t> &upper)
+{
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  int Ls=this->Ls;
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  M5Dcalls++;
+  M5Dtime-=usecond();
+
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss=sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1,tmp2;
+    for(int s=0;s<Ls;s++){
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      spProj5p(tmp1,psi(idx_u));
+      spProj5m(tmp2,psi(idx_l));
+      coalescedWrite(chi[ss+s],diag[s]*phi(ss+s)+upper[s]*tmp1+lower[s]*tmp2);
+    }
+  });
+  M5Dtime+=usecond();
+}
+
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+  int Ls=this->Ls;
+
+  auto plee  = & lee [0];
+  auto pdee  = & dee [0];
+  auto puee  = & uee [0];
+  auto pleem = & leem[0];
+  auto pueem = & ueem[0];
+
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss=sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp;
+
+    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
+    // Apply (L^{\prime})^{-1}
+    coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0]
+    for(int s=1;s<Ls;s++){
+      spProj5p(tmp,chi(ss+s-1));  
+      coalescedWrite(chi[ss+s] , psi(ss+s)-plee[s-1]*tmp);
+    }
+
+    // L_m^{-1} 
+    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -pleem[s]P_- chi
+      spProj5m(tmp,chi(ss+s));    
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp);
+    }
+
+    // U_m^{-1} D^{-1}
+    for (int s=0;s<Ls-1;s++){
+      // Chi[s] + 1/d chi[s] 
+      spProj5p(tmp,chi(ss+Ls-1)); 
+      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s)-(pueem[s]/pdee[Ls-1])*tmp);
+    }	
+    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
+      
+    // Apply U^{-1}
+    for (int s=Ls-2;s>=0;s--){
+      spProj5m(tmp,chi(ss+s+1));  
+      coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp);
+    }
+  });
+
+  MooeeInvTime+=usecond();
+
+}
+
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  int Ls=this->Ls;
+
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+  auto plee  = & lee [0];
+  auto pdee  = & dee [0];
+  auto puee  = & uee [0];
+  auto pleem = & leem[0];
+  auto pueem = & ueem[0];
+
+  assert(psi.Checkerboard() == psi.Checkerboard());
+
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+
+
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss=sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp;
+
+    // Apply (U^{\prime})^{-dagger}
+    coalescedWrite(chi[ss],psi(ss));
+    for (int s=1;s<Ls;s++){
+      spProj5m(tmp,chi(ss+s-1));
+      coalescedWrite(chi[ss+s], psi(ss+s)-conjugate(puee[s-1])*tmp);
+    }
+    // U_m^{-\dagger} 
+    for (int s=0;s<Ls-1;s++){
+      spProj5p(tmp,chi(ss+s));
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - conjugate(pueem[s])*tmp);
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for (int s=0;s<Ls-1;s++){
+      spProj5m(tmp,chi(ss+Ls-1));
+      coalescedWrite(chi[ss+s], conjugate(1.0/pdee[s])*chi(ss+s)-conjugate(pleem[s]/pdee[Ls-1])*tmp);
+    }	
+    coalescedWrite(chi[ss+Ls-1], conjugate(1.0/pdee[Ls-1])*chi(ss+Ls-1));
+  
+    // Apply L^{-dagger}
+    for (int s=Ls-2;s>=0;s--){
+      spProj5p(tmp,chi(ss+s+1));
+      coalescedWrite(chi[ss+s], chi(ss+s) - conjugate(plee[s])*tmp);
+    }
+  });
+  MooeeInvTime+=usecond();
+
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
@@ -0,0 +1,831 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
+
+NAMESPACE_BEGIN(Grid);
+
+/*
+ * Dense matrix versions of routines
+ */
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInvDag(const FermionField &psi, FermionField &chi)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
+}
+  
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
+}
+template<class Impl>  
+void
+CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
+			   const FermionField &phi_i, 
+			   FermionField &chi_i,
+			   Vector<Coeff_t> &lower,
+			   Vector<Coeff_t> &diag,
+			   Vector<Coeff_t> &upper)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  const int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+    for(int i=0;i<nsimd;i++){ //inner
+      int s  = o+i*LLs;
+      int ss = o*nsimd+i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+    }}
+
+
+  M5Dcalls++;
+  M5Dtime-=usecond();
+
+  assert(Nc==3);
+
+  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
+#if 0
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5m(hp,psi[ss+vp]);
+      spProj5p(hm,psi[ss+vm]);
+
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+	
+      hp=0.5*hp;
+      hm=0.5*hm;
+
+      spRecon5m(fp,hp);
+      spRecon5p(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v];
+      chi[ss+v] = chi[ss+v]     +u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+#else
+    for(int v=0;v<LLs;v++){
+      
+      vprefetch(psi[ss+v+LLs]);
+
+      int vp= (v==LLs-1) ? 0     : v+1;
+      int vm= (v==0    ) ? LLs-1 : v-1;
+	
+      Simd hp_00 = psi[ss+vp]()(2)(0); 
+      Simd hp_01 = psi[ss+vp]()(2)(1); 
+      Simd hp_02 = psi[ss+vp]()(2)(2); 
+      Simd hp_10 = psi[ss+vp]()(3)(0); 
+      Simd hp_11 = psi[ss+vp]()(3)(1); 
+      Simd hp_12 = psi[ss+vp]()(3)(2); 
+	
+      Simd hm_00 = psi[ss+vm]()(0)(0); 
+      Simd hm_01 = psi[ss+vm]()(0)(1); 
+      Simd hm_02 = psi[ss+vm]()(0)(2); 
+      Simd hm_10 = psi[ss+vm]()(1)(0); 
+      Simd hm_11 = psi[ss+vm]()(1)(1); 
+      Simd hm_12 = psi[ss+vm]()(1)(2); 
+
+      if ( vp<=v ) {
+	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+      }
+      if ( vm>=v ) {
+	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+      }
+
+      // Can force these to real arithmetic and save 2x.
+      Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
+      Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
+      Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02); 
+      Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
+      Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
+      Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
+      Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
+      Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
+      Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);  
+      Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
+      Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
+      Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
+
+      vstream(chi[ss+v]()(0)(0),p_00);
+      vstream(chi[ss+v]()(0)(1),p_01);
+      vstream(chi[ss+v]()(0)(2),p_02);
+      vstream(chi[ss+v]()(1)(0),p_10);
+      vstream(chi[ss+v]()(1)(1),p_11);
+      vstream(chi[ss+v]()(1)(2),p_12);
+      vstream(chi[ss+v]()(2)(0),p_20);
+      vstream(chi[ss+v]()(2)(1),p_21);
+      vstream(chi[ss+v]()(2)(2),p_22);
+      vstream(chi[ss+v]()(3)(0),p_30);
+      vstream(chi[ss+v]()(3)(1),p_31);
+      vstream(chi[ss+v]()(3)(2),p_32);
+
+    }
+#endif
+  });
+  M5Dtime+=usecond();
+}
+
+template<class Impl>  
+void
+CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
+			      const FermionField &phi_i, 
+			      FermionField &chi_i,
+			      Vector<Coeff_t> &lower,
+			      Vector<Coeff_t> &diag,
+			      Vector<Coeff_t> &upper)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  auto psi=psi_i.View();
+  auto phi=phi_i.View();
+  auto chi=chi_i.View();
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+    for(int i=0;i<nsimd;i++){ //inner
+      int s  = o+i*LLs;
+      int ss = o*nsimd+i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+    }}
+
+  M5Dcalls++;
+  M5Dtime-=usecond();
+  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
+#if 0
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5p(hp,psi[ss+vp]);
+      spProj5m(hm,psi[ss+vm]);
+
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+      
+      hp=hp*0.5;
+      hm=hm*0.5;
+      spRecon5p(fp,hp);
+      spRecon5m(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+#else
+    for(int v=0;v<LLs;v++){
+
+      vprefetch(psi[ss+v+LLs]);
+
+      int vp= (v==LLs-1) ? 0     : v+1;
+      int vm= (v==0    ) ? LLs-1 : v-1;
+	
+      Simd hp_00 = psi[ss+vp]()(0)(0); 
+      Simd hp_01 = psi[ss+vp]()(0)(1); 
+      Simd hp_02 = psi[ss+vp]()(0)(2); 
+      Simd hp_10 = psi[ss+vp]()(1)(0); 
+      Simd hp_11 = psi[ss+vp]()(1)(1); 
+      Simd hp_12 = psi[ss+vp]()(1)(2); 
+	
+      Simd hm_00 = psi[ss+vm]()(2)(0); 
+      Simd hm_01 = psi[ss+vm]()(2)(1); 
+      Simd hm_02 = psi[ss+vm]()(2)(2); 
+      Simd hm_10 = psi[ss+vm]()(3)(0); 
+      Simd hm_11 = psi[ss+vm]()(3)(1); 
+      Simd hm_12 = psi[ss+vm]()(3)(2); 
+
+      if ( vp<=v ) {
+	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+      }
+      if ( vm>=v ) {
+	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+      }
+
+      Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
+      Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
+      Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02); 
+      Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
+      Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
+      Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
+
+      Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
+      Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
+      Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);  
+      Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
+      Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
+      Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
+
+      vstream(chi[ss+v]()(0)(0),p_00);
+      vstream(chi[ss+v]()(0)(1),p_01);
+      vstream(chi[ss+v]()(0)(2),p_02);
+      vstream(chi[ss+v]()(1)(0),p_10);
+      vstream(chi[ss+v]()(1)(1),p_11);
+      vstream(chi[ss+v]()(1)(2),p_12);
+      vstream(chi[ss+v]()(2)(0),p_20);
+      vstream(chi[ss+v]()(2)(1),p_21);
+      vstream(chi[ss+v]()(2)(2),p_22);
+      vstream(chi[ss+v]()(3)(0),p_30);
+      vstream(chi[ss+v]()(3)(1),p_31);
+      vstream(chi[ss+v]()(3)(2),p_32);
+    }
+#endif
+  });
+  M5Dtime+=usecond();
+}
+
+
+#ifdef AVX512 
+#include <simd/Intel512common.h>
+#include <simd/Intel512avx.h>
+#include <simd/Intel512single.h>
+#endif 
+
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField &chi_i,
+					int LLs, int site,
+					Vector<iSinglet<Simd> > &Matp,
+					Vector<iSinglet<Simd> > &Matm)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+#ifndef AVX512
+  {
+    SiteHalfSpinor BcastP;
+    SiteHalfSpinor BcastM;
+    SiteHalfSpinor SiteChiP;
+    SiteHalfSpinor SiteChiM;
+
+    // Ls*Ls * 2 * 12 * vol flops
+    for(int s1=0;s1<LLs;s1++){ 
+      for(int s2=0;s2<LLs;s2++){ 
+	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+
+	  int s=s2+l*LLs;
+	  int lex=s2+LLs*site;
+	
+	  if ( s2==0 && l==0) {
+	    SiteChiP=Zero();
+	    SiteChiM=Zero();
+	  }
+	
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
+	    }}
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
+	    }}
+
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
+	      SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
+	    }}
+
+	}}
+      {
+	int lex = s1+LLs*site;
+	for(int sp=0;sp<2;sp++){
+	  for(int co=0;co<Nc;co++){
+	    vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
+	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+	  }}
+      }
+    }
+
+  }
+#else
+  {
+    // pointers
+    //  MASK_REGS;
+#define Chi_00 %%zmm1
+#define Chi_01 %%zmm2
+#define Chi_02 %%zmm3
+#define Chi_10 %%zmm4
+#define Chi_11 %%zmm5
+#define Chi_12 %%zmm6
+#define Chi_20 %%zmm7
+#define Chi_21 %%zmm8
+#define Chi_22 %%zmm9
+#define Chi_30 %%zmm10
+#define Chi_31 %%zmm11
+#define Chi_32 %%zmm12
+
+#define BCAST0   %%zmm13
+#define BCAST1   %%zmm14
+#define BCAST2   %%zmm15
+#define BCAST3   %%zmm16
+#define BCAST4   %%zmm17
+#define BCAST5   %%zmm18
+#define BCAST6   %%zmm19
+#define BCAST7   %%zmm20
+#define BCAST8   %%zmm21
+#define BCAST9   %%zmm22
+#define BCAST10  %%zmm23
+#define BCAST11  %%zmm24
+
+    int incr=LLs*LLs*sizeof(iSinglet<Simd>);
+    for(int s1=0;s1<LLs;s1++){ 
+      for(int s2=0;s2<LLs;s2++){ 
+	int lex=s2+LLs*site;
+	uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
+	uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
+	uint64_t a2 = (uint64_t)&psi[lex];
+	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+	  if ( (s2+l)==0 ) {
+	    asm (
+		 VPREFETCH1(0,%2)  	     VPREFETCH1(0,%1)
+		 VPREFETCH1(12,%2)  	     VPREFETCH1(13,%2)
+		 VPREFETCH1(14,%2)  	     VPREFETCH1(15,%2)         
+		 VBCASTCDUP(0,%2,BCAST0)   
+		 VBCASTCDUP(1,%2,BCAST1)   
+		 VBCASTCDUP(2,%2,BCAST2)   
+		 VBCASTCDUP(3,%2,BCAST3)   
+		 VBCASTCDUP(4,%2,BCAST4)     VMULMEM (0,%0,BCAST0,Chi_00)
+		 VBCASTCDUP(5,%2,BCAST5)     VMULMEM (0,%0,BCAST1,Chi_01)
+		 VBCASTCDUP(6,%2,BCAST6)     VMULMEM (0,%0,BCAST2,Chi_02)
+		 VBCASTCDUP(7,%2,BCAST7)     VMULMEM (0,%0,BCAST3,Chi_10)
+		 VBCASTCDUP(8,%2,BCAST8)     VMULMEM (0,%0,BCAST4,Chi_11)
+		 VBCASTCDUP(9,%2,BCAST9)     VMULMEM (0,%0,BCAST5,Chi_12)
+		 VBCASTCDUP(10,%2,BCAST10)   VMULMEM (0,%1,BCAST6,Chi_20)
+		 VBCASTCDUP(11,%2,BCAST11)   VMULMEM (0,%1,BCAST7,Chi_21)
+		 VMULMEM (0,%1,BCAST8,Chi_22)         
+		 VMULMEM (0,%1,BCAST9,Chi_30)
+		 VMULMEM (0,%1,BCAST10,Chi_31)       
+		 VMULMEM (0,%1,BCAST11,Chi_32)
+		 : : "r" (a0), "r" (a1), "r" (a2)  );
+	  } else { 
+	    asm (
+		 VBCASTCDUP(0,%2,BCAST0)   VMADDMEM (0,%0,BCAST0,Chi_00)
+		 VBCASTCDUP(1,%2,BCAST1)   VMADDMEM (0,%0,BCAST1,Chi_01)
+		 VBCASTCDUP(2,%2,BCAST2)   VMADDMEM (0,%0,BCAST2,Chi_02)
+		 VBCASTCDUP(3,%2,BCAST3)   VMADDMEM (0,%0,BCAST3,Chi_10)
+		 VBCASTCDUP(4,%2,BCAST4)   VMADDMEM (0,%0,BCAST4,Chi_11)
+		 VBCASTCDUP(5,%2,BCAST5)   VMADDMEM (0,%0,BCAST5,Chi_12)
+		 VBCASTCDUP(6,%2,BCAST6)   VMADDMEM (0,%1,BCAST6,Chi_20)
+		 VBCASTCDUP(7,%2,BCAST7)   VMADDMEM (0,%1,BCAST7,Chi_21)
+		 VBCASTCDUP(8,%2,BCAST8)   VMADDMEM (0,%1,BCAST8,Chi_22)
+		 VBCASTCDUP(9,%2,BCAST9)   VMADDMEM (0,%1,BCAST9,Chi_30)
+		 VBCASTCDUP(10,%2,BCAST10)  VMADDMEM (0,%1,BCAST10,Chi_31)
+		 VBCASTCDUP(11,%2,BCAST11)  VMADDMEM (0,%1,BCAST11,Chi_32) 
+		 : : "r" (a0), "r" (a1), "r" (a2)  );
+	  }
+	  a0 = a0+incr;
+	  a1 = a1+incr;
+	a2 = a2+sizeof(typename Simd::scalar_type);
+	}}
+      {
+	int lexa = s1+LLs*site;
+	asm (
+	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)		
+	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)		
+	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)		
+	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)		
+	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+      }
+    }
+  }
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef BCAST0
+#undef BCAST1
+#undef BCAST2
+#undef BCAST3
+#undef BCAST4
+#undef BCAST5
+#undef BCAST6
+#undef BCAST7
+#undef BCAST8
+#undef BCAST9
+#undef BCAST10
+#undef BCAST11
+#endif
+};
+
+// Z-mobius version
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField &chi_i,
+					 int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
+{
+  EnableIf<Impl::LsVectorised,int> sfinae=0;
+#ifndef AVX512
+  {
+    auto psi = psi_i.View();
+    auto chi = chi_i.View();
+
+    SiteHalfSpinor BcastP;
+    SiteHalfSpinor BcastM;
+    SiteHalfSpinor SiteChiP;
+    SiteHalfSpinor SiteChiM;
+
+    // Ls*Ls * 2 * 12 * vol flops
+    for(int s1=0;s1<LLs;s1++){ 
+      for(int s2=0;s2<LLs;s2++){ 
+	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+
+	  int s=s2+l*LLs;
+	  int lex=s2+LLs*site;
+	
+	  if ( s2==0 && l==0) {
+	    SiteChiP=Zero();
+	    SiteChiM=Zero();
+	  }
+	
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
+	    }}
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
+	    }}
+
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co); 
+	      SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co); 
+	    }}
+
+
+	}}
+      {
+	int lex = s1+LLs*site;
+	for(int sp=0;sp<2;sp++){
+	  for(int co=0;co<Nc;co++){
+	    vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
+	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+	  }}
+      }
+    }
+
+  }
+#else
+  {
+    auto psi = psi_i.View();
+    auto chi = chi_i.View();
+    // pointers
+    //  MASK_REGS;
+#define Chi_00 %zmm0
+#define Chi_01 %zmm1
+#define Chi_02 %zmm2
+#define Chi_10 %zmm3
+#define Chi_11 %zmm4
+#define Chi_12 %zmm5
+#define Chi_20 %zmm6
+#define Chi_21 %zmm7
+#define Chi_22 %zmm8
+#define Chi_30 %zmm9
+#define Chi_31 %zmm10
+#define Chi_32 %zmm11
+#define pChi_00 %%zmm0
+#define pChi_01 %%zmm1
+#define pChi_02 %%zmm2
+#define pChi_10 %%zmm3
+#define pChi_11 %%zmm4
+#define pChi_12 %%zmm5
+#define pChi_20 %%zmm6
+#define pChi_21 %%zmm7
+#define pChi_22 %%zmm8
+#define pChi_30 %%zmm9
+#define pChi_31 %%zmm10
+#define pChi_32 %%zmm11
+
+#define BCAST_00   %zmm12
+#define  SHUF_00   %zmm13
+#define BCAST_01   %zmm14
+#define  SHUF_01   %zmm15
+#define BCAST_02   %zmm16
+#define  SHUF_02   %zmm17
+#define BCAST_10   %zmm18
+#define  SHUF_10   %zmm19
+#define BCAST_11   %zmm20
+#define  SHUF_11   %zmm21
+#define BCAST_12   %zmm22
+#define  SHUF_12   %zmm23
+
+#define Mp  %zmm24
+#define Mps %zmm25
+#define Mm  %zmm26
+#define Mms %zmm27
+#define N 8
+    int incr=LLs*LLs*sizeof(iSinglet<Simd>);
+    for(int s1=0;s1<LLs;s1++){ 
+      for(int s2=0;s2<LLs;s2++){ 
+	int lex=s2+LLs*site;
+	uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
+	uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
+	uint64_t a2 = (uint64_t)&psi[lex];
+	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+	  if ( (s2+l)==0 ) {
+	    LOAD64(%r8,a0);
+	    LOAD64(%r9,a1);
+	    LOAD64(%r10,a2);
+	    asm (
+		 VLOAD(0,%r8,Mp)// i r
+		 VLOAD(0,%r9,Mm)
+		 VSHUF(Mp,Mps)  // r i 
+		 VSHUF(Mm,Mms)
+		 VPREFETCH1(12,%r10)  	     VPREFETCH1(13,%r10)
+		 VPREFETCH1(14,%r10)  	     VPREFETCH1(15,%r10)         
+
+		 VMULIDUP(0*N,%r10,Mps,Chi_00)
+		 VMULIDUP(1*N,%r10,Mps,Chi_01)
+		 VMULIDUP(2*N,%r10,Mps,Chi_02)
+		 VMULIDUP(3*N,%r10,Mps,Chi_10)
+		 VMULIDUP(4*N,%r10,Mps,Chi_11)
+		 VMULIDUP(5*N,%r10,Mps,Chi_12)
+
+		 VMULIDUP(6*N ,%r10,Mms,Chi_20)
+		 VMULIDUP(7*N ,%r10,Mms,Chi_21)
+		 VMULIDUP(8*N ,%r10,Mms,Chi_22)
+		 VMULIDUP(9*N ,%r10,Mms,Chi_30)
+		 VMULIDUP(10*N,%r10,Mms,Chi_31)
+		 VMULIDUP(11*N,%r10,Mms,Chi_32)
+
+		 VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
+		 VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
+		 VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
+		 VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
+		 VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
+		 VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
+
+		 VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
+		 VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
+		 VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
+		 VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
+		 VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
+		 VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
+		 );
+	  } else { 
+	    LOAD64(%r8,a0);
+	    LOAD64(%r9,a1);
+	    LOAD64(%r10,a2);
+	    asm (
+		 VLOAD(0,%r8,Mp)
+		 VSHUF(Mp,Mps)
+
+		 VLOAD(0,%r9,Mm)
+		 VSHUF(Mm,Mms)
+
+		 VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) //  Mri * Pii +- Cir
+		 VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
+		 VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
+		 VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
+		 VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
+		 VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
+
+		 VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
+		 VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
+		 VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
+		 VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
+		 VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
+		 VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
+
+		 VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) //  Cir = Mir * Prr +- ( Mri * Pii +- Cir) 
+		 VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) //  Ci = MiPr + Ci + MrPi ;    Cr = MrPr - ( MiPi - Cr)
+		 VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
+		 VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
+		 VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
+		 VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
+
+		 VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
+		 VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
+		 VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
+		 VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
+		 VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
+		 VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
+		 );
+	  }
+	  a0 = a0+incr;
+	  a1 = a1+incr;
+	a2 = a2+sizeof(typename Simd::scalar_type);
+	}}
+      {
+	int lexa = s1+LLs*site;
+	/*
+	  SiteSpinor tmp;
+	  asm (
+	  VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
+	  VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
+	  VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
+	  VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
+	  : : "r" ((uint64_t)&tmp) : "memory" );
+	*/
+
+	asm (
+	     VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
+	     VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
+	     VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
+	     VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
+	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+	//      if ( 1 || (site==0) ) { 
+	//	std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
+	//      }
+      }
+    }
+  }
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef BCAST0
+#undef BCAST1
+#undef BCAST2
+#undef BCAST3
+#undef BCAST4
+#undef BCAST5
+#undef BCAST6
+#undef BCAST7
+#undef BCAST8
+#undef BCAST9
+#undef BCAST10
+#undef BCAST11
+
+#endif
+};
+
+
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
+{
+  EnableIf<Impl::LsVectorised,int> sfinae=0;
+  chi.Checkerboard()=psi.Checkerboard();
+
+  int Ls=this->Ls;
+  int LLs = psi.Grid()->_rdimensions[0];
+  int vol = psi.Grid()->oSites()/LLs;
+
+  
+  Vector<iSinglet<Simd> >  Matp;
+  Vector<iSinglet<Simd> >  Matm;
+  Vector<iSinglet<Simd> >  *_Matp;
+  Vector<iSinglet<Simd> >  *_Matm;
+  
+  //  MooeeInternalCompute(dag,inv,Matp,Matm);
+  if ( inv && dag ) { 
+    _Matp = &MatpInvDag;
+    _Matm = &MatmInvDag;
+  }
+  if ( inv && (!dag) ) { 
+    _Matp = &MatpInv;
+    _Matm = &MatmInv;
+  } 
+  if ( !inv ) {
+    MooeeInternalCompute(dag,inv,Matp,Matm);
+    _Matp = &Matp;
+    _Matm = &Matm;
+  }
+  assert(_Matp->size()==Ls*LLs);
+
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+
+  if ( switcheroo<Coeff_t>::iscomplex() ) {
+    thread_loop( (auto site=0;site<vol;site++),{
+      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
+    });
+  } else { 
+    thread_loop( (auto site=0;site<vol;site++),{
+      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
+    });
+  }
+  MooeeInvTime+=usecond();
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@@ -0,0 +1,321 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
+
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
+{
+  SetCoefficientsZolotarev(1.0/scale,zdata);
+}
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
+{
+  // How to check Ls matches??
+  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+  int Ls = this->Ls;
+  assert(zdata->db==Ls);// Beta has Ls coeffs
+
+  R=(1+this->mass)/(1-this->mass);
+
+  Beta.resize(Ls);
+  cc.resize(Ls);
+  cc_d.resize(Ls);
+  sqrt_cc.resize(Ls);
+  for(int i=0; i < Ls ; i++){
+    Beta[i] = zdata -> beta[i];
+    cc[i] = 1.0/Beta[i];
+    cc_d[i]=std::sqrt(cc[i]);
+  }
+    
+  cc_d[Ls-1]=1.0;
+  for(int i=0; i < Ls-1 ; i++){
+    sqrt_cc[i]= std::sqrt(cc[i]*cc[i+1]);
+  }    
+  sqrt_cc[Ls-2]=std::sqrt(cc[Ls-2]);
+
+
+  ZoloHiInv =1.0/zolo_hi;
+  dw_diag = (4.0-this->M5)*ZoloHiInv;
+    
+  See.resize(Ls);
+  Aee.resize(Ls);
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    Aee[s] = sign * Beta[s] * dw_diag;
+    sign   = - sign;
+  }
+  Aee[Ls-1] += R;
+    
+  See[0] = Aee[0];
+  for(int s=1;s<Ls;s++){
+    See[s] = Aee[s] - 1.0/See[s-1];
+  }
+  for(int s=0;s<Ls;s++){
+    std::cout<<GridLogMessage <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
+  }
+}
+
+
+
+template<class Impl>
+RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
+{
+  int Ls = this->Ls;
+
+  FermionField D(psi.Grid());
+
+  this->DW(psi,D,DaggerNo); 
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*ZoloHiInv,D,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
+    } else if ( s==(Ls-1) ){
+      RealD R=(1.0+mass)/(1.0-mass);
+      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,D,sqrt_cc[s-1],psi,s,s-1);
+      ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
+    } else {
+      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,D,sqrt_cc[s],psi,s,s+1);
+      axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
+    }
+    sign=-sign; 
+  }
+  return norm2(chi);
+}
+template<class Impl>
+RealD  ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
+{
+  // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
+  // The rest of matrix is symmetric.
+  // Can ignore "dag"
+  return M(psi,chi);
+}
+template<class Impl>
+void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+  int Ls = this->Ls;
+
+  this->DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
+    } else {
+      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
+    }
+    sign=-sign; 
+  }
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
+{
+  int Ls = this->Ls;
+
+  // Apply 4d dslash
+  if ( psi.Checkerboard() == Odd ) {
+    this->DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
+  } else {
+    this->DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
+  }
+      
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
+    } else {
+      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
+    }
+    sign=-sign; 
+  }
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
+{
+  this->Meooe(psi,chi);
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
+{
+  int Ls = this->Ls;
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*dw_diag,psi,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
+    } else if ( s==(Ls-1) ){
+      // Drop the CC here.
+      double R=(1+mass)/(1-mass);
+      ag5xpby_ssp(chi,Beta[s]*dw_diag,psi,sqrt_cc[s-1],psi,s,s-1);
+      ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
+    } else {
+      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*dw_diag,psi,sqrt_cc[s],psi,s,s+1);
+      axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
+    }
+    sign=-sign; 
+  }
+}
+
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
+{
+  this->Mooee(psi,chi);
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
+{
+  int Ls = this->Ls;
+
+  // Apply Linv
+  axpby_ssp(chi,1.0/cc_d[0],psi,0.0,psi,0,0); 
+  for(int s=1;s<Ls;s++){
+    axpbg5y_ssp(chi,1.0/cc_d[s],psi,-1.0/See[s-1],chi,s,s-1);
+  }
+  // Apply Dinv
+  for(int s=0;s<Ls;s++){
+    ag5xpby_ssp(chi,1.0/See[s],chi,0.0,chi,s,s); //only appearance of See[0]
+  }
+  // Apply Uinv = (Linv)^T
+  axpby_ssp(chi,1.0/cc_d[Ls-1],chi,0.0,chi,Ls-1,Ls-1);
+  for(int s=Ls-2;s>=0;s--){
+    axpbg5y_ssp(chi,1.0/cc_d[s],chi,-1.0*cc_d[s+1]/See[s]/cc_d[s],chi,s,s+1);
+  }
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInv(psi,chi);
+}
+
+// force terms; five routines; default to Dhop on diagonal
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+    } else {
+      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+    }
+    sign=-sign; 
+  }
+  this->DhopDeriv(mat,D,V,DaggerNo); 
+};
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+    } else {
+      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+    }
+    sign=-sign; 
+  }
+  this->DhopDerivOE(mat,D,V,DaggerNo); 
+};
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+    } else {
+      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+    }
+    sign=-sign; 
+  }
+  this->DhopDerivEO(mat,D,V,DaggerNo); 
+};
+    
+// Constructors
+template<class Impl>
+ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
+							     GaugeField &_Umu,
+							     GridCartesian         &FiveDimGrid,
+							     GridRedBlackCartesian &FiveDimRedBlackGrid,
+							     GridCartesian         &FourDimGrid,
+							     GridRedBlackCartesian &FourDimRedBlackGrid,
+							     RealD _mass,RealD M5,const ImplParams &p) :
+  WilsonFermion5D<Impl>(_Umu,
+			FiveDimGrid, FiveDimRedBlackGrid,
+			FourDimGrid, FourDimRedBlackGrid,M5,p),
+  mass(_mass)
+{
+  int Ls = this->Ls;
+  assert((Ls&0x1)==1); // Odd Ls required
+}
+
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+    {
+      int Ls = this->Ls;
+      conformable(solution5d.Grid(),this->FermionGrid());
+      conformable(exported4d.Grid(),this->GaugeGrid());
+      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+    }
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+    {
+      int Ls = this->Ls;
+      conformable(imported5d.Grid(),this->FermionGrid());
+      conformable(input4d.Grid()   ,this->GaugeGrid());
+      FermionField tmp(this->FermionGrid());
+      tmp=Zero();
+      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
+      this->Dminus(tmp,imported5d);
+    }
+
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
@@ -0,0 +1,227 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// FIXME -- make a version of these routines with site loop outermost for cache reuse.
+// Pminus fowards
+// Pplus  backwards..
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
+				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  int Ls = this->Ls;
+  GridBase* grid = psi_i.Grid();
+  auto phi = phi_i.View();
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+  assert(phi.Checkerboard() == psi.Checkerboard());
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+  
+  auto nloop=grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    auto ss=sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    for(int s=0; s<Ls; s++){
+      spinor tmp1, tmp2;
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      spProj5m(tmp1, psi(idx_u));
+      spProj5p(tmp2, psi(idx_l));
+      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
+					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid = psi_i.Grid();
+  int Ls = this->Ls;
+
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  auto nloop=grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    auto ss=sss*Ls;
+    for(int s=0; s<Ls; s++){
+      spinor tmp1, tmp2;
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      spProj5p(tmp1, psi(idx_u));
+      spProj5m(tmp2, psi(idx_l));
+      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionField& chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid = psi_i.Grid();
+  auto psi=psi_i.View();
+  auto chi=chi_i.View();
+  int Ls = this->Ls;
+
+  auto plee  = & this->lee[0];
+  auto pdee  = & this->dee[0];
+  auto puee  = & this->uee[0];
+
+  auto pleem = & this->leem[0];
+  auto pueem = & this->ueem[0];
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+  uint64_t nloop=grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    auto ss=sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1,tmp2;
+
+    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
+    // Apply (L^{\prime})^{-1}
+    coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0]
+    for(int s=1; s<Ls; s++){
+      spProj5p(tmp1, chi(ss+s-1));
+      coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp1);
+    }
+
+    // L_m^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+      spProj5m(tmp1, chi(ss+s));
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp1);
+    }
+
+    // U_m^{-1} D^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+      spProj5p(tmp1, chi(ss+Ls-1));
+      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls])*tmp1);
+    }
+    spProj5m(tmp2, chi(ss+Ls-1));
+    coalescedWrite(chi[ss+Ls-1],(1.0/pdee[Ls])*tmp1 + (1.0/pdee[Ls-1])*tmp2);
+
+    // Apply U^{-1}
+    for(int s=Ls-2; s>=0; s--){
+      spProj5m(tmp1, chi(ss+s+1));
+      coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp1);
+    }
+  });
+  this->MooeeInvTime += usecond();
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, FermionField& chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid = psi_i.Grid();
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+  int Ls = this->Ls;
+
+  assert(psi.Checkerboard() == psi.Checkerboard());
+
+  Vector<Coeff_t> ueec(Ls);
+  Vector<Coeff_t> deec(Ls+1);
+  Vector<Coeff_t> leec(Ls);
+  Vector<Coeff_t> ueemc(Ls);
+  Vector<Coeff_t> leemc(Ls);
+
+  for(int s=0; s<ueec.size(); s++){
+    ueec[s]  = conjugate(this->uee[s]);
+    deec[s]  = conjugate(this->dee[s]);
+    leec[s]  = conjugate(this->lee[s]);
+    ueemc[s] = conjugate(this->ueem[s]);
+    leemc[s] = conjugate(this->leem[s]);
+  }
+  deec[Ls] = conjugate(this->dee[Ls]);
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+  auto nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1,tmp2;
+    auto ss=sss*Ls;
+
+    // Apply (U^{\prime})^{-dagger}
+    coalescedWrite(chi[ss], psi(ss));
+    for(int s=1; s<Ls; s++){
+      spProj5m(tmp1, chi(ss+s-1));
+      coalescedWrite(chi[ss+s], psi(ss+s) - ueec[s-1]*tmp1);
+    }
+
+    // U_m^{-\dagger}
+    for(int s=0; s<Ls-1; s++){
+      spProj5p(tmp1, chi(ss+s));
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - ueemc[s]*tmp1);
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for(int s=0; s<Ls-1; s++){
+      spProj5m(tmp1, chi(ss+Ls-1));
+      coalescedWrite(chi[ss+s] ,(1.0/deec[s])*chi(ss+s) - (leemc[s]/deec[Ls-1])*tmp1);
+    }
+    spProj5p(tmp2, chi(ss+Ls-1));
+    coalescedWrite(chi[ss+Ls-1], (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2);
+
+    // Apply L^{-dagger}
+    for(int s=Ls-2; s>=0; s--){
+      spProj5p(tmp1, chi(ss+s+1));
+      coalescedWrite(chi[ss+s],chi(ss+s) - leec[s]*tmp1);
+    }
+  });
+
+  this->MooeeInvTime += usecond();
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
@@ -0,0 +1,321 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#pragma once
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl>
+DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
+						   GaugeField            &_Umu,
+						   GridCartesian         &FiveDimGrid,
+						   GridRedBlackCartesian &FiveDimRedBlackGrid,
+						   GridCartesian         &FourDimGrid,
+						   GridRedBlackCartesian &FourDimRedBlackGrid,
+						   RealD _mq1, RealD _mq2, RealD _mq3,
+						   RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
+  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
+			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
+			    _shift, _pm, _M5, 1.0, 0.0, p)
+{
+  RealD eps = 1.0;
+  Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
+  assert(zdata->n == this->Ls);
+
+  std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
+  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
+
+  Approx::zolotarev_free(zdata);
+}
+
+/***************************************************************
+ * Additional EOFA operators only called outside the inverter.
+ * Since speed is not essential, simple axpby-style
+ * implementations should be fine.
+ ***************************************************************/
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
+{
+  int Ls = this->Ls;
+
+  Din = Zero();
+  if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
+  else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
+  else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
+  else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
+}
+
+// This is just the identity for DWF
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
+
+// This is just the identity for DWF
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
+
+/*****************************************************************************************************/
+
+template<class Impl>
+RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
+{
+  FermionField Din(psi.Grid());
+
+  this->Meooe5D(psi, Din);
+  this->DW(Din, chi, DaggerNo);
+  axpby(chi, 1.0, 1.0, chi, psi);
+  this->M5D(psi, chi);
+  return(norm2(chi));
+}
+
+template<class Impl>
+RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
+{
+  FermionField Din(psi.Grid());
+
+  this->DW(psi, Din, DaggerYes);
+  this->MeooeDag5D(Din, chi);
+  this->M5Ddag(psi, chi);
+  axpby(chi, 1.0, 1.0, chi, psi);
+  return(norm2(chi));
+}
+
+/********************************************************************
+ * Performance critical fermion operators called inside the inverter
+ ********************************************************************/
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
+{
+  int   Ls    = this->Ls;
+  int   pm    = this->pm;
+  RealD shift = this->shift;
+  RealD mq1   = this->mq1;
+  RealD mq2   = this->mq2;
+  RealD mq3   = this->mq3;
+
+  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
+  Coeff_t shiftp(0.0), shiftm(0.0);
+  if(shift != 0.0){
+    if(pm == 1){ shiftp = shift*(mq3-mq2); }
+    else{ shiftm = -shift*(mq3-mq2); }
+  }
+
+  Vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
+
+#if(0)
+  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
+  for(int i=0; i<diag.size(); ++i){
+    std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
+  }
+  for(int i=0; i<upper.size(); ++i){
+    std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
+  }
+  for(int i=0; i<lower.size(); ++i){
+    std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
+  }
+#endif
+
+  this->M5D(psi, chi, chi, lower, diag, upper);
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
+{
+  int   Ls    = this->Ls;
+  int   pm    = this->pm;
+  RealD shift = this->shift;
+  RealD mq1   = this->mq1;
+  RealD mq2   = this->mq2;
+  RealD mq3   = this->mq3;
+
+  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
+  Coeff_t shiftp(0.0), shiftm(0.0);
+  if(shift != 0.0){
+    if(pm == 1){ shiftp = shift*(mq3-mq2); }
+    else{ shiftm = -shift*(mq3-mq2); }
+  }
+
+  Vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
+
+  this->M5Ddag(psi, chi, chi, lower, diag, upper);
+}
+
+// half checkerboard operations
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  Vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
+
+  for(int s=0; s<Ls; s++){
+    upper[s] = -this->cee[s];
+    lower[s] = -this->cee[s];
+  }
+  upper[Ls-1] = this->dm;
+  lower[0]    = this->dp;
+
+  this->M5D(psi, psi, chi, lower, diag, upper);
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  Vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
+
+  for(int s=0; s<Ls; s++){
+    upper[s] = -this->cee[s];
+    lower[s] = -this->cee[s];
+  }
+  upper[Ls-1] = this->dp;
+  lower[0]    = this->dm;
+
+  this->M5Ddag(psi, psi, chi, lower, diag, upper);
+}
+
+/****************************************************************************************/
+
+//Zolo
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
+{
+  int   Ls    = this->Ls;
+  int   pm    = this->pm;
+  RealD mq1   = this->mq1;
+  RealD mq2   = this->mq2;
+  RealD mq3   = this->mq3;
+  RealD shift = this->shift;
+
+  ////////////////////////////////////////////////////////
+  // Constants for the preconditioned matrix Cayley form
+  ////////////////////////////////////////////////////////
+  this->bs.resize(Ls);
+  this->cs.resize(Ls);
+  this->aee.resize(Ls);
+  this->aeo.resize(Ls);
+  this->bee.resize(Ls);
+  this->beo.resize(Ls);
+  this->cee.resize(Ls);
+  this->ceo.resize(Ls);
+
+  for(int i=0; i<Ls; ++i){
+    this->bee[i] = 4.0 - this->M5 + 1.0;
+    this->cee[i] = 1.0;
+  }
+
+  for(int i=0; i<Ls; ++i){
+    this->aee[i] = this->cee[i];
+    this->bs[i] = this->beo[i] = 1.0;
+    this->cs[i] = this->ceo[i] = 0.0;
+  }
+
+  //////////////////////////////////////////
+  // EOFA shift terms
+  //////////////////////////////////////////
+  if(pm == 1){
+    this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
+    this->dm = mq1*this->cee[Ls-1];
+  } else if(this->pm == -1) {
+    this->dp = mq1*this->cee[0];
+    this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
+  } else {
+    this->dp = mq1*this->cee[0];
+    this->dm = mq1*this->cee[Ls-1];
+  }
+
+  //////////////////////////////////////////
+  // LDU decomposition of eeoo
+  //////////////////////////////////////////
+  this->dee.resize(Ls+1);
+  this->lee.resize(Ls);
+  this->leem.resize(Ls);
+  this->uee.resize(Ls);
+  this->ueem.resize(Ls);
+
+  for(int i=0; i<Ls; ++i){
+
+    if(i < Ls-1){
+
+      this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
+
+      this->leem[i] = this->dm/this->bee[i];
+      for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
+
+      this->dee[i] = this->bee[i];
+
+      this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
+
+      this->ueem[i] = this->dp / this->bee[0];
+      for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
+
+    } else {
+
+      this->lee[i]  = 0.0;
+      this->leem[i] = 0.0;
+      this->uee[i]  = 0.0;
+      this->ueem[i] = 0.0;
+
+    }
+  }
+
+  {
+    Coeff_t delta_d = 1.0 / this->bee[0];
+    for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
+    this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
+    this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
+  }
+}
+
+// Recompute Cayley-form coefficients for different shift
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
+{
+  this->shift = new_shift;
+  Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
+  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@@ -24,22 +24,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
 #include <Grid/perfmon/PerfCount.h>

-namespace Grid {
-namespace QCD {
-  
-// S-direction is INNERMOST and takes no part in the parity.
-const std::vector<int> 
-ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
-const std::vector<int> 
-ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
+#pragma once

-  // 5d lattice for DWF.
+NAMESPACE_BEGIN(Grid);
+
+// 5d lattice for DWF.
 template<class Impl>
 ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian         &FiveDimGrid,
 							     GridRedBlackCartesian &FiveDimRedBlackGrid,
@@ -53,9 +48,9 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
-  Stencil    (&FiveDimGrid,npoint,Even,directions,displacements),
-  StencilEven(&FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
-  StencilOdd (&FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
+  Stencil    (&FiveDimGrid,npoint,Even,directions,displacements,p),
+  StencilEven(&FiveDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
+  StencilOdd (&FiveDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
  mass(_mass),
  c1(_c1),
  c2(_c2),
@@ -108,8 +103,8 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);

    for(int d=0;d<4;d++){
-      assert(FourDimGrid._simd_layout[d]=1);
-      assert(FourDimRedBlackGrid._simd_layout[d]=1);
+      assert(FourDimGrid._simd_layout[d]==1);
+      assert(FourDimRedBlackGrid._simd_layout[d]==1);
      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
    }

@@ -226,24 +221,27 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi

  Compressor compressor;
  Stencil.HaloExchange(in,compressor);
-
-  parallel_for(int ss=0;ss<Umu._grid->oSites();ss++){
+  auto Umu_v   = Umu.View();
+  auto UUUmu_v = UUUmu.View();
+  auto in_v    = in.View();
+  auto out_v   = out.View();
+  thread_for( ss,Umu.Grid()->oSites(),{
    for(int s=0;s<Ls;s++){
      int sU=ss;
      int sF = s+Ls*sU; 
-      Kernels::DhopDir(Stencil, Umu, UUUmu, Stencil.CommBuf(), sF, sU, in, out, dir, disp);
+      Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sF, sU, in_v, out_v, dir, disp);
    }
-  }
+  });
 };

 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DerivInternal(StencilImpl & st,
-            DoubledGaugeField & U,
-            DoubledGaugeField & UUU,
-            GaugeField &mat,
-            const FermionField &A,
-            const FermionField &B,
-            int dag)
+						     DoubledGaugeField & U,
+						     DoubledGaugeField & UUU,
+						     GaugeField &mat,
+						     const FermionField &A,
+						     const FermionField &B,
+						     int dag)
 {
  // No force terms in multi-rhs solver staggered
  assert(0);
@@ -251,18 +249,18 @@ void ImprovedStaggeredFermion5D<Impl>::DerivInternal(StencilImpl & st,

 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopDeriv(GaugeField &mat,
-				      const FermionField &A,
-				      const FermionField &B,
-				      int dag)
+						 const FermionField &A,
+						 const FermionField &B,
+						 int dag)
 {
  assert(0);
 }

 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
-					const FermionField &A,
-					const FermionField &B,
-					int dag)
+						   const FermionField &A,
+						   const FermionField &B,
+						   int dag)
 {
  assert(0);
 }
@@ -270,9 +268,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivEO(GaugeField &mat,

 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
-					const FermionField &A,
-					const FermionField &B,
-					int dag)
+						   const FermionField &A,
+						   const FermionField &B,
+						   int dag)
 {
  assert(0);
 }
@@ -301,8 +299,8 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &

  Compressor compressor; 

-  int LLs = in._grid->_rdimensions[0];
-  int len =  U._grid->oSites();
+  int LLs = in.Grid()->_rdimensions[0];
+  int len =  U.Grid()->oSites();

  DhopFaceTime-=usecond();
  st.Prepare();
@@ -328,7 +326,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
      double start = usecond();
      nthreads -= ncomms;
      int ttid  = tid - ncomms;
-      int n     = U._grid->oSites(); // 4d vol
+      int n     = U.Grid()->oSites(); // 4d vol
      int chunk = n / nthreads;
      int rem   = n % nthreads;
      int myblock, myn;
@@ -341,17 +339,22 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
      }

      // do the compute
+      auto   U_v  =   U.View();
+      auto UUU_v  = UUU.View();
+      auto  in_v  =  in.View();
+      auto out_v  = out.View();
+
      if (dag == DaggerYes) {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
          int sU = ss;
 	  // Interior = 1; Exterior = 0; must implement for staggered
-          Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<---------
+          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
        }
      } else {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
 	  // Interior = 1; Exterior = 0;
          int sU = ss;
-          Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------
+          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
        }
      }
        ptime = usecond() - start;
@@ -372,18 +375,23 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  DhopFaceTime+=usecond();

  DhopComputeTime2-=usecond();
+
+  auto   U_v  =   U.View();
+  auto UUU_v  = UUU.View();
+  auto  in_v  =  in.View();
+  auto out_v  = out.View();
  if (dag == DaggerYes) {
    int sz=st.surface_list.size();
-    parallel_for (int ss = 0; ss < sz; ss++) {
+    thread_for( ss,sz,{
      int sU = st.surface_list[ss];
-      Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<----------
-    }
+      Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
+    });
  } else {
    int sz=st.surface_list.size();
-    parallel_for (int ss = 0; ss < sz; ss++) {
+    thread_for( ss,sz,{
      int sU = st.surface_list[ss];
-      Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<----------
-    }
+      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
+    });
  }
  DhopComputeTime2+=usecond();
 #else
@@ -398,7 +406,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
 						    const FermionField &in, FermionField &out,int dag)
 {
  Compressor compressor;
-  int LLs = in._grid->_rdimensions[0];
+  int LLs = in.Grid()->_rdimensions[0];



@@ -410,16 +418,20 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  
  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  auto   U_v  =   U.View();
+  auto UUU_v  = UUU.View();
+  auto  in_v  =  in.View();
+  auto out_v  = out.View();
  if (dag == DaggerYes) {
-    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
+    thread_for( ss,U.Grid()->oSites(),{
      int sU=ss;
-      Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
-    }
+      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
+    });
  } else {
-    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
+    thread_for( ss,U.Grid()->oSites(),{
      int sU=ss;
-      Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
-    }
+      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
+    });
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
@@ -432,50 +444,17 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
 }
 /*CHANGE END*/

-/* ORG
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
-						    DoubledGaugeField & U,DoubledGaugeField & UUU,
-						    const FermionField &in, FermionField &out,int dag)
-{
-  Compressor compressor;
-  int LLs = in._grid->_rdimensions[0];
-
-
-
-  DhopTotalTime -= usecond();
-  DhopCommTime -= usecond();
-  st.HaloExchange(in,compressor);
-  DhopCommTime += usecond();
-  
-  DhopComputeTime -= usecond();
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  if (dag == DaggerYes) {
-    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
-      int sU=ss;
-      Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
-    }
-  } else {
-    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
-      int sU=ss;
-	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
-    }
-  }
-  DhopComputeTime += usecond();
-  DhopTotalTime   += usecond();
-}
-*/


 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
-  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
-  conformable(in._grid,out._grid); // drops the cb check
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check

-  assert(in.checkerboard==Even);
-  out.checkerboard = Odd;
+  assert(in.Checkerboard()==Even);
+  out.Checkerboard() = Odd;

  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
 }
@@ -483,11 +462,11 @@ template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
-  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
-  conformable(in._grid,out._grid); // drops the cb check
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check

-  assert(in.checkerboard==Odd);
-  out.checkerboard = Even;
+  assert(in.Checkerboard()==Odd);
+  out.Checkerboard() = Even;

  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
 }
@@ -495,10 +474,10 @@ template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
-  conformable(in._grid,FermionGrid()); // verifies full grid
-  conformable(in._grid,out._grid);
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());

-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();

  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }
@@ -506,7 +485,7 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Report(void) 
 {
-  std::vector<int> latt = GridDefaultLatt();          
+  Coordinate latt = GridDefaultLatt();          
  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _FourDimGrid->_Nprocessors;
  RealD NN = _FourDimGrid->NodeCount();
@@ -564,21 +543,21 @@ void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField
 }
 template <class Impl>
 RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
  return axpy_norm(out, mass, in, out);
 }

 template <class Impl>
 RealD ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerYes);
  return axpy_norm(out, mass, in, out);
 }

 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) {
-  if (in.checkerboard == Odd) {
+  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerNo);
  } else {
    DhopOE(in, out, DaggerNo);
@@ -586,7 +565,7 @@ void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionFiel
 }
 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
-  if (in.checkerboard == Odd) {
+  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerYes);
  } else {
    DhopOE(in, out, DaggerYes);
@@ -595,27 +574,27 @@ void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionF

 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  typename FermionField::scalar_type scal(mass);
  out = scal * in;
 }

 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Mooee(in, out);
 }

 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  out = (1.0 / (mass)) * in;
 }

 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
-                                      FermionField &out) {
-  out.checkerboard = in.checkerboard;
+						   FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
  MooeeInv(in, out);
 }

@@ -624,31 +603,28 @@ void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
 ////////////////////////////////////////////////////////
 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
-                                                         PropagatorField &q_in_2,
-                                                         PropagatorField &q_out,
-                                                         Current curr_type,
-                                                         unsigned int mu)
+								PropagatorField &q_in_2,
+								PropagatorField &q_out,
+								Current curr_type,
+								unsigned int mu)
 {
-    assert(0);
+  assert(0);
 }

 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
-                                              PropagatorField &q_out,
-                                              Current curr_type,
-                                              unsigned int mu,
-                                              unsigned int tmin, 
+void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+							   PropagatorField &q_out,
+							   Current curr_type,
+							   unsigned int mu, 
+							   unsigned int tmin,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
-    assert(0);
+  assert(0);

 }
-
-FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
-FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
  
-}}
+NAMESPACE_END(Grid);



--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@@ -28,40 +28,35 @@ directory
 /*  END LEGAL */
 #include <Grid/Grid.h>

-namespace Grid {
-namespace QCD {
+#pragma once 

-const std::vector<int> 
-ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
-const std::vector<int> 
-ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
+NAMESPACE_BEGIN(Grid);

 /////////////////////////////////
 // Constructor and gauge import
 /////////////////////////////////

-
 template <class Impl>
 ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, 
 							 RealD _mass,
 							 RealD _c1, RealD _c2,RealD _u0,
 							 const ImplParams &p)
-    : Kernels(p),
-      _grid(&Fgrid),
-      _cbgrid(&Hgrid),
-      Stencil(&Fgrid, npoint, Even, directions, displacements),
-      StencilEven(&Hgrid, npoint, Even, directions, displacements),  // source is Even
-      StencilOdd(&Hgrid, npoint, Odd, directions, displacements),  // source is Odd
-      mass(_mass),
-      Lebesgue(_grid),
-      LebesgueEvenOdd(_cbgrid),
-      Umu(&Fgrid),
-      UmuEven(&Hgrid),
-      UmuOdd(&Hgrid),
-      UUUmu(&Fgrid),
-      UUUmuEven(&Hgrid),
-      UUUmuOdd(&Hgrid) ,
-      _tmp(&Hgrid)
+  : Kernels(p),
+    _grid(&Fgrid),
+    _cbgrid(&Hgrid),
+    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
+    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
+    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
+    mass(_mass),
+    Lebesgue(_grid),
+    LebesgueEvenOdd(_cbgrid),
+    Umu(&Fgrid),
+    UmuEven(&Hgrid),
+    UmuOdd(&Hgrid),
+    UUUmu(&Fgrid),
+    UUUmuEven(&Hgrid),
+    UUUmuOdd(&Hgrid) ,
+    _tmp(&Hgrid)
 {
  int vol4;
  int LLs=1;
@@ -85,17 +80,17 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau
  ImportGauge(_Uthin,_Ufat);
 }

-  ////////////////////////////////////////////////////////////
-  // Momentum space propagator should be 
-  // https://arxiv.org/pdf/hep-lat/9712010.pdf
-  //
-  // mom space action.
-  //   gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
-  //
-  // must track through staggered flavour/spin reduction in literature to 
-  // turn to free propagator for the one component chi field, a la page 4/5
-  // of above link to implmement fourier based solver.
-  ////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+// Momentum space propagator should be 
+// https://arxiv.org/pdf/hep-lat/9712010.pdf
+//
+// mom space action.
+//   gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
+//
+// must track through staggered flavour/spin reduction in literature to 
+// turn to free propagator for the one component chi field, a la page 4/5
+// of above link to implmement fourier based solver.
+////////////////////////////////////////////////////////////
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) 
 {
@@ -177,21 +172,21 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const

 template <class Impl>
 RealD ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
  return axpy_norm(out, mass, in, out);
 }

 template <class Impl>
 RealD ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerYes);
  return axpy_norm(out, mass, in, out);
 }

 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
-  if (in.checkerboard == Odd) {
+  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerNo);
  } else {
    DhopOE(in, out, DaggerNo);
@@ -199,7 +194,7 @@ void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField
 }
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
-  if (in.checkerboard == Odd) {
+  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerYes);
  } else {
    DhopOE(in, out, DaggerYes);
@@ -208,27 +203,27 @@ void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionFie

 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  typename FermionField::scalar_type scal(mass);
  out = scal * in;
 }

 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Mooee(in, out);
 }

 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  out = (1.0 / (mass)) * in;
 }

 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
-                                      FermionField &out) {
-  out.checkerboard = in.checkerboard;
+						 FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
  MooeeInv(in, out);
 }

@@ -244,8 +239,8 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge

  Compressor compressor;

-  FermionField Btilde(B._grid);
-  FermionField Atilde(B._grid);
+  FermionField Btilde(B.Grid());
+  FermionField Atilde(B.Grid());
  Atilde = A;

  st.HaloExchange(B, compressor);
@@ -255,10 +250,13 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
    ////////////////////////
    // Call the single hop
    ////////////////////////
-    PARALLEL_FOR_LOOP
-    for (int sss = 0; sss < B._grid->oSites(); sss++) {
-      Kernels::DhopDir(st, U, UUU, st.CommBuf(), sss, sss, B, Btilde, mu,1);
-    }
+    auto U_v   = U.View();
+    auto UUU_v = UUU.View();
+    auto B_v   = B.View();
+    auto Btilde_v   = Btilde.View();
+    thread_for(sss,B.Grid()->oSites(),{
+      Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
+    });

    // Force in three link terms
    //
@@ -288,11 +286,11 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {

-  conformable(U._grid, _grid);
-  conformable(U._grid, V._grid);
-  conformable(U._grid, mat._grid);
+  conformable(U.Grid(), _grid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());

-  mat.checkerboard = U.checkerboard;
+  mat.Checkerboard() = U.Checkerboard();

  DerivInternal(Stencil, Umu, UUUmu, mat, U, V, dag);
 }
@@ -300,13 +298,13 @@ void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionFie
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {

-  conformable(U._grid, _cbgrid);
-  conformable(U._grid, V._grid);
-  conformable(U._grid, mat._grid);
+  conformable(U.Grid(), _cbgrid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());

-  assert(V.checkerboard == Even);
-  assert(U.checkerboard == Odd);
-  mat.checkerboard = Odd;
+  assert(V.Checkerboard() == Even);
+  assert(U.Checkerboard() == Odd);
+  mat.Checkerboard() = Odd;

  DerivInternal(StencilEven, UmuOdd, UUUmuOdd, mat, U, V, dag);
 }
@@ -314,48 +312,51 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionF
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {

-  conformable(U._grid, _cbgrid);
-  conformable(U._grid, V._grid);
-  conformable(U._grid, mat._grid);
+  conformable(U.Grid(), _cbgrid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());

-  assert(V.checkerboard == Odd);
-  assert(U.checkerboard == Even);
-  mat.checkerboard = Even;
+  assert(V.Checkerboard() == Odd);
+  assert(U.Checkerboard() == Even);
+  mat.Checkerboard() = Even;

  DerivInternal(StencilOdd, UmuEven, UUUmuEven, mat, U, V, dag);
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
+void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
+{
  DhopCalls+=2;
-  conformable(in._grid, _grid);  // verifies full grid
-  conformable(in._grid, out._grid);
+  conformable(in.Grid(), _grid);  // verifies full grid
+  conformable(in.Grid(), out.Grid());

-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();

  DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
+void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
+{
  DhopCalls+=1;
-  conformable(in._grid, _cbgrid);    // verifies half grid
-  conformable(in._grid, out._grid);  // drops the cb check
+  conformable(in.Grid(), _cbgrid);    // verifies half grid
+  conformable(in.Grid(), out.Grid());  // drops the cb check

-  assert(in.checkerboard == Even);
-  out.checkerboard = Odd;
+  assert(in.Checkerboard() == Even);
+  out.Checkerboard() = Odd;

  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
+void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
+{
  DhopCalls+=1;
-  conformable(in._grid, _cbgrid);    // verifies half grid
-  conformable(in._grid, out._grid);  // drops the cb check
+  conformable(in.Grid(), _cbgrid);    // verifies half grid
+  conformable(in.Grid(), out.Grid());  // drops the cb check

-  assert(in.checkerboard == Odd);
-  out.checkerboard = Even;
+  assert(in.Checkerboard() == Odd);
+  out.Checkerboard() = Even;

  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
 }
@@ -370,11 +371,13 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel

  Compressor compressor;
  Stencil.HaloExchange(in, compressor);
-
-  PARALLEL_FOR_LOOP
-  for (int sss = 0; sss < in._grid->oSites(); sss++) {
-    Kernels::DhopDir(Stencil, Umu, UUUmu, Stencil.CommBuf(), sss, sss, in, out, dir, disp);
-  }
+  auto Umu_v   =   Umu.View();
+  auto UUUmu_v = UUUmu.View();
+  auto in_v    =  in.View();
+  auto out_v   = out.View();
+  thread_for( sss, in.Grid()->oSites(),{
+    Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
+  });
 };

 template <class Impl>
@@ -400,7 +403,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
 {
 #ifdef GRID_OMP
  Compressor compressor; 
-  int len =  U._grid->oSites();
+  int len =  U.Grid()->oSites();
  const int LLs =  1;

  DhopTotalTime   -= usecond();
@@ -439,17 +442,21 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
      }

      // do the compute
+      auto U_v   = U.View();
+      auto UUU_v = UUU.View();
+      auto in_v  = in.View();
+      auto out_v = out.View();
      if (dag == DaggerYes) {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
          int sU = ss;
 	  // Interior = 1; Exterior = 0; must implement for staggered
-          Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0); 
+          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); 
        }
      } else {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
 	  // Interior = 1; Exterior = 0;
          int sU = ss;
-          Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
+          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
        }
      }
    } else {
@@ -464,17 +471,23 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  DhopFaceTime    -= usecond();

  DhopComputeTime2    -= usecond();
-  if (dag == DaggerYes) {
-    int sz=st.surface_list.size();
-    parallel_for (int ss = 0; ss < sz; ss++) {
-      int sU = st.surface_list[ss];
-      Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
-    }
-  } else {
-    int sz=st.surface_list.size();
-    parallel_for (int ss = 0; ss < sz; ss++) {
-      int sU = st.surface_list[ss];
-      Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
+  {
+    auto U_v   = U.View();
+    auto UUU_v = UUU.View();
+    auto in_v  = in.View();
+    auto out_v = out.View();
+    if (dag == DaggerYes) {
+      int sz=st.surface_list.size();
+      thread_for(ss,sz,{
+	int sU = st.surface_list[ss];
+	Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
+      });
+    } else {
+      int sz=st.surface_list.size();
+      thread_for(ss,sz,{
+	int sU = st.surface_list[ss];
+	Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
+      });
    }
  }
  DhopComputeTime2    += usecond();
@@ -500,15 +513,19 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
  st.HaloExchange(in, compressor);
  DhopCommTime    += usecond();

+  auto U_v   =   U.View();
+  auto UUU_v = UUU.View();
+  auto in_v  =  in.View();
+  auto out_v = out.View();
  DhopComputeTime -= usecond();
  if (dag == DaggerYes) {
-    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
-    }
+    thread_for(sss, in.Grid()->oSites(),{
+      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
+    });
  } else {
-    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
-    }
+    thread_for(sss, in.Grid()->oSites(),{
+      Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
+    });
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
@@ -520,7 +537,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
 template<class Impl>
 void ImprovedStaggeredFermion<Impl>::Report(void) 
 {
-  std::vector<int> latt = GridDefaultLatt();          
+  Coordinate latt = _grid->GlobalDimensions();
  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
@@ -574,31 +591,25 @@ void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
 ////////////////////////////////////////////////////////
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
-                                                        PropagatorField &q_in_2,
-                                                        PropagatorField &q_out,
-                                                        Current curr_type,
-                                                        unsigned int mu)
+							      PropagatorField &q_in_2,
+							      PropagatorField &q_out,
+							      Current curr_type,
+							      unsigned int mu)
 {
-    assert(0);
+  assert(0);
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
-                                              PropagatorField &q_out,
-                                              Current curr_type,
-                                              unsigned int mu,
-                                              unsigned int tmin, 
+void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+                                                         PropagatorField &q_out,
+                                                         Current curr_type,
+                                                         unsigned int mu, 
+                                                         unsigned int tmin,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
-    assert(0);
+  assert(0);

 }

-
-FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
-
-  //AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
-  //TwoIndexFermOpTemplateInstantiate(ImprovedStaggeredFermion);
-
-}}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
@@ -0,0 +1,453 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+ 
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
+				  Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1;
+    spinor tmp2;
+    for(int s=0; s<Ls; s++){
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      spProj5m(tmp1, psi(idx_u));
+      spProj5p(tmp2, psi(idx_l));
+      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
+					Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
+					Vector<Coeff_t> &shift_coeffs)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+
+  auto pm  = this->pm;
+  int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
+
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1;
+    spinor tmp2;
+    spinor tmp;
+    for(int s=0; s<Ls; s++){
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      spProj5m(tmp1, psi(idx_u));
+      spProj5p(tmp2, psi(idx_l));
+
+      if(pm == 1){ spProj5p(tmp, psi(ss+shift_s)); }
+      else       { spProj5m(tmp, psi(ss+shift_s)); }
+
+      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 +lower[s]*tmp2 + shift_coeffs[s]*tmp);
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
+				     Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(), {
+    uint64_t ss = sss*Ls;
+
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1, tmp2;
+
+    for(int s=0; s<Ls; s++){
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      spProj5p(tmp1, psi(idx_u));
+      spProj5m(tmp2, psi(idx_l));
+      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
+					   Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
+					   Vector<Coeff_t> &shift_coeffs)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  auto pm = this->pm;
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss = sss*Ls;
+
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1, tmp2, tmp;
+    tmp1=Zero();
+    coalescedWrite(chi[ss+Ls-1],tmp1);
+
+    for(int s=0; s<Ls; s++){
+
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+
+      spProj5p(tmp1, psi(idx_u));
+      spProj5m(tmp2, psi(idx_l));
+
+      if(s==(Ls-1)) coalescedWrite(chi[ss+s], chi(ss+s)+ diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+      else          coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+      if(pm == 1){ spProj5p(tmp, psi(ss+s)); }
+      else       { spProj5m(tmp, psi(ss+s)); }
+
+      coalescedWrite(chi[ss+shift_s],chi(ss+shift_s)+shift_coeffs[s]*tmp);
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+  auto plee = & this->lee [0];
+  auto pdee = & this->dee [0];
+  auto puee = & this->uee [0];
+  auto pleem= & this->leem[0];
+  auto pueem= & this->ueem[0];
+
+  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+
+    uint64_t ss = sss*Ls;
+
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp;
+
+    // Apply (L^{\prime})^{-1}
+    coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0]
+    for(int s=1; s<Ls; s++){
+      spProj5p(tmp, chi(ss+s-1));
+      coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp);
+    }
+
+    // L_m^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+      spProj5m(tmp, chi(ss+s));
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp);
+    }
+
+    // U_m^{-1} D^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+      spProj5p(tmp, chi(ss+Ls-1));
+      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls-1])*tmp);
+    }
+    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
+
+    // Apply U^{-1}
+    for(int s=Ls-2; s>=0; s--){
+      spProj5m(tmp, chi(ss+s+1));
+      coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp);
+    }
+  });
+   
+  this->MooeeInvTime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+  auto pm = this->pm;
+  auto plee = & this->lee [0];
+  auto pdee = & this->dee [0];
+  auto puee = & this->uee [0];
+  auto pleem= & this->leem[0];
+  auto pueem= & this->ueem[0];
+  auto pMooeeInv_shift_lc   = &MooeeInv_shift_lc[0];
+  auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+
+    uint64_t ss = sss*Ls;
+
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1,tmp2,tmp2_spProj;
+
+    // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
+    coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0]
+    tmp2 = pMooeeInv_shift_lc[0]*psi(ss);
+    for(int s=1; s<Ls; s++){
+      spProj5p(tmp1, chi(ss+s-1));
+      coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp1);
+      tmp2 = tmp2 + pMooeeInv_shift_lc[s]*psi(ss+s);
+    }
+    if(pm == 1){ spProj5p(tmp2_spProj, tmp2);}
+    else       { spProj5m(tmp2_spProj, tmp2); }
+
+    // L_m^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+      spProj5m(tmp1, chi(ss+s));
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp1);
+    }
+
+    // U_m^{-1} D^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+      spProj5p(tmp1, chi(ss+Ls-1));
+      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls-1])*tmp1);
+    }
+    // chi[ss+Ls-1] = (1.0/pdee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
+    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
+    spProj5m(tmp1, chi(ss+Ls-1));
+    coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + pMooeeInv_shift_norm[Ls-1]*tmp2_spProj);
+
+    // Apply U^{-1} and add shift term
+    for(int s=Ls-2; s>=0; s--){
+      coalescedWrite(chi[ss+s] , chi(ss+s) - puee[s]*tmp1);
+      spProj5m(tmp1, chi(ss+s));
+      coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInv_shift_norm[s]*tmp2_spProj);
+    }
+  });
+
+  this->MooeeInvTime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionField &chi_i)
+{
+  if(this->shift != 0.0){ MooeeInvDag_shift(psi_i,chi_i); return; }
+
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+  auto plee = & this->lee [0];
+  auto pdee = & this->dee [0];
+  auto puee = & this->uee [0];
+  auto pleem= & this->leem[0];
+  auto pueem= & this->ueem[0];
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+
+    uint64_t ss = sss*Ls;
+
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp;
+
+    // Apply (U^{\prime})^{-dag}
+    coalescedWrite(chi[ss], psi(ss));
+    for(int s=1; s<Ls; s++){
+      spProj5m(tmp, chi(ss+s-1));
+      coalescedWrite(chi[ss+s], psi(ss+s) - puee[s-1]*tmp);
+    }
+    
+    // U_m^{-\dag}
+    for(int s=0; s<Ls-1; s++){
+      spProj5p(tmp, chi(ss+s));
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pueem[s]*tmp);
+    }
+
+    // L_m^{-\dag} D^{-dag}
+    for(int s=0; s<Ls-1; s++){
+      spProj5m(tmp, chi(ss+Ls-1));
+      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pleem[s]/pdee[Ls-1])*tmp);
+    }
+    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
+
+    // Apply L^{-dag}
+    for(int s=Ls-2; s>=0; s--){
+      spProj5p(tmp, chi(ss+s+1));
+      coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp);
+    }
+  });
+
+  this->MooeeInvTime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+  int Ls = this->Ls;
+
+  auto pm = this->pm;
+  auto plee = & this->lee [0];
+  auto pdee = & this->dee [0];
+  auto puee = & this->uee [0];
+  auto pleem= & this->leem[0];
+  auto pueem= & this->ueem[0];
+  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
+  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+
+    uint64_t ss = sss*Ls;
+
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1,tmp2,tmp2_spProj;
+
+    // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
+    coalescedWrite(chi[ss], psi(ss));
+    tmp2 = pMooeeInvDag_shift_lc[0]*psi(ss);
+    for(int s=1; s<Ls; s++){
+      spProj5m(tmp1, chi(ss+s-1));
+      coalescedWrite(chi[ss+s],psi(ss+s) - puee[s-1]*tmp1);
+      tmp2 = tmp2 + pMooeeInvDag_shift_lc[s]*psi(ss+s);
+    }
+
+    if(pm == 1){ spProj5p(tmp2_spProj, tmp2);}
+    else       { spProj5m(tmp2_spProj, tmp2);}
+
+    // U_m^{-\dag}
+    for(int s=0; s<Ls-1; s++){
+      spProj5p(tmp1, chi(ss+s));
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pueem[s]*tmp1);
+    }
+
+    // L_m^{-\dag} D^{-dag}
+    for(int s=0; s<Ls-1; s++){
+      spProj5m(tmp1, chi(ss+Ls-1));
+      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pleem[s]/pdee[Ls-1])*tmp1);
+    }
+    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
+    spProj5p(tmp1, chi(ss+Ls-1));
+    coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + pMooeeInvDag_shift_norm[Ls-1]*tmp2_spProj);
+
+    // Apply L^{-dag}
+    for(int s=Ls-2; s>=0; s--){
+      coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp1);
+      spProj5p(tmp1, chi(ss+s));
+      coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInvDag_shift_norm[s]*tmp2_spProj);
+    }
+  });
+
+  this->MooeeInvTime += usecond();
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h
@@ -0,0 +1,407 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#pragma once
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl>
+MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
+					   GaugeField            &_Umu,
+					   GridCartesian         &FiveDimGrid,
+					   GridRedBlackCartesian &FiveDimRedBlackGrid,
+					   GridCartesian         &FourDimGrid,
+					   GridRedBlackCartesian &FourDimRedBlackGrid,
+					   RealD _mq1, RealD _mq2, RealD _mq3,
+					   RealD _shift, int _pm, RealD _M5,
+					   RealD _b, RealD _c, const ImplParams &p) :
+  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
+			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
+			    _shift, _pm, _M5, _b, _c, p)
+{
+  int Ls = this->Ls;
+
+  RealD eps = 1.0;
+  Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
+  assert(zdata->n == this->Ls);
+
+  std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
+    ",c=" << _c << ") with Ls=" << Ls << std::endl;
+  this->SetCoefficientsTanh(zdata, _b, _c);
+  std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
+    ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
+    ",pm=" << _pm << ")" << std::endl;
+
+  Approx::zolotarev_free(zdata);
+
+  if(_shift != 0.0){
+    SetCoefficientsPrecondShiftOps();
+  } else {
+    Mooee_shift.resize(Ls, 0.0);
+    MooeeInv_shift_lc.resize(Ls, 0.0);
+    MooeeInv_shift_norm.resize(Ls, 0.0);
+    MooeeInvDag_shift_lc.resize(Ls, 0.0);
+    MooeeInvDag_shift_norm.resize(Ls, 0.0);
+  }
+}
+
+/****************************************************************
+ * Additional EOFA operators only called outside the inverter.  
+ * Since speed is not essential, simple axpby-style
+ * implementations should be fine.
+ ***************************************************************/
+template<class Impl>
+void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
+{
+  int Ls = this->Ls;
+  RealD alpha = this->alpha;
+
+  Din = Zero();
+  if((sign == 1) && (dag == 0)) { // \Omega_{+}
+    for(int s=0; s<Ls; ++s){
+      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
+    }
+  } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
+    for(int s=0; s<Ls; ++s){
+      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
+    }
+  } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
+    for(int sp=0; sp<Ls; ++sp){
+      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
+    }
+  } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
+    for(int sp=0; sp<Ls; ++sp){
+      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
+    }
+  }
+}
+
+// This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
+// It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
+template<class Impl>
+void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
+{
+  int Ls    = this->Ls;
+  RealD b   = 0.5 * ( 1.0 + this->alpha );
+  RealD c   = 0.5 * ( 1.0 - this->alpha );
+  RealD mq1 = this->mq1;
+
+  for(int s=0; s<Ls; ++s){
+    if(s == 0) {
+      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
+      axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
+    } else if(s == (Ls-1)) {
+      axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
+      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
+    } else {
+      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
+      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
+    }
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+  RealD m = this->mq1;
+  RealD c = 0.5 * this->alpha;
+  RealD d = 0.5;
+
+  RealD DtInv_p(0.0), DtInv_m(0.0);
+  RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
+  FermionField tmp(this->FermionGrid());
+
+  for(int s=0; s<Ls; ++s){
+    for(int sp=0; sp<Ls; ++sp){
+
+      DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
+      DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
+      DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
+      DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
+
+      if(sp == 0){
+	axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
+	axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
+      } else {
+	axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
+	axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
+      }
+
+    }}
+}
+
+/*****************************************************************************************************/
+
+template<class Impl>
+RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
+{
+  FermionField Din(psi.Grid());
+
+  this->Meooe5D(psi, Din);
+  this->DW(Din, chi, DaggerNo);
+  axpby(chi, 1.0, 1.0, chi, psi);
+  this->M5D(psi, chi);
+  return(norm2(chi));
+}
+
+template<class Impl>
+RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
+{
+  FermionField Din(psi.Grid());
+
+  this->DW(psi, Din, DaggerYes);
+  this->MeooeDag5D(Din, chi);
+  this->M5Ddag(psi, chi);
+  axpby(chi, 1.0, 1.0, chi, psi);
+  return(norm2(chi));
+}
+
+/********************************************************************
+ * Performance critical fermion operators called inside the inverter
+ ********************************************************************/
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  Vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+
+  // no shift term
+  if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
+
+  // fused M + shift operation
+  else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  Vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+
+  // no shift term
+  if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
+
+  // fused M + shift operation
+  else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
+}
+
+// half checkerboard operations
+template<class Impl>
+void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  // coefficients of Mooee
+  Vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
+  for(int s=0; s<Ls; s++){
+    upper[s] = -this->cee[s];
+    lower[s] = -this->cee[s];
+  }
+  upper[Ls-1] *= -this->mq1;
+  lower[0]    *= -this->mq1;
+
+  // no shift term
+  if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
+
+  // fused M + shift operation
+  else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  // coefficients of MooeeDag
+  Vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      upper[s] = -this->cee[s+1];
+      lower[s] = this->mq1*this->cee[Ls-1];
+    } else if(s==(Ls-1)) {
+      upper[s] = this->mq1*this->cee[0];
+      lower[s] = -this->cee[s-1];
+    } else {
+      upper[s] = -this->cee[s+1];
+      lower[s] = -this->cee[s-1];
+    }
+  }
+
+  // no shift term
+  if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
+
+  // fused M + shift operation
+  else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
+}
+
+/****************************************************************************************/
+
+// Computes coefficients for applying Cayley preconditioned shift operators
+//  (Mooee + \Delta) --> Mooee_shift
+//  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
+//  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
+// For the latter two cases, the operation takes the form
+//  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
+//      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
+template<class Impl>
+void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
+{
+  int   Ls    = this->Ls;
+  int   pm    = this->pm;
+  RealD alpha = this->alpha;
+  RealD k     = this->k;
+  RealD mq1   = this->mq1;
+  RealD shift = this->shift;
+
+  // Initialize
+  Mooee_shift.resize(Ls);
+  MooeeInv_shift_lc.resize(Ls);
+  MooeeInv_shift_norm.resize(Ls);
+  MooeeInvDag_shift_lc.resize(Ls);
+  MooeeInvDag_shift_norm.resize(Ls);
+
+  // Construct Mooee_shift
+  int idx(0);
+  Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
+    ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
+  for(int s=0; s<Ls; ++s){
+    idx = (pm == 1) ? (s) : (Ls-1-s);
+    Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
+  }
+
+  // Tridiagonal solve for MooeeInvDag_shift_lc
+  {
+    Coeff_t m(0.0);
+    Vector<Coeff_t> d = Mooee_shift;
+    Vector<Coeff_t> u(Ls,0.0);
+    Vector<Coeff_t> y(Ls,0.0);
+    Vector<Coeff_t> q(Ls,0.0);
+    if(pm == 1){ u[0] = 1.0; }
+    else{ u[Ls-1] = 1.0; }
+
+    // Tridiagonal matrix algorithm + Sherman-Morrison formula
+    //
+    // We solve
+    //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
+    // where Mooee' is the tridiagonal part of Mooee_{+}, and
+    // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
+    // so that the outer-product u \otimes v gives the (0,Ls-1)
+    // entry of Mooee_{+}.
+    //
+    // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
+    // and then construct the solution to the original system
+    //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
+    if(pm == 1){
+      for(int s=1; s<Ls; ++s){
+	m = -this->cee[s] / this->bee[s-1];
+	d[s] -= m*d[s-1];
+	u[s] -= m*u[s-1];
+      }
+    }
+    y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
+    q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
+    for(int s=Ls-2; s>=0; --s){
+      if(pm == 1){
+	y[s] = d[s] / this->bee[s];
+	q[s] = u[s] / this->bee[s];
+      } else {
+	y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
+	q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
+      }
+    }
+
+    // Construct MooeeInvDag_shift_lc
+    for(int s=0; s<Ls; ++s){
+      if(pm == 1){
+	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
+	  (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
+      } else {
+	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
+	  (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
+      }
+    }
+
+    // Compute remaining coefficients
+    N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
+    for(int s=0; s<Ls; ++s){
+
+      // MooeeInv_shift_lc
+      if(pm == 1){ MooeeInv_shift_lc[s] = pow(this->bee[s],s)      * pow(this->cee[s],Ls-1-s); }
+      else       { MooeeInv_shift_lc[s] = pow(this->bee[s],Ls-1-s) * pow(this->cee[s],s); }
+
+      // MooeeInv_shift_norm
+      MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
+	( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N;
+
+      // MooeeInvDag_shift_norm
+      if(pm == 1){ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],s) * pow(this->cee[s],(Ls-1-s)) /
+     	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
+      else{ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],(Ls-1-s)) * pow(this->cee[s],s) /
+	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
+    }
+  }
+}
+
+// Recompute coefficients for a different value of shift constant
+template<class Impl>
+void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
+{
+  this->shift = new_shift;
+  if(new_shift != 0.0){
+    SetCoefficientsPrecondShiftOps();
+  } else {
+    int Ls = this->Ls;
+    Mooee_shift.resize(Ls,0.0);
+    MooeeInv_shift_lc.resize(Ls,0.0);
+    MooeeInv_shift_norm.resize(Ls,0.0);
+    MooeeInvDag_shift_lc.resize(Ls,0.0);
+    MooeeInvDag_shift_norm.resize(Ls,0.0);
+  }
+}
+
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@@ -0,0 +1,450 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl>
+void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+  // this does both dag and undag but is trivial; make a common helper routing
+  int Ls = this->Ls;
+
+  this->DhopDir(psi,chi,dir,disp);
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
+    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
+  }
+  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
+
+}
+template<class Impl>
+void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
+{
+  int Ls = this->Ls;
+  if ( psi.Checkerboard() == Odd ) {
+    this->DhopEO(psi,chi,DaggerNo);
+  } else {
+    this->DhopOE(psi,chi,DaggerNo);
+  }
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
+    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
+  }
+  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
+}
+
+template<class Impl>
+void   PartialFractionFermion5D<Impl>::Mooee_internal(const FermionField &psi, FermionField &chi,int dag)
+{
+  // again dag and undag are trivially related
+  int sign = dag ? (-1) : 1;
+  int Ls = this->Ls;
+      
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+	
+    int s = 2*b;
+    RealD pp = p[nblock-1-b];
+    RealD qq = q[nblock-1-b];
+	
+    // Do each 2x2 block aligned at s and multiplies Dw site diagonal by G5 so Hw
+    ag5xpby_ssp(chi,-dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s  ,s+1); 
+    ag5xpby_ssp(chi, dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s+1,s);
+    axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
+  }
+      
+  {
+    RealD R=(1+mass)/(1-mass);
+    //R g5 psi[Ls-1] + p[0] H
+    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale*dw_diag/amax,psi,Ls-1,Ls-1);
+	
+    for(int b=0;b<nblock;b++){
+      int s = 2*b+1;
+      RealD pp = p[nblock-1-b];
+      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
+    }
+  }
+}
+
+template<class Impl>
+void   PartialFractionFermion5D<Impl>::MooeeInv_internal(const FermionField &psi, FermionField &chi,int dag)
+{
+  int sign = dag ? (-1) : 1;
+  int Ls = this->Ls;
+
+  FermionField tmp(psi.Grid());
+      
+  ///////////////////////////////////////////////////////////////////////////////////////
+  //Linv
+  ///////////////////////////////////////////////////////////////////////////////////////
+  int nblock=(Ls-1)/2;
+
+  axpy(chi,0.0,psi,psi); // Identity piece
+      
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    RealD pp = p[nblock-1-b];
+    RealD qq = q[nblock-1-b];
+    RealD coeff1=sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
+    RealD coeff2=sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
+    axpby_ssp  (chi,1.0,chi,coeff1,psi,Ls-1,s);
+    axpbg5y_ssp(chi,1.0,chi,coeff2,psi,Ls-1,s+1);
+  }
+      
+  ///////////////////////////////////////////////////////////////////////////////////////
+  //Dinv (note D isn't really diagonal -- just diagonal enough that we can still invert)
+  // Compute Seeinv (coeff of gamma5)
+  ///////////////////////////////////////////////////////////////////////////////////////
+  RealD R=(1+mass)/(1-mass);
+  RealD Seeinv = R + p[nblock]*dw_diag/amax;
+  for(int b=0;b<nblock;b++){
+    Seeinv += p[nblock-1-b]*dw_diag/amax / ( dw_diag*dw_diag/amax/amax + q[nblock-1-b]);
+  }    
+  Seeinv = 1.0/Seeinv;
+      
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    RealD pp = p[nblock-1-b];
+    RealD qq = q[nblock-1-b];
+    RealD coeff1=dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
+    RealD coeff2=amax*sqrt(qq) / ( dw_diag*dw_diag + amax*amax* qq);
+    ag5xpby_ssp  (tmp,-coeff1,chi,coeff2,chi,s,s+1);
+    ag5xpby_ssp  (tmp, coeff1,chi,coeff2,chi,s+1,s);
+  }
+  ag5xpby_ssp  (tmp, Seeinv,chi,0.0,chi,Ls-1,Ls-1);
+      
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // Uinv
+  ///////////////////////////////////////////////////////////////////////////////////////
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    RealD pp = p[nblock-1-b];
+    RealD qq = q[nblock-1-b];
+    RealD coeff1=-sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
+    RealD coeff2=-sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
+    axpby_ssp  (chi,1.0/scale,tmp,coeff1/scale,tmp,s,Ls-1);
+    axpbg5y_ssp(chi,1.0/scale,tmp,coeff2/scale,tmp,s+1,Ls-1);
+  }
+  axpby_ssp  (chi, 1.0/scale,tmp,0.0,tmp,Ls-1,Ls-1);
+}
+
+template<class Impl>
+void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, FermionField &chi,int dag)
+{
+  FermionField D(psi.Grid());
+  
+  int Ls = this->Ls;
+  int sign = dag ? (-1) : 1;
+
+  // For partial frac Hw case (b5=c5=1) chroma quirkily computes
+  //
+  // Conventions for partfrac appear to be a mess.
+  // Tony's Nara lectures have
+  //
+  // BlockDiag(  H/p_i  1             | 1       )    
+  //          (  1      p_i H / q_i^2 | 0       )  
+  //           ---------------------------------
+  //           ( -1      0                | R  +p0 H  )
+  //
+  //Chroma     ( -2H    2sqrt(q_i)    |   0         )
+  //           (2 sqrt(q_i)   2H      |  2 sqrt(p_i) )
+  //           ---------------------------------
+  //           ( 0     -2 sqrt(p_i)   |  2 R gamma_5 + p0 2H
+  //
+  // Edwards/Joo/Kennedy/Wenger
+  //
+  // Here, the "beta's" selected by chroma to scale the unphysical bulk constraint fields
+  // incorporate the approx scale factor. This is obtained by propagating the
+  // scale on "H" out to the off diagonal elements as follows:
+  //
+  // BlockDiag(  H/p_i  1             | 1       ) 
+  //          (  1      p_i H / q_i^2 | 0       )  
+  //           ---------------------------------
+  //          ( -1      0                | R  + p_0 H  )
+  //
+  // becomes:
+  // BlockDiag(  H/ sp_i  1               | 1             ) 
+  //          (  1      sp_i H / s^2q_i^2 | 0             )  
+  //           ---------------------------------
+  //           ( -1      0                | R + p_0/s H   )
+  //
+  //
+  // This is implemented in Chroma by
+  //           p0' = p0/approxMax
+  //           p_i' = p_i*approxMax
+  //           q_i' = q_i*approxMax*approxMax
+  //
+  // After the equivalence transform is applied the matrix becomes
+  // 
+  //Chroma     ( -2H    sqrt(q'_i)    |   0         )
+  //           (sqrt(q'_i)   2H       |   sqrt(p'_i) )
+  //           ---------------------------------
+  //           ( 0     -sqrt(p'_i)    |  2 R gamma_5 + p'0 2H
+  //
+  //     =     ( -2H    sqrt(q_i)amax    |   0              )
+  //           (sqrt(q_i)amax   2H       |   sqrt(p_i*amax) )
+  //           ---------------------------------
+  //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H
+  //
+
+  this->DW(psi,D,DaggerNo); 
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+	
+    int s = 2*b;
+    double pp = p[nblock-1-b];
+    double qq = q[nblock-1-b];
+	
+    // Do each 2x2 block aligned at s and
+    ag5xpby_ssp(chi,-1.0*scale,D,amax*sqrt(qq)*scale,psi, s  ,s+1); // Multiplies Dw by G5 so Hw
+    ag5xpby_ssp(chi, 1.0*scale,D,amax*sqrt(qq)*scale,psi, s+1,s);
+	
+    // Pick up last column
+    axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
+  }
+	
+  {
+    double R=(1+this->mass)/(1-this->mass);
+    //R g5 psi[Ls] + p[0] H
+    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
+	
+    for(int b=0;b<nblock;b++){
+      int s = 2*b+1;
+      double pp = p[nblock-1-b];
+      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
+    }
+  }
+
+}
+
+template<class Impl>
+RealD  PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
+{
+  M_internal(in,out,DaggerNo);
+  return norm2(out);
+}
+template<class Impl>
+RealD  PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
+{
+  M_internal(in,out,DaggerYes);
+  return norm2(out);
+}
+
+template<class Impl>
+void PartialFractionFermion5D<Impl>::Meooe       (const FermionField &in, FermionField &out)
+{
+  Meooe_internal(in,out,DaggerNo);
+}
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MeooeDag    (const FermionField &in, FermionField &out)
+{
+  Meooe_internal(in,out,DaggerYes);
+}
+template<class Impl>
+void PartialFractionFermion5D<Impl>::Mooee       (const FermionField &in, FermionField &out)
+{
+  Mooee_internal(in,out,DaggerNo);
+}
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MooeeDag    (const FermionField &in, FermionField &out)
+{
+  Mooee_internal(in,out,DaggerYes);
+}
+
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MooeeInv    (const FermionField &in, FermionField &out)
+{
+  MooeeInv_internal(in,out,DaggerNo);
+}
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MooeeInvDag (const FermionField &in, FermionField &out)
+{
+  MooeeInv_internal(in,out,DaggerYes);
+}
+
+
+// force terms; five routines; default to Dhop on diagonal
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+  }
+  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+  this->DhopDeriv(mat,D,V,DaggerNo); 
+};
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+  }
+  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+  this->DhopDerivOE(mat,D,V,DaggerNo); 
+};
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+  }
+  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+  this->DhopDerivEO(mat,D,V,DaggerNo); 
+};
+
+template<class Impl>
+void  PartialFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
+  SetCoefficientsZolotarev(1.0/scale,zdata);
+}
+template<class Impl>
+void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){
+
+  // check on degree matching
+  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+  int Ls = this->Ls;
+
+  assert(Ls == (2*zdata->da -1) );
+
+  // Part frac
+  //      RealD R;
+  R=(1+mass)/(1-mass);
+  dw_diag = (4.0-this->M5);
+
+  //      std::vector<RealD> p; 
+  //      std::vector<RealD> q;
+  p.resize(zdata->da);
+  q.resize(zdata->dd);
+	
+  for(int n=0;n<zdata->da;n++){
+    p[n] = zdata -> alpha[n];
+  }
+  for(int n=0;n<zdata->dd;n++){
+    q[n] = -zdata -> ap[n];
+  }
+      
+  scale= part_frac_chroma_convention ? 2.0 : 1.0; // Chroma conventions annoy me
+
+  amax=zolo_hi;
+}
+
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+    {
+      int Ls = this->Ls;
+      conformable(solution5d.Grid(),this->FermionGrid());
+      conformable(exported4d.Grid(),this->GaugeGrid());
+      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+    }
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+    {
+      int Ls = this->Ls;
+      conformable(imported5d.Grid(),this->FermionGrid());
+      conformable(input4d.Grid()   ,this->GaugeGrid());
+      FermionField tmp(this->FermionGrid());
+      tmp=Zero();
+      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
+      this->Dminus(tmp,imported5d);
+    }
+
+// Constructors
+template<class Impl>
+PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
+							 GridCartesian         &FiveDimGrid,
+							 GridRedBlackCartesian &FiveDimRedBlackGrid,
+							 GridCartesian         &FourDimGrid,
+							 GridRedBlackCartesian &FourDimRedBlackGrid,
+							 RealD _mass,RealD M5,
+							 const ImplParams &p) :
+  WilsonFermion5D<Impl>(_Umu,
+			FiveDimGrid, FiveDimRedBlackGrid,
+			FourDimGrid, FourDimRedBlackGrid,M5,p),
+  mass(_mass)
+
+{
+  int Ls = this->Ls;
+
+  assert((Ls&0x1)==1); // Odd Ls required
+  int nrational=Ls-1;
+
+
+  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);
+
+  // NB: chroma uses a cast to "float" for the zolotarev range(!?).
+  // this creates a real difference in the operator which I do not like but we can replicate here
+  // to demonstrate compatibility
+  //      RealD eps = (zolo_lo / zolo_hi);
+  //      zdata = bfm_zolotarev(eps,nrational,0);
+      
+  SetCoefficientsTanh(zdata,1.0);
+
+  Approx::zolotarev_free(zdata);
+
+}
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
@@ -26,6 +26,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#pragma once
+
 #include <Grid/Grid.h>

 #ifdef AVX512
@@ -586,11 +588,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VADD(UChi_00,UChi_10,UChi_00)				\
  VADD(UChi_01,UChi_11,UChi_01)				\
  VADD(UChi_02,UChi_12,UChi_02)	);			\
-  asm (									\
-       VSTORE(0,%0,pUChi_00)						\
-       VSTORE(1,%0,pUChi_01)						\
-       VSTORE(2,%0,pUChi_02)						\
-       : : "r" (out) : "memory" );
+  asm (							\
+  VSTORE(0,%0,pUChi_00)					\
+  VSTORE(1,%0,pUChi_01)					\
+  VSTORE(2,%0,pUChi_02)					\
+  : : "r" (out) : "memory" );

 // FIXME is sign right in the VSUB ?
 #define nREDUCEa(out)					\
@@ -613,20 +615,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
      permute##dir(Chi_1,Chi_1);\
      permute##dir(Chi_2,Chi_2);

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-					 DoubledGaugeField &U, DoubledGaugeField &UUU,
-					 SiteSpinor *buf, int LLs, int sU, 
-					 const FermionField &in, FermionField &out,int dag) 
+					 DoubledGaugeFieldView &U,
+					 DoubledGaugeFieldView &UUU,
+					 SiteSpinor *buf, int LLs,
+					 int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  assert(0);
 };


-//#define CONDITIONAL_MOVE(l,o,out) if ( l ) { out = (uint64_t) &in._odata[o] ; } else { out =(uint64_t) &buf[o]; }
+//#define CONDITIONAL_MOVE(l,o,out) if ( l ) { out = (uint64_t) &in[o] ; } else { out =(uint64_t) &buf[o]; }

 #define CONDITIONAL_MOVE(l,o,out) { const SiteSpinor *ptr = l? in_p : buf; out = (uint64_t) &ptr[o]; }

@@ -673,22 +675,23 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
  CONDITIONAL_MOVE(l3,o3,addr3);					\
  PF_CHI(addr3);							\
  									\
-  gauge0 =(uint64_t)&UU._odata[sU]( X );				\
-  gauge1 =(uint64_t)&UU._odata[sU]( Y );				\
-  gauge2 =(uint64_t)&UU._odata[sU]( Z );				\
-  gauge3 =(uint64_t)&UU._odata[sU]( T ); 
+  gauge0 =(uint64_t)&UU[sU]( X );				\
+  gauge1 =(uint64_t)&UU[sU]( Y );				\
+  gauge2 =(uint64_t)&UU[sU]( Z );				\
+  gauge3 =(uint64_t)&UU[sU]( T ); 
  
  // This is the single precision 5th direction vectorised kernel
 #include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-								    DoubledGaugeField &U, DoubledGaugeField &UUU,
-								    SiteSpinor *buf, int LLs, int sU, 
-								    const FermionField &in, FermionField &out,int dag) 
+								    DoubledGaugeFieldView &U,
+								    DoubledGaugeFieldView &UUU,
+								    SiteSpinor *buf, int LLs,
+								    int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
  uint64_t gauge0,gauge1,gauge2,gauge3;
  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in._odata[0];
+  const SiteSpinor *in_p; in_p = &in[0];

  int o0,o1,o2,o3; // offsets
  int l0,l1,l2,l3; // local 
@@ -719,7 +722,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
    LOAD_CHI(addr0,addr1,addr2,addr3);
    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);

-    addr0 = (uint64_t) &out._odata[sF];
+    addr0 = (uint64_t) &out[sF];
    if ( dag ) {
      nREDUCE(addr0);
    } else { 
@@ -734,14 +737,15 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl

 #include <Grid/simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-								    DoubledGaugeField &U, DoubledGaugeField &UUU,
-								    SiteSpinor *buf, int LLs, int sU, 
-								    const FermionField &in, FermionField &out,int dag) 
+								    DoubledGaugeFieldView &U,
+								    DoubledGaugeFieldView &UUU,
+								    SiteSpinor *buf, int LLs,
+								    int sU, const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
 #ifdef AVX512
  uint64_t gauge0,gauge1,gauge2,gauge3;
  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in._odata[0];
+  const SiteSpinor *in_p; in_p = &in[0];

  int o0,o1,o2,o3; // offsets
  int l0,l1,l2,l3; // local 
@@ -771,7 +775,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
    LOAD_CHI(addr0,addr1,addr2,addr3);
    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);

-    addr0 = (uint64_t) &out._odata[sF];
+    addr0 = (uint64_t) &out[sF];
    if ( dag ) {
      nREDUCE(addr0);
    } else { 
@@ -818,14 +822,15 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl

 #include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-							       DoubledGaugeField &U, DoubledGaugeField &UUU,
-							       SiteSpinor *buf, int LLs, int sU, 
-							       const FermionField &in, FermionField &out,int dag) 
+							       DoubledGaugeFieldView &U,
+							       DoubledGaugeFieldView &UUU,
+							       SiteSpinor *buf, int LLs,
+							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
  uint64_t gauge0,gauge1,gauge2,gauge3;
  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in._odata[0];
+  const SiteSpinor *in_p; in_p = &in[0];

  int o0,o1,o2,o3; // offsets
  int l0,l1,l2,l3; // local 
@@ -872,7 +877,7 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
    PERMUTE23;
    MULT_ADD_XYZT(gauge2,gauge3);  

-    addr0 = (uint64_t) &out._odata[sF];
+    addr0 = (uint64_t) &out[sF];
    if ( dag ) { 
      nREDUCEa(addr0);
    } else { 
@@ -886,14 +891,15 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,

 #include <Grid/simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-							       DoubledGaugeField &U, DoubledGaugeField &UUU,
-							       SiteSpinor *buf, int LLs, int sU, 
-							       const FermionField &in, FermionField &out,int dag) 
+							       DoubledGaugeFieldView &U,
+							       DoubledGaugeFieldView &UUU,
+							       SiteSpinor *buf, int LLs,
+							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
  uint64_t gauge0,gauge1,gauge2,gauge3;
  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in._odata[0];
+  const SiteSpinor *in_p; in_p = &in[0];

  int o0,o1,o2,o3; // offsets
  int l0,l1,l2,l3; // local 
@@ -940,7 +946,7 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
    PERMUTE23;
    MULT_ADD_XYZT(gauge2,gauge3);  
    
-    addr0 = (uint64_t) &out._odata[sF];
+    addr0 = (uint64_t) &out[sF];
    if ( dag ) {
      nREDUCEa(addr0);
    } else { 
@@ -952,17 +958,5 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
 #endif
 }

-#define KERNEL_INSTANTIATE(CLASS,FUNC,IMPL)			    \
-  template void CLASS<IMPL>::FUNC(StencilImpl &st, LebesgueOrder &lo,	\
-				  DoubledGaugeField &U,			\
-				  DoubledGaugeField &UUU,		\
-				  SiteSpinor *buf, int LLs,		\
-				  int sU, const FermionField &in, FermionField &out,int dag);
-
-KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
-KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
-KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplD);
-KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplF);
-
-}}
+NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
@@ -28,6 +28,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #include <Grid/Grid.h>

+#pragma once
+
+NAMESPACE_BEGIN(Grid);

 #define LOAD_CHI(b)		\
  const SiteSpinor & ref (b[offset]);	\
@@ -38,7 +41,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 // To splat or not to splat depends on the implementation
 #define MULT(A,UChi)				\
-  auto & ref(U._odata[sU](A));			\
+  auto & ref(U[sU](A));			\
   Impl::loadLinkElement(U_00,ref()(0,0));      \
   Impl::loadLinkElement(U_10,ref()(1,0));      \
   Impl::loadLinkElement(U_20,ref()(2,0));      \
@@ -59,7 +62,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    UChi ## _2 += U_22*Chi_2;

 #define MULT_ADD(U,A,UChi)			\
-  auto & ref(U._odata[sU](A));			\
+  auto & ref(U[sU](A));			\
   Impl::loadLinkElement(U_00,ref()(0,0));      \
   Impl::loadLinkElement(U_10,ref()(1,0));      \
   Impl::loadLinkElement(U_20,ref()(2,0));      \
@@ -92,7 +95,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  local  = SE->_is_local;		\
  perm   = SE->_permute;		\
  if ( local ) {						\
-    LOAD_CHI(in._odata);					\
+    LOAD_CHI(in);					\
    if ( perm) {						\
      PERMUTE_DIR(Perm);					\
    }								\
@@ -120,14 +123,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  local  = SE->_is_local;				\
  perm   = SE->_permute;				\
  if ( local ) {					\
-    LOAD_CHI(in._odata);				\
+    LOAD_CHI(in);				\
    if ( perm) {					\
      PERMUTE_DIR(Perm);				\
    }							\
  } else if ( st.same_node[Dir] ) {			\
    LOAD_CHI(buf);					\
  }							\
-  if (SE->_is_local || st.same_node[Dir] ) {		\
+  if (local || st.same_node[Dir] ) {		\
    MULT_ADD(U,Dir,even);				\
  }

@@ -135,22 +138,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  SE=st.GetEntry(ptype,Dir+skew,sF);			\
  offset = SE->_offset;					\
  local  = SE->_is_local;				\
-  perm   = SE->_permute;				\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
+  if ((!local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
    { LOAD_CHI(buf);	  }					\
    { MULT_ADD(U,Dir,even); }					\
  }								

-namespace Grid {
-namespace QCD {
-

 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
-					  DoubledGaugeField &U,DoubledGaugeField &UUU,
+					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
 					  SiteSpinor *buf, int LLs, int sU, 
-					  const FermionField &in, FermionField &out,int dag) 
+					  const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -213,16 +212,16 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    vstream(out._odata[sF],result);
+    vstream(out[sF],result);
  }
 }


 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeField &U, DoubledGaugeField &UUU,
+					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionField &in, FermionField &out,int dag) 
+					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -249,7 +248,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
  Simd U_22; 

  SiteSpinor result;
-  int offset,local,perm, ptype;
+  int offset, ptype, local, perm;

  StencilEntry *SE;
  int skew;
@@ -257,8 +256,8 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
  for(int s=0;s<LLs;s++){
    int sF=s+LLs*sU;

-    even_0 = zero;    even_1 = zero;    even_2 = zero;
-     odd_0 = zero;     odd_1 = zero;     odd_2 = zero;
+    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
+     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();

    skew = 0;
    HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);  
@@ -289,16 +288,16 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    vstream(out._odata[sF],result);
+    vstream(out[sF],result);
  }
 }


 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeField &U, DoubledGaugeField &UUU,
+					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionField &in, FermionField &out,int dag) 
+					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -325,7 +324,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
  Simd U_22; 

  SiteSpinor result;
-  int offset,local,perm, ptype;
+  int offset, ptype, local;

  StencilEntry *SE;
  int skew;
@@ -333,8 +332,8 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
  for(int s=0;s<LLs;s++){
    int sF=s+LLs*sU;

-    even_0 = zero;    even_1 = zero;    even_2 = zero;
-     odd_0 = zero;     odd_1 = zero;     odd_2 = zero;
+    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
+     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
    int nmu=0;
    skew = 0;
    HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);  
@@ -366,34 +365,29 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
 	result()()(1) = even_1 + odd_1;
 	result()()(2) = even_2 + odd_2;
      }
-      out._odata[sF] = out._odata[sF] + result;
+      out[sF] = out[sF] + result;
    }
  }
 }

-
 #define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
+						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionField &in, FermionField &out, int dag); \
+						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 									\
  template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
+						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionField &in, FermionField &out, int dag); \
+						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 									\
  template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
+						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionField &in, FermionField &out, int dag); \
+						     const FermionFieldView &in, FermionFieldView &out, int dag); \

-DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
+#undef LOAD_CHI
+
+NAMESPACE_END(Grid);


-}
-}
-
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -28,40 +28,38 @@ directory
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
-namespace QCD {
+#pragma once

-int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
-int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
+NAMESPACE_BEGIN(Grid);

 #define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
    if (SE->_permute) {						\
      chi_p = &chi;						\
-      permute(chi,  in._odata[SE->_offset], ptype);		\
+      permute(chi,  in[SE->_offset], ptype);		\
    } else {							\
-      chi_p = &in._odata[SE->_offset];				\
+      chi_p = &in[SE->_offset];				\
    }								\
  } else {							\
    chi_p = &buf[SE->_offset];					\
  }								\
-  multLink(Uchi, U._odata[sU], *chi_p, Dir);			
+  multLink(Uchi, U[sU], *chi_p, Dir);			

 #define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
    if (SE->_permute) {						\
      chi_p = &chi;						\
-      permute(chi,  in._odata[SE->_offset], ptype);		\
+      permute(chi,  in[SE->_offset], ptype);		\
    } else {							\
-      chi_p = &in._odata[SE->_offset];				\
+      chi_p = &in[SE->_offset];				\
    }								\
  } else if ( st.same_node[Dir] ) {				\
    chi_p = &buf[SE->_offset];					\
  }								\
  if (SE->_is_local || st.same_node[Dir] ) {			\
-    multLink(Uchi, U._odata[sU], *chi_p, Dir);			\
+    multLink(Uchi, U[sU], *chi_p, Dir);			\
  }

 #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
@@ -69,7 +67,7 @@ int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
    chi_p = &buf[SE->_offset];					\
-    multLink(Uchi, U._odata[sU], *chi_p, Dir);			\
+    multLink(Uchi, U[sU], *chi_p, Dir);			\
  }

 template <class Impl>
@@ -81,9 +79,9 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeField &U, DoubledGaugeField &UUU,
+					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionField &in, FermionField &out, int dag) {
+					     const FermionFieldView &in, FermionFieldView &out, int dag) {
  const SiteSpinor *chi_p;
  SiteSpinor chi;
  SiteSpinor Uchi;
@@ -114,7 +112,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
    if ( dag ) { 
      Uchi = - Uchi;
    } 
-    vstream(out._odata[sF], Uchi);
+    vstream(out[sF], Uchi);
  }
 };

@@ -123,9 +121,9 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
  ///////////////////////////////////////////////////
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
-						DoubledGaugeField &U, DoubledGaugeField &UUU,
+						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int LLs, int sU, 
-						const FermionField &in, FermionField &out,int dag) {
+						const FermionFieldView &in, FermionFieldView &out,int dag) {
  const SiteSpinor *chi_p;
  SiteSpinor chi;
  SiteSpinor Uchi;
@@ -136,7 +134,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
  for(int s=0;s<LLs;s++){
    int sF=LLs*sU+s;
    skew = 0;
-    Uchi=zero;
+    Uchi=Zero();
    GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
@@ -157,7 +155,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
    if ( dag ) {
      Uchi = - Uchi;
    }
-    vstream(out._odata[sF], Uchi);
+    vstream(out[sF], Uchi);
  }
 };

@@ -167,11 +165,11 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
  ///////////////////////////////////////////////////
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
-						DoubledGaugeField &U, DoubledGaugeField &UUU,
+						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int LLs, int sU,
-						const FermionField &in, FermionField &out,int dag) {
+						const FermionFieldView &in, FermionFieldView &out,int dag) {
  const SiteSpinor *chi_p;
-  SiteSpinor chi;
+  //  SiteSpinor chi;
  SiteSpinor Uchi;
  StencilEntry *SE;
  int ptype;
@@ -181,7 +179,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
  for(int s=0;s<LLs;s++){
    int sF=LLs*sU+s;
    skew = 0;
-    Uchi=zero;
+    Uchi=Zero();
    GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
@@ -202,9 +200,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &

    if ( nmu ) { 
      if ( dag ) { 
-	out._odata[sF] = out._odata[sF] - Uchi;
+	out[sF] = out[sF] - Uchi;
      } else { 
-	out._odata[sF] = out._odata[sF] + Uchi;
+	out[sF] = out[sF] + Uchi;
      }
    }
  }
@@ -215,9 +213,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
 ////////////////////////////////////////////////////////////////////////////////////

 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
+void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					 SiteSpinor *buf, int LLs, int sU,
-					 const FermionField &in, FermionField &out,
+					 const FermionFieldView &in, FermionFieldView &out,
 					 int interior,int exterior)
 {
  int dag=1;
@@ -225,9 +223,9 @@ void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, Dou
 };

 template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
+void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 				      SiteSpinor *buf, int LLs, int sU,
-				      const FermionField &in, FermionField &out,
+				      const FermionFieldView &in, FermionFieldView &out,
 				      int interior,int exterior)
 {
  int dag=0;
@@ -235,9 +233,9 @@ void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, Double
 };

 template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
+void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 				      SiteSpinor *buf, int LLs,
-				      int sU, const FermionField &in, FermionField &out,
+				      int sU, const FermionFieldView &in, FermionFieldView &out,
 				      int dag,int interior,int exterior) 
 {
  switch(Opt) {
@@ -277,8 +275,8 @@ void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, Double
 };

 template <class Impl>
-void StaggeredKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,  DoubledGaugeField &UUU, SiteSpinor *buf, int sF,
-				      int sU, const FermionField &in, FermionField &out, int dir, int disp) 
+void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U,  DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
+					    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp) 
 {
  // Disp should be either +1,-1,+3,-3
  // What about "dag" ?
@@ -287,8 +285,6 @@ void StaggeredKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,  Do
  assert(0);
 }

-FermOpStaggeredTemplateInstantiate(StaggeredKernels);
-FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
+NAMESPACE_END(Grid);

-}}

--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -26,23 +26,21 @@
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
-//#include <Grid/Eigen/Dense>
-#include <Grid/qcd/spin/Dirac.h>

-namespace Grid
-{
-namespace QCD
-{
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
+
+NAMESPACE_BEGIN(Grid);

 // *NOT* EO
 template <class Impl>
 RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
 {
-  FermionField temp(out._grid);
+  FermionField temp(out.Grid());

  // Wilson term
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  this->Dhop(in, out, DaggerNo);

  // Clover term
@@ -55,10 +53,10 @@ RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
 template <class Impl>
 RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 {
-  FermionField temp(out._grid);
+  FermionField temp(out.Grid());

  // Wilson term
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  this->Dhop(in, out, DaggerYes);

  // Clover term
@@ -72,7 +70,7 @@ template <class Impl>
 void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  WilsonFermion<Impl>::ImportGauge(_Umu);
-  GridBase *grid = _Umu._grid;
+  GridBase *grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);

  // Compute the field strength terms mu>nu
@@ -93,27 +91,29 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  CloverTerm += fillCloverZT(Ez) * csw_t;
  CloverTerm += diag_mass;

-  int lvol = _Umu._grid->lSites();
+  int lvol = _Umu.Grid()->lSites();
  int DimRep = Impl::Dimension;

  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);

-  std::vector<int> lcoor;
-  typename SiteCloverType::scalar_object Qx = zero, Qxinv = zero;
+  Coordinate lcoor;
+  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();

  for (int site = 0; site < lvol; site++)
  {
    grid->LocalIndexToLocalCoor(site, lcoor);
    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
    peekLocalSite(Qx, CloverTerm, lcoor);
-    Qxinv = zero;
+    Qxinv = Zero();
    //if (csw!=0){
    for (int j = 0; j < Ns; j++)
      for (int k = 0; k < Ns; k++)
        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++)
-            EigenCloverOp(a + j * DimRep, b + k * DimRep) = Qx()(j, k)(a, b);
+          for (int b = 0; b < DimRep; b++){
+	    auto zz =  Qx()(j, k)(a, b);
+            EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
+	  }
    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;

    EigenInvCloverOp = EigenCloverOp.inverse();
@@ -169,15 +169,15 @@ void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField
 template <class Impl>
 void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  CloverFieldType *Clover;
-  assert(in.checkerboard == Odd || in.checkerboard == Even);
+  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);

  if (dag)
  {
-    if (in._grid->_isCheckerBoarded)
+    if (in.Grid()->_isCheckerBoarded)
    {
-      if (in.checkerboard == Odd)
+      if (in.Checkerboard() == Odd)
      {
        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
      }
@@ -195,10 +195,10 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
  }
  else
  {
-    if (in._grid->_isCheckerBoarded)
+    if (in.Grid()->_isCheckerBoarded)
    {

-      if (in.checkerboard == Odd)
+      if (in.Checkerboard() == Odd)
      {
        //  std::cout << "Calling clover term Odd" << std::endl;
        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
@@ -209,7 +209,7 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
      }
      out = *Clover * in;
-      //  std::cout << GridLogMessage << "*Clover.checkerboard "  << (*Clover).checkerboard << std::endl;
+      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
    }
    else
    {
@@ -235,9 +235,4 @@ void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U,
  assert(0); // not implemented yet
 }

-FermOpTemplateInstantiate(WilsonCloverFermion);
-AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
-TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
-//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
-}
-}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -36,13 +36,8 @@ Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>
 #include <Grid/perfmon/PerfCount.h>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);
  
-// S-direction is INNERMOST and takes no part in the parity.
-const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
-const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
-
  // 5d lattice for DWF.
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
@@ -56,9 +51,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
-  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
-  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
-  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
+  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements,p),
+  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
+  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
  M5(_M5),
  Umu(_FourDimGrid),
  UmuEven(_FourDimRedBlackGrid),
@@ -105,8 +100,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);

    for(int d=0;d<4;d++){
-      assert(FourDimGrid._simd_layout[d]=1);
-      assert(FourDimRedBlackGrid._simd_layout[d]=1);
+      assert(FourDimGrid._simd_layout[d]==1);
+      assert(FourDimRedBlackGrid._simd_layout[d]==1);
      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
    }

@@ -141,7 +136,7 @@ void WilsonFermion5D<Impl>::Report(void)
  RealD NP     = _FourDimGrid->_Nprocessors;
  RealD NN     = _FourDimGrid->NodeCount();
  RealD volume = Ls;  
-  std::vector<int> latt = _FourDimGrid->GlobalDimensions();
+  Coordinate latt = _FourDimGrid->GlobalDimensions();
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];

  if ( DhopCalls > 0 ) {
@@ -221,7 +216,7 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
-  GaugeField HUmu(_Umu._grid);
+  GaugeField HUmu(_Umu.Grid());
  HUmu = _Umu*(-0.5);
  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
  pickCheckerboard(Even,UmuEven,Umu);
@@ -235,51 +230,43 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  //  assert( (disp==1)||(disp==-1) );
  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;

-  Compressor compressor(DaggerNo);
-  Stencil.HaloExchange(in,compressor);
-  
  int skip = (disp==1) ? 0 : 1;
-
  int dirdisp = dir+skip*4;
  int gamma   = dir+(1-skip)*4;

-  assert(dirdisp<=7);
-  assert(dirdisp>=0);
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in,compressor);
+  
+  uint64_t Nsite = Umu.Grid()->oSites();
+  Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);

-  parallel_for(int ss=0;ss<Umu._grid->oSites();ss++){
-    for(int s=0;s<Ls;s++){
-      int sU=ss;
-      int sF = s+Ls*sU; 
-      Kernels::DhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
-    }
-  }
 };

 template<class Impl>
 void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
-            DoubledGaugeField & U,
-            GaugeField &mat,
-            const FermionField &A,
-            const FermionField &B,
-            int dag)
+					  DoubledGaugeField & U,
+					  GaugeField &mat,
+					  const FermionField &A,
+					  const FermionField &B,
+					  int dag)
 {
  DerivCalls++;
  assert((dag==DaggerNo) ||(dag==DaggerYes));

-  conformable(st._grid,A._grid);
-  conformable(st._grid,B._grid);
+  conformable(st.Grid(),A.Grid());
+  conformable(st.Grid(),B.Grid());

  Compressor compressor(dag);
  
-  FermionField Btilde(B._grid);
-  FermionField Atilde(B._grid);
+  FermionField Btilde(B.Grid());
+  FermionField Atilde(B.Grid());

  DerivCommTime-=usecond();
  st.HaloExchange(B,compressor);
  DerivCommTime+=usecond();

  Atilde=A;
-  int LLs = B._grid->_rdimensions[0];
+  int LLs = B.Grid()->_rdimensions[0];


  DerivComputeTime-=usecond();
@@ -295,21 +282,11 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
    ////////////////////////

    DerivDhopComputeTime -= usecond();
-    parallel_for (int sss = 0; sss < U._grid->oSites(); sss++) {
-      for (int s = 0; s < Ls; s++) {
-        int sU = sss;
-        int sF = s + Ls * sU;

-        assert(sF < B._grid->oSites());
-        assert(sU < U._grid->oSites());
+    int Usites = U.Grid()->oSites();

-        Kernels::DhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);
+    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);

-        ////////////////////////////
-        // spin trace outer product
-        ////////////////////////////
-      }
-    }
    ////////////////////////////
    // spin trace outer product
    ////////////////////////////
@@ -325,12 +302,13 @@ void WilsonFermion5D<Impl>::DhopDeriv(GaugeField &mat,
                                      const FermionField &B,
                                      int dag)
 {
-  conformable(A._grid,FermionGrid());  
-  conformable(A._grid,B._grid);
+  conformable(A.Grid(),FermionGrid());  
+  conformable(A.Grid(),B.Grid());

-  //conformable(GaugeGrid(),mat._grid);// this is not general! leaving as a comment
+  //conformable(GaugeGrid(),mat.Grid());// this is not general! leaving as a comment

-  mat.checkerboard = A.checkerboard;
+  mat.Checkerboard() = A.Checkerboard();
+  //  mat.checkerboard = A.checkerboard;

  DerivInternal(Stencil,Umu,mat,A,B,dag);
 }
@@ -341,12 +319,12 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
                                        const FermionField &B,
                                        int dag)
 {
-  conformable(A._grid,FermionRedBlackGrid());
-  conformable(A._grid,B._grid);
+  conformable(A.Grid(),FermionRedBlackGrid());
+  conformable(A.Grid(),B.Grid());

-  assert(B.checkerboard==Odd);
-  assert(A.checkerboard==Even);
-  mat.checkerboard = Even;
+  assert(B.Checkerboard()==Odd);
+  assert(A.Checkerboard()==Even);
+  mat.Checkerboard() = Even;

  DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
 }
@@ -358,12 +336,12 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
                                        const FermionField &B,
                                        int dag)
 {
-  conformable(A._grid,FermionRedBlackGrid());
-  conformable(A._grid,B._grid);
+  conformable(A.Grid(),FermionRedBlackGrid());
+  conformable(A.Grid(),B.Grid());

-  assert(B.checkerboard==Even);
-  assert(A.checkerboard==Odd);
-  mat.checkerboard = Odd;
+  assert(B.Checkerboard()==Even);
+  assert(A.Checkerboard()==Odd);
+  mat.Checkerboard() = Odd;

  DerivInternal(StencilEven,UmuOdd,mat,A,B,dag);
 }
@@ -374,11 +352,9 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         const FermionField &in, FermionField &out,int dag)
 {
  DhopTotalTime-=usecond();
-#ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else 
-#endif
    DhopInternalSerialComms(st,lo,U,in,out,dag);
  DhopTotalTime+=usecond();
 }
@@ -389,131 +365,84 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 							DoubledGaugeField & U,
 							const FermionField &in, FermionField &out,int dag)
 {
-#ifdef GRID_OMP
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-
  Compressor compressor(dag);

-  int LLs = in._grid->_rdimensions[0];
-  int len =  U._grid->oSites();
+  int LLs = in.Grid()->_rdimensions[0];
+  int len =  U.Grid()->oSites();

+  /////////////////////////////
+  // Start comms  // Gather intranode and extra node differentiated??
+  /////////////////////////////
  DhopFaceTime-=usecond();
  st.HaloExchangeOptGather(in,compressor);
-  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();

-  double ctime=0;
-  double ptime=0;
+  DhopCommTime -=usecond();
+  std::vector<std::vector<CommsRequest_t> > requests;
+  st.CommunicateBegin(requests);

-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
-  { 
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      double start = usecond();
-      nthreads -= ncomms;
-      int ttid = tid - ncomms;
-      int n = U._grid->oSites();
-      int chunk = n / nthreads;
-      int rem = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-	myblock = ttid * chunk + ttid;
-	myn = chunk+1;
-      } else {
-	myblock = ttid*chunk + rem;
-	myn = chunk;
-      }
+  /////////////////////////////
+  // Overlap with comms
+  /////////////////////////////
+  DhopFaceTime-=usecond();
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+  DhopFaceTime+=usecond();
      
-      // do the compute
-      if (dag == DaggerYes) {
-	for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  int sU = ss;
-	  int sF = LLs * sU;
-	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
-	}
-      } else {
-	for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  int sU = ss;
-	  int sF = LLs * sU;
-	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
-	}
-      }
-	ptime = usecond() - start;
-    } else {
-      double start = usecond();
-      st.CommunicateThreaded();
-      ctime = usecond() - start;
-    }
+  /////////////////////////////
+  // do the compute interior
+  /////////////////////////////
+  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
+  DhopComputeTime-=usecond();
+  if (dag == DaggerYes) {
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
+  } else {
+    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
-  DhopCommTime += ctime;
-  DhopComputeTime+=ptime;
+  DhopComputeTime+=usecond();

-  // First to enter, last to leave timing
-  st.CollateThreads();
+  /////////////////////////////
+  // Complete comms
+  /////////////////////////////
+  st.CommunicateComplete(requests);
+  DhopCommTime   +=usecond();

+  /////////////////////////////
+  // do the compute exterior
+  /////////////////////////////
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();

  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
-    int sz=st.surface_list.size();
-    parallel_for (int ss = 0; ss < sz; ss++) {
-      int sU = st.surface_list[ss];
-      int sF = LLs * sU;
-      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
-    }
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  } else {
-    int sz=st.surface_list.size();
-    parallel_for (int ss = 0; ss < sz; ss++) {
-      int sU = st.surface_list[ss];
-      int sF = LLs * sU;
-      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
-    }
+    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
-#else 
-  assert(0);
-#endif
 }


 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
-					 DoubledGaugeField & U,
-					 const FermionField &in, FermionField &out,int dag)
+						    DoubledGaugeField & U,
+						    const FermionField &in, 
+						    FermionField &out,int dag)
 {
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  Compressor compressor(dag);

-  int LLs = in._grid->_rdimensions[0];
+  int LLs = in.Grid()->_rdimensions[0];
  
  DhopCommTime-=usecond();
  st.HaloExchangeOpt(in,compressor);
  DhopCommTime+=usecond();
  
  DhopComputeTime-=usecond();
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-
+  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
-    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
-      int sU = ss;
-      int sF = LLs * sU;
-      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
-    }
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  } else {
-    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
-      int sU = ss;
-      int sF = LLs * sU;
-      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
-    }
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
 }
@@ -523,11 +452,11 @@ template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
-  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
-  conformable(in._grid,out._grid); // drops the cb check
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check

-  assert(in.checkerboard==Even);
-  out.checkerboard = Odd;
+  assert(in.Checkerboard()==Even);
+  out.Checkerboard() = Odd;

  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
 }
@@ -535,11 +464,11 @@ template<class Impl>
 void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
-  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
-  conformable(in._grid,out._grid); // drops the cb check
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check

-  assert(in.checkerboard==Odd);
-  out.checkerboard = Even;
+  assert(in.Checkerboard()==Odd);
+  out.Checkerboard() = Even;

  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
 }
@@ -547,17 +476,17 @@ template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
-  conformable(in._grid,FermionGrid()); // verifies full grid
-  conformable(in._grid,out._grid);
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());

-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();

  DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
 {
-  out.checkerboard=in.checkerboard;
+  out.Checkerboard()=in.Checkerboard();
  Dhop(in,out,dag); // -0.5 is included
  axpy(out,4.0-M5,in,out);
 }
@@ -569,7 +498,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  GridBase *_grid = _FourDimGrid;
  GridBase *_5dgrid = _FiveDimGrid;

-  conformable(_5dgrid,out._grid);
+  conformable(_5dgrid,out.Grid());

  FermionField   PRsource(_5dgrid);
  FermionField   PLsource(_5dgrid);
@@ -580,7 +509,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  FermionField   bufL_4d(_grid);
  FermionField   bufR_4d(_grid);

-  unsigned int Ls = in._grid->_rdimensions[0];
+  unsigned int Ls = in.Grid()->_rdimensions[0];
  
  typedef typename FermionField::vector_type vector_type;
  typedef typename FermionField::scalar_type ScalComplex;
@@ -596,12 +525,12 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const

  Gamma g5(Gamma::Algebra::Gamma5);

-  std::vector<int> latt_size   = _grid->_fdimensions;
+  Coordinate latt_size   = _grid->_fdimensions;

-  LatComplex    sk(_grid);  sk = zero;
-  LatComplex    sk2(_grid); sk2= zero;
-  LatComplex    W(_grid); W= zero;
-  LatComplex    a(_grid); a= zero;
+  LatComplex    sk(_grid);  sk = Zero();
+  LatComplex    sk2(_grid); sk2= Zero();
+  LatComplex    W(_grid); W= Zero();
+  LatComplex    a(_grid); a= Zero();
  LatComplex    one  (_grid); one = ScalComplex(1.0,0.0);
  LatComplex 	cosha(_grid);
  LatComplex 	kmu(_grid);
@@ -643,9 +572,9 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const

  // FIXME Need a Lattice acosh
  for(int idx=0;idx<_grid->lSites();idx++){
-    std::vector<int> lcoor(Nd);
+    Coordinate lcoor(Nd);
    Tcomplex cc;
-    RealD sgn;
+    //    RealD sgn;
    _grid->LocalIndexToLocalCoor(idx,lcoor);
    peekLocalSite(cc,cosha,lcoor);
    assert((double)real(cc)>=1.0);
@@ -678,8 +607,8 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  //calculate GR, GL
  for(unsigned int ss=1;ss<=Ls;ss++)
  {
-    bufR_4d = zero;
-    bufL_4d = zero;
+    bufR_4d = Zero();
+    bufL_4d = Zero();
    for(unsigned int tt=1;tt<=Ls;tt++)
    {
      //possible sign if W<0
@@ -688,7 +617,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const

      unsigned int f = (ss > tt) ? ss-tt : tt-ss; //f = abs(ss-tt)
      //GR
-      buf1_4d = zero;
+      buf1_4d = Zero();
      ExtractSlice(buf1_4d, PRsource, (tt-1), 0);
      //G(s,t)
      bufR_4d = bufR_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf1_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf1_4d;
@@ -702,7 +631,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
      bufR_4d = bufR_4d + Amm * exp(-a*ss) * exp(-a*tt) * signW * buf1_4d ;

      //GL
-      buf2_4d = zero;
+      buf2_4d = Zero();
      ExtractSlice(buf2_4d, PLsource, (tt-1), 0);
      //G(s,t)
      bufL_4d = bufL_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf2_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf2_4d;
@@ -722,13 +651,13 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
 //calculate propagator
  for(unsigned int ss=1;ss<=Ls;ss++)
  {
-    bufR_4d = zero;
-    bufL_4d = zero;
+    bufR_4d = Zero();
+    bufL_4d = Zero();

    //(i*gamma_mu*sin(p_mu) - W)*(GL*P- source)
-    buf1_4d = zero;
+    buf1_4d = Zero();
    ExtractSlice(buf1_4d, GL, (ss-1), 0);
-    buf2_4d = zero;
+    buf2_4d = Zero();
    for(int mu=0;mu<Nd;mu++) {
      LatticeCoordinate(kmu,mu);
      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
@@ -738,9 +667,9 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
    bufL_4d = buf2_4d - W * buf1_4d;

    //(i*gamma_mu*sin(p_mu) - W)*(GR*P+ source)
-    buf1_4d = zero;
+    buf1_4d = Zero();
    ExtractSlice(buf1_4d, GR, (ss-1), 0);
-    buf2_4d = zero;
+    buf2_4d = Zero();
    for(int mu=0;mu<Nd;mu++) {
      LatticeCoordinate(kmu,mu);
      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
@@ -781,7 +710,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
 {
  // what type LatticeComplex 
  GridBase *_grid = _FourDimGrid;
-  conformable(_grid,out._grid);
+  conformable(_grid,out.Grid());
  
  typedef typename FermionField::vector_type vector_type;
  typedef typename FermionField::scalar_type ScalComplex;
@@ -795,17 +724,17 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
    Gamma::Algebra::GammaT
  };

-  std::vector<int> latt_size   = _grid->_fdimensions;
+  Coordinate latt_size   = _grid->_fdimensions;

  
-  FermionField   num  (_grid); num  = zero;
+  FermionField   num  (_grid); num  = Zero();

-  LatComplex    sk(_grid);  sk = zero;
-  LatComplex    sk2(_grid); sk2= zero;
-  LatComplex    W(_grid); W= zero;
-  LatComplex    a(_grid); a= zero;
+  LatComplex    sk(_grid);  sk = Zero();
+  LatComplex    sk2(_grid); sk2= Zero();
+  LatComplex    W(_grid); W= Zero();
+  LatComplex    a(_grid); a= Zero();
  LatComplex    one  (_grid); one = ScalComplex(1.0,0.0);
-  LatComplex denom(_grid); denom= zero;
+  LatComplex denom(_grid); denom= Zero();
  LatComplex cosha(_grid); 
  LatComplex kmu(_grid); 
  LatComplex Wea(_grid); 
@@ -838,9 +767,9 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe

  // FIXME Need a Lattice acosh
  for(int idx=0;idx<_grid->lSites();idx++){
-    std::vector<int> lcoor(Nd);
+    Coordinate lcoor(Nd);
    Tcomplex cc;
-    RealD sgn;
+    //    RealD sgn;
    _grid->LocalIndexToLocalCoor(idx,lcoor);
    peekLocalSite(cc,cosha,lcoor);
    assert((double)real(cc)>=1.0);
@@ -868,7 +797,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
    };

    GridBase *_grid = _FourDimGrid;
-    conformable(_grid,out._grid);
+    conformable(_grid,out.Grid());

    typedef typename FermionField::vector_type vector_type;
    typedef typename FermionField::scalar_type ScalComplex;
@@ -876,18 +805,18 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
    typedef Lattice<iSinglet<vector_type> > LatComplex;


-    std::vector<int> latt_size   = _grid->_fdimensions;
+    Coordinate latt_size   = _grid->_fdimensions;

-    LatComplex    sk(_grid);  sk = zero;
-    LatComplex    sk2(_grid); sk2= zero;
+    LatComplex    sk(_grid);  sk = Zero();
+    LatComplex    sk2(_grid); sk2= Zero();

-    LatComplex    w_k(_grid); w_k= zero;
-    LatComplex    b_k(_grid); b_k= zero;
+    LatComplex    w_k(_grid); w_k= Zero();
+    LatComplex    b_k(_grid); b_k= Zero();

    LatComplex     one  (_grid); one = ScalComplex(1.0,0.0);

-    FermionField   num  (_grid); num  = zero;
-    LatComplex denom(_grid); denom= zero;
+    FermionField   num  (_grid); num  = Zero();
+    LatComplex denom(_grid); denom= Zero();
    LatComplex kmu(_grid); 
    ScalComplex ci(0.0,1.0);

@@ -928,7 +857,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
 // Helper macro to reverse Simd vector. Fixme: slow, generic implementation.
 #define REVERSE_LS(qSite, qSiteRev, Nsimd) \
 { \
-    std::vector<typename SitePropagator::scalar_object> qSiteVec(Nsimd); \
+    ExtractBuffer<typename SitePropagator::scalar_object> qSiteVec(Nsimd);	\
    extract(qSite, qSiteVec); \
    for (int i = 0; i < Nsimd / 2; ++i) \
    { \
@@ -946,31 +875,35 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
 template<class vobj> 
 Lattice<vobj> spProj5p(const Lattice<vobj> & in)
 {
-  GridBase *grid=in._grid;
+  GridBase *grid=in.Grid();
  Gamma G5(Gamma::Algebra::Gamma5);
  Lattice<vobj> ret(grid);
-  parallel_for(int ss=0;ss<grid->oSites();ss++){
-    ret._odata[ss] = in._odata[ss] + G5*in._odata[ss];
-  }
+  auto ret_v = ret.View();
+  auto in_v  =  in.View();
+  thread_for(ss,grid->oSites(),{
+    ret_v[ss] = in_v[ss] + G5*in_v[ss];
+  });
  return ret;
 }
 template<class vobj> 
 Lattice<vobj> spProj5m(const Lattice<vobj> & in)
 {
  Gamma G5(Gamma::Algebra::Gamma5);
-  GridBase *grid=in._grid;
+  GridBase *grid=in.Grid();
  Lattice<vobj> ret(grid);
-  parallel_for(int ss=0;ss<grid->oSites();ss++){
-    ret._odata[ss] = in._odata[ss] - G5*in._odata[ss];
-  }
+  auto ret_v = ret.View();
+  auto in_v  =  in.View();
+  thread_for(ss,grid->oSites(),{
+    ret_v[ss] = in_v[ss] - G5*in_v[ss];
+  });
  return ret;
 }

 template <class Impl>
 void WilsonFermion5D<Impl>::ContractJ5q(FermionField &q_in,ComplexField &J5q)
 {
-  conformable(GaugeGrid(), J5q._grid);
-  conformable(q_in._grid, FermionGrid());
+  conformable(GaugeGrid(), J5q.Grid());
+  conformable(q_in.Grid(), FermionGrid());

  // 4d field
  int Ls = this->Ls;
@@ -990,8 +923,8 @@ void WilsonFermion5D<Impl>::ContractJ5q(FermionField &q_in,ComplexField &J5q)
 template <class Impl>
 void WilsonFermion5D<Impl>::ContractJ5q(PropagatorField &q_in,ComplexField &J5q)
 {
-  conformable(GaugeGrid(), J5q._grid);
-  conformable(q_in._grid, FermionGrid());
+  conformable(GaugeGrid(), J5q.Grid());
+  conformable(q_in.Grid(), FermionGrid());

  // 4d field
  int Ls = this->Ls;
@@ -1015,20 +948,26 @@ void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                     Current curr_type,
                                                     unsigned int mu)
 {
-    conformable(q_in_1._grid, FermionGrid());
-    conformable(q_in_1._grid, q_in_2._grid);
-    conformable(_FourDimGrid, q_out._grid);
+    conformable(q_in_1.Grid(), FermionGrid());
+    conformable(q_in_1.Grid(), q_in_2.Grid());
+    conformable(_FourDimGrid, q_out.Grid());

    PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
-    unsigned int LLs = q_in_1._grid->_rdimensions[0];
-    q_out = zero;
+    unsigned int LLs = q_in_1.Grid()->_rdimensions[0];
+    q_out = Zero();

    // Forward, need q1(x + mu, s), q2(x, Ls - 1 - s). Backward, need q1(x, s), 
    // q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
    tmp1 = Cshift(q_in_1, mu + 1, 1);
    tmp2 = Cshift(q_in_2, mu + 1, 1);
-    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
-    {
+    auto q_in_1_v = q_in_1.View();
+    auto q_in_2_v = q_in_2.View();
+    auto tmp1_v   = tmp1.View();
+    auto tmp2_v   = tmp2.View();
+    auto q_out_v  = q_out.View();
+    auto Umu_v    = Umu.View();
+    thread_for(sU, Umu.Grid()->oSites(),{
+
        unsigned int sF1 = sU * LLs;
        unsigned int sF2 = (sU + 1) * LLs - 1;

@@ -1042,26 +981,26 @@ void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
            // sites correctly.
            if (Impl::LsVectorised)
            {
-                REVERSE_LS(q_in_2._odata[sF2], qSite2, Ls / LLs);
-                REVERSE_LS(tmp2._odata[sF2], qmuSite2, Ls / LLs);
+                REVERSE_LS(q_in_2_v[sF2], qSite2, Ls / LLs);
+                REVERSE_LS(tmp2_v[sF2], qmuSite2, Ls / LLs);
            }
            else
            {
-                qSite2   = q_in_2._odata[sF2];
-                qmuSite2 = tmp2._odata[sF2];
+                qSite2   = q_in_2_v[sF2];
+                qmuSite2 = tmp2_v[sF2];
            }
-            Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sF1], 
+            Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sF1], 
                                                     qSite2, 
-                                                     q_out._odata[sU],
-                                                     Umu, sU, mu, axial_sign);
-            Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sF1],
+                                                     q_out_v[sU],
+                                                     Umu_v, sU, mu, axial_sign);
+            Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sF1],
                                                     qmuSite2,
-                                                     q_out._odata[sU],
-                                                     Umu, sU, mu, axial_sign);
+                                                     q_out_v[sU],
+                                                     Umu_v, sU, mu, axial_sign);
            sF1++;
            sF2--;
        }
-    }
+    });
 }


@@ -1074,18 +1013,21 @@ void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                                unsigned int tmax,
 						ComplexField &lattice_cmplx)
 {
-    conformable(q_in._grid, FermionGrid());
-    conformable(q_in._grid, q_out._grid);
+    conformable(q_in.Grid(), FermionGrid());
+    conformable(q_in.Grid(), q_out.Grid());
    PropagatorField tmp(GaugeGrid()),tmp2(GaugeGrid());
    unsigned int tshift = (mu == Tp) ? 1 : 0;
-    unsigned int LLs = q_in._grid->_rdimensions[0];
+    unsigned int LLs = q_in.Grid()->_rdimensions[0];
    unsigned int LLt    = GridDefaultLatt()[Tp];

-    q_out = zero;
+    q_out = Zero();
    LatticeInteger coords(_FourDimGrid);
    LatticeCoordinate(coords, Tp);
-
-
+    
+    auto q_out_v = q_out.View();
+    auto tmp2_v  = tmp2.View();
+    auto coords_v= coords.View();
+    auto Umu_v   = Umu.View();
    for (unsigned int s = 0; s < LLs; ++s)
    {
        bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
@@ -1098,59 +1040,51 @@ void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
        tmp = Cshift(tmp2, mu, 1);	 //q(x+mu,s)
        tmp2 = tmp*lattice_cmplx;	 //q(x+mu,s)*A(x)	

-    	parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
-    	{
+    	thread_for(sU, Umu.Grid()->oSites(),{
            // Compute the sequential conserved current insertion only if our simd
            // object contains a timeslice we need.
-            vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
-                	         (coords._odata[sU] <= tmax));
-            Integer timeSlices = Reduce(t_mask);
+            vPredicate t_mask;
+	    t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
+            Integer timeSlices = Reduce(t_mask());

            if (timeSlices > 0)
            {
 		unsigned int sF = sU * LLs + s;
-                Kernels::SeqConservedCurrentSiteFwd(tmp2._odata[sU], 
-                                              q_out._odata[sF], Umu, sU,
-                                              mu, t_mask, switch_sgn);
+                Kernels::SeqConservedCurrentSiteFwd(tmp2_v[sU], 
+						    q_out_v[sF], Umu_v, sU,
+						    mu, t_mask, switch_sgn);
            }

-        }
+        });

        //backward direction: Need q(x - mu, s)*A(x-mu)
        ExtractSlice(tmp2, q_in, s, 0);  //q(x,s)
        tmp = lattice_cmplx*tmp2;	 //q(x,s)*A(x)
        tmp2 = Cshift(tmp, mu, -1);	 //q(x-mu,s)*A(x-mu,s)

-    	parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    	thread_for(sU, Umu.Grid()->oSites(),
    	{
-            vInteger  t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
-                   	  	    (coords._odata[sU] <= (tmax + tshift)));
+	  vPredicate t_mask;
+	  t_mask()= ((coords_v[sU] >= (tmin + tshift)) && (coords_v[sU] <= (tmax + tshift)));

-	    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
-	    unsigned int t0 = 0;
-	    if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
-
-            Integer timeSlices = Reduce(t_mask);
-
-            if (timeSlices > 0)
-            {
-		unsigned int sF = sU * LLs + s; 
-        	Kernels::SeqConservedCurrentSiteBwd(tmp2._odata[sU], 
-                                             q_out._odata[sF], Umu, sU,
-                                             mu, t_mask, axial_sign);
-            }
-	}
+	  //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+	  unsigned int t0 = 0;
+	  if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
+	  
+	  Integer timeSlices = Reduce(t_mask());
+	  
+	  if (timeSlices > 0) {
+	    unsigned int sF = sU * LLs + s; 
+	    Kernels::SeqConservedCurrentSiteBwd(tmp2_v[sU], 
+						q_out_v[sF], Umu_v, sU,
+						mu, t_mask, axial_sign);
+	  }
+	});
    }
 }
-
-
-
-
-
-FermOpTemplateInstantiate(WilsonFermion5D);
-GparityFermOpTemplateInstantiate(WilsonFermion5D);
  
-}}
+NAMESPACE_END(Grid);
+



--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -1,4 +1,3 @@
-
 /*************************************************************************************

 Grid physics library, www.github.com/paboyle/Grid
@@ -29,16 +28,11 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/WilsonFermion.h>

-namespace Grid {
-namespace QCD {
-
-const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
-const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
-int WilsonFermionStatic::HandOptDslash;
+NAMESPACE_BEGIN(Grid);

 /////////////////////////////////
 // Constructor and gauge import
@@ -49,18 +43,19 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
                                   GridRedBlackCartesian &Hgrid, RealD _mass,
                                   const ImplParams &p,
                                   const WilsonAnisotropyCoefficients &anis)
-    : Kernels(p),
-      _grid(&Fgrid),
-      _cbgrid(&Hgrid),
-      Stencil(&Fgrid, npoint, Even, directions, displacements),
-      StencilEven(&Hgrid, npoint, Even, directions,displacements),  // source is Even
-      StencilOdd(&Hgrid, npoint, Odd, directions,displacements),  // source is Odd
-      mass(_mass),
-      Lebesgue(_grid),
-      LebesgueEvenOdd(_cbgrid),
-      Umu(&Fgrid),
-      UmuEven(&Hgrid),
-      UmuOdd(&Hgrid),
+  : 
+    Kernels(p),
+    _grid(&Fgrid),
+    _cbgrid(&Hgrid),
+    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
+    StencilEven(&Hgrid, npoint, Even, directions,displacements,p),  // source is Even
+    StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p),  // source is Odd
+    mass(_mass),
+    Lebesgue(_grid),
+    LebesgueEvenOdd(_cbgrid),
+    Umu(&Fgrid),
+    UmuEven(&Hgrid),
+    UmuOdd(&Hgrid),
      _tmp(&Hgrid),
      anisotropyCoeff(anis)
 {
@@ -76,8 +71,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
 }

 template <class Impl>
-void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {
-  GaugeField HUmu(_Umu._grid);
+void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) 
+{
+  GaugeField HUmu(_Umu.Grid());

  //Here multiply the anisotropy coefficients
  if (anisotropyCoeff.isAnisotropic)
@@ -107,21 +103,21 @@ void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {

 template <class Impl>
 RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
  return axpy_norm(out, diag_mass, in, out);
 }

 template <class Impl>
 RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerYes);
  return axpy_norm(out, diag_mass, in, out);
 }

 template <class Impl>
 void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
-  if (in.checkerboard == Odd) {
+  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerNo);
  } else {
    DhopOE(in, out, DaggerNo);
@@ -130,7 +126,7 @@ void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {

 template <class Impl>
 void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
-  if (in.checkerboard == Odd) {
+  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerYes);
  } else {
    DhopOE(in, out, DaggerYes);
@@ -139,26 +135,26 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
  
 template <class Impl>
 void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  typename FermionField::scalar_type scal(diag_mass);
  out = scal * in;
 }

 template <class Impl>
 void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Mooee(in, out);
 }

 template<class Impl>
 void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  out = (1.0/(diag_mass))*in;
 }
  
 template<class Impl>
 void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  MooeeInv(in,out);
 }
 template<class Impl>
@@ -169,7 +165,7 @@ void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const Fermi
  typedef Lattice<iSinglet<vector_type> > LatComplex;
  
  // what type LatticeComplex 
-  conformable(_grid,out._grid);
+  conformable(_grid,out.Grid());
  
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
@@ -178,13 +174,13 @@ void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const Fermi
    Gamma::Algebra::GammaT
  };
  
-  std::vector<int> latt_size   = _grid->_fdimensions;
+  Coordinate latt_size   = _grid->_fdimensions;
  
-  FermionField   num  (_grid); num  = zero;
-  LatComplex    wilson(_grid); wilson= zero;
+  FermionField   num  (_grid); num  = Zero();
+  LatComplex    wilson(_grid); wilson= Zero();
  LatComplex     one  (_grid); one = ScalComplex(1.0,0.0);
  
-  LatComplex denom(_grid); denom= zero;
+  LatComplex denom(_grid); denom= Zero();
  LatComplex kmu(_grid); 
  ScalComplex ci(0.0,1.0);
  // momphase = n * 2pi / L
@@ -229,9 +225,9 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,

  Compressor compressor(dag);

-  FermionField Btilde(B._grid);
-  FermionField Atilde(B._grid);
-  Atilde = A;//redundant
+  FermionField Btilde(B.Grid());
+  FermionField Atilde(B.Grid());
+  Atilde = A;

  st.HaloExchange(B, compressor);

@@ -242,12 +238,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    int gamma = mu;
    if (!dag) gamma += Nd;

-    ////////////////////////
-    // Call the single hop
-    ////////////////////////
-    parallel_for (int sss = 0; sss < B._grid->oSites(); sss++) {
-      Kernels::DhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, gamma);
-    }
+    int Ls=1;
+    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);

    //////////////////////////////////////////////////
    // spin trace outer product
@@ -258,70 +250,70 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,

 template <class Impl>
 void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-  conformable(U._grid, _grid);
-  conformable(U._grid, V._grid);
-  conformable(U._grid, mat._grid);
+  conformable(U.Grid(), _grid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());

-  mat.checkerboard = U.checkerboard;
+  mat.Checkerboard() = U.Checkerboard();

  DerivInternal(Stencil, Umu, mat, U, V, dag);
 }

 template <class Impl>
 void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-  conformable(U._grid, _cbgrid);
-  conformable(U._grid, V._grid);
-  //conformable(U._grid, mat._grid); not general, leaving as a comment (Guido)
+  conformable(U.Grid(), _cbgrid);
+  conformable(U.Grid(), V.Grid());
+  //conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido)
  // Motivation: look at the SchurDiff operator
  
-  assert(V.checkerboard == Even);
-  assert(U.checkerboard == Odd);
-  mat.checkerboard = Odd;
+  assert(V.Checkerboard() == Even);
+  assert(U.Checkerboard() == Odd);
+  mat.Checkerboard() = Odd;

  DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
 }

 template <class Impl>
 void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-  conformable(U._grid, _cbgrid);
-  conformable(U._grid, V._grid);
-  //conformable(U._grid, mat._grid);
+  conformable(U.Grid(), _cbgrid);
+  conformable(U.Grid(), V.Grid());
+  //conformable(U.Grid(), mat.Grid());

-  assert(V.checkerboard == Odd);
-  assert(U.checkerboard == Even);
-  mat.checkerboard = Even;
+  assert(V.Checkerboard() == Odd);
+  assert(U.Checkerboard() == Even);
+  mat.Checkerboard() = Even;

  DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
 }

 template <class Impl>
 void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
-  conformable(in._grid, _grid);  // verifies full grid
-  conformable(in._grid, out._grid);
+  conformable(in.Grid(), _grid);  // verifies full grid
+  conformable(in.Grid(), out.Grid());

-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();

  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
 }

 template <class Impl>
 void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
-  conformable(in._grid, _cbgrid);    // verifies half grid
-  conformable(in._grid, out._grid);  // drops the cb check
+  conformable(in.Grid(), _cbgrid);    // verifies half grid
+  conformable(in.Grid(), out.Grid());  // drops the cb check

-  assert(in.checkerboard == Even);
-  out.checkerboard = Odd;
+  assert(in.Checkerboard() == Even);
+  out.Checkerboard() = Odd;

  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
 }

 template <class Impl>
 void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) {
-  conformable(in._grid, _cbgrid);    // verifies half grid
-  conformable(in._grid, out._grid);  // drops the cb check
+  conformable(in.Grid(), _cbgrid);    // verifies half grid
+  conformable(in.Grid(), out.Grid());  // drops the cb check

-  assert(in.checkerboard == Odd);
-  out.checkerboard = Even;
+  assert(in.Checkerboard() == Odd);
+  out.Checkerboard() = Even;

  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
 }
@@ -332,7 +324,8 @@ void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int di
 }

 template <class Impl>
-void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
+void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
  int skip = (disp == 1) ? 0 : 1;
  int dirdisp = dir + skip * 4;
  int gamma = dir + (1 - skip) * 4;
@@ -341,16 +334,16 @@ void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int
 };

 template <class Impl>
-void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) {
+void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
+{
  Compressor compressor(dag);

  Stencil.HaloExchange(in, compressor);
+  int Ls=1;
+  int Nsite=in.oSites();
+  Kernels::DhopDirKernel(Stencil, Umu, Stencil.CommBuf(), Ls, Nsite, in, out, dirdisp, gamma);
+};

-  parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
-    Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
-  }
-} 
-/*Change starts*/
 template <class Impl>
 void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       DoubledGaugeField &U,
@@ -367,71 +360,51 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,

 template <class Impl>
 void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
-                                       DoubledGaugeField &U,
-                                       const FermionField &in,
-                                       FermionField &out, int dag) {
+						      DoubledGaugeField &U,
+						      const FermionField &in,
+						      FermionField &out, int dag) {
  assert((dag == DaggerNo) || (dag == DaggerYes));
-#ifdef GRID_OMP
-  Compressor compressor;
-  int len =  U._grid->oSites();
-  const int LLs =  1;
-
-  st.Prepare();
-  st.HaloGather(in,compressor);
-  st.CommsMergeSHM(compressor);
-#pragma omp parallel
-  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = len;
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
-      }
-      // do the compute
-     if (dag == DaggerYes) {
-
-        for (int sss = myblock; sss < myblock+myn; ++sss) {
-         Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
-       }
-     } else {
-        for (int sss = myblock; sss < myblock+myn; ++sss) {
-         Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
-       }
-    } //else
-
-    } else {
-      st.CommunicateThreaded();
-    }

  Compressor compressor(dag);
+  int len =  U.Grid()->oSites();

+  /////////////////////////////
+  // Start comms  // Gather intranode and extra node differentiated??
+  /////////////////////////////
+  std::vector<std::vector<CommsRequest_t> > requests;
+  st.Prepare();
+  st.HaloGather(in,compressor);
+  st.CommunicateBegin(requests);
+
+  /////////////////////////////
+  // Overlap with comms
+  /////////////////////////////
+  st.CommsMergeSHM(compressor);
+
+  /////////////////////////////
+  // do the compute interior
+  /////////////////////////////
+  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
-    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
-    }
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  } else {
-    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
-    }
-  }
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
+  } 

-  }  //pragma
-#else
-  assert(0);
-#endif
+  /////////////////////////////
+  // Complete comms
+  /////////////////////////////
+  st.CommunicateComplete(requests);
+  st.CommsMerge(compressor);
+
+  /////////////////////////////
+  // do the compute exterior
+  /////////////////////////////
+  if (dag == DaggerYes) {
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
+  } else {
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
+  }
 };


@@ -444,14 +417,11 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
  Compressor compressor(dag);
  st.HaloExchange(in, compressor);

+  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
-    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
-    }
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  } else {
-    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
-    }
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  }
 };
 /*Change ends */
@@ -468,28 +438,33 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                   Current curr_type,
                                                   unsigned int mu)
 {
-    Gamma g5(Gamma::Algebra::Gamma5);
-    conformable(_grid, q_in_1._grid);
-    conformable(_grid, q_in_2._grid);
-    conformable(_grid, q_out._grid);
-    PropagatorField tmp1(_grid), tmp2(_grid);
-    q_out = zero;
+  Gamma g5(Gamma::Algebra::Gamma5);
+  conformable(_grid, q_in_1.Grid());
+  conformable(_grid, q_in_2.Grid());
+  conformable(_grid, q_out.Grid());
+  PropagatorField tmp1(_grid), tmp2(_grid);
+  q_out = Zero();

-    // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
-    // Inefficient comms method but not performance critical.
-    tmp1 = Cshift(q_in_1, mu, 1);
-    tmp2 = Cshift(q_in_2, mu, 1);
-    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
-    {
-        Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sU],
-                                                 q_in_2._odata[sU],
-                                                 q_out._odata[sU],
-                                                 Umu, sU, mu);
-        Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sU],
-                                                 tmp2._odata[sU],
-                                                 q_out._odata[sU],
-                                                 Umu, sU, mu);
-    }
+  // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
+  // Inefficient comms method but not performance critical.
+  tmp1 = Cshift(q_in_1, mu, 1);
+  tmp2 = Cshift(q_in_2, mu, 1);
+  auto tmp1_v  =  tmp1.View();
+  auto tmp2_v  =  tmp2.View();
+  auto q_in_1_v=q_in_1.View();
+  auto q_in_2_v=q_in_2.View();
+  auto q_out_v = q_out.View();
+  auto Umu_v   =   Umu.View();
+  thread_for(sU, Umu.Grid()->oSites(),{
+      Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
+					       q_in_2_v[sU],
+					       q_out_v[sU],
+					       Umu_v, sU, mu);
+      Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
+					       tmp2_v[sU],
+					       q_out_v[sU],
+					       Umu_v, sU, mu);
+  });
 }


@@ -502,61 +477,61 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
-    conformable(_grid, q_in._grid);
-    conformable(_grid, q_out._grid);
-    PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
-    unsigned int tshift = (mu == Tp) ? 1 : 0;
-    unsigned int LLt    = GridDefaultLatt()[Tp];
+  conformable(_grid, q_in.Grid());
+  conformable(_grid, q_out.Grid());

-    q_out = zero;
-    LatticeInteger coords(_grid);
-    LatticeCoordinate(coords, Tp);
+  //  Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
+  Complex i(0.0,1.0);
+  PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
+  unsigned int tshift = (mu == Tp) ? 1 : 0;
+  unsigned int LLt    = GridDefaultLatt()[Tp];

-    // Need q(x + mu) and q(x - mu).
-    tmp = Cshift(q_in, mu, 1);
-    tmpFwd = tmp*lattice_cmplx;
-    tmp = lattice_cmplx*q_in;
-    tmpBwd = Cshift(tmp, mu, -1);
+  q_out = Zero();
+  LatticeInteger coords(_grid);
+  LatticeCoordinate(coords, Tp);

-    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
-    {
-        // Compute the sequential conserved current insertion only if our simd
-        // object contains a timeslice we need.
-        vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
-                             (coords._odata[sU] <= tmax));
-        Integer timeSlices = Reduce(t_mask);
+  // Need q(x + mu) and q(x - mu).
+  tmp    = Cshift(q_in, mu, 1);
+  tmpFwd = tmp*lattice_cmplx;
+  tmp    = lattice_cmplx*q_in;
+  tmpBwd = Cshift(tmp, mu, -1);

-        if (timeSlices > 0)
-        {
-            Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sU], 
-                                                q_out._odata[sU], 
-                                                Umu, sU, mu, t_mask);
-        }
+  auto coords_v = coords.View();
+  auto tmpFwd_v = tmpFwd.View();
+  auto tmpBwd_v = tmpBwd.View();
+  auto Umu_v    = Umu.View();
+  auto q_out_v  = q_out.View();

-        // Repeat for backward direction.
-        t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
-                      (coords._odata[sU] <= (tmax + tshift)));
+  thread_for(sU, Umu.Grid()->oSites(), {

-	//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
-	unsigned int t0 = 0;
-	if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
+    // Compute the sequential conserved current insertion only if our simd
+    // object contains a timeslice we need.
+    vPredicate t_mask;
+    t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
+    Integer timeSlices = Reduce(t_mask());

-        timeSlices = Reduce(t_mask);
-
-        if (timeSlices > 0)
-        {
-            Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sU], 
-                                                q_out._odata[sU], 
-                                                Umu, sU, mu, t_mask);
-        }
+    if (timeSlices > 0) {
+      Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], 
+					  q_out_v[sU], 
+					  Umu_v, sU, mu, t_mask);
    }

+    // Repeat for backward direction.
+    t_mask()     = ((coords_v[sU] >= (tmin + tshift)) && 
+		    (coords_v[sU] <= (tmax + tshift)));
+    
+    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+    unsigned int t0 = 0;
+    if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
+    
+    timeSlices = Reduce(t_mask());

+    if (timeSlices > 0) {
+      Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], 
+					  q_out_v[sU], 
+					  Umu_v, sU, mu, t_mask);
+    }
+  });
 }

-FermOpTemplateInstantiate(WilsonFermion);
-AdjointFermOpTemplateInstantiate(WilsonFermion);
-TwoIndexFermOpTemplateInstantiate(WilsonFermion);
-GparityFermOpTemplateInstantiate(WilsonFermion);
-}
-}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h
@@ -0,0 +1,716 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+
+
+    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#if defined(AVX512) 
+    ///////////////////////////////////////////////////////////
+    // If we are AVX512 specialise the single precision routine
+    ///////////////////////////////////////////////////////////
+#include <simd/Intel512wilson.h>
+#include <simd/Intel512single.h>
+
+/// Switch off the 5d vectorised code optimisations
+#undef DWFVEC5D
+
+static Vector<vComplexF> signsF;
+
+  template<typename vtype>    
+  int setupSigns(Vector<vtype>& signs ){
+    Vector<vtype> bother(2);
+    signs = bother;
+    vrsign(signs[0]);
+    visign(signs[1]);
+    return 1;
+  }
+
+  static int signInitF = setupSigns(signsF);
+
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
+#define COMPLEX_SIGNS(isigns) vComplexF *isigns = &signsF[0];  
+  
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+      
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#define MAYBEPERM(A,B) 
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+
+#ifdef DWFVEC5D
+
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+#undef  MULT_2SPIN
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#endif  // VEC 5D
+
+#undef COMPLEX_SIGNS
+#undef MAYBEPERM
+#undef MULT_2SPIN
+	
+
+
+///////////////////////////////////////////////////////////
+// If we are AVX512 specialise the double precision routine
+///////////////////////////////////////////////////////////
+
+#include <simd/Intel512double.h>
+    
+static Vector<vComplexD> signsD;
+static int signInitD = setupSigns(signsD);
+    
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
+#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];  
+
+
+#define INTERIOR_AND_EXTERIOR    
+#undef  INTERIOR
+#undef  EXTERIOR
+  
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+      
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+      
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#define MAYBEPERM(A,B) 
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+#ifdef DWFVEC5D
+
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+#undef  MULT_2SPIN
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#endif  // VEC 5D
+
+#undef COMPLEX_SIGNS
+#undef MAYBEPERM
+#undef MULT_2SPIN
+
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef UChi_00
+#undef UChi_01
+#undef UChi_02
+#undef UChi_10
+#undef UChi_11
+#undef UChi_12
+#undef UChi_20
+#undef UChi_21
+#undef UChi_22
+#undef UChi_30
+#undef UChi_31
+#undef UChi_32
+
+#undef Psi_00
+#undef Psi_01
+#undef Psi_02
+#undef Psi_10
+#undef Psi_11
+#undef Psi_12
+#undef Psi_20
+#undef Psi_21
+#undef Psi_22
+#undef Psi_30
+#undef Psi_31
+#undef Psi_32
+
+#undef Phi_00
+#undef Phi_01
+#undef Phi_02
+#undef Phi_10
+#undef Phi_11
+#undef Phi_12
+#undef Phi_20
+#undef Phi_21
+#undef Phi_22
+#undef Phi_30
+#undef Phi_31
+#undef Phi_32
+
+
+#endif //AVX512
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h
@@ -130,16 +130,18 @@
  int local,perm, ptype;
  uint64_t base;
  uint64_t basep;
-  const uint64_t plocal =(uint64_t) & in._odata[0];
+  const uint64_t plocal =(uint64_t) & in[0];

  COMPLEX_SIGNS(isigns);
  MASK_REGS;
-  int nmax=U._grid->oSites();
+  int nmax=U.oSites();
  for(int site=0;site<Ns;site++) {
 #ifndef EXTERIOR
-    int sU =lo.Reorder(ssU);
+    //    int sU =lo.Reorder(ssU);
+    int sU =ssU;
    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
-    int sUn=lo.Reorder(ssn);
+    //    int sUn=lo.Reorder(ssn);
+    int sUn=ssn;
    LOCK_GAUGE(0);
 #else
    int sU =ssU;
@@ -166,7 +168,7 @@
      if (nmu==0) break;
      //      if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
 #endif
-      base = (uint64_t) &out._odata[ss];
+      base = (uint64_t) &out[ss];
      basep= st.GetPFInfo(nent,plocal); nent++;
      RESULT(base,basep);
    }
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h.ab
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h.ab
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h.abc
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h.abc
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h
@@ -0,0 +1,86 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+
+
+    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////////////////////////
+// Default to no assembler implementation
+// Will specialise to 
+///////////////////////////////////////////////////////////
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+				  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+					int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+					int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+{
+  assert(0);
+}
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmQPX.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmQPX.h
@@ -28,7 +28,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-
+#pragma once

 #if defined(QPX) 

@@ -52,18 +52,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /////////////////////////////////////////////////////////////////
 #undef KERNEL_DAG
 template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
      
 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, dag Kernel, single
 /////////////////////////////////////////////////////////////////
 #define KERNEL_DAG
 template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
 						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 				    
 #undef MAYBEPERM
 #undef MULT_2SPIN
@@ -75,18 +75,18 @@ WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,Do
 /////////////////////////////////////////////////////////////////
 #undef KERNEL_DAG
 template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
 							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 				    
 /////////////////////////////////////////////////////////////////
 // Ls vectorised, dag Kernel, single
 /////////////////////////////////////////////////////////////////
 #define KERNEL_DAG
 template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef MAYBEPERM
 #undef MULT_2SPIN
 	
@@ -104,9 +104,9 @@ WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrde
 /////////////////////////////////////////////////////////////////
 #undef KERNEL_DAG
 template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 /////////////////////////////////////////////////////////////////
      

@@ -115,9 +115,9 @@ WilsonKernels<WilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,Doubl
 /////////////////////////////////////////////////////////////////
 #define KERNEL_DAG
 template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
 						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 /////////////////////////////////////////////////////////////////

 #undef MAYBEPERM
@@ -129,9 +129,9 @@ WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,Do
 /////////////////////////////////////////////////////////////////
 #undef KERNEL_DAG
 template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
 							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 /////////////////////////////////////////////////////////////////
 				    
 /////////////////////////////////////////////////////////////////
@@ -139,9 +139,9 @@ WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder &
 /////////////////////////////////////////////////////////////////
 #define KERNEL_DAG
 template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 /////////////////////////////////////////////////////////////////
 	
 #undef MAYBEPERM
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
+#pragma once
+
 #include <Grid/qcd/action/fermion/FermionCore.h>

 #define REGISTER
@@ -45,7 +48,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  Chimu_32=ref(F)(3)(2)

 #define LOAD_CHIMU(DIR,F,PERM)						\
-  { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
+  { const SiteSpinor & ref (in[offset]); LOAD_CHIMU_BODY(F); }

 #define LOAD_CHI_BODY(F)				\
    Chi_00 = ref(F)(0)(0);\
@@ -92,9 +95,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  g = F;								\
  direction = st._directions[DIR];				\
  distance = st._distances[DIR];				\
-  sl = st._grid->_simd_layout[direction];			\
+  sl = st._simd_layout[direction];			        \
  inplace_twist = 0;						\
-  if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\
+  if(SE->_around_the_world && st.parameters.twists[DIR % 4]){		\
    if(sl == 1){							\
      g = (F+1) % 2;							\
    }else{								\
@@ -103,7 +106,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  }  

 #define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\
-  { const SiteSpinor &ref(in._odata[offset]);				\
+  { const SiteSpinor &ref(in[offset]);				\
    LOAD_CHI_SETUP(DIR,F);						\
    if(!inplace_twist){							\
      LOAD_CHIMU_BODY(g);						\
@@ -201,10 +204,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>


 #define MULT_2SPIN(A,F)					\
-  {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
+  {auto & ref(U[sU](A)); MULT_2SPIN_BODY; }

 #define MULT_2SPIN_GPARITY(A,F)				\
-  {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
+  {auto & ref(U[sU](F)(A)); MULT_2SPIN_BODY; }


 #define PERMUTE_DIR(dir)			\
@@ -468,8 +471,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
+  perm   = SE->_permute;				\
  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
    LOAD_CHI_IMPL(DIR,F,PERM);			\
    MULT_2SPIN_IMPL(DIR,F);			\
@@ -479,7 +481,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define HAND_RESULT(ss,F)			\
  {						\
-    SiteSpinor & ref (out._odata[ss]);		\
+    SiteSpinor & ref (out[ss]);		\
    vstream(ref(F)(0)(0),result_00);		\
    vstream(ref(F)(0)(1),result_01);		\
    vstream(ref(F)(0)(2),result_02);		\
@@ -496,7 +498,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define HAND_RESULT_EXT(ss,F)			\
  if (nmu){					\
-    SiteSpinor & ref (out._odata[ss]);		\
+    SiteSpinor & ref (out[ss]);		\
    ref(F)(0)(0)+=result_00;		\
    ref(F)(0)(1)+=result_01;		\
    ref(F)(0)(2)+=result_02;		\
@@ -545,18 +547,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  Simd U_21;

 #define ZERO_RESULT				\
-  result_00=zero;				\
-  result_01=zero;				\
-  result_02=zero;				\
-  result_10=zero;				\
-  result_11=zero;				\
-  result_12=zero;				\
-  result_20=zero;				\
-  result_21=zero;				\
-  result_22=zero;				\
-  result_30=zero;				\
-  result_31=zero;				\
-  result_32=zero;			
+  result_00=Zero();				\
+  result_01=Zero();				\
+  result_02=Zero();				\
+  result_10=Zero();				\
+  result_11=Zero();				\
+  result_12=Zero();				\
+  result_20=Zero();				\
+  result_21=Zero();				\
+  result_22=Zero();				\
+  result_30=Zero();				\
+  result_31=Zero();				\
+  result_32=Zero();			

 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
@@ -571,21 +573,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define Chimu_31 UChi_11
 #define Chimu_32 UChi_12

-namespace Grid {
-namespace QCD {
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
+NAMESPACE_BEGIN(Grid);

 #define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
@@ -598,21 +586,6 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge
  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT(ss,F)

-  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-
 #define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
@@ -624,22 +597,6 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub
  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT(ss,F)

-  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-
 #define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  ZERO_RESULT; \
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
@@ -652,21 +609,6 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT(ss,F)

-  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-
 #define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
  ZERO_RESULT;							\
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
@@ -678,23 +620,6 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
  HAND_RESULT(ss,F)
-  
-  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-  int nmu=0;

 #define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  ZERO_RESULT; \
@@ -708,22 +633,6 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT_EXT(ss,F)

-  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  int nmu=0;
-
 #define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  ZERO_RESULT; \
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
@@ -736,13 +645,10 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT_EXT(ss,F)

-  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
 #define HAND_SPECIALISE_GPARITY(IMPL)					\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
-				    int ss,int sU,const FermionField &in, FermionField &out) \
+  template<> void						\
+  WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
+				    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -756,9 +662,9 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-					    int ss,int sU,const FermionField &in, FermionField &out) \
+  template<> void						\
+  WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -772,9 +678,9 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
-						     int ss,int sU,const FermionField &in, FermionField &out) \
+  template<> void						\
+  WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
+				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -788,9 +694,9 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-							     int ss,int sU,const FermionField &in, FermionField &out) \
+  template<> void						\
+  WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -805,8 +711,8 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
  }									\
 									\
  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
-						     int ss,int sU,const FermionField &in, FermionField &out) \
+  WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
+				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -814,16 +720,16 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
 									\
    HAND_DECLARATIONS(ignore);						\
 									\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    int offset,perm, ptype, g, direction, distance, sl, inplace_twist; \
    StencilEntry *SE;							\
    int nmu=0;								\
    HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
    nmu = 0;								\
    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-							     int ss,int sU,const FermionField &in, FermionField &out) \
+  template<> void						\
+  WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -832,47 +738,11 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
    HAND_DECLARATIONS(ignore);						\
 									\
    StencilEntry *SE;							\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    int offset,perm, ptype, g, direction, distance, sl, inplace_twist; \
    int nmu=0;								\
    HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
    nmu = 0;								\
    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }

-
-HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
-
-
-
-
-
-
-
-
-
-
-  
-////////////// Wilson ; uses this implementation /////////////////////
-
-#define INSTANTIATE_THEM(A) \
-template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-					     int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						int ss,int sU,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionField &in, FermionField &out); 
-
-INSTANTIATE_THEM(GparityWilsonImplF);
-INSTANTIATE_THEM(GparityWilsonImplD);
-INSTANTIATE_THEM(GparityWilsonImplFH);
-INSTANTIATE_THEM(GparityWilsonImplDF);
-}}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
@@ -26,12 +26,58 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
+#pragma once
+
 #include <Grid/qcd/action/fermion/FermionCore.h>

+
+#undef LOAD_CHIMU  
+#undef LOAD_CHI 
+#undef MULT_2SPIN
+#undef PERMUTE_DIR
+#undef XP_PROJ  
+#undef YP_PROJ  
+#undef ZP_PROJ  
+#undef TP_PROJ  
+#undef XM_PROJ  
+#undef YM_PROJ  
+#undef ZM_PROJ  
+#undef TM_PROJ  
+#undef XP_RECON 
+#undef XP_RECON_ACCUM 
+#undef XM_RECON 
+#undef XM_RECON_ACCUM 
+#undef YP_RECON_ACCUM 
+#undef YM_RECON_ACCUM 
+#undef ZP_RECON_ACCUM 
+#undef ZM_RECON_ACCUM 
+#undef TP_RECON_ACCUM 
+#undef TM_RECON_ACCUM 
+#undef ZERO_RESULT				 
+#undef Chimu_00
+#undef Chimu_01
+#undef Chimu_02
+#undef Chimu_10
+#undef Chimu_11
+#undef Chimu_12
+#undef Chimu_20
+#undef Chimu_21
+#undef Chimu_22
+#undef Chimu_30
+#undef Chimu_31
+#undef Chimu_32
+#undef HAND_STENCIL_LEG
+#undef HAND_STENCIL_LEG_INT
+#undef HAND_STENCIL_LEG_EXT
+#undef HAND_RESULT
+#undef HAND_RESULT_INT
+#undef HAND_RESULT_EXT
+
 #define REGISTER

 #define LOAD_CHIMU \
-  {const SiteSpinor & ref (in._odata[offset]);	\
+  {const SiteSpinor & ref (in[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
    Chimu_02=ref()(0)(2);\
@@ -56,7 +102,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 // To splat or not to splat depends on the implementation
 #define MULT_2SPIN(A)\
-  {auto & ref(U._odata[sU](A));			\
+  {auto & ref(U[sU](A));			\
   Impl::loadLinkElement(U_00,ref()(0,0));	\
   Impl::loadLinkElement(U_10,ref()(1,0));	\
   Impl::loadLinkElement(U_20,ref()(2,0));	\
@@ -355,7 +401,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define HAND_RESULT(ss)				\
  {						\
-    SiteSpinor & ref (out._odata[ss]);		\
+    SiteSpinor & ref (out[ss]);		\
    vstream(ref()(0)(0),result_00);		\
    vstream(ref()(0)(1),result_01);		\
    vstream(ref()(0)(2),result_02);		\
@@ -372,7 +418,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define HAND_RESULT_EXT(ss)			\
  if (nmu){					\
-    SiteSpinor & ref (out._odata[ss]);		\
+    SiteSpinor & ref (out[ss]);		\
    ref()(0)(0)+=result_00;		\
    ref()(0)(1)+=result_01;		\
    ref()(0)(2)+=result_02;		\
@@ -421,18 +467,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  Simd U_21;

 #define ZERO_RESULT				\
-  result_00=zero;				\
-  result_01=zero;				\
-  result_02=zero;				\
-  result_10=zero;				\
-  result_11=zero;				\
-  result_12=zero;				\
-  result_20=zero;				\
-  result_21=zero;				\
-  result_22=zero;				\
-  result_30=zero;				\
-  result_31=zero;				\
-  result_32=zero;			
+  result_00=Zero();				\
+  result_01=Zero();				\
+  result_02=Zero();				\
+  result_10=Zero();				\
+  result_11=Zero();				\
+  result_12=Zero();				\
+  result_20=Zero();				\
+  result_21=Zero();				\
+  result_22=Zero();				\
+  result_30=Zero();				\
+  result_31=Zero();				\
+  result_32=Zero();			

 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
@@ -447,12 +493,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define Chimu_31 UChi_11
 #define Chimu_32 UChi_12

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
+WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
+				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
@@ -475,8 +520,8 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge
 }

 template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
+void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -498,8 +543,8 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub
 }

 template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
+WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
@@ -522,8 +567,8 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
 }

 template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
+void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -545,8 +590,8 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D
 }

 template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
+WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
@@ -554,7 +599,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa

  HAND_DECLARATIONS(ignore);

-  int offset,local,perm, ptype;
+  int offset, ptype;
  StencilEntry *SE;
  int nmu=0;
  ZERO_RESULT;
@@ -570,8 +615,8 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
 }

 template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
+void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -579,7 +624,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
  HAND_DECLARATIONS(ignore);

  StencilEntry *SE;
-  int offset,local,perm, ptype;
+  int offset, ptype;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
@@ -595,37 +640,45 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D

 ////////////// Wilson ; uses this implementation /////////////////////

-#define INSTANTIATE_THEM(A) \
-template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-					     int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						int ss,int sU,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionField &in, FermionField &out); 
-
-INSTANTIATE_THEM(WilsonImplF);
-INSTANTIATE_THEM(WilsonImplD);
-INSTANTIATE_THEM(ZWilsonImplF);
-INSTANTIATE_THEM(ZWilsonImplD);
-INSTANTIATE_THEM(DomainWallVec5dImplF);
-INSTANTIATE_THEM(DomainWallVec5dImplD);
-INSTANTIATE_THEM(ZDomainWallVec5dImplF);
-INSTANTIATE_THEM(ZDomainWallVec5dImplD);
-INSTANTIATE_THEM(WilsonImplFH);
-INSTANTIATE_THEM(WilsonImplDF);
-INSTANTIATE_THEM(ZWilsonImplFH);
-INSTANTIATE_THEM(ZWilsonImplDF);
-INSTANTIATE_THEM(DomainWallVec5dImplFH);
-INSTANTIATE_THEM(DomainWallVec5dImplDF);
-INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
-INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
-INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
-INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
-
-}}
+NAMESPACE_END(Grid);
+#undef LOAD_CHIMU  
+#undef LOAD_CHI 
+#undef MULT_2SPIN
+#undef PERMUTE_DIR
+#undef XP_PROJ  
+#undef YP_PROJ  
+#undef ZP_PROJ  
+#undef TP_PROJ  
+#undef XM_PROJ  
+#undef YM_PROJ  
+#undef ZM_PROJ  
+#undef TM_PROJ  
+#undef XP_RECON 
+#undef XP_RECON_ACCUM 
+#undef XM_RECON 
+#undef XM_RECON_ACCUM 
+#undef YP_RECON_ACCUM 
+#undef YM_RECON_ACCUM 
+#undef ZP_RECON_ACCUM 
+#undef ZM_RECON_ACCUM 
+#undef TP_RECON_ACCUM 
+#undef TM_RECON_ACCUM 
+#undef ZERO_RESULT				 
+#undef Chimu_00
+#undef Chimu_01
+#undef Chimu_02
+#undef Chimu_10
+#undef Chimu_11
+#undef Chimu_12
+#undef Chimu_20
+#undef Chimu_21
+#undef Chimu_22
+#undef Chimu_30
+#undef Chimu_31
+#undef Chimu_32
+#undef HAND_STENCIL_LEG
+#undef HAND_STENCIL_LEG_INT
+#undef HAND_STENCIL_LEG_EXT
+#undef HAND_RESULT
+#undef HAND_RESULT_INT
+#undef HAND_RESULT_EXT
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -0,0 +1,551 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+
+////////////////////////////////////////////
+// Generic implementation; move to different file?
+////////////////////////////////////////////
+
+accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
+{
+#ifdef __CUDA_ARCH__
+  static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); 
+  uint4 * mem_pun  = (uint4 *)mem; // force 128 bit loads
+  uint4 * chip_pun = (uint4 *)&chip;
+  * chip_pun = * mem_pun;
+#else 
+  chip = *mem;
+#endif
+  return;
+}
+  
+#define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (SE->_is_local) {						\
+    int perm= SE->_permute;					\
+    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+    spProj(chi,tmp);						\
+  } else {							\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
+  }								\
+  synchronise();						\
+  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
+  Recon(result, Uchi);
+  
+#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon)		\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (SE->_is_local) {						\
+    int perm= SE->_permute;					\
+    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+    spProj(chi,tmp);						\
+  } else if ( st.same_node[Dir] ) {				\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
+  }								\
+  synchronise();						\
+  if (SE->_is_local || st.same_node[Dir] ) {			\
+    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
+    Recon(result, Uchi);					\
+  }								\
+  synchronise();						
+
+#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
+    auto chi = coalescedRead(buf[SE->_offset],lane);		\
+    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
+    Recon(result, Uchi);					\
+    nmu++;							\
+  }								\
+  synchronise();						
+
+#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
+  if (gamma == Dir) {						\
+    if (SE->_is_local ) {					\
+      int perm= SE->_permute;					\
+      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+      spProj(chi,tmp);						\
+    } else {							\
+      chi = coalescedRead(buf[SE->_offset],lane);		\
+    }								\
+    synchronise();						\
+    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
+    Recon(result, Uchi);					\
+    synchronise();						\
+  }
+
+  ////////////////////////////////////////////////////////////////////
+  // All legs kernels ; comms then compute
+  ////////////////////////////////////////////////////////////////////
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
+					     SiteHalfSpinor *buf, int sF,
+					     int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(buf[0]))   calcHalfSpinor;
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  calcHalfSpinor chi;
+  //  calcHalfSpinor *chi_p;
+  calcHalfSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=SIMTlane(Nsimd);
+  GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
+  GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG(Tp,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG(Xm,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG(Ym,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
+  coalescedWrite(out[sF],result,lane);
+};
+
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
+					  SiteHalfSpinor *buf, int sF,
+					  int sU, const FermionFieldView &in, FermionFieldView &out) 
+{
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcHalfSpinor chi;
+  //  calcHalfSpinor *chi_p;
+  calcHalfSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=SIMTlane(Nsimd);
+  GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
+  GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG(Tm,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG(Xp,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG(Yp,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
+  coalescedWrite(out[sF], result,lane);
+};
+  ////////////////////////////////////////////////////////////////////
+  // Interior kernels
+  ////////////////////////////////////////////////////////////////////
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
+						SiteHalfSpinor *buf, int sF,
+						int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcHalfSpinor chi;
+  //  calcHalfSpinor *chi_p;
+  calcHalfSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=SIMTlane(Nsimd);
+
+  result=Zero();
+  GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
+  GENERIC_STENCIL_LEG_INT(Yp,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG_INT(Zp,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG_INT(Tp,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG_INT(Xm,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG_INT(Ym,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
+  coalescedWrite(out[sF], result,lane);
+};
+
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
+							 SiteHalfSpinor *buf, int sF,
+							 int sU, const FermionFieldView &in, FermionFieldView &out) 
+{
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=SIMTlane(Nsimd);
+
+  calcHalfSpinor chi;
+  //  calcHalfSpinor *chi_p;
+  calcHalfSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  result=Zero();
+  GENERIC_STENCIL_LEG_INT(Xm,spProjXp,accumReconXp);
+  GENERIC_STENCIL_LEG_INT(Ym,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG_INT(Zm,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG_INT(Tm,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG_INT(Xp,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG_INT(Yp,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
+  coalescedWrite(out[sF], result,lane);
+};
+////////////////////////////////////////////////////////////////////
+// Exterior kernels
+////////////////////////////////////////////////////////////////////
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
+						SiteHalfSpinor *buf, int sF,
+						int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  //  calcHalfSpinor *chi_p;
+  calcHalfSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  int nmu=0;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=SIMTlane(Nsimd);
+  result=Zero();
+  GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
+  GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG_EXT(Zp,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG_EXT(Tp,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG_EXT(Xm,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm);
+  if ( nmu ) { 
+    auto out_t = coalescedRead(out[sF],lane);
+    out_t = out_t + result;
+    coalescedWrite(out[sF],out_t,lane);
+  }
+};
+
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
+					     SiteHalfSpinor *buf, int sF,
+					     int sU, const FermionFieldView &in, FermionFieldView &out) 
+{
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  //  calcHalfSpinor *chi_p;
+  calcHalfSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  int nmu=0;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=SIMTlane(Nsimd);
+  result=Zero();
+  GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
+  GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG_EXT(Zm,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG_EXT(Tm,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG_EXT(Xp,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm);
+  if ( nmu ) { 
+    auto out_t = coalescedRead(out[sF],lane);
+    out_t = out_t + result;
+    coalescedWrite(out[sF],out_t,lane);
+  }
+};
+
+template <class Impl>
+void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
+				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) 
+{
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcHalfSpinor chi;
+  calcSpinor result;
+  calcHalfSpinor Uchi;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=SIMTlane(Nsimd);
+
+  SE = st.GetEntry(ptype, dir, sF);
+  if (gamma == Xp) {						
+    if (SE->_is_local ) {					
+      int perm= SE->_permute;					
+      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	
+      spProjXp(chi,tmp);						
+    } else {							
+      chi = coalescedRead(buf[SE->_offset],lane);			
+    }								
+    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		
+    spReconXp(result, Uchi);					
+  }
+
+  GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
+  GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
+  GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
+  GENERIC_DHOPDIR_LEG(Xm,spProjXm,spReconXm);
+  GENERIC_DHOPDIR_LEG(Ym,spProjYm,spReconYm);
+  GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
+  GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
+  coalescedWrite(out[sF], result,lane);
+}
+
+template <class Impl>
+void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
+					 int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma) 
+{
+  assert(dirdisp<=7);
+  assert(dirdisp>=0);
+
+   auto U_v   = U.View();
+   auto in_v  = in.View();
+   auto out_v = out.View();
+   auto st_v  = st.View();
+   accelerator_for(ss,Nsite,Simd::Nsimd(),{
+    for(int s=0;s<Ls;s++){
+      int sU=ss;
+      int sF = s+Ls*sU; 
+      DhopDirK(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp,gamma);
+    }
+  });
+} 
+
+#define KERNEL_CALLNB(A) \
+  const uint64_t    NN = Nsite*Ls;					\
+  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
+  });
+
+#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); 
+
+#define ASM_CALL(A)							\
+  thread_for( ss, Nsite, {						\
+    int sU = ss;							\
+    int sF = ss*Ls;							\
+    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
+  });
+
+template <class Impl>
+void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+				     int Ls, int Nsite, const FermionField &in, FermionField &out,
+				     int interior,int exterior) 
+{
+    auto U_v   =   U.View();
+    auto in_v  =  in.View();
+    auto out_v = out.View();
+    auto st_v  =  st.View();
+
+   if( interior && exterior ) { 
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
+#ifndef GRID_NVCC
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite); printf(".");    return;}
+#endif
+   } else if( interior ) {
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
+#ifndef GRID_NVCC
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt);    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt); printf("-");    return;}
+#endif
+   } else if( exterior ) { 
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
+#ifndef GRID_NVCC
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt); printf("+");    return;}
+#endif
+   }
+   assert(0 && " Kernel optimisation case not covered ");
+  }
+  template <class Impl>
+  void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+					  int Ls, int Nsite, const FermionField &in, FermionField &out,
+					  int interior,int exterior) 
+  {
+    auto U_v   = U.View();
+    auto in_v  = in.View();
+    auto out_v = out.View();
+    auto st_v  = st.View();
+
+   if( interior && exterior ) { 
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;}
+#ifndef GRID_NVCC
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
+#endif
+   } else if( interior ) {
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
+#ifndef GRID_NVCC
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
+#endif
+   } else if( exterior ) { 
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
+#ifndef GRID_NVCC
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
+#endif
+   }
+   assert(0 && " Kernel optimisation case not covered ");
+  }
+
+/*******************************************************************************
+ * Conserved current utilities for Wilson fermions, for contracting propagators
+ * to make a conserved current sink or inserting the conserved current 
+ * sequentially. Common to both 4D and 5D.
+ ******************************************************************************/
+// N.B. Functions below assume a -1/2 factor within U.
+#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
+#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
+
+/*******************************************************************************
+ * Name: ContractConservedCurrentSiteFwd
+ * Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in_1 shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
+							  const SitePropagator &q_in_2,
+							  SitePropagator &q_out,
+							  DoubledGaugeFieldView &U,
+							  unsigned int sU,
+							  unsigned int mu,
+							  bool switch_sign)
+{
+  SitePropagator result, tmp;
+  Gamma g5(Gamma::Algebra::Gamma5);
+
+  Impl::multLink(tmp, U[sU], q_in_1, mu);
+
+  result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
+
+  if (switch_sign) {
+    q_out -= result;
+  } else {
+    q_out += result;
+  }
+}
+
+/*******************************************************************************
+ * Name: ContractConservedCurrentSiteBwd
+ * Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in_2 shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
+							  const SitePropagator &q_in_2,
+							  SitePropagator &q_out,
+							  DoubledGaugeFieldView &U,
+							  unsigned int sU,
+							  unsigned int mu,
+							  bool switch_sign)
+{
+  SitePropagator result, tmp;
+  Gamma g5(Gamma::Algebra::Gamma5);
+
+  Impl::multLink(tmp, U[sU], q_in_1, mu + Nd);
+
+  result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
+  if (switch_sign) {
+    q_out += result;
+  } else {
+    q_out -= result;
+  }
+}
+
+/*******************************************************************************
+ * Name: SeqConservedCurrentSiteFwd
+ * Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
+                                                     SitePropagator &q_out,
+                                                     DoubledGaugeFieldView &U,
+                                                     unsigned int sU,
+                                                     unsigned int mu,
+                                                     vPredicate t_mask,
+                                                     bool switch_sign)
+{
+  SitePropagator result;
+  
+  Impl::multLink(result, U[sU], q_in, mu);
+  result = WilsonCurrentFwd(result, mu);
+
+  // Zero any unwanted timeslice entries.
+  result = predicatedWhere(t_mask, result, 0.*result);
+  
+  if (switch_sign) {
+    q_out -= result;
+  } else {
+    q_out += result;
+  }
+}
+
+/*******************************************************************************
+ * Name: SeqConservedCurrentSiteFwd
+ * Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in shifted in -ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in, 
+                                                     SitePropagator &q_out,
+                                                     DoubledGaugeFieldView &U,
+                                                     unsigned int sU,
+                                                     unsigned int mu,
+                                                     vPredicate t_mask,
+                                                     bool switch_sign)
+{
+  SitePropagator result;
+  Impl::multLink(result, U[sU], q_in, mu + Nd);
+  result = WilsonCurrentBwd(result, mu);
+
+  // Zero any unwanted timeslice entries.
+  result = predicatedWhere(t_mask, result, 0.*result);
+  
+  if (switch_sign) {
+    q_out += result;
+  } else {
+    q_out -= result;
+  }
+}
+
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h
@@ -0,0 +1,97 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
+
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+/*
+ * BF sequence
+ *
+ void bfmbase<Float>::MooeeInv(Fermion_t psi, 
+ Fermion_t chi, 
+ int dag, int cb)
+
+ double m    = this->mass;
+ double tm   = this->twistedmass;
+ double mtil = 4.0+this->mass;
+
+ double sq = mtil*mtil + tm*tm;
+
+ double a = mtil/sq;
+ double b = -tm /sq;
+ if(dag) b=-b;
+ axpibg5x(chi,psi,a,b);
+
+ void bfmbase<Float>::Mooee(Fermion_t psi, 
+ Fermion_t chi, 
+ int dag,int cb)
+ double a = 4.0+this->mass;
+ double b = this->twistedmass;
+ if(dag) b=-b;
+ axpibg5x(chi,psi,a,b);
+*/
+
+template<class Impl>
+void WilsonTMFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+  RealD a = 4.0+this->mass;
+  RealD b = this->mu;
+  out.Checkerboard() = in.Checkerboard();
+  axpibg5x(out,in,a,b);
+}
+template<class Impl>
+void WilsonTMFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+  RealD a = 4.0+this->mass;
+  RealD b = -this->mu;
+  out.Checkerboard() = in.Checkerboard();
+  axpibg5x(out,in,a,b);
+}
+template<class Impl>
+void WilsonTMFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+  RealD m    = this->mass;
+  RealD tm   = this->mu;
+  RealD mtil = 4.0+m;
+  RealD sq   = mtil*mtil+tm*tm;
+  RealD a    = mtil/sq;
+  RealD b    = -tm /sq;
+  axpibg5x(out,in,a,b);
+}
+template<class Impl>
+void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
+  RealD m    = this->mass;
+  RealD tm   = this->mu;
+  RealD mtil = 4.0+m;
+  RealD sq   = mtil*mtil+tm*tm;
+  RealD a    = mtil/sq;
+  RealD b    = tm /sq;
+  axpibg5x(out,in,a,b);
+}
+
+NAMESPACE_END(Grid);
--- a/Show More
+++ b/Show More