Merge GPU support (upstream/develop) into distillation branch.

This compiles and looks right ... but may need some testing * develop: (762 commits) Tensor ambiguous fix Fix for GCC preprocessor/pragma handling bug Trips up NVCC for reasons I dont understand on summit Fix GCC complaint Zero() change Force a couple of things to compile on NVCC Remove debug code nvcc error suppress Merge develop Reduction finished and hopefully fixes CI regression fail on single precisoin and force Double precision variants for summation accuracy Update todo list Freeze the seed Fix compiling of MSource::Gauss for single precision Think the reduction is now sorted and cleaned up Fix force term Printing improvement GPU reduction fix and also exit backtrace option GPU friendly Simplify the comms benchmark ... # Conflicts: # Grid/communicator/SharedMemoryMPI.cc # Grid/qcd/action/fermion/WilsonKernelsAsm.cc # Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h # Grid/qcd/smearing/StoutSmearing.h # Hadrons/Modules.hpp # Hadrons/Utilities/Contractor.cc # Hadrons/modules.inc # tests/forces/Test_dwf_force_eofa.cc # tests/forces/Test_dwf_gpforce_eofa.cc
2025-06-20 00:36:55 +01:00 · 2019-09-13 13:30:00 +01:00
parent 04a661cafe b473405652
commit 61d017d0a5
796 changed files with 41536 additions and 52391 deletions
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -27,114 +27,112 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_QCD_BASE_H
-#define GRID_QCD_BASE_H
-namespace Grid{
-namespace QCD {
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once

-    static const int Xdir = 0;
-    static const int Ydir = 1;
-    static const int Zdir = 2;
-    static const int Tdir = 3;
+NAMESPACE_BEGIN(Grid);

-  
-    static const int Xp = 0;
-    static const int Yp = 1;
-    static const int Zp = 2;
-    static const int Tp = 3;
-    static const int Xm = 4;
-    static const int Ym = 5;
-    static const int Zm = 6;
-    static const int Tm = 7;
+static constexpr int Xdir = 0;
+static constexpr int Ydir = 1;
+static constexpr int Zdir = 2;
+static constexpr int Tdir = 3;

-    static const int Nc=3;
-    static const int Ns=4;
-    static const int Nd=4;
-    static const int Nhs=2; // half spinor
-    static const int Nds=8; // double stored gauge field
-    static const int Ngp=2; // gparity index range
+static constexpr int Xp = 0;
+static constexpr int Yp = 1;
+static constexpr int Zp = 2;
+static constexpr int Tp = 3;
+static constexpr int Xm = 4;
+static constexpr int Ym = 5;
+static constexpr int Zm = 6;
+static constexpr int Tm = 7;

-    //////////////////////////////////////////////////////////////////////////////
-    // QCD iMatrix types
-    // Index conventions:                            Lorentz x Spin x Colour
-    // note: static const int or constexpr will work for type deductions
-    //       with the intel compiler (up to version 17)
-    //////////////////////////////////////////////////////////////////////////////
-    #define ColourIndex  2
-    #define SpinIndex    1
-    #define LorentzIndex 0
+static constexpr int Nc=3;
+static constexpr int Ns=4;
+static constexpr int Nd=4;
+static constexpr int Nhs=2; // half spinor
+static constexpr int Nds=8; // double stored gauge field
+static constexpr int Ngp=2; // gparity index range

-    // Also should make these a named enum type
-    static const int DaggerNo=0;
-    static const int DaggerYes=1;
-    static const int InverseNo=0;
-    static const int InverseYes=1;
+//////////////////////////////////////////////////////////////////////////////
+// QCD iMatrix types
+// Index conventions:                            Lorentz x Spin x Colour
+// note: static constexpr int or constexpr will work for type deductions
+//       with the intel compiler (up to version 17)
+//////////////////////////////////////////////////////////////////////////////
+#define ColourIndex  (2)
+#define SpinIndex    (1)
+#define LorentzIndex (0)

-    // Useful traits is this a spin index
-    //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
+// Also should make these a named enum type
+static constexpr int DaggerNo=0;
+static constexpr int DaggerYes=1;
+static constexpr int InverseNo=0;
+static constexpr int InverseYes=1;

-    const int SpinorIndex = 2;
-    template<typename T> struct isSpinor {
-      static const bool value = (SpinorIndex==T::TensorLevel);
-    };
-    template <typename T> using IfSpinor    = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
-    template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
+// Useful traits is this a spin index
+//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;

-    // ChrisK very keen to add extra space for Gparity doubling.
-    //
-    // Also add domain wall index, in a way where Wilson operator 
-    // naturally distributes across the 5th dimensions.
-    //
-    // That probably makes for GridRedBlack4dCartesian grid.
+const int SpinorIndex = 2;
+template<typename T> struct isSpinor {
+  static constexpr bool value = (SpinorIndex==T::TensorLevel);
+};
+template <typename T> using IfSpinor    = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
+template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;

-    // s,sp,c,spc,lc
+// ChrisK very keen to add extra space for Gparity doubling.
+//
+// Also add domain wall index, in a way where Wilson operator 
+// naturally distributes across the 5th dimensions.
+//
+// That probably makes for GridRedBlack4dCartesian grid.

-    template<typename vtype> using iSinglet                     = iScalar<iScalar<iScalar<vtype> > >;
-    template<typename vtype> using iSpinMatrix                  = iScalar<iMatrix<iScalar<vtype>, Ns> >;
-    template<typename vtype> using iColourMatrix                = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
-    template<typename vtype> using iSpinColourMatrix            = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
-    template<typename vtype> using iLorentzColourMatrix         = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
-    template<typename vtype> using iDoubleStoredColourMatrix    = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
-    template<typename vtype> using iSpinVector                  = iScalar<iVector<iScalar<vtype>, Ns> >;
-    template<typename vtype> using iColourVector                = iScalar<iScalar<iVector<vtype, Nc> > >;
-    template<typename vtype> using iSpinColourVector            = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
-    template<typename vtype> using iHalfSpinVector              = iScalar<iVector<iScalar<vtype>, Nhs> >;
-    template<typename vtype> using iHalfSpinColourVector        = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
+// s,sp,c,spc,lc
+
+template<typename vtype> using iSinglet                   = iScalar<iScalar<iScalar<vtype> > >;
+template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iScalar<vtype>, Ns> >;
+template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
+template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
+template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
+template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
+template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
+template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
+template<typename vtype> using iSpinColourVector          = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
+template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
+template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;


-    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
-    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
+template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;

-    // Spin matrix
-    typedef iSpinMatrix<Complex  >          SpinMatrix;
-    typedef iSpinMatrix<ComplexF >          SpinMatrixF;
-    typedef iSpinMatrix<ComplexD >          SpinMatrixD;
+// Spin matrix
+typedef iSpinMatrix<Complex  >          SpinMatrix;
+typedef iSpinMatrix<ComplexF >          SpinMatrixF;
+typedef iSpinMatrix<ComplexD >          SpinMatrixD;

-    typedef iSpinMatrix<vComplex >          vSpinMatrix;
-    typedef iSpinMatrix<vComplexF>          vSpinMatrixF;
-    typedef iSpinMatrix<vComplexD>          vSpinMatrixD;
+typedef iSpinMatrix<vComplex >          vSpinMatrix;
+typedef iSpinMatrix<vComplexF>          vSpinMatrixF;
+typedef iSpinMatrix<vComplexD>          vSpinMatrixD;

-    // Colour Matrix
-    typedef iColourMatrix<Complex  >        ColourMatrix;
-    typedef iColourMatrix<ComplexF >        ColourMatrixF;
-    typedef iColourMatrix<ComplexD >        ColourMatrixD;
+// Colour Matrix
+typedef iColourMatrix<Complex  >        ColourMatrix;
+typedef iColourMatrix<ComplexF >        ColourMatrixF;
+typedef iColourMatrix<ComplexD >        ColourMatrixD;

-    typedef iColourMatrix<vComplex >        vColourMatrix;
-    typedef iColourMatrix<vComplexF>        vColourMatrixF;
-    typedef iColourMatrix<vComplexD>        vColourMatrixD;
+typedef iColourMatrix<vComplex >        vColourMatrix;
+typedef iColourMatrix<vComplexF>        vColourMatrixF;
+typedef iColourMatrix<vComplexD>        vColourMatrixD;
+
+// SpinColour matrix
+typedef iSpinColourMatrix<Complex  >    SpinColourMatrix;
+typedef iSpinColourMatrix<ComplexF >    SpinColourMatrixF;
+typedef iSpinColourMatrix<ComplexD >    SpinColourMatrixD;
+
+typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
+typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
+typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;

-    // SpinColour matrix
-    typedef iSpinColourMatrix<Complex  >    SpinColourMatrix;
-    typedef iSpinColourMatrix<ComplexF >    SpinColourMatrixF;
-    typedef iSpinColourMatrix<ComplexD >    SpinColourMatrixD;
-    
-    typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
-    typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
-    typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;
-    
    // SpinColourSpinColour matrix
    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
@ -153,383 +151,379 @@ namespace QCD {
    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;

-    // LorentzColour
-    typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
-    typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
-    typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
+// LorentzColour
+typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
+typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
+typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;

-    typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
-    typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
-    typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
+typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
+typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
+typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;

-    // DoubleStored gauge field
-    typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix;
-    typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
-    typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;
+// DoubleStored gauge field
+typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix;
+typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
+typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;

-    typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
-    typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
-    typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
+typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
+typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
+typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;

-    // Spin vector
-    typedef iSpinVector<Complex >           SpinVector;
-    typedef iSpinVector<ComplexF>           SpinVectorF;
-    typedef iSpinVector<ComplexD>           SpinVectorD;
+// Spin vector
+typedef iSpinVector<Complex >           SpinVector;
+typedef iSpinVector<ComplexF>           SpinVectorF;
+typedef iSpinVector<ComplexD>           SpinVectorD;

-    typedef iSpinVector<vComplex >           vSpinVector;
-    typedef iSpinVector<vComplexF>           vSpinVectorF;
-    typedef iSpinVector<vComplexD>           vSpinVectorD;
+typedef iSpinVector<vComplex >           vSpinVector;
+typedef iSpinVector<vComplexF>           vSpinVectorF;
+typedef iSpinVector<vComplexD>           vSpinVectorD;

-    // Colour vector
-    typedef iColourVector<Complex >         ColourVector;
-    typedef iColourVector<ComplexF>         ColourVectorF;
-    typedef iColourVector<ComplexD>         ColourVectorD;
+// Colour vector
+typedef iColourVector<Complex >         ColourVector;
+typedef iColourVector<ComplexF>         ColourVectorF;
+typedef iColourVector<ComplexD>         ColourVectorD;

-    typedef iColourVector<vComplex >         vColourVector;
-    typedef iColourVector<vComplexF>         vColourVectorF;
-    typedef iColourVector<vComplexD>         vColourVectorD;
+typedef iColourVector<vComplex >         vColourVector;
+typedef iColourVector<vComplexF>         vColourVectorF;
+typedef iColourVector<vComplexD>         vColourVectorD;

-    // SpinColourVector
-    typedef iSpinColourVector<Complex >     SpinColourVector;
-    typedef iSpinColourVector<ComplexF>     SpinColourVectorF;
-    typedef iSpinColourVector<ComplexD>     SpinColourVectorD;
+// SpinColourVector
+typedef iSpinColourVector<Complex >     SpinColourVector;
+typedef iSpinColourVector<ComplexF>     SpinColourVectorF;
+typedef iSpinColourVector<ComplexD>     SpinColourVectorD;

-    typedef iSpinColourVector<vComplex >     vSpinColourVector;
-    typedef iSpinColourVector<vComplexF>     vSpinColourVectorF;
-    typedef iSpinColourVector<vComplexD>     vSpinColourVectorD;
+typedef iSpinColourVector<vComplex >     vSpinColourVector;
+typedef iSpinColourVector<vComplexF>     vSpinColourVectorF;
+typedef iSpinColourVector<vComplexD>     vSpinColourVectorD;

-    // HalfSpin vector
-    typedef iHalfSpinVector<Complex >       HalfSpinVector;
-    typedef iHalfSpinVector<ComplexF>       HalfSpinVectorF;
-    typedef iHalfSpinVector<ComplexD>       HalfSpinVectorD;
+// HalfSpin vector
+typedef iHalfSpinVector<Complex >       HalfSpinVector;
+typedef iHalfSpinVector<ComplexF>       HalfSpinVectorF;
+typedef iHalfSpinVector<ComplexD>       HalfSpinVectorD;

-    typedef iHalfSpinVector<vComplex >       vHalfSpinVector;
-    typedef iHalfSpinVector<vComplexF>       vHalfSpinVectorF;
-    typedef iHalfSpinVector<vComplexD>       vHalfSpinVectorD;
+typedef iHalfSpinVector<vComplex >       vHalfSpinVector;
+typedef iHalfSpinVector<vComplexF>       vHalfSpinVectorF;
+typedef iHalfSpinVector<vComplexD>       vHalfSpinVectorD;

-    // HalfSpinColour vector
-    typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
-    typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
-    typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
+// HalfSpinColour vector
+typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
+typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
+typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
    
-    typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
-    typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
-    typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
+typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
+typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
+typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
    
-    // singlets
-    typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
-    typedef iSinglet<ComplexF>         TComplexF;    // FIXME This is painful. Tensor singlet complex type.
-    typedef iSinglet<ComplexD>         TComplexD;    // FIXME This is painful. Tensor singlet complex type.
+// singlets
+typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
+typedef iSinglet<ComplexF>         TComplexF;    // FIXME This is painful. Tensor singlet complex type.
+typedef iSinglet<ComplexD>         TComplexD;    // FIXME This is painful. Tensor singlet complex type.

-    typedef iSinglet<vComplex >        vTComplex ;   // what if we don't know the tensor structure
-    typedef iSinglet<vComplexF>        vTComplexF;   // what if we don't know the tensor structure
-    typedef iSinglet<vComplexD>        vTComplexD;   // what if we don't know the tensor structure
+typedef iSinglet<vComplex >        vTComplex ;   // what if we don't know the tensor structure
+typedef iSinglet<vComplexF>        vTComplexF;   // what if we don't know the tensor structure
+typedef iSinglet<vComplexD>        vTComplexD;   // what if we don't know the tensor structure

-    typedef iSinglet<Real >            TReal;        // Shouldn't need these; can I make it work without?
-    typedef iSinglet<RealF>            TRealF;       // Shouldn't need these; can I make it work without?
-    typedef iSinglet<RealD>            TRealD;       // Shouldn't need these; can I make it work without?
+typedef iSinglet<Real >            TReal;        // Shouldn't need these; can I make it work without?
+typedef iSinglet<RealF>            TRealF;       // Shouldn't need these; can I make it work without?
+typedef iSinglet<RealD>            TRealD;       // Shouldn't need these; can I make it work without?

-    typedef iSinglet<vReal >           vTReal;      
-    typedef iSinglet<vRealF>           vTRealF;      
-    typedef iSinglet<vRealD>           vTRealD;      
+typedef iSinglet<vReal >           vTReal;      
+typedef iSinglet<vRealF>           vTRealF;      
+typedef iSinglet<vRealD>           vTRealD;      

-    typedef iSinglet<vInteger>         vTInteger;
-    typedef iSinglet<Integer >         TInteger;
+typedef iSinglet<vInteger>         vTInteger;
+typedef iSinglet<Integer >         TInteger;


-    // Lattices of these
-    typedef Lattice<vColourMatrix>          LatticeColourMatrix;
-    typedef Lattice<vColourMatrixF>         LatticeColourMatrixF;
-    typedef Lattice<vColourMatrixD>         LatticeColourMatrixD;
+// Lattices of these
+typedef Lattice<vColourMatrix>          LatticeColourMatrix;
+typedef Lattice<vColourMatrixF>         LatticeColourMatrixF;
+typedef Lattice<vColourMatrixD>         LatticeColourMatrixD;

-    typedef Lattice<vSpinMatrix>            LatticeSpinMatrix;
-    typedef Lattice<vSpinMatrixF>           LatticeSpinMatrixF;
-    typedef Lattice<vSpinMatrixD>           LatticeSpinMatrixD;
+typedef Lattice<vSpinMatrix>            LatticeSpinMatrix;
+typedef Lattice<vSpinMatrixF>           LatticeSpinMatrixF;
+typedef Lattice<vSpinMatrixD>           LatticeSpinMatrixD;

-    typedef Lattice<vSpinColourMatrix>      LatticeSpinColourMatrix;
-    typedef Lattice<vSpinColourMatrixF>     LatticeSpinColourMatrixF;
-    typedef Lattice<vSpinColourMatrixD>     LatticeSpinColourMatrixD;
+typedef Lattice<vSpinColourMatrix>      LatticeSpinColourMatrix;
+typedef Lattice<vSpinColourMatrixF>     LatticeSpinColourMatrixF;
+typedef Lattice<vSpinColourMatrixD>     LatticeSpinColourMatrixD;

-    typedef Lattice<vSpinColourSpinColourMatrix>      LatticeSpinColourSpinColourMatrix;
-    typedef Lattice<vSpinColourSpinColourMatrixF>     LatticeSpinColourSpinColourMatrixF;
-    typedef Lattice<vSpinColourSpinColourMatrixD>     LatticeSpinColourSpinColourMatrixD;
+typedef Lattice<vSpinColourSpinColourMatrix>      LatticeSpinColourSpinColourMatrix;
+typedef Lattice<vSpinColourSpinColourMatrixF>     LatticeSpinColourSpinColourMatrixF;
+typedef Lattice<vSpinColourSpinColourMatrixD>     LatticeSpinColourSpinColourMatrixD;

-    typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
-    typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
-    typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
+typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
+typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
+typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;

-    // DoubleStored gauge field
-    typedef Lattice<vDoubleStoredColourMatrix>  LatticeDoubleStoredColourMatrix;
-    typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
-    typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;
+// DoubleStored gauge field
+typedef Lattice<vDoubleStoredColourMatrix>  LatticeDoubleStoredColourMatrix;
+typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
+typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;

-    typedef Lattice<vSpinVector>            LatticeSpinVector;
-    typedef Lattice<vSpinVectorF>           LatticeSpinVectorF;
-    typedef Lattice<vSpinVectorD>           LatticeSpinVectorD;
+typedef Lattice<vSpinVector>            LatticeSpinVector;
+typedef Lattice<vSpinVectorF>           LatticeSpinVectorF;
+typedef Lattice<vSpinVectorD>           LatticeSpinVectorD;

-    typedef Lattice<vColourVector>          LatticeColourVector;
-    typedef Lattice<vColourVectorF>         LatticeColourVectorF;
-    typedef Lattice<vColourVectorD>         LatticeColourVectorD;
+typedef Lattice<vColourVector>          LatticeColourVector;
+typedef Lattice<vColourVectorF>         LatticeColourVectorF;
+typedef Lattice<vColourVectorD>         LatticeColourVectorD;

-    typedef Lattice<vSpinColourVector>      LatticeSpinColourVector;
-    typedef Lattice<vSpinColourVectorF>     LatticeSpinColourVectorF;
-    typedef Lattice<vSpinColourVectorD>     LatticeSpinColourVectorD;
+typedef Lattice<vSpinColourVector>      LatticeSpinColourVector;
+typedef Lattice<vSpinColourVectorF>     LatticeSpinColourVectorF;
+typedef Lattice<vSpinColourVectorD>     LatticeSpinColourVectorD;

-    typedef Lattice<vHalfSpinVector>        LatticeHalfSpinVector;
-    typedef Lattice<vHalfSpinVectorF>       LatticeHalfSpinVectorF;
-    typedef Lattice<vHalfSpinVectorD>       LatticeHalfSpinVectorD;
+typedef Lattice<vHalfSpinVector>        LatticeHalfSpinVector;
+typedef Lattice<vHalfSpinVectorF>       LatticeHalfSpinVectorF;
+typedef Lattice<vHalfSpinVectorD>       LatticeHalfSpinVectorD;

-    typedef Lattice<vHalfSpinColourVector>  LatticeHalfSpinColourVector;
-    typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
-    typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;
+typedef Lattice<vHalfSpinColourVector>  LatticeHalfSpinColourVector;
+typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
+typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;

-    typedef Lattice<vTReal>            LatticeReal;
-    typedef Lattice<vTRealF>           LatticeRealF;
-    typedef Lattice<vTRealD>           LatticeRealD;
+typedef Lattice<vTReal>            LatticeReal;
+typedef Lattice<vTRealF>           LatticeRealF;
+typedef Lattice<vTRealD>           LatticeRealD;

-    typedef Lattice<vTComplex>         LatticeComplex;
-    typedef Lattice<vTComplexF>        LatticeComplexF;
-    typedef Lattice<vTComplexD>        LatticeComplexD;
+typedef Lattice<vTComplex>         LatticeComplex;
+typedef Lattice<vTComplexF>        LatticeComplexF;
+typedef Lattice<vTComplexD>        LatticeComplexD;

-    typedef Lattice<vTInteger>         LatticeInteger; // Predicates for "where"
+typedef Lattice<vTInteger>         LatticeInteger; // Predicates for "where"


-    ///////////////////////////////////////////
-    // Physical names for things
-    ///////////////////////////////////////////
-    typedef LatticeHalfSpinColourVector  LatticeHalfFermion;
-    typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
-    typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;
+///////////////////////////////////////////
+// Physical names for things
+///////////////////////////////////////////
+typedef LatticeHalfSpinColourVector  LatticeHalfFermion;
+typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
+typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;

-    typedef LatticeSpinColourVector      LatticeFermion;
-    typedef LatticeSpinColourVectorF     LatticeFermionF;
-    typedef LatticeSpinColourVectorD     LatticeFermionD;
+typedef LatticeSpinColourVector      LatticeFermion;
+typedef LatticeSpinColourVectorF     LatticeFermionF;
+typedef LatticeSpinColourVectorD     LatticeFermionD;

-    typedef LatticeSpinColourMatrix                LatticePropagator;
-    typedef LatticeSpinColourMatrixF               LatticePropagatorF;
-    typedef LatticeSpinColourMatrixD               LatticePropagatorD;
+typedef LatticeSpinColourMatrix                LatticePropagator;
+typedef LatticeSpinColourMatrixF               LatticePropagatorF;
+typedef LatticeSpinColourMatrixD               LatticePropagatorD;

-    typedef LatticeLorentzColourMatrix             LatticeGaugeField;
-    typedef LatticeLorentzColourMatrixF            LatticeGaugeFieldF;
-    typedef LatticeLorentzColourMatrixD            LatticeGaugeFieldD;
+typedef LatticeLorentzColourMatrix             LatticeGaugeField;
+typedef LatticeLorentzColourMatrixF            LatticeGaugeFieldF;
+typedef LatticeLorentzColourMatrixD            LatticeGaugeFieldD;

-    typedef LatticeDoubleStoredColourMatrix        LatticeDoubledGaugeField;
-    typedef LatticeDoubleStoredColourMatrixF       LatticeDoubledGaugeFieldF;
-    typedef LatticeDoubleStoredColourMatrixD       LatticeDoubledGaugeFieldD;
+typedef LatticeDoubleStoredColourMatrix        LatticeDoubledGaugeField;
+typedef LatticeDoubleStoredColourMatrixF       LatticeDoubledGaugeFieldF;
+typedef LatticeDoubleStoredColourMatrixD       LatticeDoubledGaugeFieldD;

-    template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;
+template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;

-    // Uhgg... typing this hurt  ;)
-    // (my keyboard got burning hot when I typed this, must be the anti-Fermion)
-    typedef Lattice<vColourVector>          LatticeStaggeredFermion;    
-    typedef Lattice<vColourVectorF>         LatticeStaggeredFermionF;    
-    typedef Lattice<vColourVectorD>         LatticeStaggeredFermionD;    
+// Uhgg... typing this hurt  ;)
+// (my keyboard got burning hot when I typed this, must be the anti-Fermion)
+typedef Lattice<vColourVector>          LatticeStaggeredFermion;    
+typedef Lattice<vColourVectorF>         LatticeStaggeredFermionF;    
+typedef Lattice<vColourVectorD>         LatticeStaggeredFermionD;    

-    typedef Lattice<vColourMatrix>          LatticeStaggeredPropagator; 
-    typedef Lattice<vColourMatrixF>         LatticeStaggeredPropagatorF; 
-    typedef Lattice<vColourMatrixD>         LatticeStaggeredPropagatorD; 
+typedef Lattice<vColourMatrix>          LatticeStaggeredPropagator; 
+typedef Lattice<vColourMatrixF>         LatticeStaggeredPropagatorF; 
+typedef Lattice<vColourMatrixD>         LatticeStaggeredPropagatorD; 

-    //////////////////////////////////////////////////////////////////////////////
-    // Peek and Poke named after physics attributes
-    //////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+// Peek and Poke named after physics attributes
+//////////////////////////////////////////////////////////////////////////////

-    //spin
-    template<class vobj> auto peekSpin(const vobj &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
-    {
-      return PeekIndex<SpinIndex>(rhs,i);
-    }
-    template<class vobj> auto peekSpin(const vobj &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
-    {
-      return PeekIndex<SpinIndex>(rhs,i,j);
-    }
-    template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
-    {
-      return PeekIndex<SpinIndex>(rhs,i);
-    }
-    template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
-    {
-      return PeekIndex<SpinIndex>(rhs,i,j);
-    }
-    //colour
-    template<class vobj> auto peekColour(const vobj &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
-    {
-      return PeekIndex<ColourIndex>(rhs,i);
-    }
-    template<class vobj> auto peekColour(const vobj &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
-    {
-      return PeekIndex<ColourIndex>(rhs,i,j);
-    }
-    template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
-    {
-      return PeekIndex<ColourIndex>(rhs,i);
-    }
-    template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
-    {
-      return PeekIndex<ColourIndex>(rhs,i,j);
-    }
-    //lorentz
-    template<class vobj> auto peekLorentz(const vobj &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
-    {
-      return PeekIndex<LorentzIndex>(rhs,i);
-    }
-    template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
-    {
-      return PeekIndex<LorentzIndex>(rhs,i);
-    }
+//spin
+template<class vobj> auto peekSpin(const vobj &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
+{
+  return PeekIndex<SpinIndex>(rhs,i);
+}
+template<class vobj> auto peekSpin(const vobj &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
+{
+  return PeekIndex<SpinIndex>(rhs,i,j);
+}
+template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
+{
+  return PeekIndex<SpinIndex>(rhs,i);
+}
+template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
+{
+  return PeekIndex<SpinIndex>(rhs,i,j);
+}
+//colour
+template<class vobj> auto peekColour(const vobj &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
+{
+  return PeekIndex<ColourIndex>(rhs,i);
+}
+template<class vobj> auto peekColour(const vobj &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
+{
+  return PeekIndex<ColourIndex>(rhs,i,j);
+}
+template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
+{
+  return PeekIndex<ColourIndex>(rhs,i);
+}
+template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
+{
+  return PeekIndex<ColourIndex>(rhs,i,j);
+}
+//lorentz
+template<class vobj> auto peekLorentz(const vobj &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
+{
+  return PeekIndex<LorentzIndex>(rhs,i);
+}
+template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
+{
+  return PeekIndex<LorentzIndex>(rhs,i);
+}

-    //////////////////////////////////////////////
-    // Poke lattice
-    //////////////////////////////////////////////
-    template<class vobj> 
-      void pokeColour(Lattice<vobj> &lhs,
-              const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0))> & rhs,
+//////////////////////////////////////////////
+// Poke lattice
+//////////////////////////////////////////////
+template<class vobj> 
+void pokeColour(Lattice<vobj> &lhs,
+		const Lattice<decltype(peekIndex<ColourIndex>(vobj(),0))> & rhs,
+		int i)
+{
+  PokeIndex<ColourIndex>(lhs,rhs,i);
+}
+template<class vobj> 
+void pokeColour(Lattice<vobj> &lhs,
+		const Lattice<decltype(peekIndex<ColourIndex>(vobj(),0,0))> & rhs,
+		int i,int j)
+{
+  PokeIndex<ColourIndex>(lhs,rhs,i,j);
+}
+template<class vobj> 
+void pokeSpin(Lattice<vobj> &lhs,
+              const Lattice<decltype(peekIndex<SpinIndex>(vobj(),0))> & rhs,
              int i)
-    {
-      PokeIndex<ColourIndex>(lhs,rhs,i);
-    }
-    template<class vobj> 
-      void pokeColour(Lattice<vobj> &lhs,
-              const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0,0))> & rhs,
+{
+  PokeIndex<SpinIndex>(lhs,rhs,i);
+}
+template<class vobj> 
+void pokeSpin(Lattice<vobj> &lhs,
+              const Lattice<decltype(peekIndex<SpinIndex>(vobj(),0,0))> & rhs,
              int i,int j)
-    {
-      PokeIndex<ColourIndex>(lhs,rhs,i,j);
-    }
-    template<class vobj> 
-      void pokeSpin(Lattice<vobj> &lhs,
-              const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0))> & rhs,
-              int i)
-    {
-      PokeIndex<SpinIndex>(lhs,rhs,i);
-    }
-    template<class vobj> 
-      void pokeSpin(Lattice<vobj> &lhs,
-              const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0,0))> & rhs,
-              int i,int j)
-    {
-      PokeIndex<SpinIndex>(lhs,rhs,i,j);
-    }
-    template<class vobj> 
-      void pokeLorentz(Lattice<vobj> &lhs,
-              const Lattice<decltype(peekIndex<LorentzIndex>(lhs._odata[0],0))> & rhs,
-              int i)
-    {
-      PokeIndex<LorentzIndex>(lhs,rhs,i);
-    }
+{
+  PokeIndex<SpinIndex>(lhs,rhs,i,j);
+}
+template<class vobj> 
+void pokeLorentz(Lattice<vobj> &lhs,
+		 const Lattice<decltype(peekIndex<LorentzIndex>(vobj(),0))> & rhs,
+		 int i)
+{
+  PokeIndex<LorentzIndex>(lhs,rhs,i);
+}

-    //////////////////////////////////////////////
-    // Poke scalars
-    //////////////////////////////////////////////
-    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
-    {
-      pokeIndex<SpinIndex>(lhs,rhs,i);
-    }
-    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0,0)) & rhs,int i,int j)
-    {
-      pokeIndex<SpinIndex>(lhs,rhs,i,j);
-    }
+//////////////////////////////////////////////
+// Poke scalars
+//////////////////////////////////////////////
+template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
+{
+  pokeIndex<SpinIndex>(lhs,rhs,i);
+}
+template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0,0)) & rhs,int i,int j)
+{
+  pokeIndex<SpinIndex>(lhs,rhs,i,j);
+}

-    template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0)) & rhs,int i)
-    {
-      pokeIndex<ColourIndex>(lhs,rhs,i);
-    }
-    template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0,0)) & rhs,int i,int j)
-    {
-      pokeIndex<ColourIndex>(lhs,rhs,i,j);
-    }
+template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0)) & rhs,int i)
+{
+  pokeIndex<ColourIndex>(lhs,rhs,i);
+}
+template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0,0)) & rhs,int i,int j)
+{
+  pokeIndex<ColourIndex>(lhs,rhs,i,j);
+}

-    template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<LorentzIndex>(lhs,0)) & rhs,int i)
-    {
-      pokeIndex<LorentzIndex>(lhs,rhs,i);
-    }
+template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<LorentzIndex>(lhs,0)) & rhs,int i)
+{
+  pokeIndex<LorentzIndex>(lhs,rhs,i);
+}

-    //////////////////////////////////////////////
-    // Fermion <-> propagator assignements
-    //////////////////////////////////////////////
+//////////////////////////////////////////////
+// Fermion <-> propagator assignements
+//////////////////////////////////////////////
    //template <class Prop, class Ferm>
    template <class Fimpl>
      void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
+{
+  for(int j = 0; j < Ns; ++j)
    {
-      for(int j = 0; j < Ns; ++j)
-        {
-            auto pjs = peekSpin(p, j, s);
-            auto fj  = peekSpin(f, j);
+      auto pjs = peekSpin(p, j, s);
+      auto fj  = peekSpin(f, j);
            
            for(int i = 0; i < Fimpl::Dimension; ++i)
-            {
-                pokeColour(pjs, peekColour(fj, i), i, c);
-            }
-            pokeSpin(p, pjs, j, s);
-        }
+	{
+	  pokeColour(pjs, peekColour(fj, i), i, c);
+	}
+      pokeSpin(p, pjs, j, s);
    }
+}
    
    //template <class Prop, class Ferm>
    template <class Fimpl>
      void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
+{
+  for(int j = 0; j < Ns; ++j)
    {
-        for(int j = 0; j < Ns; ++j)
-        {
-            auto pjs = peekSpin(p, j, s);
-            auto fj  = peekSpin(f, j);
+      auto pjs = peekSpin(p, j, s);
+      auto fj  = peekSpin(f, j);
            
            for(int i = 0; i < Fimpl::Dimension; ++i)
-            {
-                pokeColour(fj, peekColour(pjs, i, c), i);
-            }
-            pokeSpin(f, fj, j);
-        }
+	{
+	  pokeColour(fj, peekColour(pjs, i, c), i);
+	}
+      pokeSpin(f, fj, j);
    }
+}
    
-    //////////////////////////////////////////////
-    // transpose array and scalar
-    //////////////////////////////////////////////
-    template<int Index,class vobj> inline Lattice<vobj> transposeSpin(const Lattice<vobj> &lhs){
-      return transposeIndex<SpinIndex>(lhs);
-    }
-    template<int Index,class vobj> inline Lattice<vobj> transposeColour(const Lattice<vobj> &lhs){
-      return transposeIndex<ColourIndex>(lhs);
-    }
-    template<int Index,class vobj> inline vobj transposeSpin(const vobj &lhs){
-      return transposeIndex<SpinIndex>(lhs);
-    }
-    template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
-      return transposeIndex<ColourIndex>(lhs);
-    }
+//////////////////////////////////////////////
+// transpose array and scalar
+//////////////////////////////////////////////
+template<int Index,class vobj> inline Lattice<vobj> transposeSpin(const Lattice<vobj> &lhs){
+  return transposeIndex<SpinIndex>(lhs);
+}
+template<int Index,class vobj> inline Lattice<vobj> transposeColour(const Lattice<vobj> &lhs){
+  return transposeIndex<ColourIndex>(lhs);
+}
+template<int Index,class vobj> inline vobj transposeSpin(const vobj &lhs){
+  return transposeIndex<SpinIndex>(lhs);
+}
+template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
+  return transposeIndex<ColourIndex>(lhs);
+}

-    //////////////////////////////////////////
-    // Trace lattice and non-lattice
-    //////////////////////////////////////////
-    template<int Index,class vobj>
-    inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs._odata[0]))>
-    {
-      return traceIndex<SpinIndex>(lhs);
-    }
-    template<int Index,class vobj>
-    inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs._odata[0]))>
-    {
-      return traceIndex<ColourIndex>(lhs);
-    }
-    template<int Index,class vobj>
-    inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))>
-    {
-      return traceIndex<SpinIndex>(lhs);
-    }
-    template<int Index,class vobj>
-    inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs))>
-    {
-      return traceIndex<ColourIndex>(lhs);
-    }
+//////////////////////////////////////////
+// Trace lattice and non-lattice
+//////////////////////////////////////////
+template<int Index,class vobj>
+inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(vobj()))>
+{
+  return traceIndex<SpinIndex>(lhs);
+}
+template<int Index,class vobj>
+inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(vobj()))>
+{
+  return traceIndex<ColourIndex>(lhs);
+}
+template<int Index,class vobj>
+inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))>
+{
+  return traceIndex<SpinIndex>(lhs);
+}
+template<int Index,class vobj>
+inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs))>
+{
+  return traceIndex<ColourIndex>(lhs);
+}

-    //////////////////////////////////////////
-    // Current types
-    //////////////////////////////////////////
-    GRID_SERIALIZABLE_ENUM(Current, undef,
-                           Vector,  0,
-                           Axial,   1,
-                           Tadpole, 2);
+//////////////////////////////////////////
+// Current types
+//////////////////////////////////////////
+GRID_SERIALIZABLE_ENUM(Current, undef,
+		       Vector,  0,
+		       Axial,   1,
+		       Tadpole, 2);

-}   //namespace QCD
-} // Grid
+NAMESPACE_END(Grid);

-
-
-#endif
--- a/Grid/qcd/action/Action.h
+++ b/Grid/qcd/action/Action.h
@ -37,14 +37,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 // Abstract base interface
 ////////////////////////////////////////////
 #include <Grid/qcd/action/ActionCore.h>
+NAMESPACE_CHECK(ActionCore);
 ////////////////////////////////////////////////////////////////////////
 // Fermion actions; prevent coupling fermion.cc files to other headers
 ////////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/FermionCore.h>
+NAMESPACE_CHECK(FermionCore);
 #include <Grid/qcd/action/fermion/Fermion.h>
+NAMESPACE_CHECK(Fermion);
 ////////////////////////////////////////
 // Pseudo fermion combinations for HMC
 ////////////////////////////////////////
 #include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
+NAMESPACE_CHECK(PseudoFermion);

 #endif
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@ -27,19 +27,18 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */

 #ifndef ACTION_BASE_H
 #define ACTION_BASE_H

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template <class GaugeField >
 class Action 
 {

- public:
+public:
  bool is_smeared = false;
  // Heatbath?
  virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
@ -50,7 +49,6 @@ class Action
  virtual ~Action(){}
 };

-}
-}
+NAMESPACE_END(Grid);

 #endif // ACTION_BASE_H
--- a/Grid/qcd/action/ActionCore.h
+++ b/Grid/qcd/action/ActionCore.h
@ -31,29 +31,37 @@ directory
 #define QCD_ACTION_CORE

 #include <Grid/qcd/action/ActionBase.h>
+NAMESPACE_CHECK(ActionBase);
 #include <Grid/qcd/action/ActionSet.h>
+NAMESPACE_CHECK(ActionSet);
 #include <Grid/qcd/action/ActionParams.h>
+NAMESPACE_CHECK(ActionParams);

 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
 #include <Grid/qcd/action/gauge/Gauge.h>
+NAMESPACE_CHECK(Gauge);

 ////////////////////////////////////////////
 // Fermion prereqs
 ////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/FermionCore.h>
+NAMESPACE_CHECK(ActionFermionCore);

 ////////////////////////////////////////////
 // Scalar Actions
 ////////////////////////////////////////////
 #include <Grid/qcd/action/scalar/Scalar.h>
+NAMESPACE_CHECK(Scalar);

 ////////////////////////////////////////////
 // Utility functions
 ////////////////////////////////////////////
 #include <Grid/qcd/utils/Metric.h>
+NAMESPACE_CHECK(Metric);
 #include <Grid/qcd/utils/CovariantLaplacian.h>
+NAMESPACE_CHECK(CovariantLaplacian);



--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@ -27,37 +27,35 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */

 #ifndef GRID_QCD_ACTION_PARAMS_H
 #define GRID_QCD_ACTION_PARAMS_H

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  // These can move into a params header and be given MacroMagic serialisation
-  struct GparityWilsonImplParams {
-    bool overlapCommsCompute;
-    std::vector<int> twists;
-    GparityWilsonImplParams() : twists(Nd, 0), overlapCommsCompute(false){};
-  };
+// These can move into a params header and be given MacroMagic serialisation
+struct GparityWilsonImplParams {
+  Coordinate twists;
+  GparityWilsonImplParams() : twists(Nd, 0) {};
+};
  
-  struct WilsonImplParams {
-    bool overlapCommsCompute;
-    std::vector<Real> twist_n_2pi_L;
-    std::vector<Complex> boundary_phases;
-    WilsonImplParams() : overlapCommsCompute(false) {
-      boundary_phases.resize(Nd, 1.0);
+struct WilsonImplParams {
+  bool overlapCommsCompute;
+  AcceleratorVector<Real,Nd> twist_n_2pi_L;
+  AcceleratorVector<Complex,Nd> boundary_phases;
+  WilsonImplParams()  {
+    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
-    };
-    WilsonImplParams(const std::vector<Complex> phi) : boundary_phases(phi), overlapCommsCompute(false) {
-      twist_n_2pi_L.resize(Nd, 0.0);
-    }
  };
+  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
+    twist_n_2pi_L.resize(Nd, 0.0);
+  }
+};

-  struct StaggeredImplParams {
-    StaggeredImplParams()  {};
-  };
+struct StaggeredImplParams {
+  StaggeredImplParams()  {};
+};
  
  struct OneFlavourRationalParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(OneFlavourRationalParams, 
@ -69,10 +67,10 @@ namespace QCD {
 				    int,   precision,
 				    int,   BoundsCheckFreq);
    
-    // MaxIter and tolerance, vectors??
+  // MaxIter and tolerance, vectors??
    
-    // constructor 
-    OneFlavourRationalParams(	RealD _lo      = 0.0, 
+  // constructor 
+  OneFlavourRationalParams(	RealD _lo      = 0.0, 
 				RealD _hi      = 1.0, 
 				int _maxit     = 1000,
 				RealD tol      = 1.0e-8, 
@ -88,11 +86,6 @@ namespace QCD {
        BoundsCheckFreq(_BoundsCheckFreq){};
  };
  
-  
-}
-}
-
-
-
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/ActionSet.h
+++ b/Grid/qcd/action/ActionSet.h
@ -26,14 +26,11 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */
 #ifndef ACTION_SET_H
 #define ACTION_SET_H

-namespace Grid {
-
-// Should drop this namespace here
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 //////////////////////////////////
 // Indexing of tuple types
@ -62,7 +59,7 @@ struct Index<T, std::tuple<U, Types...>> {

 template <class Field, class Repr = NoHirep >
 struct ActionLevel {
- public:
+public:
  unsigned int multiplier;

  // Fundamental repr actions separated because of the smearing
@ -77,7 +74,7 @@ struct ActionLevel {
  std::vector<ActPtr>& actions;

  explicit ActionLevel(unsigned int mul = 1) : 
-  actions(std::get<0>(actions_hirep)), multiplier(mul) {
+    actions(std::get<0>(actions_hirep)), multiplier(mul) {
    // initialize the hirep vectors to zero.
    // apply(this->resize, actions_hirep, 0); //need a working resize
    assert(mul >= 1);
@ -87,7 +84,7 @@ struct ActionLevel {
  void push_back(Action<GenField>* ptr) {
    // insert only in the correct vector
    std::get< Index < GenField, action_hirep_types>::value >(actions_hirep).push_back(ptr);
-  };
+  }

  template <class ActPtr>
  static void resize(ActPtr ap, unsigned int n) {
@ -110,7 +107,6 @@ struct ActionLevel {
 template <class GaugeField, class R>
 using ActionSet = std::vector<ActionLevel<GaugeField, R> >;

-} // QCD
-} // Grid
+NAMESPACE_END(Grid);

 #endif  // ACTION_SET_H
--- a/Grid/qcd/action/fermion/AbstractEOFAFermion.h
+++ b/Grid/qcd/action/fermion/AbstractEOFAFermion.h
@ -26,75 +26,75 @@ with this program; if not, write to the Free Software Foundation, Inc.,

 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */
 #ifndef  GRID_QCD_ABSTRACT_EOFA_FERMION_H
 #define  GRID_QCD_ABSTRACT_EOFA_FERMION_H

 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  // DJM: Abstract base class for EOFA fermion types.
-  // Defines layout of additional EOFA-specific parameters and operators.
-  // Use to construct EOFA pseudofermion actions that are agnostic to
-  // Shamir / Mobius / etc., and ensure that no one can construct EOFA
-  // pseudofermion action with non-EOFA fermion type.
-  template<class Impl>
-  class AbstractEOFAFermion : public CayleyFermion5D<Impl> {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
+// DJM: Abstract base class for EOFA fermion types.
+// Defines layout of additional EOFA-specific parameters and operators.
+// Use to construct EOFA pseudofermion actions that are agnostic to
+// Shamir / Mobius / etc., and ensure that no one can construct EOFA
+// pseudofermion action with non-EOFA fermion type.
+template<class Impl>
+class AbstractEOFAFermion : public CayleyFermion5D<Impl> {
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    public:
-      // Fermion operator: D(mq1) + shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm}
-      RealD mq1;
-      RealD mq2;
-      RealD mq3;
-      RealD shift;
-      int pm;
+public:
+  // Fermion operator: D(mq1) + shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm}
+  RealD mq1;
+  RealD mq2;
+  RealD mq3;
+  RealD shift;
+  int pm;

-      RealD alpha; // Mobius scale
-      RealD k;     // EOFA normalization constant
+  RealD alpha; // Mobius scale
+  RealD k;     // EOFA normalization constant

-      virtual void Instantiatable(void) = 0;
+  virtual void Instantiatable(void) = 0;

-      // EOFA-specific operations
-      // Force user to implement in derived classes
-      virtual void  Omega    (const FermionField& in, FermionField& out, int sign, int dag) = 0;
-      virtual void  Dtilde   (const FermionField& in, FermionField& out) = 0;
-      virtual void  DtildeInv(const FermionField& in, FermionField& out) = 0;
+  // EOFA-specific operations
+  // Force user to implement in derived classes
+  virtual void  Omega    (const FermionField& in, FermionField& out, int sign, int dag) = 0;
+  virtual void  Dtilde   (const FermionField& in, FermionField& out) = 0;
+  virtual void  DtildeInv(const FermionField& in, FermionField& out) = 0;

-      // Implement derivatives in base class:
-      // for EOFA both DWF and Mobius just need d(Dw)/dU
-      virtual void MDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
-        this->DhopDeriv(mat, U, V, dag);
-      };
-      virtual void MoeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
-        this->DhopDerivOE(mat, U, V, dag);
-      };
-      virtual void MeoDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
-        this->DhopDerivEO(mat, U, V, dag);
-      };
-
-      // Recompute 5D coefficients for different value of shift constant
-      // (needed for heatbath loop over poles)
-      virtual void RefreshShiftCoefficients(RealD new_shift) = 0;
-
-      // Constructors
-      AbstractEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
-        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
-        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int _pm,
-        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams())
-        : CayleyFermion5D<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid,
-          _mq1, _M5, p), mq1(_mq1), mq2(_mq2), mq3(_mq3), shift(_shift), pm(_pm)
-      {
-        int Ls = this->Ls;
-        this->alpha = _b + _c;
-        this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
-                    ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
-                    ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
-      };
+  // Implement derivatives in base class:
+  // for EOFA both DWF and Mobius just need d(Dw)/dU
+  virtual void MDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
+    this->DhopDeriv(mat, U, V, dag);
  };
-}}
+  virtual void MoeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
+    this->DhopDerivOE(mat, U, V, dag);
+  };
+  virtual void MeoDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
+    this->DhopDerivEO(mat, U, V, dag);
+  };
+
+  // Recompute 5D coefficients for different value of shift constant
+  // (needed for heatbath loop over poles)
+  virtual void RefreshShiftCoefficients(RealD new_shift) = 0;
+
+  // Constructors
+  AbstractEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
+		      GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
+		      RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int _pm,
+		      RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams())
+    : CayleyFermion5D<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid,
+			    _mq1, _M5, p), mq1(_mq1), mq2(_mq2), mq3(_mq3), shift(_shift), pm(_pm)
+  {
+    int Ls = this->Ls;
+    this->alpha = _b + _c;
+    this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
+      ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
+      ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
+  };
+};
+
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid

@ -24,203 +24,146 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_CAYLEY_FERMION_H
-#define  GRID_QCD_CAYLEY_FERMION_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once

 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class CayleyFermion5D : public WilsonFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-     template<typename T> struct switcheroo   {
-       static inline int iscomplex()  { return 0; }
+  // override multiply
+  virtual RealD  M    (const FermionField &in, FermionField &out);
+  virtual RealD  Mdag (const FermionField &in, FermionField &out);

-       template<class vec>
-       static inline vec mult(vec a, vec b) {
-	 return real_mult(a,b);
-       }
-     };
-     template<> struct switcheroo<ComplexD> {
-       static inline int iscomplex()  { return 1; }
+  // half checkerboard operations
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+  virtual void   Meo5D (const FermionField &psi, FermionField &chi);

-       template<class vec>
-       static inline vec mult(vec a, vec b) {
-	 return a*b;
-       }
-     };
-     template<> struct switcheroo<ComplexF> {
-       static inline int iscomplex()  { return 1; }
-       template<class vec>
-       static inline vec mult(vec a, vec b) {
-	 return a*b;
-       }
-     };
+  virtual void   M5D   (const FermionField &psi, FermionField &chi);
+  virtual void   M5Ddag(const FermionField &psi, FermionField &chi);

+  ///////////////////////////////////////////////////////////////
+  // Physical surface field utilities
+  ///////////////////////////////////////////////////////////////
+  virtual void Dminus(const FermionField &psi, FermionField &chi);
+  virtual void DminusDag(const FermionField &psi, FermionField &chi);
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+  virtual void ExportPhysicalFermionSource(const FermionField &solution5d, FermionField &exported4d);
+  virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);
+  virtual void ImportUnphysicalFermion(const FermionField &solution5d, FermionField &exported4d);

-    template<class Impl>
-    class CayleyFermion5D : public WilsonFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
+  ///////////////////////////////////////////////////////////////
+  // Support for MADWF tricks
+  ///////////////////////////////////////////////////////////////
+  RealD Mass(void) { return mass; };
+  void  SetMass(RealD _mass) { 
+    mass=_mass; 
+    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
+  } ;
+  void  P(const FermionField &psi, FermionField &chi);
+  void  Pdag(const FermionField &psi, FermionField &chi);
+  
+  /////////////////////////////////////////////////////
+  // Instantiate different versions depending on Impl
+  /////////////////////////////////////////////////////
+  void M5D(const FermionField &psi,
+	   const FermionField &phi,
+	   FermionField &chi,
+	   Vector<Coeff_t> &lower,
+	   Vector<Coeff_t> &diag,
+	   Vector<Coeff_t> &upper);

-      // override multiply
-      virtual RealD  M    (const FermionField &in, FermionField &out);
-      virtual RealD  Mdag (const FermionField &in, FermionField &out);
+  void M5Ddag(const FermionField &psi,
+	      const FermionField &phi,
+	      FermionField &chi,
+	      Vector<Coeff_t> &lower,
+	      Vector<Coeff_t> &diag,
+	      Vector<Coeff_t> &upper);

-      // half checkerboard operations
-      virtual void   Meooe       (const FermionField &in, FermionField &out);
-      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
-      virtual void   Mooee       (const FermionField &in, FermionField &out);
-      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
-      virtual void   Meo5D (const FermionField &psi, FermionField &chi);
+  virtual void   Instantiatable(void)=0;

-      virtual void   M5D   (const FermionField &psi, FermionField &chi);
-      virtual void   M5Ddag(const FermionField &psi, FermionField &chi);
+  // force terms; five routines; default to Dhop on diagonal
+  virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

-      ///////////////////////////////////////////////////////////////
-      // Physical surface field utilities
-      ///////////////////////////////////////////////////////////////
-      virtual void   Dminus(const FermionField &psi, FermionField &chi);
-      virtual void   DminusDag(const FermionField &psi, FermionField &chi);
-      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
-      virtual void ExportPhysicalFermionSource(const FermionField &solution5d, FermionField &exported4d);
-      virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);
-      virtual void ImportUnphysicalFermion(const FermionField &solution5d, FermionField &exported4d);
+  // Efficient support for multigrid coarsening
+  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);

-      ///////////////////////////////////////////////////////////////
-      // Support for MADWF tricks
-      ///////////////////////////////////////////////////////////////
-      RealD Mass(void) { return mass; };
-      void  SetMass(RealD _mass) { 
-	mass=_mass; 
-	SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
-      } ;
-      void  P(const FermionField &psi, FermionField &chi);
-      void  Pdag(const FermionField &psi, FermionField &chi);
+  void   Meooe5D       (const FermionField &in, FermionField &out);
+  void   MeooeDag5D    (const FermionField &in, FermionField &out);

-      /////////////////////////////////////////////////////
-      // Instantiate different versions depending on Impl
-      /////////////////////////////////////////////////////
-      void M5D(const FermionField &psi,
-	       const FermionField &phi,
-	       FermionField &chi,
-	       std::vector<Coeff_t> &lower,
-	       std::vector<Coeff_t> &diag,
-	       std::vector<Coeff_t> &upper);
+  //    protected:
+  RealD mass;

-      void M5Ddag(const FermionField &psi,
-		  const FermionField &phi,
-		  FermionField &chi,
-		  std::vector<Coeff_t> &lower,
-		  std::vector<Coeff_t> &diag,
-		  std::vector<Coeff_t> &upper);
+  // Save arguments to SetCoefficientsInternal
+  Vector<Coeff_t> _gamma;
+  RealD                _zolo_hi;
+  RealD                _b;
+  RealD                _c;

-      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
-      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd> > & Matp, Vector<iSinglet<Simd> > & Matm);
+  // Cayley form Moebius (tanh and zolotarev)
+  Vector<Coeff_t> omega;
+  Vector<Coeff_t> bs;    // S dependent coeffs
+  Vector<Coeff_t> cs;
+  Vector<Coeff_t> as;
+  // For preconditioning Cayley form
+  Vector<Coeff_t> bee;
+  Vector<Coeff_t> cee;
+  Vector<Coeff_t> aee;
+  Vector<Coeff_t> beo;
+  Vector<Coeff_t> ceo;
+  Vector<Coeff_t> aeo;
+  // LDU factorisation of the eeoo matrix
+  Vector<Coeff_t> lee;
+  Vector<Coeff_t> leem;
+  Vector<Coeff_t> uee;
+  Vector<Coeff_t> ueem;
+  Vector<Coeff_t> dee;

-      void MooeeInternalAsm(const FermionField &in, FermionField &out,
-			    int LLs, int site,
-			    Vector<iSinglet<Simd> > &Matp,
-			    Vector<iSinglet<Simd> > &Matm);
-      void MooeeInternalZAsm(const FermionField &in, FermionField &out,
-			    int LLs, int site,
-			    Vector<iSinglet<Simd> > &Matp,
-			    Vector<iSinglet<Simd> > &Matm);
+  // Matrices of 5d ee inverse params
+  Vector<iSinglet<Simd> >  MatpInv;
+  Vector<iSinglet<Simd> >  MatmInv;
+  Vector<iSinglet<Simd> >  MatpInvDag;
+  Vector<iSinglet<Simd> >  MatmInvDag;

+  // Constructors
+  CayleyFermion5D(GaugeField &_Umu,
+		  GridCartesian         &FiveDimGrid,
+		  GridRedBlackCartesian &FiveDimRedBlackGrid,
+		  GridCartesian         &FourDimGrid,
+		  GridRedBlackCartesian &FourDimRedBlackGrid,
+		  RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

-      virtual void   Instantiatable(void)=0;
+  void CayleyReport(void);
+  void CayleyZeroCounters(void);

-      // force terms; five routines; default to Dhop on diagonal
-      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  double M5Dflops;
+  double M5Dcalls;
+  double M5Dtime;

-      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  double MooeeInvFlops;
+  double MooeeInvCalls;
+  double MooeeInvTime;

-      void   Meooe5D       (const FermionField &in, FermionField &out);
-      void   MeooeDag5D    (const FermionField &in, FermionField &out);
+protected:
+  virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
+  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
+  virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c);
+};

-      //    protected:
-      RealD mass;
+NAMESPACE_END(Grid);

-      // Save arguments to SetCoefficientsInternal
-      std::vector<Coeff_t> _gamma;
-      RealD                _zolo_hi;
-      RealD                _b;
-      RealD                _c;
-
-      // Cayley form Moebius (tanh and zolotarev)
-      std::vector<Coeff_t> omega;
-      std::vector<Coeff_t> bs;    // S dependent coeffs
-      std::vector<Coeff_t> cs;
-      std::vector<Coeff_t> as;
-      // For preconditioning Cayley form
-      std::vector<Coeff_t> bee;
-      std::vector<Coeff_t> cee;
-      std::vector<Coeff_t> aee;
-      std::vector<Coeff_t> beo;
-      std::vector<Coeff_t> ceo;
-      std::vector<Coeff_t> aeo;
-      // LDU factorisation of the eeoo matrix
-      std::vector<Coeff_t> lee;
-      std::vector<Coeff_t> leem;
-      std::vector<Coeff_t> uee;
-      std::vector<Coeff_t> ueem;
-      std::vector<Coeff_t> dee;
-
-      // Matrices of 5d ee inverse params
-      Vector<iSinglet<Simd> >  MatpInv;
-      Vector<iSinglet<Simd> >  MatmInv;
-      Vector<iSinglet<Simd> >  MatpInvDag;
-      Vector<iSinglet<Simd> >  MatmInvDag;
-
-      // Constructors
-      CayleyFermion5D(GaugeField &_Umu,
-		      GridCartesian         &FiveDimGrid,
-		      GridRedBlackCartesian &FiveDimRedBlackGrid,
-		      GridCartesian         &FourDimGrid,
-		      GridRedBlackCartesian &FourDimRedBlackGrid,
-		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
-
-
-
-     void CayleyReport(void);
-     void CayleyZeroCounters(void);
-
-     double M5Dflops;
-     double M5Dcalls;
-     double M5Dtime;
-
-     double MooeeInvFlops;
-     double MooeeInvCalls;
-     double MooeeInvTime;
-
-    protected:
-      virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
-      virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
-      virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
-    };
-
-  }
-}
-#define INSTANTIATE_DPERP(A)\
-template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
-					std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
-template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
-					   std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
-template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \
-template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);
-
-#undef  CAYLEY_DPERP_DENSE
-#define  CAYLEY_DPERP_CACHE
-#undef  CAYLEY_DPERP_LINALG
-#define CAYLEY_DPERP_VEC
-
-#endif
--- a/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc
@ -1,249 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
-
-
-namespace Grid {
-namespace QCD {
-
-  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-
-  // Pminus fowards
-  // Pplus  backwards..
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
-				const FermionField &phi, 
-				FermionField &chi,
-				std::vector<Coeff_t> &lower,
-				std::vector<Coeff_t> &diag,
-				std::vector<Coeff_t> &upper)
-{
-  int Ls =this->Ls;
-  GridBase *grid=psi._grid;
-  assert(phi.checkerboard == psi.checkerboard);
-  chi.checkerboard=psi.checkerboard;
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    for(int s=0;s<Ls;s++){
-      auto tmp = psi._odata[0];
-      if ( s==0 ) {
- 	                            spProj5m(tmp,psi._odata[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	                    spProj5p(tmp,psi._odata[ss+Ls-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else if ( s==(Ls-1)) {
-	                            spProj5m(tmp,psi._odata[ss+0]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
- 	                    spProj5p(tmp,psi._odata[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else { 
-	                            spProj5m(tmp,psi._odata[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	                    spProj5p(tmp,psi._odata[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      }
-    }
-  }
-  M5Dtime+=usecond();
-}
-
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
-				   const FermionField &phi, 
-				   FermionField &chi,
-				   std::vector<Coeff_t> &lower,
-				   std::vector<Coeff_t> &diag,
-				   std::vector<Coeff_t> &upper)
-{
-  int Ls =this->Ls;
-  GridBase *grid=psi._grid;
-  assert(phi.checkerboard == psi.checkerboard);
-  chi.checkerboard=psi.checkerboard;
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    auto tmp = psi._odata[0];
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	spProj5p(tmp,psi._odata[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5m(tmp,psi._odata[ss+Ls-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else if ( s==(Ls-1)) {
-	spProj5p(tmp,psi._odata[ss+0]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5m(tmp,psi._odata[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else { 
-	spProj5p(tmp,psi._odata[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5m(tmp,psi._odata[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      }
-    }
-  }
-  M5Dtime+=usecond();
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
-{
-  GridBase *grid=psi._grid;
-  int Ls=this->Ls;
-
-  chi.checkerboard=psi.checkerboard;
-
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    auto tmp = psi._odata[0];
-
-    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-    // Apply (L^{\prime})^{-1}
-    chi[ss]=psi[ss]; // chi[0]=psi[0]
-    for(int s=1;s<Ls;s++){
-                            spProj5p(tmp,chi[ss+s-1]);  
-      chi[ss+s] = psi[ss+s]-lee[s-1]*tmp;
-    }
-    // L_m^{-1} 
-    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-                                   spProj5m(tmp,chi[ss+s]);    
-      chi[ss+Ls-1] = chi[ss+Ls-1] - leem[s]*tmp;
-    }
-    // U_m^{-1} D^{-1}
-    for (int s=0;s<Ls-1;s++){
-      // Chi[s] + 1/d chi[s] 
-                                                spProj5p(tmp,chi[ss+Ls-1]); 
-      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(ueem[s]/dee[Ls-1])*tmp;
-    }	
-    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
-      
-    // Apply U^{-1}
-    for (int s=Ls-2;s>=0;s--){
-                            spProj5m(tmp,chi[ss+s+1]);  
-      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
-    }
-  }
-
-  MooeeInvTime+=usecond();
-
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
-{
-  GridBase *grid=psi._grid;
-  int Ls=this->Ls;
-
-  assert(psi.checkerboard == psi.checkerboard);
-  chi.checkerboard=psi.checkerboard;
-
-  std::vector<Coeff_t> ueec(Ls);
-  std::vector<Coeff_t> deec(Ls);
-  std::vector<Coeff_t> leec(Ls);
-  std::vector<Coeff_t> ueemc(Ls);
-  std::vector<Coeff_t> leemc(Ls);
-  for(int s=0;s<ueec.size();s++){
-    ueec[s] = conjugate(uee[s]);
-    deec[s] = conjugate(dee[s]);
-    leec[s] = conjugate(lee[s]);
-    ueemc[s]= conjugate(ueem[s]);
-    leemc[s]= conjugate(leem[s]);
-  }
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-
-    auto tmp = psi._odata[0];
-
-    // Apply (U^{\prime})^{-dagger}
-    chi[ss]=psi[ss];
-    for (int s=1;s<Ls;s++){
-                            spProj5m(tmp,chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s]-ueec[s-1]*tmp;
-    }
-    // U_m^{-\dagger} 
-    for (int s=0;s<Ls-1;s++){
-                                   spProj5p(tmp,chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp;
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for (int s=0;s<Ls-1;s++){
-      spProj5m(tmp,chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/deec[s])*chi[ss+s]-(leemc[s]/deec[Ls-1])*tmp;
-    }	
-    chi[ss+Ls-1]= (1.0/deec[Ls-1])*chi[ss+Ls-1];
-  
-    // Apply L^{-dagger}
-    for (int s=Ls-2;s>=0;s--){
-      spProj5p(tmp,chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - leec[s]*tmp;
-    }
-  }
-
-  MooeeInvTime+=usecond();
-
-}
-
-#ifdef CAYLEY_DPERP_CACHE
-  INSTANTIATE_DPERP(WilsonImplF);
-  INSTANTIATE_DPERP(WilsonImplD);
-  INSTANTIATE_DPERP(GparityWilsonImplF);
-  INSTANTIATE_DPERP(GparityWilsonImplD);
-  INSTANTIATE_DPERP(ZWilsonImplF);
-  INSTANTIATE_DPERP(ZWilsonImplD);
-
-  INSTANTIATE_DPERP(WilsonImplFH);
-  INSTANTIATE_DPERP(WilsonImplDF);
-  INSTANTIATE_DPERP(GparityWilsonImplFH);
-  INSTANTIATE_DPERP(GparityWilsonImplDF);
-  INSTANTIATE_DPERP(ZWilsonImplFH);
-  INSTANTIATE_DPERP(ZWilsonImplDF);
-#endif
-
-}}
--- a/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc
@ -1,828 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
-
-
-namespace Grid {
-namespace QCD {  
-  /*
-   * Dense matrix versions of routines
-   */
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
-{
-  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
-}
-  
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
-{
-  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
-}
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
-				const FermionField &phi, 
-				FermionField &chi,
-				std::vector<Coeff_t> &lower,
-				std::vector<Coeff_t> &diag,
-				std::vector<Coeff_t> &upper)
-{
-  GridBase *grid=psi._grid;
-  int Ls   = this->Ls;
-  int LLs  = grid->_rdimensions[0];
-  const int nsimd= Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs==nsimd);
-  assert(phi.checkerboard == psi.checkerboard);
-
-  chi.checkerboard=psi.checkerboard;
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type * u_p = (scalar_type *)&u[0];
-  scalar_type * l_p = (scalar_type *)&l[0];
-  scalar_type * d_p = (scalar_type *)&d[0];
-
-  for(int o=0;o<LLs;o++){ // outer
-  for(int i=0;i<nsimd;i++){ //inner
-    int s  = o+i*LLs;
-    int ss = o*nsimd+i;
-    u_p[ss] = upper[s];
-    l_p[ss] = lower[s];
-    d_p[ss] = diag[s];
-  }}
-
-
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  assert(Nc==3);
-
-  parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
-#if 0
-      alignas(64) SiteHalfSpinor hp;
-      alignas(64) SiteHalfSpinor hm;
-      alignas(64) SiteSpinor fp;
-      alignas(64) SiteSpinor fm;
-
-      for(int v=0;v<LLs;v++){
-
-	int vp=(v+1)%LLs;
-	int vm=(v+LLs-1)%LLs;
-
-	spProj5m(hp,psi[ss+vp]);
-	spProj5p(hm,psi[ss+vm]);
-
-	if ( vp<=v ) rotate(hp,hp,1);
-	if ( vm>=v ) rotate(hm,hm,nsimd-1);
-	
-	hp=0.5*hp;
-        hm=0.5*hm;
-
-	spRecon5m(fp,hp);
-	spRecon5p(fm,hm);
-
-	chi[ss+v] = d[v]*phi[ss+v];
-	chi[ss+v] = chi[ss+v]     +u[v]*fp;
-	chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-      }
-#else
-      for(int v=0;v<LLs;v++){
-
-	vprefetch(psi[ss+v+LLs]);
-
-	int vp= (v==LLs-1) ? 0     : v+1;
-	int vm= (v==0    ) ? LLs-1 : v-1;
-	
-	Simd hp_00 = psi[ss+vp]()(2)(0); 
-	Simd hp_01 = psi[ss+vp]()(2)(1); 
-	Simd hp_02 = psi[ss+vp]()(2)(2); 
-	Simd hp_10 = psi[ss+vp]()(3)(0); 
-	Simd hp_11 = psi[ss+vp]()(3)(1); 
-	Simd hp_12 = psi[ss+vp]()(3)(2); 
-	
-	Simd hm_00 = psi[ss+vm]()(0)(0); 
-	Simd hm_01 = psi[ss+vm]()(0)(1); 
-	Simd hm_02 = psi[ss+vm]()(0)(2); 
-	Simd hm_10 = psi[ss+vm]()(1)(0); 
-	Simd hm_11 = psi[ss+vm]()(1)(1); 
-	Simd hm_12 = psi[ss+vm]()(1)(2); 
-
-	if ( vp<=v ) {
-	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-	}
-	if ( vm>=v ) {
-	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-	}
-
-	// Can force these to real arithmetic and save 2x.
-	Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
-	Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
-	Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02); 
-	Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
-	Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
-	Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
-	Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
-	Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
-	Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);  
-	Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
-	Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
-	Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
-
-	vstream(chi[ss+v]()(0)(0),p_00);
-	vstream(chi[ss+v]()(0)(1),p_01);
-	vstream(chi[ss+v]()(0)(2),p_02);
-	vstream(chi[ss+v]()(1)(0),p_10);
-	vstream(chi[ss+v]()(1)(1),p_11);
-	vstream(chi[ss+v]()(1)(2),p_12);
-	vstream(chi[ss+v]()(2)(0),p_20);
-	vstream(chi[ss+v]()(2)(1),p_21);
-	vstream(chi[ss+v]()(2)(2),p_22);
-	vstream(chi[ss+v]()(3)(0),p_30);
-	vstream(chi[ss+v]()(3)(1),p_31);
-	vstream(chi[ss+v]()(3)(2),p_32);
-
-      }
-#endif
-  }
-  M5Dtime+=usecond();
-}
-
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
-				   const FermionField &phi, 
-				   FermionField &chi,
-				   std::vector<Coeff_t> &lower,
-				   std::vector<Coeff_t> &diag,
-				   std::vector<Coeff_t> &upper)
-{
-  GridBase *grid=psi._grid;
-  int Ls   = this->Ls;
-  int LLs  = grid->_rdimensions[0];
-  int nsimd= Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs==nsimd);
-  assert(phi.checkerboard == psi.checkerboard);
-
-  chi.checkerboard=psi.checkerboard;
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type * u_p = (scalar_type *)&u[0];
-  scalar_type * l_p = (scalar_type *)&l[0];
-  scalar_type * d_p = (scalar_type *)&d[0];
-
-  for(int o=0;o<LLs;o++){ // outer
-  for(int i=0;i<nsimd;i++){ //inner
-    int s  = o+i*LLs;
-    int ss = o*nsimd+i;
-    u_p[ss] = upper[s];
-    l_p[ss] = lower[s];
-    d_p[ss] = diag[s];
-  }}
-
-  M5Dcalls++;
-  M5Dtime-=usecond();
-  parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
-#if 0
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0;v<LLs;v++){
-
-      int vp=(v+1)%LLs;
-      int vm=(v+LLs-1)%LLs;
-
-      spProj5p(hp,psi[ss+vp]);
-      spProj5m(hm,psi[ss+vm]);
-
-      if ( vp<=v ) rotate(hp,hp,1);
-      if ( vm>=v ) rotate(hm,hm,nsimd-1);
-      
-      hp=hp*0.5;
-      hm=hm*0.5;
-      spRecon5p(fp,hp);
-      spRecon5m(fm,hm);
-
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-    }
-#else
-      for(int v=0;v<LLs;v++){
-
-	vprefetch(psi[ss+v+LLs]);
-
-	int vp= (v==LLs-1) ? 0     : v+1;
-	int vm= (v==0    ) ? LLs-1 : v-1;
-	
-	Simd hp_00 = psi[ss+vp]()(0)(0); 
-	Simd hp_01 = psi[ss+vp]()(0)(1); 
-	Simd hp_02 = psi[ss+vp]()(0)(2); 
-	Simd hp_10 = psi[ss+vp]()(1)(0); 
-	Simd hp_11 = psi[ss+vp]()(1)(1); 
-	Simd hp_12 = psi[ss+vp]()(1)(2); 
-	
-	Simd hm_00 = psi[ss+vm]()(2)(0); 
-	Simd hm_01 = psi[ss+vm]()(2)(1); 
-	Simd hm_02 = psi[ss+vm]()(2)(2); 
-	Simd hm_10 = psi[ss+vm]()(3)(0); 
-	Simd hm_11 = psi[ss+vm]()(3)(1); 
-	Simd hm_12 = psi[ss+vm]()(3)(2); 
-
-	if ( vp<=v ) {
-	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-	}
-	if ( vm>=v ) {
-	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-	}
-
-	Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
-	Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
-	Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02); 
-	Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
-	Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
-	Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
-
-	Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
-	Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
-	Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);  
-	Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
-	Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
-	Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
-
-	vstream(chi[ss+v]()(0)(0),p_00);
-	vstream(chi[ss+v]()(0)(1),p_01);
-	vstream(chi[ss+v]()(0)(2),p_02);
-	vstream(chi[ss+v]()(1)(0),p_10);
-	vstream(chi[ss+v]()(1)(1),p_11);
-	vstream(chi[ss+v]()(1)(2),p_12);
-	vstream(chi[ss+v]()(2)(0),p_20);
-	vstream(chi[ss+v]()(2)(1),p_21);
-	vstream(chi[ss+v]()(2)(2),p_22);
-	vstream(chi[ss+v]()(3)(0),p_30);
-	vstream(chi[ss+v]()(3)(1),p_31);
-	vstream(chi[ss+v]()(3)(2),p_32);
-      }
-#endif
-  }
-  M5Dtime+=usecond();
-}
-
-
-#ifdef AVX512 
-#include <simd/Intel512common.h>
-#include <simd/Intel512avx.h>
-#include <simd/Intel512single.h>
-#endif 
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionField &chi,
-					     int LLs, int site,
-					     Vector<iSinglet<Simd> > &Matp,
-					     Vector<iSinglet<Simd> > &Matm)
-{
-#ifndef AVX512
-  {
-  SiteHalfSpinor BcastP;
-  SiteHalfSpinor BcastM;
-  SiteHalfSpinor SiteChiP;
-  SiteHalfSpinor SiteChiM;
-
-  // Ls*Ls * 2 * 12 * vol flops
-  for(int s1=0;s1<LLs;s1++){ 
-    for(int s2=0;s2<LLs;s2++){ 
-      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-
-        int s=s2+l*LLs;
-	int lex=s2+LLs*site;
-	
-	if ( s2==0 && l==0) {
-	  SiteChiP=zero;
-	  SiteChiM=zero;
-	}
-	
-	for(int sp=0;sp<2;sp++){
-        for(int co=0;co<Nc;co++){
-	  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
-	}}
-	for(int sp=0;sp<2;sp++){
-        for(int co=0;co<Nc;co++){
-	  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
-	}}
-
-	for(int sp=0;sp<2;sp++){
-        for(int co=0;co<Nc;co++){
-	  SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
-	  SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
-	}}
-
-    }}
-    {
-      int lex = s1+LLs*site;
-      for(int sp=0;sp<2;sp++){
-      for(int co=0;co<Nc;co++){
-	vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
-	vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-      }}
-    }
-  }
-
-  }
-#else
-  {
-  // pointers
-    //  MASK_REGS;
-#define Chi_00 %%zmm1
-#define Chi_01 %%zmm2
-#define Chi_02 %%zmm3
-#define Chi_10 %%zmm4
-#define Chi_11 %%zmm5
-#define Chi_12 %%zmm6
-#define Chi_20 %%zmm7
-#define Chi_21 %%zmm8
-#define Chi_22 %%zmm9
-#define Chi_30 %%zmm10
-#define Chi_31 %%zmm11
-#define Chi_32 %%zmm12
-
-#define BCAST0   %%zmm13
-#define BCAST1   %%zmm14
-#define BCAST2   %%zmm15
-#define BCAST3   %%zmm16
-#define BCAST4   %%zmm17
-#define BCAST5   %%zmm18
-#define BCAST6   %%zmm19
-#define BCAST7   %%zmm20
-#define BCAST8   %%zmm21
-#define BCAST9   %%zmm22
-#define BCAST10  %%zmm23
-#define BCAST11  %%zmm24
-
-  int incr=LLs*LLs*sizeof(iSinglet<Simd>);
-  for(int s1=0;s1<LLs;s1++){ 
-    for(int s2=0;s2<LLs;s2++){ 
-      int lex=s2+LLs*site;
-      uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
-      uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
-      uint64_t a2 = (uint64_t)&psi[lex];
-      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-	if ( (s2+l)==0 ) {
-	  asm (
-  	           VPREFETCH1(0,%2)  	     VPREFETCH1(0,%1)
-  	           VPREFETCH1(12,%2)  	     VPREFETCH1(13,%2)
-  	           VPREFETCH1(14,%2)  	     VPREFETCH1(15,%2)         
-		   VBCASTCDUP(0,%2,BCAST0)   
-		   VBCASTCDUP(1,%2,BCAST1)   
-		   VBCASTCDUP(2,%2,BCAST2)   
-		   VBCASTCDUP(3,%2,BCAST3)   
-		   VBCASTCDUP(4,%2,BCAST4)     VMULMEM (0,%0,BCAST0,Chi_00)
-		   VBCASTCDUP(5,%2,BCAST5)     VMULMEM (0,%0,BCAST1,Chi_01)
-		   VBCASTCDUP(6,%2,BCAST6)     VMULMEM (0,%0,BCAST2,Chi_02)
-		   VBCASTCDUP(7,%2,BCAST7)     VMULMEM (0,%0,BCAST3,Chi_10)
-		   VBCASTCDUP(8,%2,BCAST8)     VMULMEM (0,%0,BCAST4,Chi_11)
-		   VBCASTCDUP(9,%2,BCAST9)     VMULMEM (0,%0,BCAST5,Chi_12)
-		   VBCASTCDUP(10,%2,BCAST10)   VMULMEM (0,%1,BCAST6,Chi_20)
-		   VBCASTCDUP(11,%2,BCAST11)   VMULMEM (0,%1,BCAST7,Chi_21)
-		   VMULMEM (0,%1,BCAST8,Chi_22)         
-		   VMULMEM (0,%1,BCAST9,Chi_30)
-		   VMULMEM (0,%1,BCAST10,Chi_31)       
-		   VMULMEM (0,%1,BCAST11,Chi_32)
-		   : : "r" (a0), "r" (a1), "r" (a2)  );
-	} else { 
-	  asm (
-		   VBCASTCDUP(0,%2,BCAST0)   VMADDMEM (0,%0,BCAST0,Chi_00)
-		   VBCASTCDUP(1,%2,BCAST1)   VMADDMEM (0,%0,BCAST1,Chi_01)
-		   VBCASTCDUP(2,%2,BCAST2)   VMADDMEM (0,%0,BCAST2,Chi_02)
-		   VBCASTCDUP(3,%2,BCAST3)   VMADDMEM (0,%0,BCAST3,Chi_10)
-		   VBCASTCDUP(4,%2,BCAST4)   VMADDMEM (0,%0,BCAST4,Chi_11)
-		   VBCASTCDUP(5,%2,BCAST5)   VMADDMEM (0,%0,BCAST5,Chi_12)
-		   VBCASTCDUP(6,%2,BCAST6)   VMADDMEM (0,%1,BCAST6,Chi_20)
-		   VBCASTCDUP(7,%2,BCAST7)   VMADDMEM (0,%1,BCAST7,Chi_21)
-		   VBCASTCDUP(8,%2,BCAST8)   VMADDMEM (0,%1,BCAST8,Chi_22)
-		   VBCASTCDUP(9,%2,BCAST9)   VMADDMEM (0,%1,BCAST9,Chi_30)
-		   VBCASTCDUP(10,%2,BCAST10)  VMADDMEM (0,%1,BCAST10,Chi_31)
-		   VBCASTCDUP(11,%2,BCAST11)  VMADDMEM (0,%1,BCAST11,Chi_32) 
-		   : : "r" (a0), "r" (a1), "r" (a2)  );
-	}
-	a0 = a0+incr;
-	a1 = a1+incr;
-	a2 = a2+sizeof(typename Simd::scalar_type);
-      }}
-    {
-      int lexa = s1+LLs*site;
-      asm (
-	       VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)		
-	       VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)		
-	       VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)		
-	       VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)		
-	       : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-    }
-  }
-  }
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-#endif
-};
-
-  // Z-mobius version
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionField &chi,
-					     int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
-{
-#ifndef AVX512
-  {
-  SiteHalfSpinor BcastP;
-  SiteHalfSpinor BcastM;
-  SiteHalfSpinor SiteChiP;
-  SiteHalfSpinor SiteChiM;
-
-  // Ls*Ls * 2 * 12 * vol flops
-  for(int s1=0;s1<LLs;s1++){ 
-    for(int s2=0;s2<LLs;s2++){ 
-      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-
-        int s=s2+l*LLs;
-	int lex=s2+LLs*site;
-	
-	if ( s2==0 && l==0) {
-	  SiteChiP=zero;
-	  SiteChiM=zero;
-	}
-	
-	for(int sp=0;sp<2;sp++){
-        for(int co=0;co<Nc;co++){
-	  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
-	}}
-	for(int sp=0;sp<2;sp++){
-        for(int co=0;co<Nc;co++){
-	  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
-	}}
-
-	for(int sp=0;sp<2;sp++){
-        for(int co=0;co<Nc;co++){
-	  SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co); 
-	  SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co); 
-	}}
-
-
-    }}
-    {
-      int lex = s1+LLs*site;
-      for(int sp=0;sp<2;sp++){
-      for(int co=0;co<Nc;co++){
-	vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
-	vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-      }}
-    }
-  }
-
-  }
-#else
-  {
-  // pointers
-  //  MASK_REGS;
-#define Chi_00 %zmm0
-#define Chi_01 %zmm1
-#define Chi_02 %zmm2
-#define Chi_10 %zmm3
-#define Chi_11 %zmm4
-#define Chi_12 %zmm5
-#define Chi_20 %zmm6
-#define Chi_21 %zmm7
-#define Chi_22 %zmm8
-#define Chi_30 %zmm9
-#define Chi_31 %zmm10
-#define Chi_32 %zmm11
-#define pChi_00 %%zmm0
-#define pChi_01 %%zmm1
-#define pChi_02 %%zmm2
-#define pChi_10 %%zmm3
-#define pChi_11 %%zmm4
-#define pChi_12 %%zmm5
-#define pChi_20 %%zmm6
-#define pChi_21 %%zmm7
-#define pChi_22 %%zmm8
-#define pChi_30 %%zmm9
-#define pChi_31 %%zmm10
-#define pChi_32 %%zmm11
-
-#define BCAST_00   %zmm12
-#define  SHUF_00   %zmm13
-#define BCAST_01   %zmm14
-#define  SHUF_01   %zmm15
-#define BCAST_02   %zmm16
-#define  SHUF_02   %zmm17
-#define BCAST_10   %zmm18
-#define  SHUF_10   %zmm19
-#define BCAST_11   %zmm20
-#define  SHUF_11   %zmm21
-#define BCAST_12   %zmm22
-#define  SHUF_12   %zmm23
-
-#define Mp  %zmm24
-#define Mps %zmm25
-#define Mm  %zmm26
-#define Mms %zmm27
-#define N 8
-  int incr=LLs*LLs*sizeof(iSinglet<Simd>);
-  for(int s1=0;s1<LLs;s1++){ 
-    for(int s2=0;s2<LLs;s2++){ 
-      int lex=s2+LLs*site;
-      uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
-      uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
-      uint64_t a2 = (uint64_t)&psi[lex];
-      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-	if ( (s2+l)==0 ) {
-	  LOAD64(%r8,a0);
-	  LOAD64(%r9,a1);
-	  LOAD64(%r10,a2);
-	  asm (
-	       VLOAD(0,%r8,Mp)// i r
-	       VLOAD(0,%r9,Mm)
-	       VSHUF(Mp,Mps)  // r i 
-	       VSHUF(Mm,Mms)
-	       VPREFETCH1(12,%r10)  	     VPREFETCH1(13,%r10)
-	       VPREFETCH1(14,%r10)  	     VPREFETCH1(15,%r10)         
-
-	       VMULIDUP(0*N,%r10,Mps,Chi_00)
-	       VMULIDUP(1*N,%r10,Mps,Chi_01)
-	       VMULIDUP(2*N,%r10,Mps,Chi_02)
-	       VMULIDUP(3*N,%r10,Mps,Chi_10)
-	       VMULIDUP(4*N,%r10,Mps,Chi_11)
-	       VMULIDUP(5*N,%r10,Mps,Chi_12)
-
-	       VMULIDUP(6*N ,%r10,Mms,Chi_20)
-	       VMULIDUP(7*N ,%r10,Mms,Chi_21)
-	       VMULIDUP(8*N ,%r10,Mms,Chi_22)
-	       VMULIDUP(9*N ,%r10,Mms,Chi_30)
-	       VMULIDUP(10*N,%r10,Mms,Chi_31)
-	       VMULIDUP(11*N,%r10,Mms,Chi_32)
-
-	       VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
-	       VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
-	       VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
-	       VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
-	       VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
-	       VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
-
-	       VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
-	       VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
-	       VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
-	       VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
-	       VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
-	       VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
-	       );
-	} else { 
-	  LOAD64(%r8,a0);
-	  LOAD64(%r9,a1);
-	  LOAD64(%r10,a2);
-	  asm (
-	       VLOAD(0,%r8,Mp)
-	       VSHUF(Mp,Mps)
-
-	       VLOAD(0,%r9,Mm)
-	       VSHUF(Mm,Mms)
-
-	       VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) //  Mri * Pii +- Cir
-	       VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
-	       VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
-	       VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
-	       VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
-	       VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
-
-	       VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
-	       VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
-	       VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
-	       VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
-	       VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
-	       VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
-
-	       VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) //  Cir = Mir * Prr +- ( Mri * Pii +- Cir) 
-	       VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) //  Ci = MiPr + Ci + MrPi ;    Cr = MrPr - ( MiPi - Cr)
-	       VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
-	       VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
-	       VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
-	       VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
-
-	       VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
-	       VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
-	       VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
-	       VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
-	       VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
-	       VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
-	       );
-	}
-	a0 = a0+incr;
-	a1 = a1+incr;
-	a2 = a2+sizeof(typename Simd::scalar_type);
-      }}
-    {
-      int lexa = s1+LLs*site;
-      /*
-      SiteSpinor tmp;
-      asm (
-	       VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
-	       VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
-	       VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
-	       VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
-	       : : "r" ((uint64_t)&tmp) : "memory" );
-      */
-
-      asm (
-	       VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
-	       VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
-	       VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
-	       VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
-	       : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-      //      if ( 1 || (site==0) ) { 
-      //	std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
-      //      }
-    }
-  }
-  }
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-
-#endif
-};
-
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
-{
-  int Ls=this->Ls;
-  int LLs = psi._grid->_rdimensions[0];
-  int vol = psi._grid->oSites()/LLs;
-
-  chi.checkerboard=psi.checkerboard;
-  
-  Vector<iSinglet<Simd> >  Matp;
-  Vector<iSinglet<Simd> >  Matm;
-  Vector<iSinglet<Simd> >  *_Matp;
-  Vector<iSinglet<Simd> >  *_Matm;
-  
-  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-  if ( inv && dag ) { 
-    _Matp = &MatpInvDag;
-    _Matm = &MatmInvDag;
-  }
-  if ( inv && (!dag) ) { 
-    _Matp = &MatpInv;
-    _Matm = &MatmInv;
-  } 
-  if ( !inv ) {
-    MooeeInternalCompute(dag,inv,Matp,Matm);
-    _Matp = &Matp;
-    _Matm = &Matm;
-  }
-  assert(_Matp->size()==Ls*LLs);
-
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
-  if ( switcheroo<Coeff_t>::iscomplex() ) {
-    parallel_for(auto site=0;site<vol;site++){
-      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
-    }
-  } else { 
-    parallel_for(auto site=0;site<vol;site++){
-      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
-    }
-  }
-  MooeeInvTime+=usecond();
-}
-
-INSTANTIATE_DPERP(DomainWallVec5dImplD);
-INSTANTIATE_DPERP(DomainWallVec5dImplF);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplF);
-
-INSTANTIATE_DPERP(DomainWallVec5dImplDF);
-INSTANTIATE_DPERP(DomainWallVec5dImplFH);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplDF);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplFH);
-
-template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-
-template void CayleyFermion5D<DomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<DomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-
-
-
-}}
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.cc
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.cc
@ -1,323 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
-
-namespace Grid {
-  namespace QCD {
-
-    template<class Impl>
-    void ContinuedFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
-    {
-      SetCoefficientsZolotarev(1.0/scale,zdata);
-    }
-    template<class Impl>
-    void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
-    {
-      // How to check Ls matches??
-      //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
-      int Ls = this->Ls;
-      assert(zdata->db==Ls);// Beta has Ls coeffs
-
-      R=(1+this->mass)/(1-this->mass);
-
-      Beta.resize(Ls);
-      cc.resize(Ls);
-      cc_d.resize(Ls);
-      sqrt_cc.resize(Ls);
-      for(int i=0; i < Ls ; i++){
-	Beta[i] = zdata -> beta[i];
-	cc[i] = 1.0/Beta[i];
-	cc_d[i]=sqrt(cc[i]);
-      }
-    
-      cc_d[Ls-1]=1.0;
-      for(int i=0; i < Ls-1 ; i++){
-	sqrt_cc[i]= sqrt(cc[i]*cc[i+1]);
-      }    
-      sqrt_cc[Ls-2]=sqrt(cc[Ls-2]);
-
-
-      ZoloHiInv =1.0/zolo_hi;
-      dw_diag = (4.0-this->M5)*ZoloHiInv;
-    
-      See.resize(Ls);
-      Aee.resize(Ls);
-      int sign=1;
-      for(int s=0;s<Ls;s++){
-	Aee[s] = sign * Beta[s] * dw_diag;
-	sign   = - sign;
-      }
-      Aee[Ls-1] += R;
-    
-      See[0] = Aee[0];
-      for(int s=1;s<Ls;s++){
-	See[s] = Aee[s] - 1.0/See[s-1];
-      }
-      for(int s=0;s<Ls;s++){
-	std::cout<<GridLogMessage <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
-      }
-    }
-
-
-
-    template<class Impl>
-    RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
-    {
-      int Ls = this->Ls;
-
-      FermionField D(psi._grid);
-
-      this->DW(psi,D,DaggerNo); 
-
-      int sign=1;
-      for(int s=0;s<Ls;s++){
-	if ( s==0 ) {
-	  ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*ZoloHiInv,D,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
-	} else if ( s==(Ls-1) ){
-	  RealD R=(1.0+mass)/(1.0-mass);
-	  ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,D,sqrt_cc[s-1],psi,s,s-1);
-	  ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
-	} else {
-	  ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,D,sqrt_cc[s],psi,s,s+1);
-  	  axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
-	}
-	sign=-sign; 
-      }
-      return norm2(chi);
-    }
-    template<class Impl>
-    RealD  ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
-    {
-      // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
-      // The rest of matrix is symmetric.
-      // Can ignore "dag"
-      return M(psi,chi);
-    }
-    template<class Impl>
-    void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-      int Ls = this->Ls;
-
-      this->DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
-
-      int sign=1;
-      for(int s=0;s<Ls;s++){
-	if ( s==(Ls-1) ){
-	  ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
-	} else {
-	  ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
-	}
-	sign=-sign; 
-      }
-    }
-    template<class Impl>
-    void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
-    {
-      int Ls = this->Ls;
-
-      // Apply 4d dslash
-      if ( psi.checkerboard == Odd ) {
-	this->DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
-      } else {
-	this->DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
-      }
-      
-      int sign=1;
-      for(int s=0;s<Ls;s++){
-	if ( s==(Ls-1) ){
-	  ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
-	} else {
-	  ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
-	}
-	sign=-sign; 
-      }
-    }
-    template<class Impl>
-    void   ContinuedFractionFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
-    {
-      this->Meooe(psi,chi);
-    }
-    template<class Impl>
-    void   ContinuedFractionFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
-    {
-      int Ls = this->Ls;
-
-      int sign=1;
-      for(int s=0;s<Ls;s++){
-	if ( s==0 ) {
-	  ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*dw_diag,psi,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
-	} else if ( s==(Ls-1) ){
-	  // Drop the CC here.
-	  double R=(1+mass)/(1-mass);
-	  ag5xpby_ssp(chi,Beta[s]*dw_diag,psi,sqrt_cc[s-1],psi,s,s-1);
-	  ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
-	} else {
-	  ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*dw_diag,psi,sqrt_cc[s],psi,s,s+1);
-	  axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
-	}
-	sign=-sign; 
-      }
-    }
-
-    template<class Impl>
-    void   ContinuedFractionFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
-    {
-      this->Mooee(psi,chi);
-    }
-    template<class Impl>
-    void   ContinuedFractionFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
-    {
-      int Ls = this->Ls;
-
-      // Apply Linv
-      axpby_ssp(chi,1.0/cc_d[0],psi,0.0,psi,0,0); 
-      for(int s=1;s<Ls;s++){
-	axpbg5y_ssp(chi,1.0/cc_d[s],psi,-1.0/See[s-1],chi,s,s-1);
-      }
-      // Apply Dinv
-      for(int s=0;s<Ls;s++){
-	ag5xpby_ssp(chi,1.0/See[s],chi,0.0,chi,s,s); //only appearance of See[0]
-      }
-      // Apply Uinv = (Linv)^T
-      axpby_ssp(chi,1.0/cc_d[Ls-1],chi,0.0,chi,Ls-1,Ls-1);
-      for(int s=Ls-2;s>=0;s--){
-	axpbg5y_ssp(chi,1.0/cc_d[s],chi,-1.0*cc_d[s+1]/See[s]/cc_d[s],chi,s,s+1);
-      }
-    }
-    template<class Impl>
-    void   ContinuedFractionFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
-    {
-      this->MooeeInv(psi,chi);
-    }
-
-  // force terms; five routines; default to Dhop on diagonal
-    template<class Impl>
-   void ContinuedFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    int Ls = this->Ls;
-
-    FermionField D(V._grid);
-
-    int sign=1;
-    for(int s=0;s<Ls;s++){
-      if ( s==(Ls-1) ){
-	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
-      } else {
-	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
-      }
-      sign=-sign; 
-    }
-    this->DhopDeriv(mat,D,V,DaggerNo); 
-  };
-    template<class Impl>
-   void ContinuedFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    int Ls = this->Ls;
-
-    FermionField D(V._grid);
-
-    int sign=1;
-    for(int s=0;s<Ls;s++){
-      if ( s==(Ls-1) ){
-	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
-      } else {
-	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
-      }
-      sign=-sign; 
-    }
-    this->DhopDerivOE(mat,D,V,DaggerNo); 
-  };
-  template<class Impl>
-  void ContinuedFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    int Ls = this->Ls;
-
-    FermionField D(V._grid);
-
-    int sign=1;
-    for(int s=0;s<Ls;s++){
-      if ( s==(Ls-1) ){
-	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
-      } else {
-	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
-      }
-      sign=-sign; 
-    }
-    this->DhopDerivEO(mat,D,V,DaggerNo); 
-  };
-    
-    // Constructors
-    template<class Impl>
-    ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
-							   GaugeField &_Umu,
-							   GridCartesian         &FiveDimGrid,
-							   GridRedBlackCartesian &FiveDimRedBlackGrid,
-							   GridCartesian         &FourDimGrid,
-							   GridRedBlackCartesian &FourDimRedBlackGrid,
-							   RealD _mass,RealD M5,const ImplParams &p) :
-      WilsonFermion5D<Impl>(_Umu,
-			    FiveDimGrid, FiveDimRedBlackGrid,
-			    FourDimGrid, FourDimRedBlackGrid,M5,p),
-      mass(_mass)
-    {
-      int Ls = this->Ls;
-      assert((Ls&0x1)==1); // Odd Ls required
-    }
-
-    template<class Impl>
-    void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
-    {
-      int Ls = this->Ls;
-      conformable(solution5d._grid,this->FermionGrid());
-      conformable(exported4d._grid,this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
-    }
-    template<class Impl>
-    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
-    {
-      int Ls = this->Ls;
-      conformable(imported5d._grid,this->FermionGrid());
-      conformable(input4d._grid   ,this->GaugeGrid());
-      FermionField tmp(this->FermionGrid());
-      tmp=zero;
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
-      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
-      this->Dminus(tmp,imported5d);
-    }
-
-    FermOpTemplateInstantiate(ContinuedFractionFermion5D);
-
-  }
-}
-
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,46 +24,44 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_CONTINUED_FRACTION_H
 #define  GRID_QCD_CONTINUED_FRACTION_H

 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class ContinuedFractionFermion5D : public WilsonFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class ContinuedFractionFermion5D : public WilsonFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
+  // override multiply
+  virtual RealD  M    (const FermionField &in, FermionField &out);
+  virtual RealD  Mdag (const FermionField &in, FermionField &out);

-      // override multiply
-      virtual RealD  M    (const FermionField &in, FermionField &out);
-      virtual RealD  Mdag (const FermionField &in, FermionField &out);
+  // half checkerboard operaions
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);

-      // half checkerboard operaions
-      virtual void   Meooe       (const FermionField &in, FermionField &out);
-      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
-      virtual void   Mooee       (const FermionField &in, FermionField &out);
-      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+  // force terms; five routines; default to Dhop on diagonal
+  virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

-      // force terms; five routines; default to Dhop on diagonal
-      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  //      virtual void   Instantiatable(void)=0;
+  virtual void   Instantiatable(void) =0;

-      //      virtual void   Instantiatable(void)=0;
-      virtual void   Instantiatable(void) =0;
-
-      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  // Efficient support for multigrid coarsening
+  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);

      ///////////////////////////////////////////////////////////////
      // Physical surface field utilities
@ -73,35 +71,34 @@ namespace Grid {
      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);

-      // Constructors
-      ContinuedFractionFermion5D(GaugeField &_Umu,
-				 GridCartesian         &FiveDimGrid,
-				 GridRedBlackCartesian &FiveDimRedBlackGrid,
-				 GridCartesian         &FourDimGrid,
-				 GridRedBlackCartesian &FourDimRedBlackGrid,
-				 RealD _mass,RealD M5,const ImplParams &p= ImplParams());
+  // Constructors
+  ContinuedFractionFermion5D(GaugeField &_Umu,
+			     GridCartesian         &FiveDimGrid,
+			     GridRedBlackCartesian &FiveDimRedBlackGrid,
+			     GridCartesian         &FourDimGrid,
+			     GridRedBlackCartesian &FourDimRedBlackGrid,
+			     RealD _mass,RealD M5,const ImplParams &p= ImplParams());

-    protected:
+protected:

-      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
-      void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);;
+  void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
+  void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);;

-      // Cont frac
-      RealD dw_diag;
-      RealD mass;
-      RealD R;
-      RealD ZoloHiInv;
-      std::vector<double> Beta;
-      std::vector<double> cc;;
-      std::vector<double> cc_d;;
-      std::vector<double> sqrt_cc;
-      std::vector<double> See;
-      std::vector<double> Aee;
+  // Cont frac
+  RealD dw_diag;
+  RealD mass;
+  RealD R;
+  RealD ZoloHiInv;
+  Vector<double> Beta;
+  Vector<double> cc;;
+  Vector<double> cc_d;;
+  Vector<double> sqrt_cc;
+  Vector<double> See;
+  Vector<double> Aee;

-    };
+};


-  }
-}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc
@ -1,438 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    template<class Impl>
-    DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
-      GaugeField            &_Umu,
-      GridCartesian         &FiveDimGrid,
-      GridRedBlackCartesian &FiveDimRedBlackGrid,
-      GridCartesian         &FourDimGrid,
-      GridRedBlackCartesian &FourDimRedBlackGrid,
-      RealD _mq1, RealD _mq2, RealD _mq3,
-      RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
-    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-        _shift, _pm, _M5, 1.0, 0.0, p)
-    {
-        RealD eps = 1.0;
-        Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
-        assert(zdata->n == this->Ls);
-
-        std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
-        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-
-        Approx::zolotarev_free(zdata);
-    }
-
-    /***************************************************************
-     * Additional EOFA operators only called outside the inverter.
-     * Since speed is not essential, simple axpby-style
-     * implementations should be fine.
-     ***************************************************************/
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-    {
-        int Ls = this->Ls;
-
-        Din = zero;
-        if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
-        else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-        else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
-        else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-    }
-
-    // This is just the identity for DWF
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-    // This is just the identity for DWF
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-    /*****************************************************************************************************/
-
-    template<class Impl>
-    RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-    {
-        int Ls = this->Ls;
-
-        FermionField Din(psi._grid);
-
-        this->Meooe5D(psi, Din);
-        this->DW(Din, chi, DaggerNo);
-        axpby(chi, 1.0, 1.0, chi, psi);
-        this->M5D(psi, chi);
-        return(norm2(chi));
-    }
-
-    template<class Impl>
-    RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-    {
-        int Ls = this->Ls;
-
-        FermionField Din(psi._grid);
-
-        this->DW(psi, Din, DaggerYes);
-        this->MeooeDag5D(Din, chi);
-        this->M5Ddag(psi, chi);
-        axpby(chi, 1.0, 1.0, chi, psi);
-        return(norm2(chi));
-    }
-
-    /********************************************************************
-     * Performance critical fermion operators called inside the inverter
-     ********************************************************************/
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-    {
-        int   Ls    = this->Ls;
-        int   pm    = this->pm;
-        RealD shift = this->shift;
-        RealD mq1   = this->mq1;
-        RealD mq2   = this->mq2;
-        RealD mq3   = this->mq3;
-
-        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-        Coeff_t shiftp(0.0), shiftm(0.0);
-        if(shift != 0.0){
-          if(pm == 1){ shiftp = shift*(mq3-mq2); }
-          else{ shiftm = -shift*(mq3-mq2); }
-        }
-
-        std::vector<Coeff_t> diag(Ls,1.0);
-        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
-        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
-
-        #if(0)
-            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
-            for(int i=0; i<diag.size(); ++i){
-                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-            }
-            for(int i=0; i<upper.size(); ++i){
-                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-            }
-            for(int i=0; i<lower.size(); ++i){
-                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-            }
-        #endif
-
-        this->M5D(psi, chi, chi, lower, diag, upper);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-    {
-        int   Ls    = this->Ls;
-        int   pm    = this->pm;
-        RealD shift = this->shift;
-        RealD mq1   = this->mq1;
-        RealD mq2   = this->mq2;
-        RealD mq3   = this->mq3;
-
-        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-        Coeff_t shiftp(0.0), shiftm(0.0);
-        if(shift != 0.0){
-          if(pm == 1){ shiftp = shift*(mq3-mq2); }
-          else{ shiftm = -shift*(mq3-mq2); }
-        }
-
-        std::vector<Coeff_t> diag(Ls,1.0);
-        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
-        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
-
-        #if(0)
-            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
-            for(int i=0; i<diag.size(); ++i){
-                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-            }
-            for(int i=0; i<upper.size(); ++i){
-                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-            }
-            for(int i=0; i<lower.size(); ++i){
-                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-            }
-        #endif
-
-        this->M5Ddag(psi, chi, chi, lower, diag, upper);
-    }
-
-    // half checkerboard operations
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-    {
-        int Ls = this->Ls;
-
-        std::vector<Coeff_t> diag = this->bee;
-        std::vector<Coeff_t> upper(Ls);
-        std::vector<Coeff_t> lower(Ls);
-
-        for(int s=0; s<Ls; s++){
-          upper[s] = -this->cee[s];
-          lower[s] = -this->cee[s];
-        }
-        upper[Ls-1] = this->dm;
-        lower[0]    = this->dp;
-
-        this->M5D(psi, psi, chi, lower, diag, upper);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-    {
-        int Ls = this->Ls;
-
-        std::vector<Coeff_t> diag = this->bee;
-        std::vector<Coeff_t> upper(Ls);
-        std::vector<Coeff_t> lower(Ls);
-
-        for(int s=0; s<Ls; s++){
-          upper[s] = -this->cee[s];
-          lower[s] = -this->cee[s];
-        }
-        upper[Ls-1] = this->dp;
-        lower[0]    = this->dm;
-
-        this->M5Ddag(psi, psi, chi, lower, diag, upper);
-    }
-
-    /****************************************************************************************/
-
-    //Zolo
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
-    {
-        int   Ls    = this->Ls;
-        int   pm    = this->pm;
-        RealD mq1   = this->mq1;
-        RealD mq2   = this->mq2;
-        RealD mq3   = this->mq3;
-        RealD shift = this->shift;
-
-        ////////////////////////////////////////////////////////
-        // Constants for the preconditioned matrix Cayley form
-        ////////////////////////////////////////////////////////
-        this->bs.resize(Ls);
-        this->cs.resize(Ls);
-        this->aee.resize(Ls);
-        this->aeo.resize(Ls);
-        this->bee.resize(Ls);
-        this->beo.resize(Ls);
-        this->cee.resize(Ls);
-        this->ceo.resize(Ls);
-
-        for(int i=0; i<Ls; ++i){
-          this->bee[i] = 4.0 - this->M5 + 1.0;
-          this->cee[i] = 1.0;
-        }
-
-        for(int i=0; i<Ls; ++i){
-          this->aee[i] = this->cee[i];
-          this->bs[i] = this->beo[i] = 1.0;
-          this->cs[i] = this->ceo[i] = 0.0;
-        }
-
-        //////////////////////////////////////////
-        // EOFA shift terms
-        //////////////////////////////////////////
-        if(pm == 1){
-          this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
-          this->dm = mq1*this->cee[Ls-1];
-        } else if(this->pm == -1) {
-          this->dp = mq1*this->cee[0];
-          this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
-        } else {
-          this->dp = mq1*this->cee[0];
-          this->dm = mq1*this->cee[Ls-1];
-        }
-
-        //////////////////////////////////////////
-        // LDU decomposition of eeoo
-        //////////////////////////////////////////
-        this->dee.resize(Ls+1);
-        this->lee.resize(Ls);
-        this->leem.resize(Ls);
-        this->uee.resize(Ls);
-        this->ueem.resize(Ls);
-
-        for(int i=0; i<Ls; ++i){
-
-          if(i < Ls-1){
-
-            this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
-
-            this->leem[i] = this->dm/this->bee[i];
-            for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
-
-            this->dee[i] = this->bee[i];
-
-            this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
-
-            this->ueem[i] = this->dp / this->bee[0];
-            for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
-
-          } else {
-
-            this->lee[i]  = 0.0;
-            this->leem[i] = 0.0;
-            this->uee[i]  = 0.0;
-            this->ueem[i] = 0.0;
-
-          }
-        }
-
-        {
-          Coeff_t delta_d = 1.0 / this->bee[0];
-          for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
-          this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
-          this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
-        }
-
-        int inv = 1;
-        this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
-        this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
-    }
-
-    // Recompute Cayley-form coefficients for different shift
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-    {
-        this->shift = new_shift;
-        Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
-        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-        Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-    {
-        int Ls = this->Ls;
-
-        GridBase* grid = this->FermionRedBlackGrid();
-        int LLs = grid->_rdimensions[0];
-
-        if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-        Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-        Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-        for(int s=0; s<Ls; s++){
-            Pplus(s,s)  = this->bee[s];
-            Pminus(s,s) = this->bee[s];
-        }
-
-        for(int s=0; s<Ls-1; s++){
-            Pminus(s,s+1) = -this->cee[s];
-        }
-
-        for(int s=0; s<Ls-1; s++){
-            Pplus(s+1,s) = -this->cee[s+1];
-        }
-
-        Pplus (0,Ls-1) = this->dp;
-        Pminus(Ls-1,0) = this->dm;
-
-        Eigen::MatrixXcd PplusMat ;
-        Eigen::MatrixXcd PminusMat;
-
-        #if(0)
-            std::cout << GridLogMessage << "Pplus:" << std::endl;
-            for(int s=0; s<Ls; ++s){
-                for(int ss=0; ss<Ls; ++ss){
-                    std::cout << Pplus(s,ss) << "\t";
-                }
-                std::cout << std::endl;
-            }
-            std::cout << GridLogMessage << "Pminus:" << std::endl;
-            for(int s=0; s<Ls; ++s){
-                for(int ss=0; ss<Ls; ++ss){
-                    std::cout << Pminus(s,ss) << "\t";
-                }
-                std::cout << std::endl;
-            }
-        #endif
-
-        if(inv) {
-            PplusMat  = Pplus.inverse();
-            PminusMat = Pminus.inverse();
-        } else {
-            PplusMat  = Pplus;
-            PminusMat = Pminus;
-        }
-
-        if(dag){
-            PplusMat.adjointInPlace();
-            PminusMat.adjointInPlace();
-        }
-
-        typedef typename SiteHalfSpinor::scalar_type scalar_type;
-        const int Nsimd = Simd::Nsimd();
-        Matp.resize(Ls*LLs);
-        Matm.resize(Ls*LLs);
-
-        for(int s2=0; s2<Ls; s2++){
-        for(int s1=0; s1<LLs; s1++){
-            int istride = LLs;
-            int ostride = 1;
-            Simd Vp;
-            Simd Vm;
-            scalar_type *sp = (scalar_type*) &Vp;
-            scalar_type *sm = (scalar_type*) &Vm;
-            for(int l=0; l<Nsimd; l++){
-                if(switcheroo<Coeff_t>::iscomplex()) {
-                    sp[l] = PplusMat (l*istride+s1*ostride,s2);
-                    sm[l] = PminusMat(l*istride+s1*ostride,s2);
-                } else {
-                    // if real
-                    scalar_type tmp;
-                    tmp = PplusMat (l*istride+s1*ostride,s2);
-                    sp[l] = scalar_type(tmp.real(),tmp.real());
-                    tmp = PminusMat(l*istride+s1*ostride,s2);
-                    sm[l] = scalar_type(tmp.real(),tmp.real());
-                }
-            }
-            Matp[LLs*s2+s1] = Vp;
-            Matm[LLs*s2+s1] = Vm;
-        }}
-    }
-
-    FermOpTemplateInstantiate(DomainWallEOFAFermion);
-    GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
-
-}}
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h
@ -26,90 +26,65 @@ with this program; if not, write to the Free Software Foundation, Inc.,

 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
-/*  END LEGAL */
-#ifndef  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
-#define  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
+			   /*  END LEGAL */
+#pragma once

 #include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  template<class Impl>
-  class DomainWallEOFAFermion : public AbstractEOFAFermion<Impl>
-  {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
+template<class Impl>
+class DomainWallEOFAFermion : public AbstractEOFAFermion<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    public:
-      // Modified (0,Ls-1) and (Ls-1,0) elements of Mooee
-      // for red-black preconditioned Shamir EOFA
-      Coeff_t dm;
-      Coeff_t dp;
+public:
+  // Modified (0,Ls-1) and (Ls-1,0) elements of Mooee
+  // for red-black preconditioned Shamir EOFA
+  Coeff_t dm;
+  Coeff_t dp;

-      virtual void Instantiatable(void) {};
+  virtual void Instantiatable(void) {};

-      // EOFA-specific operations
-      virtual void  Omega      (const FermionField& in, FermionField& out, int sign, int dag);
-      virtual void  Dtilde     (const FermionField& in, FermionField& out);
-      virtual void  DtildeInv  (const FermionField& in, FermionField& out);
+  // EOFA-specific operations
+  virtual void  Omega      (const FermionField& in, FermionField& out, int sign, int dag);
+  virtual void  Dtilde     (const FermionField& in, FermionField& out);
+  virtual void  DtildeInv  (const FermionField& in, FermionField& out);

-      // override multiply
-      virtual RealD M          (const FermionField& in, FermionField& out);
-      virtual RealD Mdag       (const FermionField& in, FermionField& out);
+  // override multiply
+  virtual RealD M          (const FermionField& in, FermionField& out);
+  virtual RealD Mdag       (const FermionField& in, FermionField& out);

-      // half checkerboard operations
-      virtual void  Mooee      (const FermionField& in, FermionField& out);
-      virtual void  MooeeDag   (const FermionField& in, FermionField& out);
-      virtual void  MooeeInv   (const FermionField& in, FermionField& out);
-      virtual void  MooeeInvDag(const FermionField& in, FermionField& out);
+  // half checkerboard operations
+  virtual void  Mooee      (const FermionField& in, FermionField& out);
+  virtual void  MooeeDag   (const FermionField& in, FermionField& out);
+  virtual void  MooeeInv   (const FermionField& in, FermionField& out);
+  virtual void  MooeeInvDag(const FermionField& in, FermionField& out);

-      virtual void   M5D       (const FermionField& psi, FermionField& chi);
-      virtual void   M5Ddag    (const FermionField& psi, FermionField& chi);
+  virtual void   M5D       (const FermionField& psi, FermionField& chi);
+  virtual void   M5Ddag    (const FermionField& psi, FermionField& chi);

-      /////////////////////////////////////////////////////
-      // Instantiate different versions depending on Impl
-      /////////////////////////////////////////////////////
-      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+  /////////////////////////////////////////////////////
+  // Instantiate different versions depending on Impl
+  /////////////////////////////////////////////////////
+  void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
+	   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);

-      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+  void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
+	      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);

-      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
+  virtual void RefreshShiftCoefficients(RealD new_shift);

-      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+  // Constructors
+  DomainWallEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
+			GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
+			RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
+			RealD _M5, const ImplParams& p=ImplParams());

-      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
-        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+protected:
+  void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c);
+};

-      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
-        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+NAMESPACE_END(Grid);

-      virtual void RefreshShiftCoefficients(RealD new_shift);
-
-      // Constructors
-      DomainWallEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
-        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
-        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
-        RealD _M5, const ImplParams& p=ImplParams());
-
-    protected:
-      void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
-  };
-}}
-
-#define INSTANTIATE_DPERP_DWF_EOFA(A)\
-template void DomainWallEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
-template void DomainWallEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
-template void DomainWallEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
-template void DomainWallEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi);
-
-#undef  DOMAIN_WALL_EOFA_DPERP_DENSE
-#define DOMAIN_WALL_EOFA_DPERP_CACHE
-#undef  DOMAIN_WALL_EOFA_DPERP_LINALG
-#define DOMAIN_WALL_EOFA_DPERP_VEC
-
-#endif
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc
@ -1,248 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-
-    // Pminus fowards
-    // Pplus  backwards..
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        int Ls = this->Ls;
-        GridBase* grid = psi._grid;
-
-        assert(phi.checkerboard == psi.checkerboard);
-        chi.checkerboard = psi.checkerboard;
-        // Flops = 6.0*(Nc*Ns) *Ls*vol
-        this->M5Dcalls++;
-        this->M5Dtime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
-            for(int s=0; s<Ls; s++){
-                auto tmp = psi._odata[0];
-                if(s==0) {
-                    spProj5m(tmp, psi._odata[ss+s+1]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5p(tmp, psi._odata[ss+Ls-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                } else if(s==(Ls-1)) {
-                    spProj5m(tmp, psi._odata[ss+0]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5p(tmp, psi._odata[ss+s-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                } else {
-                    spProj5m(tmp, psi._odata[ss+s+1]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5p(tmp, psi._odata[ss+s-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                }
-            }
-        }
-
-        this->M5Dtime += usecond();
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        int Ls = this->Ls;
-        GridBase* grid = psi._grid;
-        assert(phi.checkerboard == psi.checkerboard);
-        chi.checkerboard=psi.checkerboard;
-
-        // Flops = 6.0*(Nc*Ns) *Ls*vol
-        this->M5Dcalls++;
-        this->M5Dtime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
-            auto tmp = psi._odata[0];
-            for(int s=0; s<Ls; s++){
-                if(s==0) {
-                    spProj5p(tmp, psi._odata[ss+s+1]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5m(tmp, psi._odata[ss+Ls-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                } else if(s==(Ls-1)) {
-                    spProj5p(tmp, psi._odata[ss+0]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5m(tmp, psi._odata[ss+s-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                } else {
-                    spProj5p(tmp, psi._odata[ss+s+1]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5m(tmp, psi._odata[ss+s-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                }
-            }
-        }
-
-        this->M5Dtime += usecond();
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-    {
-        GridBase* grid = psi._grid;
-        int Ls = this->Ls;
-
-        chi.checkerboard = psi.checkerboard;
-
-        this->MooeeInvCalls++;
-        this->MooeeInvTime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
-
-            auto tmp1 = psi._odata[0];
-            auto tmp2 = psi._odata[0];
-
-            // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-            // Apply (L^{\prime})^{-1}
-            chi[ss] = psi[ss]; // chi[0]=psi[0]
-            for(int s=1; s<Ls; s++){
-                spProj5p(tmp1, chi[ss+s-1]);
-                chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
-            }
-
-            // L_m^{-1}
-            for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-                spProj5m(tmp1, chi[ss+s]);
-                chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
-            }
-
-            // U_m^{-1} D^{-1}
-            for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-                spProj5p(tmp1, chi[ss+Ls-1]);
-                chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
-            }
-            spProj5m(tmp2, chi[ss+Ls-1]);
-            chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
-
-            // Apply U^{-1}
-            for(int s=Ls-2; s>=0; s--){
-                spProj5m(tmp1, chi[ss+s+1]);
-                chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
-            }
-        }
-
-        this->MooeeInvTime += usecond();
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-    {
-        GridBase* grid = psi._grid;
-        int Ls = this->Ls;
-
-        assert(psi.checkerboard == psi.checkerboard);
-        chi.checkerboard = psi.checkerboard;
-
-        std::vector<Coeff_t> ueec(Ls);
-        std::vector<Coeff_t> deec(Ls+1);
-        std::vector<Coeff_t> leec(Ls);
-        std::vector<Coeff_t> ueemc(Ls);
-        std::vector<Coeff_t> leemc(Ls);
-
-        for(int s=0; s<ueec.size(); s++){
-            ueec[s]  = conjugate(this->uee[s]);
-            deec[s]  = conjugate(this->dee[s]);
-            leec[s]  = conjugate(this->lee[s]);
-            ueemc[s] = conjugate(this->ueem[s]);
-            leemc[s] = conjugate(this->leem[s]);
-        }
-        deec[Ls] = conjugate(this->dee[Ls]);
-
-        this->MooeeInvCalls++;
-        this->MooeeInvTime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
-
-            auto tmp1 = psi._odata[0];
-            auto tmp2 = psi._odata[0];
-
-            // Apply (U^{\prime})^{-dagger}
-            chi[ss] = psi[ss];
-            for(int s=1; s<Ls; s++){
-                spProj5m(tmp1, chi[ss+s-1]);
-                chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
-            }
-
-            // U_m^{-\dagger}
-            for(int s=0; s<Ls-1; s++){
-                spProj5p(tmp1, chi[ss+s]);
-                chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
-            }
-
-            // L_m^{-\dagger} D^{-dagger}
-            for(int s=0; s<Ls-1; s++){
-                spProj5m(tmp1, chi[ss+Ls-1]);
-                chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
-            }
-            spProj5p(tmp2, chi[ss+Ls-1]);
-            chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
-
-            // Apply L^{-dagger}
-            for(int s=Ls-2; s>=0; s--){
-                spProj5p(tmp1, chi[ss+s+1]);
-                chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
-            }
-        }
-
-        this->MooeeInvTime += usecond();
-    }
-
-    #ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
-
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
-
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
-
-    #endif
-
-}}
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermiondense.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermiondense.cc
@ -1,159 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    /*
-    * Dense matrix versions of routines
-    */
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-    {
-        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-    {
-        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-    {
-        int Ls = this->Ls;
-        int LLs = psi._grid->_rdimensions[0];
-        int vol = psi._grid->oSites()/LLs;
-
-        chi.checkerboard = psi.checkerboard;
-
-        assert(Ls==LLs);
-
-        Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
-        Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
-
-        for(int s=0;s<Ls;s++){
-            Pplus(s,s)  = this->bee[s];
-            Pminus(s,s) = this->bee[s];
-        }
-
-        for(int s=0; s<Ls-1; s++){
-            Pminus(s,s+1) = -this->cee[s];
-        }
-
-        for(int s=0; s<Ls-1; s++){
-            Pplus(s+1,s) = -this->cee[s+1];
-        }
-
-        Pplus (0,Ls-1) = this->dp;
-        Pminus(Ls-1,0) = this->dm;
-
-        Eigen::MatrixXd PplusMat ;
-        Eigen::MatrixXd PminusMat;
-
-        if(inv) {
-            PplusMat  = Pplus.inverse();
-            PminusMat = Pminus.inverse();
-        } else {
-            PplusMat  = Pplus;
-            PminusMat = Pminus;
-        }
-
-        if(dag){
-            PplusMat.adjointInPlace();
-            PminusMat.adjointInPlace();
-        }
-
-        // For the non-vectorised s-direction this is simple
-
-        for(auto site=0; site<vol; site++){
-
-            SiteSpinor     SiteChi;
-            SiteHalfSpinor SitePplus;
-            SiteHalfSpinor SitePminus;
-
-            for(int s1=0; s1<Ls; s1++){
-                SiteChi = zero;
-                for(int s2=0; s2<Ls; s2++){
-                    int lex2 = s2 + Ls*site;
-                    if(PplusMat(s1,s2) != 0.0){
-                        spProj5p(SitePplus,psi[lex2]);
-                        accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
-                    }
-                    if(PminusMat(s1,s2) != 0.0){
-                        spProj5m(SitePminus, psi[lex2]);
-                        accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
-                    }
-                }
-                chi[s1+Ls*site] = SiteChi*0.5;
-            }
-        }
-    }
-
-    #ifdef DOMAIN_WALL_EOFA_DPERP_DENSE
-
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
-
-        template void DomainWallEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
-
-        template void DomainWallEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-    #endif
-
-}}
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermionssp.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermionssp.cc
@ -1,168 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-    // Pminus fowards
-    // Pplus  backwards
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        Coeff_t one(1.0);
-        int Ls = this->Ls;
-        for(int s=0; s<Ls; s++){
-            if(s==0) {
-              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
-            } else if (s==(Ls-1)) {
-              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
-              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
-            } else {
-              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-              axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
-            }
-        }
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        Coeff_t one(1.0);
-        int Ls = this->Ls;
-        for(int s=0; s<Ls; s++){
-            if(s==0) {
-              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
-            } else if (s==(Ls-1)) {
-              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
-              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-            } else {
-              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-            }
-        }
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-    {
-        Coeff_t one(1.0);
-        Coeff_t czero(0.0);
-        chi.checkerboard = psi.checkerboard;
-        int Ls = this->Ls;
-
-        FermionField tmp(psi._grid);
-
-        // Apply (L^{\prime})^{-1}
-        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-        for(int s=1; s<Ls; s++){
-            axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
-        }
-
-        // L_m^{-1}
-        for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-            axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
-        }
-
-        // U_m^{-1} D^{-1}
-        for(int s=0; s<Ls-1; s++){
-            axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls], chi, s, Ls-1);
-        }
-        axpby_ssp_pminus(tmp, czero, chi, one/this->dee[Ls-1], chi, Ls-1, Ls-1);
-        axpby_ssp_pplus(chi, one, tmp, one/this->dee[Ls], chi, Ls-1, Ls-1);
-
-        // Apply U^{-1}
-        for(int s=Ls-2; s>=0; s--){
-            axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
-        }
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-    {
-        Coeff_t one(1.0);
-        Coeff_t czero(0.0);
-        chi.checkerboard = psi.checkerboard;
-        int Ls = this->Ls;
-
-        FermionField tmp(psi._grid);
-
-        // Apply (U^{\prime})^{-dagger}
-        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-        for(int s=1; s<Ls; s++){
-            axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
-        }
-
-        // U_m^{-\dagger}
-        for(int s=0; s<Ls-1; s++){
-            axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
-        }
-
-        // L_m^{-\dagger} D^{-dagger}
-        for(int s=0; s<Ls-1; s++){
-            axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
-        }
-        axpby_ssp_pminus(tmp, czero, chi, one/conjugate(this->dee[Ls-1]), chi, Ls-1, Ls-1);
-        axpby_ssp_pplus(chi, one, tmp, one/conjugate(this->dee[Ls]), chi, Ls-1, Ls-1);
-
-        // Apply L^{-dagger}
-        for(int s=Ls-2; s>=0; s--){
-            axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
-        }
-    }
-
-    #ifdef DOMAIN_WALL_EOFA_DPERP_LINALG
-
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
-
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
-
-    #endif
-
-}}
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc
@ -1,605 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    /*
-    * Dense matrix versions of routines
-    */
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-    {
-        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-    {
-        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        GridBase* grid = psi._grid;
-        int Ls  = this->Ls;
-        int LLs = grid->_rdimensions[0];
-        const int nsimd = Simd::Nsimd();
-
-        Vector<iSinglet<Simd> > u(LLs);
-        Vector<iSinglet<Simd> > l(LLs);
-        Vector<iSinglet<Simd> > d(LLs);
-
-        assert(Ls/LLs == nsimd);
-        assert(phi.checkerboard == psi.checkerboard);
-
-        chi.checkerboard = psi.checkerboard;
-
-        // just directly address via type pun
-        typedef typename Simd::scalar_type scalar_type;
-        scalar_type* u_p = (scalar_type*) &u[0];
-        scalar_type* l_p = (scalar_type*) &l[0];
-        scalar_type* d_p = (scalar_type*) &d[0];
-
-        for(int o=0;o<LLs;o++){ // outer
-        for(int i=0;i<nsimd;i++){ //inner
-            int s  = o + i*LLs;
-            int ss = o*nsimd + i;
-            u_p[ss] = upper[s];
-            l_p[ss] = lower[s];
-            d_p[ss] = diag[s];
-        }}
-
-        this->M5Dcalls++;
-        this->M5Dtime -= usecond();
-
-        assert(Nc == 3);
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-            #if 0
-
-                alignas(64) SiteHalfSpinor hp;
-                alignas(64) SiteHalfSpinor hm;
-                alignas(64) SiteSpinor fp;
-                alignas(64) SiteSpinor fm;
-
-                for(int v=0; v<LLs; v++){
-
-                    int vp = (v+1)%LLs;
-                    int vm = (v+LLs-1)%LLs;
-
-                    spProj5m(hp, psi[ss+vp]);
-                    spProj5p(hm, psi[ss+vm]);
-
-                    if (vp <= v){ rotate(hp, hp, 1); }
-                    if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-                    hp = 0.5*hp;
-                    hm = 0.5*hm;
-
-                    spRecon5m(fp, hp);
-                    spRecon5p(fm, hm);
-
-                    chi[ss+v] = d[v]*phi[ss+v];
-                    chi[ss+v] = chi[ss+v] + u[v]*fp;
-                    chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-                }
-
-            #else
-
-                for(int v=0; v<LLs; v++){
-
-                    vprefetch(psi[ss+v+LLs]);
-
-                    int vp = (v==LLs-1) ? 0     : v+1;
-                    int vm = (v==0)     ? LLs-1 : v-1;
-
-                    Simd hp_00 = psi[ss+vp]()(2)(0);
-                    Simd hp_01 = psi[ss+vp]()(2)(1);
-                    Simd hp_02 = psi[ss+vp]()(2)(2);
-                    Simd hp_10 = psi[ss+vp]()(3)(0);
-                    Simd hp_11 = psi[ss+vp]()(3)(1);
-                    Simd hp_12 = psi[ss+vp]()(3)(2);
-
-                    Simd hm_00 = psi[ss+vm]()(0)(0);
-                    Simd hm_01 = psi[ss+vm]()(0)(1);
-                    Simd hm_02 = psi[ss+vm]()(0)(2);
-                    Simd hm_10 = psi[ss+vm]()(1)(0);
-                    Simd hm_11 = psi[ss+vm]()(1)(1);
-                    Simd hm_12 = psi[ss+vm]()(1)(2);
-
-                    if(vp <= v){
-                        hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-                        hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-                        hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-                        hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-                        hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-                        hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-                    }
-
-                    if(vm >= v){
-                        hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-                        hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-                        hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-                        hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-                        hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-                        hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-                    }
-
-                    // Can force these to real arithmetic and save 2x.
-                    Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-                    Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-                    Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-                    Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-                    Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-                    Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-                    Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-                    Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-                    Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-                    Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-                    Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-                    Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-                    vstream(chi[ss+v]()(0)(0), p_00);
-                    vstream(chi[ss+v]()(0)(1), p_01);
-                    vstream(chi[ss+v]()(0)(2), p_02);
-                    vstream(chi[ss+v]()(1)(0), p_10);
-                    vstream(chi[ss+v]()(1)(1), p_11);
-                    vstream(chi[ss+v]()(1)(2), p_12);
-                    vstream(chi[ss+v]()(2)(0), p_20);
-                    vstream(chi[ss+v]()(2)(1), p_21);
-                    vstream(chi[ss+v]()(2)(2), p_22);
-                    vstream(chi[ss+v]()(3)(0), p_30);
-                    vstream(chi[ss+v]()(3)(1), p_31);
-                    vstream(chi[ss+v]()(3)(2), p_32);
-                }
-
-            #endif
-        }
-
-        this->M5Dtime += usecond();
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        GridBase* grid = psi._grid;
-        int Ls  = this->Ls;
-        int LLs = grid->_rdimensions[0];
-        int nsimd = Simd::Nsimd();
-
-        Vector<iSinglet<Simd> > u(LLs);
-        Vector<iSinglet<Simd> > l(LLs);
-        Vector<iSinglet<Simd> > d(LLs);
-
-        assert(Ls/LLs == nsimd);
-        assert(phi.checkerboard == psi.checkerboard);
-
-        chi.checkerboard = psi.checkerboard;
-
-        // just directly address via type pun
-        typedef typename Simd::scalar_type scalar_type;
-        scalar_type* u_p = (scalar_type*) &u[0];
-        scalar_type* l_p = (scalar_type*) &l[0];
-        scalar_type* d_p = (scalar_type*) &d[0];
-
-        for(int o=0; o<LLs; o++){ // outer
-        for(int i=0; i<nsimd; i++){ //inner
-            int s  = o + i*LLs;
-            int ss = o*nsimd + i;
-            u_p[ss] = upper[s];
-            l_p[ss] = lower[s];
-            d_p[ss] = diag[s];
-        }}
-
-        this->M5Dcalls++;
-        this->M5Dtime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-        #if 0
-
-            alignas(64) SiteHalfSpinor hp;
-            alignas(64) SiteHalfSpinor hm;
-            alignas(64) SiteSpinor fp;
-            alignas(64) SiteSpinor fm;
-
-            for(int v=0; v<LLs; v++){
-
-                int vp = (v+1)%LLs;
-                int vm = (v+LLs-1)%LLs;
-
-                spProj5p(hp, psi[ss+vp]);
-                spProj5m(hm, psi[ss+vm]);
-
-                if(vp <= v){ rotate(hp, hp, 1); }
-                if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-                hp = hp*0.5;
-                hm = hm*0.5;
-                spRecon5p(fp, hp);
-                spRecon5m(fm, hm);
-
-                chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-                chi[ss+v] = chi[ss+v]     +l[v]*fm;
-            }
-
-        #else
-
-            for(int v=0; v<LLs; v++){
-
-                vprefetch(psi[ss+v+LLs]);
-
-                int vp = (v == LLs-1) ? 0     : v+1;
-                int vm = (v == 0    ) ? LLs-1 : v-1;
-
-                Simd hp_00 = psi[ss+vp]()(0)(0);
-                Simd hp_01 = psi[ss+vp]()(0)(1);
-                Simd hp_02 = psi[ss+vp]()(0)(2);
-                Simd hp_10 = psi[ss+vp]()(1)(0);
-                Simd hp_11 = psi[ss+vp]()(1)(1);
-                Simd hp_12 = psi[ss+vp]()(1)(2);
-
-                Simd hm_00 = psi[ss+vm]()(2)(0);
-                Simd hm_01 = psi[ss+vm]()(2)(1);
-                Simd hm_02 = psi[ss+vm]()(2)(2);
-                Simd hm_10 = psi[ss+vm]()(3)(0);
-                Simd hm_11 = psi[ss+vm]()(3)(1);
-                Simd hm_12 = psi[ss+vm]()(3)(2);
-
-                if (vp <= v){
-                    hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-                    hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-                    hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-                    hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-                    hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-                    hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-                }
-
-                if(vm >= v){
-                    hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-                    hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-                    hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-                    hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-                    hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-                    hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-                }
-
-                Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-                Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-                Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-                Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-                Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-                Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-                Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-                Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-                Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-                Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-                Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-                Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-                vstream(chi[ss+v]()(0)(0), p_00);
-                vstream(chi[ss+v]()(0)(1), p_01);
-                vstream(chi[ss+v]()(0)(2), p_02);
-                vstream(chi[ss+v]()(1)(0), p_10);
-                vstream(chi[ss+v]()(1)(1), p_11);
-                vstream(chi[ss+v]()(1)(2), p_12);
-                vstream(chi[ss+v]()(2)(0), p_20);
-                vstream(chi[ss+v]()(2)(1), p_21);
-                vstream(chi[ss+v]()(2)(2), p_22);
-                vstream(chi[ss+v]()(3)(0), p_30);
-                vstream(chi[ss+v]()(3)(1), p_31);
-                vstream(chi[ss+v]()(3)(2), p_32);
-            }
-        #endif
-
-        }
-
-        this->M5Dtime += usecond();
-    }
-
-    #ifdef AVX512
-        #include<simd/Intel512common.h>
-        #include<simd/Intel512avx.h>
-        #include<simd/Intel512single.h>
-    #endif
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
-        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-    {
-        #ifndef AVX512
-        {
-            SiteHalfSpinor BcastP;
-            SiteHalfSpinor BcastM;
-            SiteHalfSpinor SiteChiP;
-            SiteHalfSpinor SiteChiM;
-
-            // Ls*Ls * 2 * 12 * vol flops
-            for(int s1=0; s1<LLs; s1++){
-
-                for(int s2=0; s2<LLs; s2++){
-                for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-                    int s = s2 + l*LLs;
-                    int lex = s2 + LLs*site;
-
-                    if( s2==0 && l==0 ){
-                        SiteChiP=zero;
-                        SiteChiM=zero;
-                    }
-
-                    for(int sp=0; sp<2;  sp++){
-                    for(int co=0; co<Nc; co++){
-                        vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-                    }}
-
-                    for(int sp=0; sp<2;  sp++){
-                    for(int co=0; co<Nc; co++){
-                        vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-                    }}
-
-                    for(int sp=0; sp<2;  sp++){
-                    for(int co=0; co<Nc; co++){
-                        SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-                        SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-                    }}
-                }}
-
-                {
-                    int lex = s1 + LLs*site;
-                    for(int sp=0; sp<2;  sp++){
-                    for(int co=0; co<Nc; co++){
-                        vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-                        vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-                    }}
-                }
-            }
-
-        }
-        #else
-        {
-            // pointers
-            //  MASK_REGS;
-            #define Chi_00 %%zmm1
-            #define Chi_01 %%zmm2
-            #define Chi_02 %%zmm3
-            #define Chi_10 %%zmm4
-            #define Chi_11 %%zmm5
-            #define Chi_12 %%zmm6
-            #define Chi_20 %%zmm7
-            #define Chi_21 %%zmm8
-            #define Chi_22 %%zmm9
-            #define Chi_30 %%zmm10
-            #define Chi_31 %%zmm11
-            #define Chi_32 %%zmm12
-
-            #define BCAST0  %%zmm13
-            #define BCAST1  %%zmm14
-            #define BCAST2  %%zmm15
-            #define BCAST3  %%zmm16
-            #define BCAST4  %%zmm17
-            #define BCAST5  %%zmm18
-            #define BCAST6  %%zmm19
-            #define BCAST7  %%zmm20
-            #define BCAST8  %%zmm21
-            #define BCAST9  %%zmm22
-            #define BCAST10 %%zmm23
-            #define BCAST11 %%zmm24
-
-            int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-            for(int s1=0; s1<LLs; s1++){
-
-                for(int s2=0; s2<LLs; s2++){
-
-                    int lex = s2 + LLs*site;
-                    uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-                    uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-                    uint64_t a2 = (uint64_t) &psi[lex];
-
-                    for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-                        if((s2+l)==0) {
-                            asm(
-                                    VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-                                    VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-                                    VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-                                    VBCASTCDUP(0,%2,BCAST0)
-                                    VBCASTCDUP(1,%2,BCAST1)
-                                    VBCASTCDUP(2,%2,BCAST2)
-                                    VBCASTCDUP(3,%2,BCAST3)
-                                    VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-                                    VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-                                    VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-                                    VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-                                    VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-                                    VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-                                    VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-                                    VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-                                    VMULMEM(0,%1,BCAST8,Chi_22)
-                                    VMULMEM(0,%1,BCAST9,Chi_30)
-                                    VMULMEM(0,%1,BCAST10,Chi_31)
-                                    VMULMEM(0,%1,BCAST11,Chi_32)
-                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
-                        } else {
-                            asm(
-                                    VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-                                    VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-                                    VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-                                    VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-                                    VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-                                    VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-                                    VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-                                    VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-                                    VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-                                    VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-                                    VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-                                    VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
-                        }
-                        a0 = a0 + incr;
-                        a1 = a1 + incr;
-                        a2 = a2 + sizeof(typename Simd::scalar_type);
-                    }
-                }
-
-                {
-                  int lexa = s1+LLs*site;
-                  asm (
-                     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-                     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-                     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-                     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-                     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-                }
-            }
-        }
-
-        #undef Chi_00
-        #undef Chi_01
-        #undef Chi_02
-        #undef Chi_10
-        #undef Chi_11
-        #undef Chi_12
-        #undef Chi_20
-        #undef Chi_21
-        #undef Chi_22
-        #undef Chi_30
-        #undef Chi_31
-        #undef Chi_32
-
-        #undef BCAST0
-        #undef BCAST1
-        #undef BCAST2
-        #undef BCAST3
-        #undef BCAST4
-        #undef BCAST5
-        #undef BCAST6
-        #undef BCAST7
-        #undef BCAST8
-        #undef BCAST9
-        #undef BCAST10
-        #undef BCAST11
-        #endif
-    };
-
-    // Z-mobius version
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-    {
-        std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-        exit(-1);
-    };
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-    {
-        int Ls  = this->Ls;
-        int LLs = psi._grid->_rdimensions[0];
-        int vol = psi._grid->oSites()/LLs;
-
-        chi.checkerboard = psi.checkerboard;
-
-        Vector<iSinglet<Simd> > Matp;
-        Vector<iSinglet<Simd> > Matm;
-        Vector<iSinglet<Simd> > *_Matp;
-        Vector<iSinglet<Simd> > *_Matm;
-
-        //  MooeeInternalCompute(dag,inv,Matp,Matm);
-        if(inv && dag){
-            _Matp = &this->MatpInvDag;
-            _Matm = &this->MatmInvDag;
-        }
-
-        if(inv && (!dag)){
-            _Matp = &this->MatpInv;
-            _Matm = &this->MatmInv;
-        }
-
-        if(!inv){
-            MooeeInternalCompute(dag, inv, Matp, Matm);
-            _Matp = &Matp;
-            _Matm = &Matm;
-        }
-
-        assert(_Matp->size() == Ls*LLs);
-
-        this->MooeeInvCalls++;
-        this->MooeeInvTime -= usecond();
-
-        if(switcheroo<Coeff_t>::iscomplex()){
-            parallel_for(auto site=0; site<vol; site++){
-                MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-            }
-        } else {
-            parallel_for(auto site=0; site<vol; site++){
-                MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-            }
-        }
-
-        this->MooeeInvTime += usecond();
-    }
-
-    #ifdef DOMAIN_WALL_EOFA_DPERP_VEC
-
-        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
-
-        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
-
-        template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-        template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-    #endif
-
-}}
--- a/Grid/qcd/action/fermion/DomainWallFermion.h
+++ b/Grid/qcd/action/fermion/DomainWallFermion.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -25,34 +25,33 @@ Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_DOMAIN_WALL_FERMION_H
 #define  GRID_QCD_DOMAIN_WALL_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class DomainWallFermion : public CayleyFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class DomainWallFermion : public CayleyFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
+  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist, bool fiveD) {
+	FermionField in_k(in.Grid());
+	FermionField prop_k(in.Grid());

-      void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist, bool fiveD) {
-	FermionField in_k(in._grid);
-	FermionField prop_k(in._grid);
-
-	FFT theFFT((GridCartesian *) in._grid);
+	FFT theFFT((GridCartesian *) in.Grid());

 	//phase for boundary condition
-	ComplexField coor(in._grid);
-	ComplexField ph(in._grid);  ph = zero;
-	FermionField in_buf(in._grid); in_buf = zero;
+	ComplexField coor(in.Grid());
+	ComplexField ph(in.Grid());  ph = Zero();
+	FermionField in_buf(in.Grid()); in_buf = Zero();
+	typedef typename Simd::scalar_type Scalar;
 	Scalar ci(0.0,1.0);
 	assert(twist.size() == Nd);//check that twist is Nd
 	assert(boundary.size() == Nd);//check that boundary conditions is Nd
@ -63,13 +62,12 @@ namespace Grid {
 	  // Shift coordinate lattice index by 1 to account for 5th dimension.
          LatticeCoordinate(coor, nu + shift);
 	  double boundary_phase = ::acos(real(boundary[nu]));
-	  ph = ph + boundary_phase*coor*((1./(in._grid->_fdimensions[nu+shift])));
+	  ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
 	  //momenta for propagator shifted by twist+boundary
 	  twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
 	}
 	in_buf = exp(ci*ph*(-1.0))*in;

-
 	if(fiveD){//FFT only on temporal and spatial dimensions
          std::vector<int> mask(Nd+1,1); mask[0] = 0;
 	  theFFT.FFT_dim_mask(in_k,in_buf,mask,FFT::forward);
@ -82,7 +80,7 @@ namespace Grid {
 	  theFFT.FFT_all_dim(out,prop_k,FFT::backward);
        }
 	//phase for boundary condition
-	out = out * exp(ci*ph);
+	out = out * exp(Scalar(2.0*M_PI)*ci*ph);
      };

      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) {
@ -105,38 +103,37 @@ namespace Grid {
 	FreePropagator(in,out,mass,boundary,twist,fiveD);
      };

-      virtual void   Instantiatable(void) {};
-      // Constructors
-      DomainWallFermion(GaugeField &_Umu,
-			GridCartesian         &FiveDimGrid,
-			GridRedBlackCartesian &FiveDimRedBlackGrid,
-			GridCartesian         &FourDimGrid,
-			GridRedBlackCartesian &FourDimRedBlackGrid,
-			RealD _mass,RealD _M5,const ImplParams &p= ImplParams()) : 
+  virtual void   Instantiatable(void) {};
+  // Constructors
+  DomainWallFermion(GaugeField &_Umu,
+		    GridCartesian         &FiveDimGrid,
+		    GridRedBlackCartesian &FiveDimRedBlackGrid,
+		    GridCartesian         &FourDimGrid,
+		    GridRedBlackCartesian &FourDimRedBlackGrid,
+		    RealD _mass,RealD _M5,const ImplParams &p= ImplParams()) : 


-      CayleyFermion5D<Impl>(_Umu,
-			    FiveDimGrid,
-			    FiveDimRedBlackGrid,
-			    FourDimGrid,
-			    FourDimRedBlackGrid,_mass,_M5,p)
+    CayleyFermion5D<Impl>(_Umu,
+			  FiveDimGrid,
+			  FiveDimRedBlackGrid,
+			  FourDimGrid,
+			  FourDimRedBlackGrid,_mass,_M5,p)

-      {
-	RealD eps = 1.0;
+  {
+    RealD eps = 1.0;

-	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
-	assert(zdata->n==this->Ls);
+    Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
+    assert(zdata->n==this->Ls);
 	
-	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
-	// Call base setter
-	this->SetCoefficientsTanh(zdata,1.0,0.0);
-
-	Approx::zolotarev_free(zdata);
-      }
-
-    };
+    //    std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
+    // Call base setter
+    this->SetCoefficientsTanh(zdata,1.0,0.0);

+    Approx::zolotarev_free(zdata);
  }
-}
+
+};
+
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
+++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
@ -0,0 +1,213 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class S,class Representation = FundamentalRepresentation, class Options=CoeffReal>
+class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Representation::Dimension> > { 
+public:
+
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > Gimpl;
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  static const int Dimension = Representation::Dimension;
+  static const bool isFundamental = Representation::isFundamental;
+  static const bool LsVectorised=true;
+  static const int Nhcs = Options::Nhcs;
+      
+  typedef typename Options::_Coeff_t Coeff_t;      
+  typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
+  
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
+  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+  
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplPropagator<Simd>        SitePropagator;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+  typedef iImplHalfCommSpinor<SimdL>   SiteHalfCommSpinor;
+  typedef Lattice<SiteSpinor>          FermionField;
+  typedef Lattice<SitePropagator>      PropagatorField;
+
+  /////////////////////////////////////////////////
+  // Make the doubled gauge field a *scalar*
+  /////////////////////////////////////////////////
+  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
+  typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar
+  typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar
+  typedef Lattice<SiteDoubledGaugeField>                      DoubledGaugeField;
+      
+  typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
+  typedef WilsonImplParams ImplParams;
+  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
+  typedef typename StencilImpl::View_type StencilView;
+  
+  ImplParams Params;
+
+  DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
+      
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    vsplat(reg, memory);
+  }
+
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi, const SiteDoubledGaugeField &U,
+					  const _Spinor &chi, int mu, StencilEntry *SE,
+					  StencilView &St) 
+  {
+#ifdef GPU_VEC
+    // Gauge link is scalarised
+    mult(&phi(), &U(mu), &chi());
+#else
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+        vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mult(&phi(), &UU(), &chi());
+#endif
+  }
+
+  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) 
+  {
+    SiteScalarGaugeField  ScalarUmu;
+    SiteDoubledGaugeField ScalarUds;
+    
+    GaugeLinkField U(Umu.Grid());
+    GaugeField  Uadj(Umu.Grid());
+    for (int mu = 0; mu < Nd; mu++) {
+      U = PeekIndex<LorentzIndex>(Umu, mu);
+      U = adj(Cshift(U, mu, -1));
+      PokeIndex<LorentzIndex>(Uadj, U, mu);
+    }
+    
+    for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
+      Coordinate lcoor;
+      GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
+      
+      peekLocalSite(ScalarUmu, Umu, lcoor);
+      for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
+      
+      peekLocalSite(ScalarUmu, Uadj, lcoor);
+      for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
+      
+      pokeLocalSite(ScalarUds, Uds, lcoor);
+    }
+  }
+      
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) 
+  {
+    assert(0);
+  }
+
+  inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
+    assert(0);
+  } 
+
+  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+    assert(0);
+  }
+
+  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    assert(0);
+  }
+
+
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
+
+    assert(0);
+    // Following lines to be revised after Peter's addition of half prec
+    // missing put lane...
+    /*
+      typedef decltype(traceIndex<SpinIndex>(outerProduct(Btilde[0], Atilde[0]))) result_type;
+      unsigned int LLs = Btilde.Grid()->_rdimensions[0];
+      conformable(Atilde.Grid(),Btilde.Grid());
+      GridBase* grid = mat.Grid();
+      GridBase* Bgrid = Btilde.Grid();
+      unsigned int dimU = grid->Nd();
+      unsigned int dimF = Bgrid->Nd();
+      GaugeLinkField tmp(grid); 
+      tmp = Zero();
+    
+      // FIXME 
+      // Current implementation works, thread safe, probably suboptimal
+      // Passing through the local coordinate for grid transformation
+      // the force grid is in general very different from the Ls vectorized grid
+
+      for (int so = 0; so < grid->oSites(); so++) {
+      std::vector<typename result_type::scalar_object> vres(Bgrid->Nsimd());
+      std::vector<int> ocoor;  grid->oCoorFromOindex(ocoor,so); 
+      for (int si = 0; si < tmp.Grid()->iSites(); si++){
+      typename result_type::scalar_object scalar_object; scalar_object = Zero();
+      std::vector<int> local_coor;      
+      std::vector<int> icoor; grid->iCoorFromIindex(icoor,si);
+      grid->InOutCoorToLocalCoor(ocoor, icoor, local_coor);
+      for (int s = 0; s < LLs; s++) {
+      std::vector<int> slocal_coor(dimF);
+      slocal_coor[0] = s;
+      for (int s4d = 1; s4d< dimF; s4d++) slocal_coor[s4d] = local_coor[s4d-1];
+      int sF = Bgrid->oIndexReduced(slocal_coor);  
+      assert(sF < Bgrid->oSites());
+
+      extract(traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF])), vres); 
+      // sum across the 5d dimension
+      for (auto v : vres) scalar_object += v;  
+      }
+      tmp[so].putlane(scalar_object, si);
+      }
+      }
+      PokeIndex<LorentzIndex>(mat, tmp, mu);
+    */
+  }
+};
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplD; // Double
+ 
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
+ 
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplD; // Double
+ 
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@ -23,10 +23,9 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_FERMION_H
-#define  GRID_QCD_FERMION_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Explicit explicit template instantiation is still required in the .cc files
@ -50,12 +49,17 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ////////////////////////////////////////////

 #include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
-#include <Grid/qcd/action/fermion/WilsonTMFermion.h>     // 4d wilson like
+NAMESPACE_CHECK(Wilson);
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
+NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
+NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
+NAMESPACE_CHECK(Wilson5D);

 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
+NAMESPACE_CHECK(Staggered);

 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
@ -63,7 +67,8 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 #include <Grid/qcd/action/fermion/MobiusFermion.h>
 #include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 #include <Grid/qcd/action/fermion/ZMobiusFermion.h>
-#include <Grid/qcd/action/fermion/SchurDiagTwoKappa.h>
+NAMESPACE_CHECK(DomainWall);
+
 #include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
 #include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
 #include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
@ -75,6 +80,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 #include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
 #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
 #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
+NAMESPACE_CHECK(Overlap);
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
@ -84,14 +90,17 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 // Fourier accelerated Pauli Villars inverse support
 ///////////////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/WilsonTMFermion5D.h>   
+NAMESPACE_CHECK(WilsonTM5);

 ////////////////////////////////////////////////////////////////////////////////
 // Move this group to a DWF specific tools/algorithms subdir? 
 ////////////////////////////////////////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/SchurDiagTwoKappa.h>
 #include <Grid/qcd/action/fermion/FourierAcceleratedPV.h>
 #include <Grid/qcd/action/fermion/PauliVillarsInverters.h>
 #include <Grid/qcd/action/fermion/Reconstruct5Dprop.h>
 #include <Grid/qcd/action/fermion/MADWF.h>
+NAMESPACE_CHECK(DWFutils);

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // More maintainable to maintain the following typedef list centrally, as more "impl" targets
@ -99,8 +108,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ////////////////////////////////////////////////////////////////////////////////////////////////////

 // Cayley 5d
-namespace Grid {
-  namespace QCD {
+NAMESPACE_BEGIN(Grid);

 typedef WilsonFermion<WilsonImplR> WilsonFermionR;
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
@ -186,46 +194,6 @@ typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
 typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;

 // Ls vectorised
-typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
-typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
-typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
-
-typedef DomainWallFermion<DomainWallVec5dImplRL> DomainWallFermionVec5dRL;
-typedef DomainWallFermion<DomainWallVec5dImplFH> DomainWallFermionVec5dFH;
-typedef DomainWallFermion<DomainWallVec5dImplDF> DomainWallFermionVec5dDF;
-
-typedef DomainWallEOFAFermion<DomainWallVec5dImplR> DomainWallEOFAFermionVec5dR;
-typedef DomainWallEOFAFermion<DomainWallVec5dImplF> DomainWallEOFAFermionVec5dF;
-typedef DomainWallEOFAFermion<DomainWallVec5dImplD> DomainWallEOFAFermionVec5dD;
-
-typedef DomainWallEOFAFermion<DomainWallVec5dImplRL> DomainWallEOFAFermionVec5dRL;
-typedef DomainWallEOFAFermion<DomainWallVec5dImplFH> DomainWallEOFAFermionVec5dFH;
-typedef DomainWallEOFAFermion<DomainWallVec5dImplDF> DomainWallEOFAFermionVec5dDF;
-
-typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
-typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
-typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
-
-typedef MobiusFermion<DomainWallVec5dImplRL> MobiusFermionVec5dRL;
-typedef MobiusFermion<DomainWallVec5dImplFH> MobiusFermionVec5dFH;
-typedef MobiusFermion<DomainWallVec5dImplDF> MobiusFermionVec5dDF;
-
-typedef MobiusEOFAFermion<DomainWallVec5dImplR> MobiusEOFAFermionVec5dR;
-typedef MobiusEOFAFermion<DomainWallVec5dImplF> MobiusEOFAFermionVec5dF;
-typedef MobiusEOFAFermion<DomainWallVec5dImplD> MobiusEOFAFermionVec5dD;
-
-typedef MobiusEOFAFermion<DomainWallVec5dImplRL> MobiusEOFAFermionVec5dRL;
-typedef MobiusEOFAFermion<DomainWallVec5dImplFH> MobiusEOFAFermionVec5dFH;
-typedef MobiusEOFAFermion<DomainWallVec5dImplDF> MobiusEOFAFermionVec5dDF;
-
-typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
-typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
-typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
-
-typedef ZMobiusFermion<ZDomainWallVec5dImplRL> ZMobiusFermionVec5dRL;
-typedef ZMobiusFermion<ZDomainWallVec5dImplFH> ZMobiusFermionVec5dFH;
-typedef ZMobiusFermion<ZDomainWallVec5dImplDF> ZMobiusFermionVec5dDF;
-
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
@ -318,12 +286,13 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;

+#ifndef GRID_NVCC
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
+#endif

-
-  }}
+NAMESPACE_END(Grid);

 ////////////////////
 // Scalar QED actions
@ -332,4 +301,4 @@ typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermion
 #include <Grid/qcd/action/scalar/Scalar.h>
 #include <Grid/qcd/action/gauge/Photon.h>

-#endif
+
--- a/Grid/qcd/action/fermion/FermionCore.h
+++ b/Grid/qcd/action/fermion/FermionCore.h
@ -36,58 +36,13 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 // Fermion prereqs
 ////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
+NAMESPACE_CHECK(Compressor);
 #include <Grid/qcd/action/fermion/FermionOperatorImpl.h>
+NAMESPACE_CHECK(FermionOperatorImpl);
 #include <Grid/qcd/action/fermion/FermionOperator.h>
+NAMESPACE_CHECK(FermionOperator);
 #include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
 #include <Grid/qcd/action/fermion/StaggeredKernels.h>        //used by all wilson type fermions
-
-#define FermOpStaggeredTemplateInstantiate(A) \
-  template class A<StaggeredImplF>; \
-  template class A<StaggeredImplD>; 
-
-#define FermOpStaggeredVec5dTemplateInstantiate(A) \
-  template class A<StaggeredVec5dImplF>; \
-  template class A<StaggeredVec5dImplD>; 
-
-#define FermOp4dVecTemplateInstantiate(A) \
-  template class A<WilsonImplF>;		\
-  template class A<WilsonImplD>;		\
-  template class A<ZWilsonImplF>;		\
-  template class A<ZWilsonImplD>;		\
-  template class A<GparityWilsonImplF>;		\
-  template class A<GparityWilsonImplD>;		\
-  template class A<WilsonImplFH>;		\
-  template class A<WilsonImplDF>;		\
-  template class A<ZWilsonImplFH>;		\
-  template class A<ZWilsonImplDF>;		\
-  template class A<GparityWilsonImplFH>;		\
-  template class A<GparityWilsonImplDF>;		
-
-
-#define AdjointFermOpTemplateInstantiate(A) \
-  template class A<WilsonAdjImplF>; \
-  template class A<WilsonAdjImplD>; 
-
-#define TwoIndexFermOpTemplateInstantiate(A) \
-  template class A<WilsonTwoIndexSymmetricImplF>; \
-  template class A<WilsonTwoIndexSymmetricImplD>; \
-  template class A<WilsonTwoIndexAntiSymmetricImplF>; \
-  template class A<WilsonTwoIndexAntiSymmetricImplD>;
-
-#define FermOp5dVecTemplateInstantiate(A) \
-  template class A<DomainWallVec5dImplF>;	\
-  template class A<DomainWallVec5dImplD>;	\
-  template class A<ZDomainWallVec5dImplF>;	\
-  template class A<ZDomainWallVec5dImplD>;	\
-  template class A<DomainWallVec5dImplFH>;	\
-  template class A<DomainWallVec5dImplDF>;	\
-  template class A<ZDomainWallVec5dImplFH>;	\
-  template class A<ZDomainWallVec5dImplDF>;	
-
-#define FermOpTemplateInstantiate(A) \
- FermOp4dVecTemplateInstantiate(A) \
- FermOp5dVecTemplateInstantiate(A) 
-
-#define GparityFermOpTemplateInstantiate(A) 
+NAMESPACE_CHECK(Kernels);

 #endif
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -26,86 +26,87 @@ Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_FERMION_OPERATOR_H
-#define  GRID_QCD_FERMION_OPERATOR_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
-
-    ////////////////////////////////////////////////////////////////
-    // Allow to select  between gauge representation rank bc's, flavours etc.
-    // and single/double precision.
-    ////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+// Allow to select  between gauge representation rank bc's, flavours etc.
+// and single/double precision.
+////////////////////////////////////////////////////////////////
    
-    template<class Impl>
-    class FermionOperator : public CheckerBoardedSparseMatrixBase<typename Impl::FermionField>, public Impl
-    {
-    public:
+template<class Impl>
+class FermionOperator : public CheckerBoardedSparseMatrixBase<typename Impl::FermionField>, public Impl
+{
+public:

-      INHERIT_IMPL_TYPES(Impl);
+  INHERIT_IMPL_TYPES(Impl);

-      FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
-      virtual ~FermionOperator(void) = default;
+  FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
+  virtual ~FermionOperator(void) = default;

-      virtual FermionField &tmp(void) = 0;
+  virtual FermionField &tmp(void) = 0;

-      GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
-      GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
+  GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
+  GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };

-      virtual GridBase *FermionGrid(void)         =0;
-      virtual GridBase *FermionRedBlackGrid(void) =0;
-      virtual GridBase *GaugeGrid(void)           =0;
-      virtual GridBase *GaugeRedBlackGrid(void)   =0;
+  virtual GridBase *FermionGrid(void)         =0;
+  virtual GridBase *FermionRedBlackGrid(void) =0;
+  virtual GridBase *GaugeGrid(void)           =0;
+  virtual GridBase *GaugeRedBlackGrid(void)   =0;

-      // override multiply
-      virtual RealD  M    (const FermionField &in, FermionField &out)=0;
-      virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;
+  // override multiply
+  virtual RealD  M    (const FermionField &in, FermionField &out)=0;
+  virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;

-      // half checkerboard operaions
-      virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
-      virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0;
-      virtual void   Mooee       (const FermionField &in, FermionField &out)=0;
-      virtual void   MooeeDag    (const FermionField &in, FermionField &out)=0;
-      virtual void   MooeeInv    (const FermionField &in, FermionField &out)=0;
-      virtual void   MooeeInvDag (const FermionField &in, FermionField &out)=0;
+  // half checkerboard operaions
+  virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0;
+  virtual void   Mooee       (const FermionField &in, FermionField &out)=0;
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out)=0;
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out)=0;
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out)=0;

-      // non-hermitian hopping term; half cb or both
-      virtual void Dhop  (const FermionField &in, FermionField &out,int dag)=0;
-      virtual void DhopOE(const FermionField &in, FermionField &out,int dag)=0;
-      virtual void DhopEO(const FermionField &in, FermionField &out,int dag)=0;
-      virtual void DhopDir(const FermionField &in, FermionField &out,int dir,int disp)=0; // implemented by WilsonFermion and WilsonFermion5D
+  // non-hermitian hopping term; half cb or both
+  virtual void Dhop  (const FermionField &in, FermionField &out,int dag)=0;
+  virtual void DhopOE(const FermionField &in, FermionField &out,int dag)=0;
+  virtual void DhopEO(const FermionField &in, FermionField &out,int dag)=0;
+  virtual void DhopDir(const FermionField &in, FermionField &out,int dir,int disp)=0; // implemented by WilsonFermion and WilsonFermion5D

-      // force terms; five routines; default to Dhop on diagonal
-      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDeriv(mat,U,V,dag);};
-      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivOE(mat,U,V,dag);};
-      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivEO(mat,U,V,dag);};
-      virtual void MooDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=zero;}; // Clover can override these
-      virtual void MeeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=zero;};
+  // force terms; five routines; default to Dhop on diagonal
+  virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDeriv(mat,U,V,dag);};
+  virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivOE(mat,U,V,dag);};
+  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivEO(mat,U,V,dag);};
+  virtual void MooDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=Zero();}; // Clover can override these
+  virtual void MeeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=Zero();};

-      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
-      virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
-      virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;


-      virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
-      virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
+  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac


      virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};

-      virtual void  FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) {
-	FFT theFFT((GridCartesian *) in._grid);
+      virtual void  FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) 
+      {
+	FFT theFFT((GridCartesian *) in.Grid());

-	FermionField in_k(in._grid);
-	FermionField prop_k(in._grid);
+	typedef typename Simd::scalar_type Scalar;
+
+	FermionField in_k(in.Grid());
+	FermionField prop_k(in.Grid());

 	//phase for boundary condition
-	ComplexField coor(in._grid);
-	ComplexField ph(in._grid);  ph = zero;
-	FermionField in_buf(in._grid); in_buf = zero;
+	ComplexField coor(in.Grid());
+	ComplexField ph(in.Grid());  ph = Zero();
+	FermionField in_buf(in.Grid()); in_buf = Zero();
+
 	Scalar ci(0.0,1.0);
 	assert(twist.size() == Nd);//check that twist is Nd
 	assert(boundary.size() == Nd);//check that boundary conditions is Nd
@ -113,7 +114,7 @@ namespace Grid {
 	{
          LatticeCoordinate(coor, nu);
 	  double boundary_phase = ::acos(real(boundary[nu]));
-	  ph = ph + boundary_phase*coor*((1./(in._grid->_fdimensions[nu])));
+	  ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu])));
 	  //momenta for propagator shifted by twist+boundary
 	  twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
 	}
@ -124,43 +125,42 @@ namespace Grid {
 	theFFT.FFT_all_dim(out,prop_k,FFT::backward);

 	//phase for boundary condition
-	out = out * exp(ci*ph);
+        out = out * exp(Scalar(2.0*M_PI)*ci*ph);

      };

      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
-		std::vector<Complex> boundary;
-		for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
-		std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
-	        FreePropagator(in,out,mass,boundary,twist);
+	std::vector<Complex> boundary;
+	for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
+	std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+	FreePropagator(in,out,mass,boundary,twist);
      };

-      ///////////////////////////////////////////////
-      // Updates gauge field during HMC
-      ///////////////////////////////////////////////
-      virtual void ImportGauge(const GaugeField & _U)=0;
+  ///////////////////////////////////////////////
+  // Updates gauge field during HMC
+  ///////////////////////////////////////////////
+  virtual void ImportGauge(const GaugeField & _U)=0;

-      //////////////////////////////////////////////////////////////////////
-      // Conserved currents, either contract at sink or insert sequentially.
-      //////////////////////////////////////////////////////////////////////
-
-      virtual void ContractConservedCurrent(PropagatorField &q_in_1,
-                                            PropagatorField &q_in_2,
-                                            PropagatorField &q_out,
-                                            Current curr_type,
-                                            unsigned int mu)=0;
-      virtual void SeqConservedCurrent(PropagatorField &q_in, 
-                                       PropagatorField &q_out,
-                                       Current curr_type,
-                                       unsigned int mu,
-                                       unsigned int tmin, 
-                                       unsigned int tmax,
-                                       ComplexField &lattice_cmplx)=0;
+  //////////////////////////////////////////////////////////////////////
+  // Conserved currents, either contract at sink or insert sequentially.
+  //////////////////////////////////////////////////////////////////////
+  virtual void ContractConservedCurrent(PropagatorField &q_in_1,
+					PropagatorField &q_in_2,
+					PropagatorField &q_out,
+					Current curr_type,
+					unsigned int mu)=0;
+  virtual void SeqConservedCurrent(PropagatorField &q_in, 
+				   PropagatorField &q_out,
+				   Current curr_type,
+				   unsigned int mu,
+				   unsigned int tmin, 
+				   unsigned int tmax,
+				   ComplexField &lattice_cmplx)=0;

      // Only reimplemented in Wilson5D 
      // Default to just a zero correlation function
-      virtual void ContractJ5q(FermionField &q_in   ,ComplexField &J5q) { J5q=zero; };
-      virtual void ContractJ5q(PropagatorField &q_in,ComplexField &J5q) { J5q=zero; };
+  virtual void ContractJ5q(FermionField &q_in   ,ComplexField &J5q) { J5q=Zero(); };
+  virtual void ContractJ5q(PropagatorField &q_in,ComplexField &J5q) { J5q=Zero(); };

      ///////////////////////////////////////////////
      // Physical field import/export
@ -183,9 +183,7 @@ namespace Grid {
      {
 	exported=solution;
      };
-    };
+};

-  }
-}
+NAMESPACE_END(Grid);

-#endif
--- a/Grid/qcd/action/fermion/FermionOperatorImpl.h
+++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h
--- a/Grid/qcd/action/fermion/FourierAcceleratedPV.h
+++ b/Grid/qcd/action/fermion/FourierAcceleratedPV.h
@ -28,8 +28,8 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once
-namespace Grid {
-namespace QCD {
+
+NAMESPACE_BEGIN(Grid);

  template<typename M>
    void get_real_const_bc(M& m, RealD& _b, RealD& _c) {
@ -63,8 +63,8 @@ class FourierAcceleratedPV {
   : dwfPV(_dwfPV), Umu(_Umu), cg(_cg), group_in_s(_group_in_s) 
  {
    assert( dwfPV.FermionGrid()->_fdimensions[0] % (2*group_in_s) == 0);
-    grid5D = QCD::SpaceTimeGrid::makeFiveDimGrid(2*group_in_s, (GridCartesian*)Umu._grid);
-    gridRB5D = QCD::SpaceTimeGrid::makeFiveDimRedBlackGrid(2*group_in_s, (GridCartesian*)Umu._grid);
+    grid5D   = SpaceTimeGrid::makeFiveDimGrid(2*group_in_s, (GridCartesian*)Umu.Grid());
+    gridRB5D = SpaceTimeGrid::makeFiveDimRedBlackGrid(2*group_in_s, (GridCartesian*)Umu.Grid());
  }

  void rotatePV(const Vi& _src, Vi& dst, bool forward) const {
@ -72,13 +72,13 @@ class FourierAcceleratedPV {
    GridStopWatch gsw1, gsw2;

    typedef typename Vi::scalar_type Coeff_t;
-    int Ls = dst._grid->_fdimensions[0];
+    int Ls = dst.Grid()->_fdimensions[0];

-    Vi _tmp(dst._grid);
+    Vi _tmp(dst.Grid());
    double phase = M_PI / (double)Ls;
    Coeff_t bzero(0.0,0.0);

-    FFT theFFT((GridCartesian*)dst._grid);
+    FFT theFFT((GridCartesian*)dst.Grid());

    if (!forward) {
      gsw1.Start();
@ -115,7 +115,7 @@ class FourierAcceleratedPV {
    std::cout << GridLogMessage << "Fourier-Accelerated Outer Pauli Villars"<<std::endl;

    typedef typename Vi::scalar_type Coeff_t;
-    int Ls = _dst._grid->_fdimensions[0];
+    int Ls = _dst.Grid()->_fdimensions[0];

    GridStopWatch gswT;
    gswT.Start();
@ -126,12 +126,12 @@ class FourierAcceleratedPV {
    
    // U(true) Rightinv TMinv U(false) = Minv

-    Vi _src_diag(_dst._grid);
+    Vi _src_diag(_dst.Grid());
    Vi _src_diag_slice(dwfPV.GaugeGrid());
    Vi _dst_diag_slice(dwfPV.GaugeGrid());
    Vi _src_diag_slices(grid5D);
    Vi _dst_diag_slices(grid5D);
-    Vi _dst_diag(_dst._grid);
+    Vi _dst_diag(_dst.Grid());

    rotatePV(_src,_src_diag,false);

@ -163,7 +163,7 @@ class FourierAcceleratedPV {
      for (int sidx=0;sidx<group_in_s;sidx++) {

 	int s = sgroup*group_in_s + sidx;
-	int sprime = Ls-s-1;
+	//	int sprime = Ls-s-1;

 	RealD phase = M_PI / (RealD)Ls * (2.0 * s + 1.0);
 	RealD cosp = ::cos(phase);
@ -196,7 +196,7 @@ class FourierAcceleratedPV {

      GridStopWatch gsw;
      gsw.Start();
-      _dst_diag_slices = zero; // zero guess
+      _dst_diag_slices = Zero(); // zero guess
      sol(tm,_src_diag_slices,_dst_diag_slices);
      gsw.Stop();
      std::cout << GridLogMessage << "Solve[sgroup=" << sgroup << "] completed in " << gsw.Elapsed() << ", " << gswA.Elapsed() << std::endl;
@ -212,7 +212,7 @@ class FourierAcceleratedPV {

 	// now rotate with inverse of
 	Coeff_t pA = b + c*cosp;
-	Coeff_t pB = - Coeff_t(0.0,1.0)*c*sinp;
+	Coeff_t pB = - Coeff_t(0.0,1.0)*Coeff_t(c*sinp);
 	Coeff_t pABden = pA*pA - pB*pB;
 	// (pA + pB * G5) * (pA - pB*G5) = (pA^2 - pB^2)
      
@ -234,4 +234,5 @@ class FourierAcceleratedPV {
  }

 };
-}}
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@ -0,0 +1,321 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
+class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
+public:
+
+ static const int Dimension = Representation::Dimension;
+ static const bool isFundamental = Representation::isFundamental;
+ static const int Nhcs = Options::Nhcs;
+ static const bool LsVectorised=false;
+
+ typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
+ INHERIT_GIMPL_TYPES(Gimpl);
+ 
+ typedef typename Options::_Coeff_t Coeff_t;
+ typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
+      
+ template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Dimension>, Ns>,   Ngp>;
+ template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Dimension>, Ns>,   Ngp>;
+ template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Dimension>, Nhs>,  Ngp>;
+ template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
+ template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;
+
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplPropagator<Simd>        SitePropagator;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+  typedef iImplHalfCommSpinor<SimdL>   SiteHalfCommSpinor;
+  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+
+  typedef Lattice<SiteSpinor> FermionField;
+  typedef Lattice<SitePropagator> PropagatorField;
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+ 
+  typedef GparityWilsonImplParams ImplParams;
+  typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
+  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor, ImplParams> StencilImpl;
+  typedef typename StencilImpl::View_type StencilView;
+      
+  ImplParams Params;
+
+  GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
+
+  // provide the multiply by link that is differentiated between Gparity (with
+  // flavour index) and non-Gparity
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi, 
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi, 
+					  int mu) 
+  {
+    assert(0);
+  } 
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi, 
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi, 
+					  int mu, 
+					  StencilEntry *SE,
+					  StencilView &St) 
+  {
+    int direction = St._directions[mu];
+    int distance  = St._distances[mu];
+    int ptype     = St._permute_type[mu];
+    int sl        = St._simd_layout[direction];
+    Coordinate icoor;
+
+#ifdef __CUDA_ARCH__
+    _Spinor tmp;
+
+    const int Nsimd =SiteDoubledGaugeField::Nsimd();
+    int s = SIMTlane(Nsimd);
+    St.iCoorFromIindex(icoor,s);
+
+    int mmu = mu % Nd;
+    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
+      
+      int permute_lane = (sl==1) 
+    	|| ((distance== 1)&&(icoor[direction]==1))
+	|| ((distance==-1)&&(icoor[direction]==0));
+
+      if ( permute_lane ) { 
+	tmp(0) = chi(1);
+	tmp(1) = chi(0);
+      } else {
+	tmp(0) = chi(0);
+	tmp(1) = chi(1);
+      }
+
+      auto UU0=coalescedRead(U(0)(mu));
+      auto UU1=coalescedRead(U(1)(mu));
+
+      mult(&phi(0),&UU0,&tmp(0));
+      mult(&phi(1),&UU1,&tmp(1));
+
+    } else {
+
+      auto UU0=coalescedRead(U(0)(mu));
+      auto UU1=coalescedRead(U(1)(mu));
+
+      mult(&phi(0),&UU0,&chi(0));
+      mult(&phi(1),&UU1,&chi(1));
+
+    }
+
+#else
+    typedef _Spinor vobj;
+    typedef typename SiteHalfSpinor::scalar_object sobj;
+    typedef typename SiteHalfSpinor::vector_type   vector_type;
+	
+    vobj vtmp;
+    sobj stmp;
+        
+    const int Nsimd =vector_type::Nsimd();
+    
+    // Fixme X.Y.Z.T hardcode in stencil
+    int mmu = mu % Nd;
+        
+    // assert our assumptions
+    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
+    assert((sl == 1) || (sl == 2));
+
+    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
+
+      if ( sl == 2 ) {
+       
+	ExtractBuffer<sobj> vals(Nsimd);
+
+	extract(chi,vals);
+	for(int s=0;s<Nsimd;s++){
+
+	  St.iCoorFromIindex(icoor,s);
+              
+	  assert((icoor[direction]==0)||(icoor[direction]==1));
+              
+	  int permute_lane;
+	  if ( distance == 1) {
+	    permute_lane = icoor[direction]?1:0;
+	  } else {
+	    permute_lane = icoor[direction]?0:1;
+	  }
+              
+	  if ( permute_lane ) { 
+	    stmp(0) = vals[s](1);
+	    stmp(1) = vals[s](0);
+	    vals[s] = stmp;
+	  }
+	}
+	merge(vtmp,vals);
+            
+      } else { 
+	vtmp(0) = chi(1);
+	vtmp(1) = chi(0);
+      }
+      mult(&phi(0),&U(0)(mu),&vtmp(0));
+      mult(&phi(1),&U(1)(mu),&vtmp(1));
+     
+    } else { 
+      mult(&phi(0),&U(0)(mu),&chi(0));
+      mult(&phi(1),&U(1)(mu),&chi(1));
+    }
+#endif   
+  }
+
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    reg = memory;
+  }
+
+  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+  {
+    conformable(Uds.Grid(),GaugeGrid);
+    conformable(Umu.Grid(),GaugeGrid);
+   
+    GaugeLinkField Utmp (GaugeGrid);
+    GaugeLinkField U    (GaugeGrid);
+    GaugeLinkField Uconj(GaugeGrid);
+   
+    Lattice<iScalar<vInteger> > coor(GaugeGrid);
+        
+    for(int mu=0;mu<Nd;mu++){
+          
+      LatticeCoordinate(coor,mu);
+          
+      U     = PeekIndex<LorentzIndex>(Umu,mu);
+      Uconj = conjugate(U);
+     
+      // This phase could come from a simple bc 1,1,-1,1 ..
+      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
+      if ( Params.twists[mu] ) { 
+	Uconj = where(coor==neglink,-Uconj,Uconj);
+      }
+	  
+      auto U_v = U.View();
+      auto Uds_v = Uds.View();
+      auto Uconj_v = Uconj.View();
+      auto Utmp_v= Utmp.View();
+      thread_foreach(ss,U_v,{
+	Uds_v[ss](0)(mu) = U_v[ss]();
+	Uds_v[ss](1)(mu) = Uconj_v[ss]();
+      });
+          
+      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
+      Uconj = adj(Cshift(Uconj,mu,-1));
+ 
+      Utmp = U;
+      if ( Params.twists[mu] ) { 
+	Utmp = where(coor==0,Uconj,Utmp);
+      }
+
+      thread_foreach(ss,Utmp_v,{
+	Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
+      });
+          
+      Utmp = Uconj;
+      if ( Params.twists[mu] ) { 
+	Utmp = where(coor==0,U,Utmp);
+      }
+	  
+      thread_foreach(ss,Utmp_v,{
+        Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
+      });
+          
+    }
+  }
+      
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
+
+    // DhopDir provides U or Uconj depending on coor/flavour.
+    GaugeLinkField link(mat.Grid());
+    // use lorentz for flavour as hack.
+    auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
+    auto link_v = link.View();
+    auto tmp_v = tmp.View();
+    thread_foreach(ss,tmp_v,{
+      link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
+    });
+    PokeIndex<LorentzIndex>(mat, link, mu);
+    return;
+  }
+      
+ inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
+   //mat = outerProduct(Btilde, A);
+   assert(0);
+  }
+
+  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+    assert(0);
+    /*
+    auto tmp = TraceIndex<SpinIndex>(P);
+    parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
+      mat[ss]() = tmp[ss](0, 0) + conjugate(tmp[ss](1, 1));
+    }
+    */
+  }
+
+  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    assert(0);
+  }
+  
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
+
+    int Ls = Btilde.Grid()->_fdimensions[0];
+        
+    GaugeLinkField tmp(mat.Grid());
+    tmp = Zero();
+    auto tmp_v = tmp.View();
+    auto Atilde_v = Atilde.View();
+    auto Btilde_v = Btilde.View();
+    thread_for(ss,tmp.Grid()->oSites(),{
+      for (int s = 0; s < Ls; s++) {
+	int sF = s + Ls * ss;
+	auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
+	tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
+      }
+    });
+    PokeIndex<LorentzIndex>(mat, tmp, mu);
+    return;
+  }
+  
+};
+
+typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
+typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF;  // Float
+typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD;  // Double
+ 
+typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
+typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
+typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF;  // Double
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@ -25,16 +25,14 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */
 #ifndef GRID_QCD_IMPR_STAG_FERMION_H
 #define GRID_QCD_IMPR_STAG_FERMION_H

-namespace Grid {
-
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 class ImprovedStaggeredFermionStatic {
- public:
+public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static const int npoint = 16;
@ -42,7 +40,7 @@ class ImprovedStaggeredFermionStatic {

 template <class Impl>
 class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedStaggeredFermionStatic {
- public:
+public:
  INHERIT_IMPL_TYPES(Impl);
  typedef StaggeredKernels<Impl> Kernels;

@ -139,7 +137,7 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS

  // DoubleStore impl dependent
  void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
-  void ImportGauge      (const GaugeField &_Uthin  ,const GaugeField &_Ufat);
+  void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
  void ImportGaugeSimple(const GaugeField &_UUU    ,const GaugeField &_U);
  void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
  DoubledGaugeField &GetU(void)   { return Umu ; } ;
@ -151,7 +149,7 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
  ///////////////////////////////////////////////////////////////

  //    protected:
- public:
+public:
  // any other parameters of action ???
  virtual int   isTrivialEE(void) { return 1; };
  virtual RealD Mass(void) { return mass; }
@ -188,11 +186,11 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
                                PropagatorField &q_out,
                                Current curr_type,
                                unsigned int mu);
-  void SeqConservedCurrent(PropagatorField &q_in, 
+  void SeqConservedCurrent(PropagatorField &q_in,
                           PropagatorField &q_out,
-                           Current curr_type, 
-                           unsigned int mu,
-                           unsigned int tmin, 
+                           Current curr_type,
+                           unsigned int mu, 
+                           unsigned int tmin,
                           unsigned int tmax,
 			   ComplexField &lattice_cmplx);
 };
@ -200,6 +198,6 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;

-}
-}
+NAMESPACE_END(Grid);
+
 #endif
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@ -1,5 +1,5 @@

-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -25,101 +25,99 @@ Author: AzusaYamaguchi <ayamaguc@staffmail.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_IMPROVED_STAGGERED_FERMION_5D_H
-#define  GRID_QCD_IMPROVED_STAGGERED_FERMION_5D_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  ////////////////////////////////////////////////////////////////////////////////
-  // This is the 4d red black case appropriate to support
-  ////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// This is the 4d red black case appropriate to support
+////////////////////////////////////////////////////////////////////////////////

-    class ImprovedStaggeredFermion5DStatic { 
-    public:
-      // S-direction is INNERMOST and takes no part in the parity.
-      static const std::vector<int> directions;
-      static const std::vector<int> displacements;
-      const int npoint = 16;
-    };
+class ImprovedStaggeredFermion5DStatic { 
+public:
+  // S-direction is INNERMOST and takes no part in the parity.
+  static const std::vector<int> directions;
+  static const std::vector<int> displacements;
+  const int npoint = 16;
+};

-    template<class Impl>
-    class ImprovedStaggeredFermion5D :  public StaggeredKernels<Impl>, public ImprovedStaggeredFermion5DStatic 
-    {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
-      typedef StaggeredKernels<Impl> Kernels;
+template<class Impl>
+class ImprovedStaggeredFermion5D :  public StaggeredKernels<Impl>, public ImprovedStaggeredFermion5DStatic 
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+  typedef StaggeredKernels<Impl> Kernels;

-      FermionField _tmp;
-      FermionField &tmp(void) { return _tmp; }
+  FermionField _tmp;
+  FermionField &tmp(void) { return _tmp; }

-      ////////////////////////////////////////
-      // Performance monitoring
-      ////////////////////////////////////////
-      void Report(void);
-      void ZeroCounters(void);
-      double DhopTotalTime;
-      double DhopCalls;
-      double DhopCommTime;
-      double DhopComputeTime;
+  ////////////////////////////////////////
+  // Performance monitoring
+  ////////////////////////////////////////
+  void Report(void);
+  void ZeroCounters(void);
+  double DhopTotalTime;
+  double DhopCalls;
+  double DhopCommTime;
+  double DhopComputeTime;
      double DhopComputeTime2;
      double DhopFaceTime;

-      ///////////////////////////////////////////////////////////////
-      // Implement the abstract base
-      ///////////////////////////////////////////////////////////////
-      GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
-      GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
-      GridBase *FermionGrid(void)            { return _FiveDimGrid;}
-      GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
+  ///////////////////////////////////////////////////////////////
+  // Implement the abstract base
+  ///////////////////////////////////////////////////////////////
+  GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
+  GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
+  GridBase *FermionGrid(void)            { return _FiveDimGrid;}
+  GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}

-      // full checkerboard operations; leave unimplemented as abstract for now
-      RealD  M    (const FermionField &in, FermionField &out);
-      RealD  Mdag (const FermionField &in, FermionField &out);
+  // full checkerboard operations; leave unimplemented as abstract for now
+  RealD  M    (const FermionField &in, FermionField &out);
+  RealD  Mdag (const FermionField &in, FermionField &out);

-      // half checkerboard operations
-      void   Meooe       (const FermionField &in, FermionField &out);
-      void   Mooee       (const FermionField &in, FermionField &out);
-      void   MooeeInv    (const FermionField &in, FermionField &out);
+  // half checkerboard operations
+  void   Meooe       (const FermionField &in, FermionField &out);
+  void   Mooee       (const FermionField &in, FermionField &out);
+  void   MooeeInv    (const FermionField &in, FermionField &out);

-      void   MeooeDag    (const FermionField &in, FermionField &out);
-      void   MooeeDag    (const FermionField &in, FermionField &out);
-      void   MooeeInvDag (const FermionField &in, FermionField &out);
+  void   MeooeDag    (const FermionField &in, FermionField &out);
+  void   MooeeDag    (const FermionField &in, FermionField &out);
+  void   MooeeInvDag (const FermionField &in, FermionField &out);

-      void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
-      void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+  void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);

-      // These can be overridden by fancy 5d chiral action
-      void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  // These can be overridden by fancy 5d chiral action
+  void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

-      // Implement hopping term non-hermitian hopping term; half cb or both
-      void Dhop  (const FermionField &in, FermionField &out,int dag);
-      void DhopOE(const FermionField &in, FermionField &out,int dag);
-      void DhopEO(const FermionField &in, FermionField &out,int dag);
+  // Implement hopping term non-hermitian hopping term; half cb or both
+  void Dhop  (const FermionField &in, FermionField &out,int dag);
+  void DhopOE(const FermionField &in, FermionField &out,int dag);
+  void DhopEO(const FermionField &in, FermionField &out,int dag);

    
-    ///////////////////////////////////////////////////////////////
-    // New methods added 
-    ///////////////////////////////////////////////////////////////
-    void DerivInternal(StencilImpl & st,
-		       DoubledGaugeField & U,
-		       DoubledGaugeField & UUU,
-		       GaugeField &mat,
-		       const FermionField &A,
-		       const FermionField &B,
-		       int dag);
+  ///////////////////////////////////////////////////////////////
+  // New methods added 
+  ///////////////////////////////////////////////////////////////
+  void DerivInternal(StencilImpl & st,
+		     DoubledGaugeField & U,
+		     DoubledGaugeField & UUU,
+		     GaugeField &mat,
+		     const FermionField &A,
+		     const FermionField &B,
+		     int dag);
    
-    void DhopInternal(StencilImpl & st,
-		      LebesgueOrder &lo,
-		      DoubledGaugeField &U,
-		      DoubledGaugeField &UUU,
-		      const FermionField &in, 
-		      FermionField &out,
-		      int dag);
+  void DhopInternal(StencilImpl & st,
+		    LebesgueOrder &lo,
+		    DoubledGaugeField &U,
+		    DoubledGaugeField &UUU,
+		    const FermionField &in, 
+		    FermionField &out,
+		    int dag);
    
    void DhopInternalOverlappedComms(StencilImpl & st,
 		      LebesgueOrder &lo,
@ -138,17 +136,17 @@ namespace QCD {
 		      int dag);
    
    
-    // Constructors
+  // Constructors
    ////////////////////////////////////////////////////////////////////////////////////////////////
    // Grid internal interface -- Thin link and fat link, with coefficients
    ////////////////////////////////////////////////////////////////////////////////////////////////
-    ImprovedStaggeredFermion5D(GaugeField &_Uthin,
-			       GaugeField &_Ufat,
-			       GridCartesian         &FiveDimGrid,
-			       GridRedBlackCartesian &FiveDimRedBlackGrid,
-			       GridCartesian         &FourDimGrid,
-			       GridRedBlackCartesian &FourDimRedBlackGrid,
-			       double _mass,
+  ImprovedStaggeredFermion5D(GaugeField &_Uthin,
+			     GaugeField &_Ufat,
+			     GridCartesian         &FiveDimGrid,
+			     GridRedBlackCartesian &FiveDimRedBlackGrid,
+			     GridCartesian         &FourDimGrid,
+			     GridRedBlackCartesian &FourDimRedBlackGrid,
+			     double _mass,
 			       RealD _c1, RealD _c2,RealD _u0,
 			       const ImplParams &p= ImplParams());
    ////////////////////////////////////////////////////////////////////////////////////////////////
@ -160,11 +158,11 @@ namespace QCD {
 			       GridRedBlackCartesian &FourDimRedBlackGrid,
 			       double _mass,
 			       RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
-			       const ImplParams &p= ImplParams());
-
+			     const ImplParams &p= ImplParams());
+    
    // DoubleStore gauge field in operator
    void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
-    void ImportGauge      (const GaugeField &_Uthin  ,const GaugeField &_Ufat);
+  void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat);
    void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
    void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
    // Give a reference; can be used to do an assignment or copy back out after import
@ -173,62 +171,61 @@ namespace QCD {
    DoubledGaugeField &GetUUU(void) { return UUUmu; };
    void CopyGaugeCheckerboards(void);
    
-    ///////////////////////////////////////////////////////////////
-    // Data members require to support the functionality
-    ///////////////////////////////////////////////////////////////
-  public:
-
+  ///////////////////////////////////////////////////////////////
+  // Data members require to support the functionality
+  ///////////////////////////////////////////////////////////////
+public:
+    
    virtual int   isTrivialEE(void) { return 1; };
    virtual RealD Mass(void) { return mass; }
    
-    GridBase *_FourDimGrid;
-    GridBase *_FourDimRedBlackGrid;
-    GridBase *_FiveDimGrid;
-    GridBase *_FiveDimRedBlackGrid;
+  GridBase *_FourDimGrid;
+  GridBase *_FourDimRedBlackGrid;
+  GridBase *_FiveDimGrid;
+  GridBase *_FiveDimRedBlackGrid;
    
-    RealD mass;
-    RealD c1;
-    RealD c2;
-    RealD u0;
-    int Ls;
+  RealD mass;
+  RealD c1;
+  RealD c2;
+  RealD u0;
+  int Ls;
    
-    //Defines the stencils for even and odd
-    StencilImpl Stencil; 
-    StencilImpl StencilEven; 
-    StencilImpl StencilOdd; 
+  //Defines the stencils for even and odd
+  StencilImpl Stencil; 
+  StencilImpl StencilEven; 
+  StencilImpl StencilOdd; 
    
-    // Copy of the gauge field , with even and odd subsets
-    DoubledGaugeField Umu;
-    DoubledGaugeField UmuEven;
-    DoubledGaugeField UmuOdd;
+  // Copy of the gauge field , with even and odd subsets
+  DoubledGaugeField Umu;
+  DoubledGaugeField UmuEven;
+  DoubledGaugeField UmuOdd;

-    DoubledGaugeField UUUmu;
-    DoubledGaugeField UUUmuEven;
-    DoubledGaugeField UUUmuOdd;
+  DoubledGaugeField UUUmu;
+  DoubledGaugeField UUUmuEven;
+  DoubledGaugeField UUUmuOdd;
    
-    LebesgueOrder Lebesgue;
-    LebesgueOrder LebesgueEvenOdd;
+  LebesgueOrder Lebesgue;
+  LebesgueOrder LebesgueEvenOdd;
    
-    // Comms buffer
-    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  // Comms buffer
+  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    
-    ///////////////////////////////////////////////////////////////
-    // Conserved current utilities
-    ///////////////////////////////////////////////////////////////
-    void ContractConservedCurrent(PropagatorField &q_in_1,
-                                  PropagatorField &q_in_2,
-                                  PropagatorField &q_out,
-                                  Current curr_type,
-                                  unsigned int mu);
-    void SeqConservedCurrent(PropagatorField &q_in, 
-                             PropagatorField &q_out,
-                             Current curr_type, 
-                             unsigned int mu,
-                             unsigned int tmin, 
+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+				PropagatorField &q_in_2,
+				PropagatorField &q_out,
+				Current curr_type,
+				unsigned int mu);
+  void SeqConservedCurrent(PropagatorField &q_in,
+			   PropagatorField &q_out,
+			   Current curr_type,
+			   unsigned int mu, 
+			   unsigned int tmin,
                             unsigned int tmax,
                 	     ComplexField &lattice_cmplx);
-  };
+};

-}}
+NAMESPACE_END(Grid);

-#endif
--- a/Grid/qcd/action/fermion/MADWF.h
+++ b/Grid/qcd/action/fermion/MADWF.h
@ -27,8 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #pragma once

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template <class Fieldi, class Fieldo,IfNotSame<Fieldi,Fieldo> X=0>
 inline void convert(const Fieldi &from,Fieldo &to) 
@ -109,7 +108,7 @@ class MADWF
    std::cout << GridLogMessage << " b    " <<norm2(b)<<std::endl;

    defect = b;
-    sol5=zero;
+    sol5=Zero();
    for (int i=0;i<maxiter;i++) {

      ///////////////////////////////////////
@ -122,7 +121,7 @@ class MADWF
      ////////////////////////////////////////////////
      // Solve the inner system with surface term c0
      ////////////////////////////////////////////////
-      ci = zero;  
+      ci = Zero();  
      convert(c0,c0i); // Possible precison change
      InsertSlice(c0i,ci,0, 0);

@ -190,4 +189,4 @@ class MADWF

 };

-}}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/MobiusEOFAFermion.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.cc
@ -1,502 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  template<class Impl>
-    MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
-      GaugeField            &_Umu,
-      GridCartesian         &FiveDimGrid,
-      GridRedBlackCartesian &FiveDimRedBlackGrid,
-      GridCartesian         &FourDimGrid,
-      GridRedBlackCartesian &FourDimRedBlackGrid,
-      RealD _mq1, RealD _mq2, RealD _mq3,
-      RealD _shift, int _pm, RealD _M5,
-      RealD _b, RealD _c, const ImplParams &p) :
-    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-        _shift, _pm, _M5, _b, _c, p)
-    {
-      int Ls = this->Ls;
-
-      RealD eps = 1.0;
-      Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
-      assert(zdata->n == this->Ls);
-
-      std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
-        ",c=" << _c << ") with Ls=" << Ls << std::endl;
-      this->SetCoefficientsTanh(zdata, _b, _c);
-      std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
-        ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
-        ",pm=" << _pm << ")" << std::endl;
-
-      Approx::zolotarev_free(zdata);
-
-      if(_shift != 0.0){
-        SetCoefficientsPrecondShiftOps();
-      } else {
-        Mooee_shift.resize(Ls, 0.0);
-        MooeeInv_shift_lc.resize(Ls, 0.0);
-        MooeeInv_shift_norm.resize(Ls, 0.0);
-        MooeeInvDag_shift_lc.resize(Ls, 0.0);
-        MooeeInvDag_shift_norm.resize(Ls, 0.0);
-      }
-    }
-
-    /****************************************************************
-     * Additional EOFA operators only called outside the inverter.  
-     * Since speed is not essential, simple axpby-style
-     * implementations should be fine.
-     ***************************************************************/
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-    {
-      int Ls = this->Ls;
-      RealD alpha = this->alpha;
-
-      Din = zero;
-      if((sign == 1) && (dag == 0)) { // \Omega_{+}
-        for(int s=0; s<Ls; ++s){
-          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
-        }
-      } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
-        for(int s=0; s<Ls; ++s){
-          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
-        }
-      } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
-        for(int sp=0; sp<Ls; ++sp){
-          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
-        }
-      } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
-        for(int sp=0; sp<Ls; ++sp){
-          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
-        }
-      }
-    }
-
-    // This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
-    // It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
-    {
-      int Ls    = this->Ls;
-      RealD b   = 0.5 * ( 1.0 + this->alpha );
-      RealD c   = 0.5 * ( 1.0 - this->alpha );
-      RealD mq1 = this->mq1;
-
-      for(int s=0; s<Ls; ++s){
-        if(s == 0) {
-          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-          axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
-        } else if(s == (Ls-1)) {
-          axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
-          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-        } else {
-          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-        }
-      }
-    }
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-      RealD m = this->mq1;
-      RealD c = 0.5 * this->alpha;
-      RealD d = 0.5;
-
-      RealD DtInv_p(0.0), DtInv_m(0.0);
-      RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
-      FermionField tmp(this->FermionGrid());
-
-      for(int s=0; s<Ls; ++s){
-      for(int sp=0; sp<Ls; ++sp){
-
-        DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
-        DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
-        DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
-        DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
-
-        if(sp == 0){
-          axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
-          axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
-        } else {
-          axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
-          axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
-        }
-
-      }}
-    }
-
-    /*****************************************************************************************************/
-
-    template<class Impl>
-    RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      FermionField Din(psi._grid);
-
-      this->Meooe5D(psi, Din);
-      this->DW(Din, chi, DaggerNo);
-      axpby(chi, 1.0, 1.0, chi, psi);
-      this->M5D(psi, chi);
-      return(norm2(chi));
-    }
-
-    template<class Impl>
-    RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      FermionField Din(psi._grid);
-
-      this->DW(psi, Din, DaggerYes);
-      this->MeooeDag5D(Din, chi);
-      this->M5Ddag(psi, chi);
-      axpby(chi, 1.0, 1.0, chi, psi);
-      return(norm2(chi));
-    }
-
-    /********************************************************************
-     * Performance critical fermion operators called inside the inverter
-     ********************************************************************/
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      std::vector<Coeff_t> diag(Ls,1.0);
-      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-      // no shift term
-      if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
-
-      // fused M + shift operation
-      else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-    }
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      std::vector<Coeff_t> diag(Ls,1.0);
-      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-      // no shift term
-      if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
-
-      // fused M + shift operation
-      else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-    }
-
-    // half checkerboard operations
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      // coefficients of Mooee
-      std::vector<Coeff_t> diag = this->bee;
-      std::vector<Coeff_t> upper(Ls);
-      std::vector<Coeff_t> lower(Ls);
-      for(int s=0; s<Ls; s++){
-        upper[s] = -this->cee[s];
-        lower[s] = -this->cee[s];
-      }
-      upper[Ls-1] *= -this->mq1;
-      lower[0]    *= -this->mq1;
-
-      // no shift term
-      if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
-
-      // fused M + shift operation
-      else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-    }
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      // coefficients of MooeeDag
-      std::vector<Coeff_t> diag = this->bee;
-      std::vector<Coeff_t> upper(Ls);
-      std::vector<Coeff_t> lower(Ls);
-      for(int s=0; s<Ls; s++){
-        if(s==0) {
-          upper[s] = -this->cee[s+1];
-          lower[s] = this->mq1*this->cee[Ls-1];
-        } else if(s==(Ls-1)) {
-          upper[s] = this->mq1*this->cee[0];
-          lower[s] = -this->cee[s-1];
-        } else {
-          upper[s] = -this->cee[s+1];
-          lower[s] = -this->cee[s-1];
-        }
-      }
-
-      // no shift term
-      if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
-
-      // fused M + shift operation
-      else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-    }
-
-    /****************************************************************************************/
-
-    // Computes coefficients for applying Cayley preconditioned shift operators
-    //  (Mooee + \Delta) --> Mooee_shift
-    //  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
-    //  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
-    // For the latter two cases, the operation takes the form
-    //  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
-    //      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
-    {
-      int   Ls    = this->Ls;
-      int   pm    = this->pm;
-      RealD alpha = this->alpha;
-      RealD k     = this->k;
-      RealD mq1   = this->mq1;
-      RealD shift = this->shift;
-
-      // Initialize
-      Mooee_shift.resize(Ls);
-      MooeeInv_shift_lc.resize(Ls);
-      MooeeInv_shift_norm.resize(Ls);
-      MooeeInvDag_shift_lc.resize(Ls);
-      MooeeInvDag_shift_norm.resize(Ls);
-
-      // Construct Mooee_shift
-      int idx(0);
-      Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
-                  ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
-      for(int s=0; s<Ls; ++s){
-        idx = (pm == 1) ? (s) : (Ls-1-s);
-        Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
-      }
-
-      // Tridiagonal solve for MooeeInvDag_shift_lc
-      {
-        Coeff_t m(0.0);
-        std::vector<Coeff_t> d = Mooee_shift;
-        std::vector<Coeff_t> u(Ls,0.0);
-        std::vector<Coeff_t> y(Ls,0.0);
-        std::vector<Coeff_t> q(Ls,0.0);
-        if(pm == 1){ u[0] = 1.0; }
-        else{ u[Ls-1] = 1.0; }
-
-        // Tridiagonal matrix algorithm + Sherman-Morrison formula
-        //
-        // We solve
-        //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
-        // where Mooee' is the tridiagonal part of Mooee_{+}, and
-        // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
-        // so that the outer-product u \otimes v gives the (0,Ls-1)
-        // entry of Mooee_{+}.
-        //
-        // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
-        // and then construct the solution to the original system
-        //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
-        if(pm == 1){
-          for(int s=1; s<Ls; ++s){
-            m = -this->cee[s] / this->bee[s-1];
-            d[s] -= m*d[s-1];
-            u[s] -= m*u[s-1];
-          }
-        }
-        y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
-        q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
-        for(int s=Ls-2; s>=0; --s){
-          if(pm == 1){
-            y[s] = d[s] / this->bee[s];
-            q[s] = u[s] / this->bee[s];
-          } else {
-            y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
-            q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
-          }
-        }
-
-        // Construct MooeeInvDag_shift_lc
-        for(int s=0; s<Ls; ++s){
-          if(pm == 1){
-            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
-              (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
-          } else {
-            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
-              (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
-          }
-        }
-
-        // Compute remaining coefficients
-        N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
-        for(int s=0; s<Ls; ++s){
-
-          // MooeeInv_shift_lc
-          if(pm == 1){ MooeeInv_shift_lc[s] = std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s); }
-          else{ MooeeInv_shift_lc[s] = std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s); }
-
-          // MooeeInv_shift_norm
-          MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
-            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N;
-
-          // MooeeInvDag_shift_norm
-          if(pm == 1){ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s) /
-            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
-          else{ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s) /
-            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
-        }
-      }
-    }
-
-    // Recompute coefficients for a different value of shift constant
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-    {
-      this->shift = new_shift;
-      if(new_shift != 0.0){
-        SetCoefficientsPrecondShiftOps();
-      } else {
-        int Ls = this->Ls;
-        Mooee_shift.resize(Ls,0.0);
-        MooeeInv_shift_lc.resize(Ls,0.0);
-        MooeeInv_shift_norm.resize(Ls,0.0);
-        MooeeInvDag_shift_lc.resize(Ls,0.0);
-        MooeeInvDag_shift_norm.resize(Ls,0.0);
-      }
-    }
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-      Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-    {
-      int Ls = this->Ls;
-
-      GridBase* grid = this->FermionRedBlackGrid();
-      int LLs = grid->_rdimensions[0];
-
-      if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-      Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-      Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-      for(int s=0; s<Ls; s++){
-        Pplus(s,s)  = this->bee[s];
-        Pminus(s,s) = this->bee[s];
-      }
-
-      for(int s=0; s<Ls-1; s++){
-        Pminus(s,s+1) = -this->cee[s];
-        Pplus(s+1,s) = -this->cee[s+1];
-      }
-
-      Pplus (0,Ls-1) = this->mq1*this->cee[0];
-      Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
-
-      if(this->shift != 0.0){
-        RealD c = 0.5 * this->alpha;
-        RealD d = 0.5;
-        RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
-        if(this->pm == 1) {
-          for(int s=0; s<Ls; ++s){
-            Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
-          }
-        } else {
-          for(int s=0; s<Ls; ++s){
-            Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
-          }
-        }
-      }
-
-      Eigen::MatrixXcd PplusMat ;
-      Eigen::MatrixXcd PminusMat;
-
-      if(inv) {
-        PplusMat  = Pplus.inverse();
-        PminusMat = Pminus.inverse();
-      } else {
-        PplusMat  = Pplus;
-        PminusMat = Pminus;
-      }
-
-      if(dag){
-        PplusMat.adjointInPlace();
-        PminusMat.adjointInPlace();
-      }
-
-      typedef typename SiteHalfSpinor::scalar_type scalar_type;
-      const int Nsimd = Simd::Nsimd();
-      Matp.resize(Ls*LLs);
-      Matm.resize(Ls*LLs);
-
-      for(int s2=0; s2<Ls; s2++){
-      for(int s1=0; s1<LLs; s1++){
-        int istride = LLs;
-        int ostride = 1;
-        Simd Vp;
-        Simd Vm;
-        scalar_type *sp = (scalar_type*) &Vp;
-        scalar_type *sm = (scalar_type*) &Vm;
-        for(int l=0; l<Nsimd; l++){
-          if(switcheroo<Coeff_t>::iscomplex()) {
-            sp[l] = PplusMat (l*istride+s1*ostride,s2);
-            sm[l] = PminusMat(l*istride+s1*ostride,s2);
-          } else {
-            // if real
-            scalar_type tmp;
-            tmp = PplusMat (l*istride+s1*ostride,s2);
-            sp[l] = scalar_type(tmp.real(),tmp.real());
-            tmp = PminusMat(l*istride+s1*ostride,s2);
-            sm[l] = scalar_type(tmp.real(),tmp.real());
-          }
-        }
-        Matp[LLs*s2+s1] = Vp;
-        Matm[LLs*s2+s1] = Vm;
-      }}
-  }
-
-  FermOpTemplateInstantiate(MobiusEOFAFermion);
-  GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
-
-}}
--- a/Grid/qcd/action/fermion/MobiusEOFAFermion.h
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.h
@ -26,108 +26,79 @@ with this program; if not, write to the Free Software Foundation, Inc.,

 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */
 #ifndef  GRID_QCD_MOBIUS_EOFA_FERMION_H
 #define  GRID_QCD_MOBIUS_EOFA_FERMION_H

 #include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  template<class Impl>
-  class MobiusEOFAFermion : public AbstractEOFAFermion<Impl>
-  {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
+template<class Impl>
+class MobiusEOFAFermion : public AbstractEOFAFermion<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    public:
-      // Shift operator coefficients for red-black preconditioned Mobius EOFA
-      std::vector<Coeff_t> Mooee_shift;
-      std::vector<Coeff_t> MooeeInv_shift_lc;
-      std::vector<Coeff_t> MooeeInv_shift_norm;
-      std::vector<Coeff_t> MooeeInvDag_shift_lc;
-      std::vector<Coeff_t> MooeeInvDag_shift_norm;
+public:
+  // Shift operator coefficients for red-black preconditioned Mobius EOFA
+  Vector<Coeff_t> Mooee_shift;
+  Vector<Coeff_t> MooeeInv_shift_lc;
+  Vector<Coeff_t> MooeeInv_shift_norm;
+  Vector<Coeff_t> MooeeInvDag_shift_lc;
+  Vector<Coeff_t> MooeeInvDag_shift_norm;

-      virtual void Instantiatable(void) {};
+  virtual void Instantiatable(void) {};

-      // EOFA-specific operations
-      virtual void  Omega            (const FermionField& in, FermionField& out, int sign, int dag);
-      virtual void  Dtilde           (const FermionField& in, FermionField& out);
-      virtual void  DtildeInv        (const FermionField& in, FermionField& out);
+  // EOFA-specific operations
+  virtual void  Omega            (const FermionField& in, FermionField& out, int sign, int dag);
+  virtual void  Dtilde           (const FermionField& in, FermionField& out);
+  virtual void  DtildeInv        (const FermionField& in, FermionField& out);

-      // override multiply
-      virtual RealD M                (const FermionField& in, FermionField& out);
-      virtual RealD Mdag             (const FermionField& in, FermionField& out);
+  // override multiply
+  virtual RealD M                (const FermionField& in, FermionField& out);
+  virtual RealD Mdag             (const FermionField& in, FermionField& out);

-      // half checkerboard operations
-      virtual void  Mooee            (const FermionField& in, FermionField& out);
-      virtual void  MooeeDag         (const FermionField& in, FermionField& out);
-      virtual void  MooeeInv         (const FermionField& in, FermionField& out);
-      virtual void  MooeeInv_shift   (const FermionField& in, FermionField& out);
-      virtual void  MooeeInvDag      (const FermionField& in, FermionField& out);
-      virtual void  MooeeInvDag_shift(const FermionField& in, FermionField& out);
+  // half checkerboard operations
+  virtual void  Mooee            (const FermionField& in, FermionField& out);
+  virtual void  MooeeDag         (const FermionField& in, FermionField& out);
+  virtual void  MooeeInv         (const FermionField& in, FermionField& out);
+  virtual void  MooeeInv_shift   (const FermionField& in, FermionField& out);
+  virtual void  MooeeInvDag      (const FermionField& in, FermionField& out);
+  virtual void  MooeeInvDag_shift(const FermionField& in, FermionField& out);

-      virtual void   M5D             (const FermionField& psi, FermionField& chi);
-      virtual void   M5Ddag          (const FermionField& psi, FermionField& chi);
+  virtual void   M5D             (const FermionField& psi, FermionField& chi);
+  virtual void   M5Ddag          (const FermionField& psi, FermionField& chi);

-      /////////////////////////////////////////////////////
-      // Instantiate different versions depending on Impl
-      /////////////////////////////////////////////////////
-      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+  /////////////////////////////////////////////////////
+  // Instantiate different versions depending on Impl
+  /////////////////////////////////////////////////////
+  void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
+	   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);

-      void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-        std::vector<Coeff_t>& shift_coeffs);
+  void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
+		 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
+		 Vector<Coeff_t>& shift_coeffs);

-      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+  void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
+	      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);

-      void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-        std::vector<Coeff_t>& shift_coeffs);
+  void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
+		    Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
+		    Vector<Coeff_t>& shift_coeffs);

-      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
+  virtual void RefreshShiftCoefficients(RealD new_shift);

-      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+  // Constructors
+  MobiusEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
+		    GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
+		    RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
+		    RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams());

-      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
-        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+protected:
+  void SetCoefficientsPrecondShiftOps(void);
+};

-      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
-        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
-
-      virtual void RefreshShiftCoefficients(RealD new_shift);
-
-      // Constructors
-      MobiusEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
-        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
-        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
-        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams());
-
-    protected:
-      void SetCoefficientsPrecondShiftOps(void);
-  };
-}}
-
-#define INSTANTIATE_DPERP_MOBIUS_EOFA(A)\
-template void MobiusEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
-template void MobiusEOFAFermion<A>::M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
-template void MobiusEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
-template void MobiusEOFAFermion<A>::M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
-template void MobiusEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
-template void MobiusEOFAFermion<A>::MooeeInv_shift(const FermionField& psi, FermionField& chi); \
-template void MobiusEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi); \
-template void MobiusEOFAFermion<A>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi);
-
-#undef  MOBIUS_EOFA_DPERP_DENSE
-#define MOBIUS_EOFA_DPERP_CACHE
-#undef  MOBIUS_EOFA_DPERP_LINALG
-#define MOBIUS_EOFA_DPERP_VEC
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/MobiusEOFAFermioncache.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermioncache.cc
@ -1,429 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi, const FermionField &phi, FermionField &chi,
-    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
-  {
-    int Ls = this->Ls;
-    GridBase *grid = psi._grid;
-
-    assert(phi.checkerboard == psi.checkerboard);
-    chi.checkerboard = psi.checkerboard;
-
-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-      for(int s=0; s<Ls; s++){
-        auto tmp = psi._odata[0];
-        if(s==0){
-          spProj5m(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+Ls-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else if(s==(Ls-1)) {
-          spProj5m(tmp, psi._odata[ss+0]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else {
-          spProj5m(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        }
-      }
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
-    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
-    std::vector<Coeff_t> &shift_coeffs)
-  {
-    int Ls = this->Ls;
-    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
-    GridBase *grid = psi._grid;
-
-    assert(phi.checkerboard == psi.checkerboard);
-    chi.checkerboard = psi.checkerboard;
-
-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-      for(int s=0; s<Ls; s++){
-        auto tmp = psi._odata[0];
-        if(s==0){
-          spProj5m(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+Ls-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else if(s==(Ls-1)) {
-          spProj5m(tmp, psi._odata[ss+0]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else {
-          spProj5m(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        }
-        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+shift_s]); }
-        else{ spProj5m(tmp, psi._odata[ss+shift_s]); }
-        chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
-      }
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi,
-    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
-  {
-    int Ls = this->Ls;
-    GridBase *grid = psi._grid;
-
-    assert(phi.checkerboard == psi.checkerboard);
-    chi.checkerboard = psi.checkerboard;
-
-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-      auto tmp = psi._odata[0];
-      for(int s=0; s<Ls; s++){
-        if(s==0) {
-          spProj5p(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+Ls-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else if(s==(Ls-1)) {
-          spProj5p(tmp, psi._odata[ss+0]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else {
-          spProj5p(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        }
-      }
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
-    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
-    std::vector<Coeff_t> &shift_coeffs)
-  {
-    int Ls = this->Ls;
-    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
-    GridBase *grid = psi._grid;
-
-    assert(phi.checkerboard == psi.checkerboard);
-    chi.checkerboard = psi.checkerboard;
-
-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-      chi[ss+Ls-1] = zero;
-      auto tmp = psi._odata[0];
-      for(int s=0; s<Ls; s++){
-        if(s==0) {
-          spProj5p(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+Ls-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else if(s==(Ls-1)) {
-          spProj5p(tmp, psi._odata[ss+0]);
-          chi[ss+s] = chi[ss+s] + diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else {
-          spProj5p(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        }
-        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+s]); }
-        else{ spProj5m(tmp, psi._odata[ss+s]); }
-        chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
-      }
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
-  {
-    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
-
-    GridBase *grid = psi._grid;
-    int Ls = this->Ls;
-
-    chi.checkerboard = psi.checkerboard;
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-
-      auto tmp = psi._odata[0];
-
-      // Apply (L^{\prime})^{-1}
-      chi[ss] = psi[ss]; // chi[0]=psi[0]
-      for(int s=1; s<Ls; s++){
-        spProj5p(tmp, chi[ss+s-1]);
-        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp;
-      }
-
-      // L_m^{-1}
-      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-        spProj5m(tmp, chi[ss+s]);
-        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp;
-      }
-
-      // U_m^{-1} D^{-1}
-      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-        spProj5p(tmp, chi[ss+Ls-1]);
-        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp;
-      }
-      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-
-      // Apply U^{-1}
-      for(int s=Ls-2; s>=0; s--){
-        spProj5m(tmp, chi[ss+s+1]);
-        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi, FermionField &chi)
-  {
-    GridBase *grid = psi._grid;
-    int Ls = this->Ls;
-
-    chi.checkerboard = psi.checkerboard;
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-
-      auto tmp1        = psi._odata[0];
-      auto tmp2        = psi._odata[0];
-      auto tmp2_spProj = psi._odata[0];
-
-      // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
-      chi[ss] = psi[ss]; // chi[0]=psi[0]
-      tmp2 = MooeeInv_shift_lc[0]*psi[ss];
-      for(int s=1; s<Ls; s++){
-        spProj5p(tmp1, chi[ss+s-1]);
-        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
-        tmp2 = tmp2 + MooeeInv_shift_lc[s]*psi[ss+s];
-      }
-      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
-      else{ spProj5m(tmp2_spProj, tmp2); }
-
-      // L_m^{-1}
-      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-        spProj5m(tmp1, chi[ss+s]);
-        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
-      }
-
-      // U_m^{-1} D^{-1}
-      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-        spProj5p(tmp1, chi[ss+Ls-1]);
-        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp1;
-      }
-      // chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
-      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-      spProj5m(tmp1, chi[ss+Ls-1]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
-
-      // Apply U^{-1} and add shift term
-      for(int s=Ls-2; s>=0; s--){
-        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
-        spProj5m(tmp1, chi[ss+s]);
-        chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi, FermionField &chi)
-  {
-    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
-
-    GridBase *grid = psi._grid;
-    int Ls = this->Ls;
-
-    chi.checkerboard = psi.checkerboard;
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-
-      auto tmp = psi._odata[0];
-
-      // Apply (U^{\prime})^{-dag}
-      chi[ss] = psi[ss];
-      for(int s=1; s<Ls; s++){
-        spProj5m(tmp, chi[ss+s-1]);
-        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp;
-      }
-
-      // U_m^{-\dag}
-      for(int s=0; s<Ls-1; s++){
-        spProj5p(tmp, chi[ss+s]);
-        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp;
-      }
-
-      // L_m^{-\dag} D^{-dag}
-      for(int s=0; s<Ls-1; s++){
-        spProj5m(tmp, chi[ss+Ls-1]);
-        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp;
-      }
-      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-
-      // Apply L^{-dag}
-      for(int s=Ls-2; s>=0; s--){
-        spProj5p(tmp, chi[ss+s+1]);
-        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi, FermionField &chi)
-  {
-    GridBase *grid = psi._grid;
-    int Ls = this->Ls;
-
-    chi.checkerboard = psi.checkerboard;
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-
-      auto tmp1        = psi._odata[0];
-      auto tmp2        = psi._odata[0];
-      auto tmp2_spProj = psi._odata[0];
-
-      // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
-      chi[ss] = psi[ss];
-      tmp2 = MooeeInvDag_shift_lc[0]*psi[ss];
-      for(int s=1; s<Ls; s++){
-        spProj5m(tmp1, chi[ss+s-1]);
-        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp1;
-        tmp2 = tmp2 + MooeeInvDag_shift_lc[s]*psi[ss+s];
-      }
-      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
-      else{ spProj5m(tmp2_spProj, tmp2); }
-
-      // U_m^{-\dag}
-      for(int s=0; s<Ls-1; s++){
-        spProj5p(tmp1, chi[ss+s]);
-        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp1;
-      }
-
-      // L_m^{-\dag} D^{-dag}
-      for(int s=0; s<Ls-1; s++){
-        spProj5m(tmp1, chi[ss+Ls-1]);
-        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp1;
-      }
-      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-      spProj5p(tmp1, chi[ss+Ls-1]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInvDag_shift_norm[Ls-1]*tmp2_spProj;
-
-      // Apply L^{-dag}
-      for(int s=Ls-2; s>=0; s--){
-        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp1;
-        spProj5p(tmp1, chi[ss+s]);
-        chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  #ifdef MOBIUS_EOFA_DPERP_CACHE
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
-
-  #endif
-
-}}
--- a/Grid/qcd/action/fermion/MobiusEOFAFermiondense.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermiondense.cc
@ -1,184 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  /*
-  * Dense matrix versions of routines
-  */
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-  {
-    int Ls = this->Ls;
-    int LLs = psi._grid->_rdimensions[0];
-    int vol = psi._grid->oSites()/LLs;
-
-    int pm      = this->pm;
-    RealD shift = this->shift;
-    RealD alpha = this->alpha;
-    RealD k     = this->k;
-    RealD mq1   = this->mq1;
-
-    chi.checkerboard = psi.checkerboard;
-
-    assert(Ls==LLs);
-
-    Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
-    Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
-
-    for(int s=0;s<Ls;s++){
-        Pplus(s,s)  = this->bee[s];
-        Pminus(s,s) = this->bee[s];
-    }
-
-    for(int s=0; s<Ls-1; s++){
-        Pminus(s,s+1) = -this->cee[s];
-    }
-
-    for(int s=0; s<Ls-1; s++){
-        Pplus(s+1,s) = -this->cee[s+1];
-    }
-    Pplus (0,Ls-1) = mq1*this->cee[0];
-    Pminus(Ls-1,0) = mq1*this->cee[Ls-1];
-
-    if(shift != 0.0){
-      Coeff_t N = 2.0 * ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
-      for(int s=0; s<Ls; ++s){
-        if(pm == 1){ Pplus(s,Ls-1) += shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
-        else{ Pminus(Ls-1-s,Ls-1) -= shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
-      }
-    }
-
-    Eigen::MatrixXd PplusMat ;
-    Eigen::MatrixXd PminusMat;
-
-    if(inv){
-      PplusMat  = Pplus.inverse();
-      PminusMat = Pminus.inverse();
-    } else {
-      PplusMat  = Pplus;
-      PminusMat = Pminus;
-    }
-
-    if(dag){
-      PplusMat.adjointInPlace();
-      PminusMat.adjointInPlace();
-    }
-
-    // For the non-vectorised s-direction this is simple
-
-    for(auto site=0; site<vol; site++){
-
-        SiteSpinor     SiteChi;
-        SiteHalfSpinor SitePplus;
-        SiteHalfSpinor SitePminus;
-
-        for(int s1=0; s1<Ls; s1++){
-            SiteChi = zero;
-            for(int s2=0; s2<Ls; s2++){
-                int lex2 = s2 + Ls*site;
-                if(PplusMat(s1,s2) != 0.0){
-                    spProj5p(SitePplus,psi[lex2]);
-                    accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
-                }
-                if(PminusMat(s1,s2) != 0.0){
-                    spProj5m(SitePminus, psi[lex2]);
-                    accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
-                }
-            }
-            chi[s1+Ls*site] = SiteChi*0.5;
-        }
-    }
-  }
-
-  #ifdef MOBIUS_EOFA_DPERP_DENSE
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
-
-    template void MobiusEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
-
-    template void MobiusEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-  #endif
-
-}}
--- a/Grid/qcd/action/fermion/MobiusEOFAFermionssp.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermionssp.cc
@ -1,290 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-  // Pminus fowards
-  // Pplus  backwards
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-  {
-    Coeff_t one(1.0);
-    int Ls = this->Ls;
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
-      } else if (s==(Ls-1)) {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
-        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
-      } else {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
-      }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-    std::vector<Coeff_t>& shift_coeffs)
-  {
-    Coeff_t one(1.0);
-    int Ls = this->Ls;
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
-      } else if (s==(Ls-1)) {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
-        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
-      } else {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
-      }
-      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
-      else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-  {
-    Coeff_t one(1.0);
-    int Ls = this->Ls;
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
-      } else if (s==(Ls-1)) {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-      } else {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-      }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-    std::vector<Coeff_t>& shift_coeffs)
-  {
-    Coeff_t one(1.0);
-    int Ls = this->Ls;
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
-      } else if (s==(Ls-1)) {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-      } else {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-      }
-      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
-      else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-  {
-    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
-
-    Coeff_t one(1.0);
-    Coeff_t czero(0.0);
-    chi.checkerboard = psi.checkerboard;
-    int Ls = this->Ls;
-
-    // Apply (L^{\prime})^{-1}
-    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
-    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
-    }
-    axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
-
-    // Apply U^{-1}
-    for(int s=Ls-2; s>=0; s--){
-      axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
-  {
-    Coeff_t one(1.0);
-    Coeff_t czero(0.0);
-    chi.checkerboard = psi.checkerboard;
-    int Ls = this->Ls;
-
-    FermionField tmp(psi._grid);
-
-    // Apply (L^{\prime})^{-1}
-    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-    axpby_ssp(tmp, czero, tmp, this->MooeeInv_shift_lc[0], psi, 0, 0);
-    for(int s=1; s<Ls; s++){
-      axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
-      axpby_ssp(tmp, one, tmp, this->MooeeInv_shift_lc[s], psi, 0, s);
-    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
-    }
-    axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
-
-    // Apply U^{-1} and add shift term
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
-    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
-    for(int s=Ls-2; s>=0; s--){
-      axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
-      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
-      else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-  {
-    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
-
-    Coeff_t one(1.0);
-    Coeff_t czero(0.0);
-    chi.checkerboard = psi.checkerboard;
-    int Ls = this->Ls;
-
-    // Apply (U^{\prime})^{-dagger}
-    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
-    }
-
-    // U_m^{-\dagger}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
-    }
-    axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
-
-    // Apply L^{-dagger}
-    for(int s=Ls-2; s>=0; s--){
-      axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
-  {
-    Coeff_t one(1.0);
-    Coeff_t czero(0.0);
-    chi.checkerboard = psi.checkerboard;
-    int Ls = this->Ls;
-
-    FermionField tmp(psi._grid);
-
-    // Apply (U^{\prime})^{-dagger} and accumulate (MooeeInvDag_shift_lc)_{j} \psi_{j} in tmp[0]
-    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-    axpby_ssp(tmp, czero, tmp, this->MooeeInvDag_shift_lc[0], psi, 0, 0);
-    for(int s=1; s<Ls; s++){
-      axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
-      axpby_ssp(tmp, one, tmp, this->MooeeInvDag_shift_lc[s], psi, 0, s);
-    }
-
-    // U_m^{-\dagger}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
-    }
-    axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
-
-    // Apply L^{-dagger} and add shift
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
-    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
-    for(int s=Ls-2; s>=0; s--){
-      axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
-      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
-      else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
-    }
-  }
-
-  #ifdef MOBIUS_EOFA_DPERP_LINALG
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
-
-  #endif
-
-}}
--- a/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc
@ -1,983 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  /*
-  * Dense matrix versions of routines
-  */
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-  {
-    GridBase* grid  = psi._grid;
-    int Ls          = this->Ls;
-    int LLs         = grid->_rdimensions[0];
-    const int nsimd = Simd::Nsimd();
-
-    Vector<iSinglet<Simd>> u(LLs);
-    Vector<iSinglet<Simd>> l(LLs);
-    Vector<iSinglet<Simd>> d(LLs);
-
-    assert(Ls/LLs == nsimd);
-    assert(phi.checkerboard == psi.checkerboard);
-
-    chi.checkerboard = psi.checkerboard;
-
-    // just directly address via type pun
-    typedef typename Simd::scalar_type scalar_type;
-    scalar_type* u_p = (scalar_type*) &u[0];
-    scalar_type* l_p = (scalar_type*) &l[0];
-    scalar_type* d_p = (scalar_type*) &d[0];
-
-    for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s   = o + i*LLs;
-      int ss  = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    assert(Nc == 3);
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-      #if 0
-
-        alignas(64) SiteHalfSpinor hp;
-        alignas(64) SiteHalfSpinor hm;
-        alignas(64) SiteSpinor fp;
-        alignas(64) SiteSpinor fm;
-
-        for(int v=0; v<LLs; v++){
-
-          int vp = (v+1)%LLs;
-          int vm = (v+LLs-1)%LLs;
-
-          spProj5m(hp, psi[ss+vp]);
-          spProj5p(hm, psi[ss+vm]);
-
-          if (vp <= v){ rotate(hp, hp, 1); }
-          if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-          hp = 0.5*hp;
-          hm = 0.5*hm;
-
-          spRecon5m(fp, hp);
-          spRecon5p(fm, hm);
-
-          chi[ss+v] = d[v]*phi[ss+v];
-          chi[ss+v] = chi[ss+v] + u[v]*fp;
-          chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-        }
-
-      #else
-
-        for(int v=0; v<LLs; v++){
-
-          vprefetch(psi[ss+v+LLs]);
-
-          int vp = (v == LLs-1) ? 0     : v+1;
-          int vm = (v == 0)     ? LLs-1 : v-1;
-
-          Simd hp_00 = psi[ss+vp]()(2)(0);
-          Simd hp_01 = psi[ss+vp]()(2)(1);
-          Simd hp_02 = psi[ss+vp]()(2)(2);
-          Simd hp_10 = psi[ss+vp]()(3)(0);
-          Simd hp_11 = psi[ss+vp]()(3)(1);
-          Simd hp_12 = psi[ss+vp]()(3)(2);
-
-          Simd hm_00 = psi[ss+vm]()(0)(0);
-          Simd hm_01 = psi[ss+vm]()(0)(1);
-          Simd hm_02 = psi[ss+vm]()(0)(2);
-          Simd hm_10 = psi[ss+vm]()(1)(0);
-          Simd hm_11 = psi[ss+vm]()(1)(1);
-          Simd hm_12 = psi[ss+vm]()(1)(2);
-
-          if(vp <= v){
-            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-          }
-
-          if(vm >= v){
-            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-          }
-
-          // Can force these to real arithmetic and save 2x.
-          Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-          Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-          Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-          Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-          Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-          Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-          Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-          Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-          Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-          Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-          Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-          Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-          vstream(chi[ss+v]()(0)(0), p_00);
-          vstream(chi[ss+v]()(0)(1), p_01);
-          vstream(chi[ss+v]()(0)(2), p_02);
-          vstream(chi[ss+v]()(1)(0), p_10);
-          vstream(chi[ss+v]()(1)(1), p_11);
-          vstream(chi[ss+v]()(1)(2), p_12);
-          vstream(chi[ss+v]()(2)(0), p_20);
-          vstream(chi[ss+v]()(2)(1), p_21);
-          vstream(chi[ss+v]()(2)(2), p_22);
-          vstream(chi[ss+v]()(3)(0), p_30);
-          vstream(chi[ss+v]()(3)(1), p_31);
-          vstream(chi[ss+v]()(3)(2), p_32);
-        }
-
-      #endif
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-    std::vector<Coeff_t>& shift_coeffs)
-  {
-    #if 0
-
-      this->M5D(psi, phi, chi, lower, diag, upper);
-
-      // FIXME: possible gain from vectorizing shift operation as well?
-      Coeff_t one(1.0);
-      int Ls = this->Ls;
-      for(int s=0; s<Ls; s++){
-        if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
-        else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
-      }
-
-    #else
-
-      GridBase* grid  = psi._grid;
-      int Ls          = this->Ls;
-      int LLs         = grid->_rdimensions[0];
-      const int nsimd = Simd::Nsimd();
-
-      Vector<iSinglet<Simd>> u(LLs);
-      Vector<iSinglet<Simd>> l(LLs);
-      Vector<iSinglet<Simd>> d(LLs);
-      Vector<iSinglet<Simd>> s(LLs);
-
-      assert(Ls/LLs == nsimd);
-      assert(phi.checkerboard == psi.checkerboard);
-
-      chi.checkerboard = psi.checkerboard;
-
-      // just directly address via type pun
-      typedef typename Simd::scalar_type scalar_type;
-      scalar_type* u_p = (scalar_type*) &u[0];
-      scalar_type* l_p = (scalar_type*) &l[0];
-      scalar_type* d_p = (scalar_type*) &d[0];
-      scalar_type* s_p = (scalar_type*) &s[0];
-
-      for(int o=0; o<LLs; o++){ // outer
-      for(int i=0; i<nsimd; i++){ //inner
-        int s   = o + i*LLs;
-        int ss  = o*nsimd + i;
-        u_p[ss] = upper[s];
-        l_p[ss] = lower[s];
-        d_p[ss] = diag[s];
-        s_p[ss] = shift_coeffs[s];
-      }}
-
-      this->M5Dcalls++;
-      this->M5Dtime -= usecond();
-
-      assert(Nc == 3);
-
-      parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-        int vs     = (this->pm == 1) ? LLs-1 : 0;
-        Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
-        Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
-        Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
-        Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
-        Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
-        Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
-
-        for(int v=0; v<LLs; v++){
-
-          vprefetch(psi[ss+v+LLs]);
-
-          int vp = (v == LLs-1) ? 0     : v+1;
-          int vm = (v == 0)     ? LLs-1 : v-1;
-
-          Simd hp_00 = psi[ss+vp]()(2)(0);
-          Simd hp_01 = psi[ss+vp]()(2)(1);
-          Simd hp_02 = psi[ss+vp]()(2)(2);
-          Simd hp_10 = psi[ss+vp]()(3)(0);
-          Simd hp_11 = psi[ss+vp]()(3)(1);
-          Simd hp_12 = psi[ss+vp]()(3)(2);
-
-          Simd hm_00 = psi[ss+vm]()(0)(0);
-          Simd hm_01 = psi[ss+vm]()(0)(1);
-          Simd hm_02 = psi[ss+vm]()(0)(2);
-          Simd hm_10 = psi[ss+vm]()(1)(0);
-          Simd hm_11 = psi[ss+vm]()(1)(1);
-          Simd hm_12 = psi[ss+vm]()(1)(2);
-
-          if(vp <= v){
-            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-          }
-
-          if(this->pm == 1 && vs <= v){
-            hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-            hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-            hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-            hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-            hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-            hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-          }
-
-          if(vm >= v){
-            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-          }
-
-          if(this->pm == -1 && vs >= v){
-            hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-            hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-            hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-            hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-            hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-            hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-          }
-
-          // Can force these to real arithmetic and save 2x.
-          Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-          Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-          Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-          Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-          Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-          Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-          Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-          Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-          Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-          Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-          Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-          Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-          vstream(chi[ss+v]()(0)(0), p_00);
-          vstream(chi[ss+v]()(0)(1), p_01);
-          vstream(chi[ss+v]()(0)(2), p_02);
-          vstream(chi[ss+v]()(1)(0), p_10);
-          vstream(chi[ss+v]()(1)(1), p_11);
-          vstream(chi[ss+v]()(1)(2), p_12);
-          vstream(chi[ss+v]()(2)(0), p_20);
-          vstream(chi[ss+v]()(2)(1), p_21);
-          vstream(chi[ss+v]()(2)(2), p_22);
-          vstream(chi[ss+v]()(3)(0), p_30);
-          vstream(chi[ss+v]()(3)(1), p_31);
-          vstream(chi[ss+v]()(3)(2), p_32);
-        }
-      }
-
-      this->M5Dtime += usecond();
-
-    #endif
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-  {
-    GridBase* grid = psi._grid;
-    int Ls  = this->Ls;
-    int LLs = grid->_rdimensions[0];
-    int nsimd = Simd::Nsimd();
-
-    Vector<iSinglet<Simd>> u(LLs);
-    Vector<iSinglet<Simd>> l(LLs);
-    Vector<iSinglet<Simd>> d(LLs);
-
-    assert(Ls/LLs == nsimd);
-    assert(phi.checkerboard == psi.checkerboard);
-
-    chi.checkerboard = psi.checkerboard;
-
-    // just directly address via type pun
-    typedef typename Simd::scalar_type scalar_type;
-    scalar_type* u_p = (scalar_type*) &u[0];
-    scalar_type* l_p = (scalar_type*) &l[0];
-    scalar_type* d_p = (scalar_type*) &d[0];
-
-    for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-      #if 0
-
-        alignas(64) SiteHalfSpinor hp;
-        alignas(64) SiteHalfSpinor hm;
-        alignas(64) SiteSpinor fp;
-        alignas(64) SiteSpinor fm;
-
-        for(int v=0; v<LLs; v++){
-
-          int vp = (v+1)%LLs;
-          int vm = (v+LLs-1)%LLs;
-
-          spProj5p(hp, psi[ss+vp]);
-          spProj5m(hm, psi[ss+vm]);
-
-          if(vp <= v){ rotate(hp, hp, 1); }
-          if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-          hp = hp*0.5;
-          hm = hm*0.5;
-          spRecon5p(fp, hp);
-          spRecon5m(fm, hm);
-
-          chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-          chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-        }
-
-      #else
-
-        for(int v=0; v<LLs; v++){
-
-          vprefetch(psi[ss+v+LLs]);
-
-          int vp = (v == LLs-1) ? 0     : v+1;
-          int vm = (v == 0    ) ? LLs-1 : v-1;
-
-          Simd hp_00 = psi[ss+vp]()(0)(0);
-          Simd hp_01 = psi[ss+vp]()(0)(1);
-          Simd hp_02 = psi[ss+vp]()(0)(2);
-          Simd hp_10 = psi[ss+vp]()(1)(0);
-          Simd hp_11 = psi[ss+vp]()(1)(1);
-          Simd hp_12 = psi[ss+vp]()(1)(2);
-
-          Simd hm_00 = psi[ss+vm]()(2)(0);
-          Simd hm_01 = psi[ss+vm]()(2)(1);
-          Simd hm_02 = psi[ss+vm]()(2)(2);
-          Simd hm_10 = psi[ss+vm]()(3)(0);
-          Simd hm_11 = psi[ss+vm]()(3)(1);
-          Simd hm_12 = psi[ss+vm]()(3)(2);
-
-          if (vp <= v){
-            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-          }
-
-          if(vm >= v){
-            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-          }
-
-          Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-          Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-          Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-          Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-          Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-          Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-          Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-          Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-          Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-          Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-          Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-          Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-          vstream(chi[ss+v]()(0)(0), p_00);
-          vstream(chi[ss+v]()(0)(1), p_01);
-          vstream(chi[ss+v]()(0)(2), p_02);
-          vstream(chi[ss+v]()(1)(0), p_10);
-          vstream(chi[ss+v]()(1)(1), p_11);
-          vstream(chi[ss+v]()(1)(2), p_12);
-          vstream(chi[ss+v]()(2)(0), p_20);
-          vstream(chi[ss+v]()(2)(1), p_21);
-          vstream(chi[ss+v]()(2)(2), p_22);
-          vstream(chi[ss+v]()(3)(0), p_30);
-          vstream(chi[ss+v]()(3)(1), p_31);
-          vstream(chi[ss+v]()(3)(2), p_32);
-
-        }
-
-      #endif
-
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-    std::vector<Coeff_t>& shift_coeffs)
-  {
-    #if 0
-
-      this->M5Ddag(psi, phi, chi, lower, diag, upper);
-
-      // FIXME: possible gain from vectorizing shift operation as well?
-      Coeff_t one(1.0);
-      int Ls = this->Ls;
-      for(int s=0; s<Ls; s++){
-        if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
-        else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
-      }
-
-    #else
-
-      GridBase* grid = psi._grid;
-      int Ls  = this->Ls;
-      int LLs = grid->_rdimensions[0];
-      int nsimd = Simd::Nsimd();
-
-      Vector<iSinglet<Simd>> u(LLs);
-      Vector<iSinglet<Simd>> l(LLs);
-      Vector<iSinglet<Simd>> d(LLs);
-      Vector<iSinglet<Simd>> s(LLs);
-
-      assert(Ls/LLs == nsimd);
-      assert(phi.checkerboard == psi.checkerboard);
-
-      chi.checkerboard = psi.checkerboard;
-
-      // just directly address via type pun
-      typedef typename Simd::scalar_type scalar_type;
-      scalar_type* u_p = (scalar_type*) &u[0];
-      scalar_type* l_p = (scalar_type*) &l[0];
-      scalar_type* d_p = (scalar_type*) &d[0];
-      scalar_type* s_p = (scalar_type*) &s[0];
-
-      for(int o=0; o<LLs; o++){ // outer
-      for(int i=0; i<nsimd; i++){ //inner
-        int s  = o + i*LLs;
-        int ss = o*nsimd + i;
-        u_p[ss] = upper[s];
-        l_p[ss] = lower[s];
-        d_p[ss] = diag[s];
-        s_p[ss] = shift_coeffs[s];
-      }}
-
-      this->M5Dcalls++;
-      this->M5Dtime -= usecond();
-
-      parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-        int vs     = (this->pm == 1) ? LLs-1 : 0;
-        Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
-        Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
-        Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
-        Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
-        Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
-        Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
-
-        for(int v=0; v<LLs; v++){
-
-          vprefetch(psi[ss+v+LLs]);
-
-          int vp = (v == LLs-1) ? 0     : v+1;
-          int vm = (v == 0    ) ? LLs-1 : v-1;
-
-          Simd hp_00 = psi[ss+vp]()(0)(0);
-          Simd hp_01 = psi[ss+vp]()(0)(1);
-          Simd hp_02 = psi[ss+vp]()(0)(2);
-          Simd hp_10 = psi[ss+vp]()(1)(0);
-          Simd hp_11 = psi[ss+vp]()(1)(1);
-          Simd hp_12 = psi[ss+vp]()(1)(2);
-
-          Simd hm_00 = psi[ss+vm]()(2)(0);
-          Simd hm_01 = psi[ss+vm]()(2)(1);
-          Simd hm_02 = psi[ss+vm]()(2)(2);
-          Simd hm_10 = psi[ss+vm]()(3)(0);
-          Simd hm_11 = psi[ss+vm]()(3)(1);
-          Simd hm_12 = psi[ss+vm]()(3)(2);
-
-          if (vp <= v){
-            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-          }
-
-          if(this->pm == 1 && vs <= v){
-            hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-            hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-            hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-            hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-            hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-            hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-          }
-
-          if(vm >= v){
-            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-          }
-
-          if(this->pm == -1 && vs >= v){
-            hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-            hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-            hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-            hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-            hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-            hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-          }
-
-          Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-          Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-          Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-          Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-          Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-          Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-          Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-          Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-          Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-          Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-          Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-          Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-
-          vstream(chi[ss+v]()(0)(0), p_00);
-          vstream(chi[ss+v]()(0)(1), p_01);
-          vstream(chi[ss+v]()(0)(2), p_02);
-          vstream(chi[ss+v]()(1)(0), p_10);
-          vstream(chi[ss+v]()(1)(1), p_11);
-          vstream(chi[ss+v]()(1)(2), p_12);
-          vstream(chi[ss+v]()(2)(0), p_20);
-          vstream(chi[ss+v]()(2)(1), p_21);
-          vstream(chi[ss+v]()(2)(2), p_22);
-          vstream(chi[ss+v]()(3)(0), p_30);
-          vstream(chi[ss+v]()(3)(1), p_31);
-          vstream(chi[ss+v]()(3)(2), p_32);
-
-        }
-
-      }
-
-      this->M5Dtime += usecond();
-
-    #endif
-  }
-
-  #ifdef AVX512
-    #include<simd/Intel512common.h>
-    #include<simd/Intel512avx.h>
-    #include<simd/Intel512single.h>
-  #endif
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
-    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-  {
-    #ifndef AVX512
-      {
-        SiteHalfSpinor BcastP;
-        SiteHalfSpinor BcastM;
-        SiteHalfSpinor SiteChiP;
-        SiteHalfSpinor SiteChiM;
-
-        // Ls*Ls * 2 * 12 * vol flops
-        for(int s1=0; s1<LLs; s1++){
-
-          for(int s2=0; s2<LLs; s2++){
-          for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-            int s = s2 + l*LLs;
-            int lex = s2 + LLs*site;
-
-            if( s2==0 && l==0 ){
-              SiteChiP=zero;
-              SiteChiM=zero;
-            }
-
-            for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-            }}
-
-            for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-            }}
-
-            for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-              SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-            }}
-          }}
-
-          {
-            int lex = s1 + LLs*site;
-            for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-              vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-            }}
-          }
-        }
-      }
-    #else
-      {
-        // pointers
-        //  MASK_REGS;
-        #define Chi_00 %%zmm1
-        #define Chi_01 %%zmm2
-        #define Chi_02 %%zmm3
-        #define Chi_10 %%zmm4
-        #define Chi_11 %%zmm5
-        #define Chi_12 %%zmm6
-        #define Chi_20 %%zmm7
-        #define Chi_21 %%zmm8
-        #define Chi_22 %%zmm9
-        #define Chi_30 %%zmm10
-        #define Chi_31 %%zmm11
-        #define Chi_32 %%zmm12
-
-        #define BCAST0  %%zmm13
-        #define BCAST1  %%zmm14
-        #define BCAST2  %%zmm15
-        #define BCAST3  %%zmm16
-        #define BCAST4  %%zmm17
-        #define BCAST5  %%zmm18
-        #define BCAST6  %%zmm19
-        #define BCAST7  %%zmm20
-        #define BCAST8  %%zmm21
-        #define BCAST9  %%zmm22
-        #define BCAST10 %%zmm23
-        #define BCAST11 %%zmm24
-
-        int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-
-        for(int s1=0; s1<LLs; s1++){
-
-          for(int s2=0; s2<LLs; s2++){
-
-            int lex = s2 + LLs*site;
-            uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-            uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-            uint64_t a2 = (uint64_t) &psi[lex];
-
-            for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-
-              if((s2+l)==0) {
-                asm(
-                      VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-                      VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-                      VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-                      VBCASTCDUP(0,%2,BCAST0)
-                      VBCASTCDUP(1,%2,BCAST1)
-                      VBCASTCDUP(2,%2,BCAST2)
-                      VBCASTCDUP(3,%2,BCAST3)
-                      VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-                      VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-                      VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-                      VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-                      VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-                      VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-                      VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-                      VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-                      VMULMEM(0,%1,BCAST8,Chi_22)
-                      VMULMEM(0,%1,BCAST9,Chi_30)
-                      VMULMEM(0,%1,BCAST10,Chi_31)
-                      VMULMEM(0,%1,BCAST11,Chi_32)
-                      : : "r" (a0), "r" (a1), "r" (a2)                            );
-              } else {
-                asm(
-                      VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-                      VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-                      VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-                      VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-                      VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-                      VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-                      VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-                      VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-                      VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-                      VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-                      VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-                      VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-                      : : "r" (a0), "r" (a1), "r" (a2)                            );
-              }
-
-              a0 = a0 + incr;
-              a1 = a1 + incr;
-              a2 = a2 + sizeof(typename Simd::scalar_type);
-            }
-          }
-
-          {
-            int lexa = s1+LLs*site;
-            asm (
-               VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-               VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-               VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-               VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-               : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-          }
-        }
-      }
-
-      #undef Chi_00
-      #undef Chi_01
-      #undef Chi_02
-      #undef Chi_10
-      #undef Chi_11
-      #undef Chi_12
-      #undef Chi_20
-      #undef Chi_21
-      #undef Chi_22
-      #undef Chi_30
-      #undef Chi_31
-      #undef Chi_32
-
-      #undef BCAST0
-      #undef BCAST1
-      #undef BCAST2
-      #undef BCAST3
-      #undef BCAST4
-      #undef BCAST5
-      #undef BCAST6
-      #undef BCAST7
-      #undef BCAST8
-      #undef BCAST9
-      #undef BCAST10
-      #undef BCAST11
-
-    #endif
-  };
-
-  // Z-mobius version
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-  {
-    std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-    exit(-1);
-  };
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-  {
-    int Ls  = this->Ls;
-    int LLs = psi._grid->_rdimensions[0];
-    int vol = psi._grid->oSites()/LLs;
-
-    chi.checkerboard = psi.checkerboard;
-
-    Vector<iSinglet<Simd>>   Matp;
-    Vector<iSinglet<Simd>>   Matm;
-    Vector<iSinglet<Simd>>* _Matp;
-    Vector<iSinglet<Simd>>* _Matm;
-
-    //  MooeeInternalCompute(dag,inv,Matp,Matm);
-    if(inv && dag){
-      _Matp = &this->MatpInvDag;
-      _Matm = &this->MatmInvDag;
-    }
-
-    if(inv && (!dag)){
-      _Matp = &this->MatpInv;
-      _Matm = &this->MatmInv;
-    }
-
-    if(!inv){
-      MooeeInternalCompute(dag, inv, Matp, Matm);
-      _Matp = &Matp;
-      _Matm = &Matm;
-    }
-
-    assert(_Matp->size() == Ls*LLs);
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    if(switcheroo<Coeff_t>::iscomplex()){
-      parallel_for(auto site=0; site<vol; site++){
-        MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-      }
-    } else {
-      parallel_for(auto site=0; site<vol; site++){
-        MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  #ifdef MOBIUS_EOFA_DPERP_VEC
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
-
-    template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-    template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-  #endif
-
-}}
--- a/Grid/qcd/action/fermion/MobiusFermion.h
+++ b/Grid/qcd/action/fermion/MobiusFermion.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,57 +24,54 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_MOBIUS_FERMION_H
 #define  GRID_QCD_MOBIUS_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class MobiusFermion : public CayleyFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class MobiusFermion : public CayleyFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
-
-      virtual void   Instantiatable(void) {};
-      // Constructors
-      MobiusFermion(GaugeField &_Umu,
-		    GridCartesian         &FiveDimGrid,
-		    GridRedBlackCartesian &FiveDimRedBlackGrid,
-		    GridCartesian         &FourDimGrid,
-		    GridRedBlackCartesian &FourDimRedBlackGrid,
-		    RealD _mass,RealD _M5,
-		    RealD b, RealD c,const ImplParams &p= ImplParams()) : 
+  virtual void   Instantiatable(void) {};
+  // Constructors
+  MobiusFermion(GaugeField &_Umu,
+		GridCartesian         &FiveDimGrid,
+		GridRedBlackCartesian &FiveDimRedBlackGrid,
+		GridCartesian         &FourDimGrid,
+		GridRedBlackCartesian &FourDimRedBlackGrid,
+		RealD _mass,RealD _M5,
+		RealD b, RealD c,const ImplParams &p= ImplParams()) : 
      
-      CayleyFermion5D<Impl>(_Umu,
-			    FiveDimGrid,
-			    FiveDimRedBlackGrid,
-			    FourDimGrid,
-			    FourDimRedBlackGrid,_mass,_M5,p)
+    CayleyFermion5D<Impl>(_Umu,
+			  FiveDimGrid,
+			  FiveDimRedBlackGrid,
+			  FourDimGrid,
+			  FourDimRedBlackGrid,_mass,_M5,p)

-      {
-	RealD eps = 1.0;
+  {
+    RealD eps = 1.0;

-	std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
-	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
-	assert(zdata->n==this->Ls);
+    std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
+    Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
+    assert(zdata->n==this->Ls);
 	
-	// Call base setter
-	this->SetCoefficientsTanh(zdata,b,c);
+    // Call base setter
+    this->SetCoefficientsTanh(zdata,b,c);

-	Approx::zolotarev_free(zdata);
+    Approx::zolotarev_free(zdata);
 
-      }
-
-    };
-
  }
-}
+
+};
+
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/MobiusZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/MobiusZolotarevFermion.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,58 +24,55 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class MobiusZolotarevFermion : public CayleyFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class MobiusZolotarevFermion : public CayleyFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
-
-      virtual void   Instantiatable(void) {};
-      // Constructors
-       MobiusZolotarevFermion(GaugeField &_Umu,
-			      GridCartesian         &FiveDimGrid,
-			      GridRedBlackCartesian &FiveDimRedBlackGrid,
-			      GridCartesian         &FourDimGrid,
-			      GridRedBlackCartesian &FourDimRedBlackGrid,
-			      RealD _mass,RealD _M5,
-			      RealD b, RealD c,
-			      RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
+  virtual void   Instantiatable(void) {};
+  // Constructors
+  MobiusZolotarevFermion(GaugeField &_Umu,
+			 GridCartesian         &FiveDimGrid,
+			 GridRedBlackCartesian &FiveDimRedBlackGrid,
+			 GridCartesian         &FourDimGrid,
+			 GridRedBlackCartesian &FourDimRedBlackGrid,
+			 RealD _mass,RealD _M5,
+			 RealD b, RealD c,
+			 RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
      
-      CayleyFermion5D<Impl>(_Umu,
-			    FiveDimGrid,
-			    FiveDimRedBlackGrid,
-			    FourDimGrid,
-			    FourDimRedBlackGrid,_mass,_M5,p)
+    CayleyFermion5D<Impl>(_Umu,
+			  FiveDimGrid,
+			  FiveDimRedBlackGrid,
+			  FourDimGrid,
+			  FourDimRedBlackGrid,_mass,_M5,p)

-      {
-	RealD eps = lo/hi;
+  {
+    RealD eps = lo/hi;

-	Approx::zolotarev_data *zdata = Approx::zolotarev(eps,this->Ls,0);
-	assert(zdata->n==this->Ls);
+    Approx::zolotarev_data *zdata = Approx::zolotarev(eps,this->Ls,0);
+    assert(zdata->n==this->Ls);

-	std::cout<<GridLogMessage << "MobiusZolotarevFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Zolotarev range ["<<lo<<","<<hi<<"]"<<std::endl;
+    std::cout<<GridLogMessage << "MobiusZolotarevFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Zolotarev range ["<<lo<<","<<hi<<"]"<<std::endl;
 	
-	// Call base setter
-	this->SetCoefficientsZolotarev(hi,zdata,b,c);
+    // Call base setter
+    this->SetCoefficientsZolotarev(hi,zdata,b,c);
 
-	Approx::zolotarev_free(zdata);
-      }
-
-    };
-
+    Approx::zolotarev_free(zdata);
  }
-}
+
+};
+
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,46 +24,44 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef OVERLAP_WILSON_CAYLEY_TANH_FERMION_H
 #define OVERLAP_WILSON_CAYLEY_TANH_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
-
-    template<class Impl>
-    class OverlapWilsonCayleyTanhFermion : public MobiusFermion<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
+template<class Impl>
+class OverlapWilsonCayleyTanhFermion : public MobiusFermion<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

     void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
       this->MomentumSpacePropagatorHw(out,in,_m,twist);
-     };
+  };

-     // Constructors
-    OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
-				   GridCartesian         &FiveDimGrid,
-				   GridRedBlackCartesian &FiveDimRedBlackGrid,
-				   GridCartesian         &FourDimGrid,
-				   GridRedBlackCartesian &FourDimRedBlackGrid,
-				   RealD _mass,RealD _M5,
-				   RealD scale,const ImplParams &p= ImplParams()) :
+  // Constructors
+  OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
+				 GridCartesian         &FiveDimGrid,
+				 GridRedBlackCartesian &FiveDimRedBlackGrid,
+				 GridCartesian         &FourDimGrid,
+				 GridRedBlackCartesian &FourDimRedBlackGrid,
+				 RealD _mass,RealD _M5,
+				 RealD scale,const ImplParams &p= ImplParams()) :
      
-      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      MobiusFermion<Impl>(_Umu,
-			  FiveDimGrid,
-			  FiveDimRedBlackGrid,
-			  FourDimGrid,
-			  FourDimRedBlackGrid,_mass,_M5,0.5*scale,0.5*scale,p)
-	{
-	}
-    };
+    // b+c=scale, b-c = 0 <=> b =c = scale/2
+    MobiusFermion<Impl>(_Umu,
+			FiveDimGrid,
+			FiveDimRedBlackGrid,
+			FourDimGrid,
+			FourDimRedBlackGrid,_mass,_M5,0.5*scale,0.5*scale,p)
+  {
  }
-}
+};
+
+NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,45 +24,42 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H
 #define  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class OverlapWilsonCayleyZolotarevFermion : public MobiusZolotarevFermion<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class OverlapWilsonCayleyZolotarevFermion : public MobiusZolotarevFermion<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
+  // Constructors

-      // Constructors
+  OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
+				      GridCartesian         &FiveDimGrid,
+				      GridRedBlackCartesian &FiveDimRedBlackGrid,
+				      GridCartesian         &FourDimGrid,
+				      GridRedBlackCartesian &FourDimRedBlackGrid,
+				      RealD _mass,RealD _M5,
+				      RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
+    // b+c=1.0, b-c = 0 <=> b =c = 1/2
+    MobiusZolotarevFermion<Impl>(_Umu,
+				 FiveDimGrid,
+				 FiveDimRedBlackGrid,
+				 FourDimGrid,
+				 FourDimRedBlackGrid,_mass,_M5,0.5,0.5,lo,hi,p)

-    OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
-					GridCartesian         &FiveDimGrid,
-					GridRedBlackCartesian &FiveDimRedBlackGrid,
-					GridCartesian         &FourDimGrid,
-					GridRedBlackCartesian &FourDimRedBlackGrid,
-					RealD _mass,RealD _M5,
-					RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
-      // b+c=1.0, b-c = 0 <=> b =c = 1/2
-      MobiusZolotarevFermion<Impl>(_Umu,
-				   FiveDimGrid,
-				   FiveDimRedBlackGrid,
-				   FourDimGrid,
-				   FourDimRedBlackGrid,_mass,_M5,0.5,0.5,lo,hi,p)
+  {}

-      {}
+};

-    };
-
-  }
-}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,48 +24,47 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class OverlapWilsonContFracTanhFermion : public ContinuedFractionFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class OverlapWilsonContFracTanhFermion : public ContinuedFractionFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
-
-      virtual void   Instantiatable(void){};
-      // Constructors
-    OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
-				     GridCartesian         &FiveDimGrid,
-				     GridRedBlackCartesian &FiveDimRedBlackGrid,
-				     GridCartesian         &FourDimGrid,
-				     GridRedBlackCartesian &FourDimRedBlackGrid,
-				     RealD _mass,RealD _M5,
-				     RealD scale,const ImplParams &p= ImplParams()) :
+  virtual void   Instantiatable(void){};
+  // Constructors
+  OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
+				   GridCartesian         &FiveDimGrid,
+				   GridRedBlackCartesian &FiveDimRedBlackGrid,
+				   GridCartesian         &FourDimGrid,
+				   GridRedBlackCartesian &FourDimRedBlackGrid,
+				   RealD _mass,RealD _M5,
+				   RealD scale,const ImplParams &p= ImplParams()) :
      
-      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      ContinuedFractionFermion5D<Impl>(_Umu,
-				       FiveDimGrid,
-				       FiveDimRedBlackGrid,
-				       FourDimGrid,
-				       FourDimRedBlackGrid,_mass,_M5,p)
-	{
-	  assert((this->Ls&0x1)==1); // Odd Ls required
-	  int nrational=this->Ls-1;// Even rational order
-	  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
-	  this->SetCoefficientsTanh(zdata,scale);
-	  Approx::zolotarev_free(zdata);
-	}
-    };
+    // b+c=scale, b-c = 0 <=> b =c = scale/2
+    ContinuedFractionFermion5D<Impl>(_Umu,
+				     FiveDimGrid,
+				     FiveDimRedBlackGrid,
+				     FourDimGrid,
+				     FourDimRedBlackGrid,_mass,_M5,p)
+  {
+    assert((this->Ls&0x1)==1); // Odd Ls required
+    int nrational=this->Ls-1;// Even rational order
+    Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
+    this->SetCoefficientsTanh(zdata,scale);
+    Approx::zolotarev_free(zdata);
  }
-}
+};
+
+NAMESPACE_END(Grid);
+
 #endif
--- a/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,51 +24,49 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class OverlapWilsonContFracZolotarevFermion : public ContinuedFractionFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    template<class Impl>
-    class OverlapWilsonContFracZolotarevFermion : public ContinuedFractionFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-
-      virtual void   Instantiatable(void){};
-      // Constructors
-    OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
-					  GridCartesian         &FiveDimGrid,
-					  GridRedBlackCartesian &FiveDimRedBlackGrid,
-					  GridCartesian         &FourDimGrid,
-					  GridRedBlackCartesian &FourDimRedBlackGrid,
-					  RealD _mass,RealD _M5,
-					  RealD lo,RealD hi,const ImplParams &p= ImplParams()):
+  virtual void   Instantiatable(void){};
+  // Constructors
+  OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
+					GridCartesian         &FiveDimGrid,
+					GridRedBlackCartesian &FiveDimRedBlackGrid,
+					GridCartesian         &FourDimGrid,
+					GridRedBlackCartesian &FourDimRedBlackGrid,
+					RealD _mass,RealD _M5,
+					RealD lo,RealD hi,const ImplParams &p= ImplParams()):
      
-      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      ContinuedFractionFermion5D<Impl>(_Umu,
-				       FiveDimGrid,
-				       FiveDimRedBlackGrid,
-				       FourDimGrid,
-				       FourDimRedBlackGrid,_mass,_M5,p)
-	{
-	  assert((this->Ls&0x1)==1); // Odd Ls required
+    // b+c=scale, b-c = 0 <=> b =c = scale/2
+    ContinuedFractionFermion5D<Impl>(_Umu,
+				     FiveDimGrid,
+				     FiveDimRedBlackGrid,
+				     FourDimGrid,
+				     FourDimRedBlackGrid,_mass,_M5,p)
+  {
+    assert((this->Ls&0x1)==1); // Odd Ls required

-	  int nrational=this->Ls;// Odd rational order
-	  RealD eps = lo/hi;
+    int nrational=this->Ls;// Odd rational order
+    RealD eps = lo/hi;

-	  Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
-	  this->SetCoefficientsZolotarev(hi,zdata);
-	  Approx::zolotarev_free(zdata);
+    Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
+    this->SetCoefficientsZolotarev(hi,zdata);
+    Approx::zolotarev_free(zdata);

-	}
-    };
  }
-}
+};
+
+NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,48 +24,46 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class OverlapWilsonPartialFractionTanhFermion : public PartialFractionFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class OverlapWilsonPartialFractionTanhFermion : public PartialFractionFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
-
-      virtual void   Instantiatable(void){};
-      // Constructors
-    OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
-					    GridCartesian         &FiveDimGrid,
-					    GridRedBlackCartesian &FiveDimRedBlackGrid,
-					    GridCartesian         &FourDimGrid,
-					    GridRedBlackCartesian &FourDimRedBlackGrid,
-					    RealD _mass,RealD _M5,
-					    RealD scale,const ImplParams &p= ImplParams()) :
+  virtual void   Instantiatable(void){};
+  // Constructors
+  OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
+					  GridCartesian         &FiveDimGrid,
+					  GridRedBlackCartesian &FiveDimRedBlackGrid,
+					  GridCartesian         &FourDimGrid,
+					  GridRedBlackCartesian &FourDimRedBlackGrid,
+					  RealD _mass,RealD _M5,
+					  RealD scale,const ImplParams &p= ImplParams()) :
      
-      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      PartialFractionFermion5D<Impl>(_Umu,
-				     FiveDimGrid,
-				     FiveDimRedBlackGrid,
-				     FourDimGrid,
-				     FourDimRedBlackGrid,_mass,_M5,p)
-	{
-	  assert((this->Ls&0x1)==1); // Odd Ls required
-	  int nrational=this->Ls-1;// Even rational order
-	  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
-	  this->SetCoefficientsTanh(zdata,scale);
-	  Approx::zolotarev_free(zdata);
-	}
-    };
+    // b+c=scale, b-c = 0 <=> b =c = scale/2
+    PartialFractionFermion5D<Impl>(_Umu,
+				   FiveDimGrid,
+				   FiveDimRedBlackGrid,
+				   FourDimGrid,
+				   FourDimRedBlackGrid,_mass,_M5,p)
+  {
+    assert((this->Ls&0x1)==1); // Odd Ls required
+    int nrational=this->Ls-1;// Even rational order
+    Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
+    this->SetCoefficientsTanh(zdata,scale);
+    Approx::zolotarev_free(zdata);
  }
-}
+};
+
+NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,51 +24,50 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class OverlapWilsonPartialFractionZolotarevFermion : public PartialFractionFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    template<class Impl>
-    class OverlapWilsonPartialFractionZolotarevFermion : public PartialFractionFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-
-      virtual void   Instantiatable(void){};
-      // Constructors
-    OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
-						 GridCartesian         &FiveDimGrid,
-						 GridRedBlackCartesian &FiveDimRedBlackGrid,
-						 GridCartesian         &FourDimGrid,
-						 GridRedBlackCartesian &FourDimRedBlackGrid,
-						 RealD _mass,RealD _M5,
-						 RealD lo,RealD hi,const ImplParams &p= ImplParams()):
+  virtual void   Instantiatable(void){};
+  // Constructors
+  OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
+					       GridCartesian         &FiveDimGrid,
+					       GridRedBlackCartesian &FiveDimRedBlackGrid,
+					       GridCartesian         &FourDimGrid,
+					       GridRedBlackCartesian &FourDimRedBlackGrid,
+					       RealD _mass,RealD _M5,
+					       RealD lo,RealD hi,const ImplParams &p= ImplParams()):
      
-      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      PartialFractionFermion5D<Impl>(_Umu,
-				     FiveDimGrid,
-				     FiveDimRedBlackGrid,
-				     FourDimGrid,
-				     FourDimRedBlackGrid,_mass,_M5,p)
-	{
-	  assert((this->Ls&0x1)==1); // Odd Ls required
+    // b+c=scale, b-c = 0 <=> b =c = scale/2
+    PartialFractionFermion5D<Impl>(_Umu,
+				   FiveDimGrid,
+				   FiveDimRedBlackGrid,
+				   FourDimGrid,
+				   FourDimRedBlackGrid,_mass,_M5,p)
+  {
+    assert((this->Ls&0x1)==1); // Odd Ls required

-	  int nrational=this->Ls;// Odd rational order
-	  RealD eps = lo/hi;
+    int nrational=this->Ls;// Odd rational order
+    RealD eps = lo/hi;

-	  Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
-	  this->SetCoefficientsZolotarev(hi,zdata);
-	  Approx::zolotarev_free(zdata);
+    Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
+    this->SetCoefficientsZolotarev(hi,zdata);
+    Approx::zolotarev_free(zdata);

-	}
-    };
  }
-}
+};
+
+NAMESPACE_END(Grid);
+
 #endif
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.cc
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.cc
@ -1,459 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
-
-namespace Grid {
-  namespace QCD {
-
-
-    template<class Impl>
-    void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-      // this does both dag and undag but is trivial; make a common helper routing
-
-      int sign = 1;
-      int Ls = this->Ls;
-
-      this->DhopDir(psi,chi,dir,disp);
-
-      int nblock=(Ls-1)/2;
-      for(int b=0;b<nblock;b++){
-	int s = 2*b;
-	ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
-	ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
-      }
-      ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
-
-    }
-    template<class Impl>
-    void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
-    {
-      int Ls = this->Ls;
-      int sign = dag ? (-1) : 1;
-
-      if ( psi.checkerboard == Odd ) {
-	this->DhopEO(psi,chi,DaggerNo);
-      } else {
-	this->DhopOE(psi,chi,DaggerNo);
-      }
-
-      int nblock=(Ls-1)/2;
-      for(int b=0;b<nblock;b++){
-	int s = 2*b;
-	ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
-	ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
-      }
-      ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
-    }
-
-    template<class Impl>
-    void   PartialFractionFermion5D<Impl>::Mooee_internal(const FermionField &psi, FermionField &chi,int dag)
-    {
-      // again dag and undag are trivially related
-      int sign = dag ? (-1) : 1;
-      int Ls = this->Ls;
-      
-      int nblock=(Ls-1)/2;
-      for(int b=0;b<nblock;b++){
-	
-	int s = 2*b;
-	RealD pp = p[nblock-1-b];
-	RealD qq = q[nblock-1-b];
-	
-	// Do each 2x2 block aligned at s and multiplies Dw site diagonal by G5 so Hw
-	ag5xpby_ssp(chi,-dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s  ,s+1); 
-	ag5xpby_ssp(chi, dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s+1,s);
-	axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
-      }
-      
-      {
-	RealD R=(1+mass)/(1-mass);
-	//R g5 psi[Ls-1] + p[0] H
-	ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale*dw_diag/amax,psi,Ls-1,Ls-1);
-	
-	for(int b=0;b<nblock;b++){
-	  int s = 2*b+1;
-	  RealD pp = p[nblock-1-b];
-	  axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
-	}
-      }
-    }
-
-    template<class Impl>
-    void   PartialFractionFermion5D<Impl>::MooeeInv_internal(const FermionField &psi, FermionField &chi,int dag)
-    {
-      int sign = dag ? (-1) : 1;
-      int Ls = this->Ls;
-
-      FermionField tmp(psi._grid);
-      
-      ///////////////////////////////////////////////////////////////////////////////////////
-      //Linv
-      ///////////////////////////////////////////////////////////////////////////////////////
-      int nblock=(Ls-1)/2;
-
-      axpy(chi,0.0,psi,psi); // Identity piece
-      
-      for(int b=0;b<nblock;b++){
-	int s = 2*b;
-	RealD pp = p[nblock-1-b];
-	RealD qq = q[nblock-1-b];
-	RealD coeff1=sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
-	RealD coeff2=sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
-	axpby_ssp  (chi,1.0,chi,coeff1,psi,Ls-1,s);
-	axpbg5y_ssp(chi,1.0,chi,coeff2,psi,Ls-1,s+1);
-      }
-      
-      ///////////////////////////////////////////////////////////////////////////////////////
-      //Dinv (note D isn't really diagonal -- just diagonal enough that we can still invert)
-      // Compute Seeinv (coeff of gamma5)
-      ///////////////////////////////////////////////////////////////////////////////////////
-      RealD R=(1+mass)/(1-mass);
-      RealD Seeinv = R + p[nblock]*dw_diag/amax;
-      for(int b=0;b<nblock;b++){
-	Seeinv += p[nblock-1-b]*dw_diag/amax / ( dw_diag*dw_diag/amax/amax + q[nblock-1-b]);
-      }    
-      Seeinv = 1.0/Seeinv;
-      
-      for(int b=0;b<nblock;b++){
-	int s = 2*b;
-	RealD pp = p[nblock-1-b];
-	RealD qq = q[nblock-1-b];
-	RealD coeff1=dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
-	RealD coeff2=amax*sqrt(qq) / ( dw_diag*dw_diag + amax*amax* qq);
-	ag5xpby_ssp  (tmp,-coeff1,chi,coeff2,chi,s,s+1);
-	ag5xpby_ssp  (tmp, coeff1,chi,coeff2,chi,s+1,s);
-      }
-      ag5xpby_ssp  (tmp, Seeinv,chi,0.0,chi,Ls-1,Ls-1);
-      
-      ///////////////////////////////////////////////////////////////////////////////////////
-      // Uinv
-      ///////////////////////////////////////////////////////////////////////////////////////
-      for(int b=0;b<nblock;b++){
-	int s = 2*b;
-	RealD pp = p[nblock-1-b];
-	RealD qq = q[nblock-1-b];
-	RealD coeff1=-sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
-	RealD coeff2=-sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
-	axpby_ssp  (chi,1.0/scale,tmp,coeff1/scale,tmp,s,Ls-1);
-	axpbg5y_ssp(chi,1.0/scale,tmp,coeff2/scale,tmp,s+1,Ls-1);
-      }
-      axpby_ssp  (chi, 1.0/scale,tmp,0.0,tmp,Ls-1,Ls-1);
-    }
-
-    template<class Impl>
-    void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, FermionField &chi,int dag)
-    {
-      FermionField D(psi._grid);
-  
-      int Ls = this->Ls;
-      int sign = dag ? (-1) : 1;
-
-      // For partial frac Hw case (b5=c5=1) chroma quirkily computes
-      //
-      // Conventions for partfrac appear to be a mess.
-      // Tony's Nara lectures have
-      //
-      // BlockDiag(  H/p_i  1             | 1       )    
-      //          (  1      p_i H / q_i^2 | 0       )  
-      //           ---------------------------------
-      //           ( -1      0                | R  +p0 H  )
-      //
-      //Chroma     ( -2H    2sqrt(q_i)    |   0         )
-      //           (2 sqrt(q_i)   2H      |  2 sqrt(p_i) )
-      //           ---------------------------------
-      //           ( 0     -2 sqrt(p_i)   |  2 R gamma_5 + p0 2H
-      //
-      // Edwards/Joo/Kennedy/Wenger
-      //
-      // Here, the "beta's" selected by chroma to scale the unphysical bulk constraint fields
-      // incorporate the approx scale factor. This is obtained by propagating the
-      // scale on "H" out to the off diagonal elements as follows:
-      //
-      // BlockDiag(  H/p_i  1             | 1       ) 
-      //          (  1      p_i H / q_i^2 | 0       )  
-      //           ---------------------------------
-      //          ( -1      0                | R  + p_0 H  )
-      //
-      // becomes:
-      // BlockDiag(  H/ sp_i  1               | 1             ) 
-      //          (  1      sp_i H / s^2q_i^2 | 0             )  
-      //           ---------------------------------
-      //           ( -1      0                | R + p_0/s H   )
-      //
-      //
-      // This is implemented in Chroma by
-      //           p0' = p0/approxMax
-      //           p_i' = p_i*approxMax
-      //           q_i' = q_i*approxMax*approxMax
-      //
-      // After the equivalence transform is applied the matrix becomes
-      // 
-      //Chroma     ( -2H    sqrt(q'_i)    |   0         )
-      //           (sqrt(q'_i)   2H       |   sqrt(p'_i) )
-      //           ---------------------------------
-      //           ( 0     -sqrt(p'_i)    |  2 R gamma_5 + p'0 2H
-      //
-      //     =     ( -2H    sqrt(q_i)amax    |   0              )
-      //           (sqrt(q_i)amax   2H       |   sqrt(p_i*amax) )
-      //           ---------------------------------
-      //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H
-      //
-
-      this->DW(psi,D,DaggerNo); 
-
-      int nblock=(Ls-1)/2;
-      for(int b=0;b<nblock;b++){
-	
-	int s = 2*b;
-	double pp = p[nblock-1-b];
-	double qq = q[nblock-1-b];
-	
-	// Do each 2x2 block aligned at s and
-	ag5xpby_ssp(chi,-1.0*scale,D,amax*sqrt(qq)*scale,psi, s  ,s+1); // Multiplies Dw by G5 so Hw
-	ag5xpby_ssp(chi, 1.0*scale,D,amax*sqrt(qq)*scale,psi, s+1,s);
-	
-	// Pick up last column
-	axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
-      }
-	
-      {
-	double R=(1+this->mass)/(1-this->mass);
-	//R g5 psi[Ls] + p[0] H
-	ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
-	
-	for(int b=0;b<nblock;b++){
-	  int s = 2*b+1;
-	  double pp = p[nblock-1-b];
-	  axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
-	}
-      }
-
-    }
-
-    template<class Impl>
-    RealD  PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
-    {
-      M_internal(in,out,DaggerNo);
-      return norm2(out);
-    }
-    template<class Impl>
-    RealD  PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
-    {
-      M_internal(in,out,DaggerYes);
-      return norm2(out);
-    }
-
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::Meooe       (const FermionField &in, FermionField &out)
-    {
-      Meooe_internal(in,out,DaggerNo);
-    }
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::MeooeDag    (const FermionField &in, FermionField &out)
-    {
-      Meooe_internal(in,out,DaggerYes);
-    }
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::Mooee       (const FermionField &in, FermionField &out)
-    {
-      Mooee_internal(in,out,DaggerNo);
-    }
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::MooeeDag    (const FermionField &in, FermionField &out)
-    {
-      Mooee_internal(in,out,DaggerYes);
-    }
-
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::MooeeInv    (const FermionField &in, FermionField &out)
-    {
-      MooeeInv_internal(in,out,DaggerNo);
-    }
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::MooeeInvDag (const FermionField &in, FermionField &out)
-    {
-      MooeeInv_internal(in,out,DaggerYes);
-    }
-
-
-  // force terms; five routines; default to Dhop on diagonal
-    template<class Impl>
-   void PartialFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    int Ls = this->Ls;
-
-    FermionField D(V._grid);
-
-    int nblock=(Ls-1)/2;
-    for(int b=0;b<nblock;b++){
-      int s = 2*b;
-      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
-      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
-    }
-    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
-
-    this->DhopDeriv(mat,D,V,DaggerNo); 
-  };
-    template<class Impl>
-   void PartialFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    int Ls = this->Ls;
-
-    FermionField D(V._grid);
-
-    int nblock=(Ls-1)/2;
-    for(int b=0;b<nblock;b++){
-      int s = 2*b;
-      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
-      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
-    }
-    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
-
-    this->DhopDerivOE(mat,D,V,DaggerNo); 
-  };
-    template<class Impl>
-   void PartialFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    int Ls = this->Ls;
-
-    FermionField D(V._grid);
-
-    int nblock=(Ls-1)/2;
-    for(int b=0;b<nblock;b++){
-      int s = 2*b;
-      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
-      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
-    }
-    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
-
-    this->DhopDerivEO(mat,D,V,DaggerNo); 
-  };
-
-    template<class Impl>
-    void  PartialFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
-      SetCoefficientsZolotarev(1.0/scale,zdata);
-    }
-    template<class Impl>
-    void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){
-
-      // check on degree matching
-      //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-      //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
-      int Ls = this->Ls;
-
-      assert(Ls == (2*zdata->da -1) );
-
-      // Part frac
-      //      RealD R;
-      R=(1+mass)/(1-mass);
-      dw_diag = (4.0-this->M5);
-
-      //      std::vector<RealD> p; 
-      //      std::vector<RealD> q;
-      p.resize(zdata->da);
-      q.resize(zdata->dd);
-	
-      for(int n=0;n<zdata->da;n++){
-	p[n] = zdata -> alpha[n];
-      }
-      for(int n=0;n<zdata->dd;n++){
-	q[n] = -zdata -> ap[n];
-      }
-      
-      scale= part_frac_chroma_convention ? 2.0 : 1.0; // Chroma conventions annoy me
-
-      amax=zolo_hi;
-    }
-
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
-    {
-      int Ls = this->Ls;
-      conformable(solution5d._grid,this->FermionGrid());
-      conformable(exported4d._grid,this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
-    }
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
-    {
-      int Ls = this->Ls;
-      conformable(imported5d._grid,this->FermionGrid());
-      conformable(input4d._grid   ,this->GaugeGrid());
-      FermionField tmp(this->FermionGrid());
-      tmp=zero;
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
-      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
-      this->Dminus(tmp,imported5d);
-    }
-
-      // Constructors
-    template<class Impl>
-    PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
-							     GridCartesian         &FiveDimGrid,
-							     GridRedBlackCartesian &FiveDimRedBlackGrid,
-							     GridCartesian         &FourDimGrid,
-							     GridRedBlackCartesian &FourDimRedBlackGrid,
-							     RealD _mass,RealD M5,
-							     const ImplParams &p) :
-      WilsonFermion5D<Impl>(_Umu,
-			    FiveDimGrid, FiveDimRedBlackGrid,
-			    FourDimGrid, FourDimRedBlackGrid,M5,p),
-      mass(_mass)
-
-    {
-      int Ls = this->Ls;
-
-      assert((Ls&0x1)==1); // Odd Ls required
-      int nrational=Ls-1;
-
-
-      Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);
-
-      // NB: chroma uses a cast to "float" for the zolotarev range(!?).
-      // this creates a real difference in the operator which I do not like but we can replicate here
-      // to demonstrate compatibility
-      //      RealD eps = (zolo_lo / zolo_hi);
-      //      zdata = bfm_zolotarev(eps,nrational,0);
-      
-      SetCoefficientsTanh(zdata,1.0);
-
-      Approx::zolotarev_free(zdata);
-
-    }
- 
-    FermOpTemplateInstantiate(PartialFractionFermion5D);
-
- }
-}
-
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,51 +24,49 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_PARTIAL_FRACTION_H
 #define  GRID_QCD_PARTIAL_FRACTION_H

 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class PartialFractionFermion5D : public WilsonFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    template<class Impl>
-    class PartialFractionFermion5D : public WilsonFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
+  const int part_frac_chroma_convention=1;

-      const int part_frac_chroma_convention=1;
+  void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
+  void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
+  void   MooeeInv_internal(const FermionField &in, FermionField &out,int dag);
+  void   M_internal(const FermionField &in, FermionField &out,int dag);

-      void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
-      void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
-      void   MooeeInv_internal(const FermionField &in, FermionField &out,int dag);
-      void   M_internal(const FermionField &in, FermionField &out,int dag);
+  // override multiply
+  virtual RealD  M    (const FermionField &in, FermionField &out);
+  virtual RealD  Mdag (const FermionField &in, FermionField &out);

-      // override multiply
-      virtual RealD  M    (const FermionField &in, FermionField &out);
-      virtual RealD  Mdag (const FermionField &in, FermionField &out);
+  // half checkerboard operaions
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);

-      // half checkerboard operaions
-      virtual void   Meooe       (const FermionField &in, FermionField &out);
-      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
-      virtual void   Mooee       (const FermionField &in, FermionField &out);
-      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+  // force terms; five routines; default to Dhop on diagonal
+  virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

-      // force terms; five routines; default to Dhop on diagonal
-      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void   Instantiatable(void) =0; // ensure no make-eee

-      virtual void   Instantiatable(void) =0; // ensure no make-eee
-
-      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  // Efficient support for multigrid coarsening
+  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);

      ///////////////////////////////////////////////////////////////
      // Physical surface field utilities
@ -76,32 +74,30 @@ namespace Grid {
      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);

-      // Constructors
-      PartialFractionFermion5D(GaugeField &_Umu,
-			       GridCartesian         &FiveDimGrid,
-			       GridRedBlackCartesian &FiveDimRedBlackGrid,
-			       GridCartesian         &FourDimGrid,
-			       GridRedBlackCartesian &FourDimRedBlackGrid,
-			       RealD _mass,RealD M5,const ImplParams &p= ImplParams());
+  // Constructors
+  PartialFractionFermion5D(GaugeField &_Umu,
+			   GridCartesian         &FiveDimGrid,
+			   GridRedBlackCartesian &FiveDimRedBlackGrid,
+			   GridCartesian         &FourDimGrid,
+			   GridRedBlackCartesian &FourDimRedBlackGrid,
+			   RealD _mass,RealD M5,const ImplParams &p= ImplParams());

-    protected:
+protected:

-      virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
-      virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);
+  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
+  virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);

-      // Part frac
-      RealD mass;
-      RealD dw_diag;
-      RealD R;
-      RealD amax;
-      RealD scale;
-      std::vector<double> p; 
-      std::vector<double> q;
+  // Part frac
+  RealD mass;
+  RealD dw_diag;
+  RealD R;
+  RealD amax;
+  RealD scale;
+  Vector<double> p; 
+  Vector<double> q;

-    };
+};

-
-  }
-}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/PauliVillarsInverters.h
+++ b/Grid/qcd/action/fermion/PauliVillarsInverters.h
@ -27,8 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #pragma once

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template<class Field>
 class PauliVillarsSolverUnprec
@ -90,6 +89,4 @@ class PauliVillarsSolverFourierAccel
  };
 };

-
-}
-}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/Reconstruct5Dprop.h
+++ b/Grid/qcd/action/fermion/Reconstruct5Dprop.h
@ -27,8 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #pragma once

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template<class Field,class PVinverter> class Reconstruct5DfromPhysical {
 private:
@ -131,5 +130,5 @@ template<class Field,class PVinverter> class Reconstruct5DfromPhysical {
  }
 };

-}
-}
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/ScaledShamirFermion.h
+++ b/Grid/qcd/action/fermion/ScaledShamirFermion.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,46 +24,43 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_SCALED_SHAMIR_FERMION_H
 #define  GRID_QCD_SCALED_SHAMIR_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class ScaledShamirFermion : public MobiusFermion<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    template<class Impl>
-    class ScaledShamirFermion : public MobiusFermion<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-
-      // Constructors
-    ScaledShamirFermion(GaugeField &_Umu,
-			GridCartesian         &FiveDimGrid,
-			GridRedBlackCartesian &FiveDimRedBlackGrid,
-			GridCartesian         &FourDimGrid,
-			GridRedBlackCartesian &FourDimRedBlackGrid,
-			RealD _mass,RealD _M5,
-//			RealD scale):
-			RealD scale,const ImplParams &p= ImplParams()) :
+  // Constructors
+  ScaledShamirFermion(GaugeField &_Umu,
+		      GridCartesian         &FiveDimGrid,
+		      GridRedBlackCartesian &FiveDimRedBlackGrid,
+		      GridCartesian         &FourDimGrid,
+		      GridRedBlackCartesian &FourDimRedBlackGrid,
+		      RealD _mass,RealD _M5,
+		      //			RealD scale):
+		      RealD scale,const ImplParams &p= ImplParams()) :
      
-      // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
-      MobiusFermion<Impl>(_Umu,
-		    FiveDimGrid,
-		    FiveDimRedBlackGrid,
-		    FourDimGrid,
-	FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0),p)
-//		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
-      {
-      }
-
-    };
-
+    // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
+    MobiusFermion<Impl>(_Umu,
+			FiveDimGrid,
+			FiveDimRedBlackGrid,
+			FourDimGrid,
+			FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0),p)
+    //		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
+  {
  }
-}
+
+};
+
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h
+++ b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,40 +24,40 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  _SCHUR_DIAG_TWO_KAPPA_H
-#define  _SCHUR_DIAG_TWO_KAPPA_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  // This is specific to (Z)mobius fermions
-  template<class Matrix, class Field>
-    class KappaSimilarityTransform {
-  public:
-    INHERIT_IMPL_TYPES(Matrix);
-    std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
+// This is specific to (Z)mobius fermions
+template<class Matrix, class Field>
+class KappaSimilarityTransform {
+public:
+  INHERIT_IMPL_TYPES(Matrix);
+  Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;

-    KappaSimilarityTransform (Matrix &zmob) {
-      for (int i=0;i<(int)zmob.bs.size();i++) {
-	Coeff_t k = 1.0 / ( 2.0 * (zmob.bs[i] *(4 - zmob.M5) + 1.0) );
-	kappa.push_back( k );
-	kappaDag.push_back( conj(k) );
-	kappaInv.push_back( 1.0 / k );
-	kappaInvDag.push_back( 1.0 / conj(k) );
-      }
+  KappaSimilarityTransform (Matrix &zmob) {
+    for (int i=0;i<(int)zmob.bs.size();i++) {
+      Coeff_t k = 1.0 / ( 2.0 * (zmob.bs[i] *(4 - zmob.M5) + 1.0) );
+      kappa.push_back( k );
+      kappaDag.push_back( conj(k) );
+      kappaInv.push_back( 1.0 / k );
+      kappaInvDag.push_back( 1.0 / conj(k) );
    }
+  }

  template<typename vobj>
-    void sscale(const Lattice<vobj>& in, Lattice<vobj>& out, Coeff_t* s) {
-    GridBase *grid=out._grid;
-    out.checkerboard = in.checkerboard;
+  void sscale(const Lattice<vobj>& in, Lattice<vobj>& out, Coeff_t* s) {
+    GridBase *grid=out.Grid();
+    out.Checkerboard() = in.Checkerboard();
    assert(grid->_simd_layout[0] == 1); // should be fine for ZMobius for now
    int Ls = grid->_rdimensions[0];
-    parallel_for(int ss=0;ss<grid->oSites();ss++){
-      vobj tmp = s[ss % Ls]*in._odata[ss];
-      vstream(out._odata[ss],tmp);
-    }
+    thread_for(ss, grid->oSites(),
+    {
+      vobj tmp = s[ss % Ls]*in[ss];
+      vstream(out[ss],tmp);
+    });
  }

  RealD sscale_norm(const Field& in, Field& out, Coeff_t* s) {
@ -70,33 +70,33 @@ namespace Grid {
  virtual RealD MInv    (const Field& in, Field& out) { return sscale_norm(in,out,&kappaInv[0]);}
  virtual RealD MInvDag (const Field& in, Field& out) { return sscale_norm(in,out,&kappaInvDag[0]);}

-  };
+};

-  template<class Matrix,class Field>
-    class SchurDiagTwoKappaOperator :  public SchurOperatorBase<Field> {
-  public:
-    KappaSimilarityTransform<Matrix, Field> _S;
-    SchurDiagTwoOperator<Matrix, Field> _Mat;
+template<class Matrix,class Field>
+class SchurDiagTwoKappaOperator :  public SchurOperatorBase<Field> {
+public:
+  KappaSimilarityTransform<Matrix, Field> _S;
+  SchurDiagTwoOperator<Matrix, Field> _Mat;

-    SchurDiagTwoKappaOperator (Matrix &Mat): _S(Mat), _Mat(Mat) {};
+  SchurDiagTwoKappaOperator (Matrix &Mat): _S(Mat), _Mat(Mat) {};

-    virtual  RealD Mpc      (const Field &in, Field &out) {
-      Field tmp(in._grid);
+  virtual  RealD Mpc      (const Field &in, Field &out) {
+    Field tmp(in.Grid());

-      _S.MInv(in,out);
-      _Mat.Mpc(out,tmp);
-      return _S.M(tmp,out);
+    _S.MInv(in,out);
+    _Mat.Mpc(out,tmp);
+    return _S.M(tmp,out);

-    }
-    virtual  RealD MpcDag   (const Field &in, Field &out){
-      Field tmp(in._grid);
+  }
+  virtual  RealD MpcDag   (const Field &in, Field &out){
+    Field tmp(in.Grid());

-      _S.MDag(in,out);
-      _Mat.MpcDag(out,tmp);
-      return _S.MInvDag(tmp,out);
-    }
-  };
+    _S.MDag(in,out);
+    _Mat.MpcDag(out,tmp);
+    return _S.MInvDag(tmp,out);
+  }
+};
+
+NAMESPACE_END(Grid);

-}

-#endif
--- a/Grid/qcd/action/fermion/ShamirZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/ShamirZolotarevFermion.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,46 +24,43 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class ShamirZolotarevFermion : public MobiusZolotarevFermion<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);

-    template<class Impl>
-    class ShamirZolotarevFermion : public MobiusZolotarevFermion<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-
-      // Constructors
+  // Constructors


-    ShamirZolotarevFermion(GaugeField &_Umu,
-			   GridCartesian         &FiveDimGrid,
-			   GridRedBlackCartesian &FiveDimRedBlackGrid,
-			   GridCartesian         &FourDimGrid,
-			   GridRedBlackCartesian &FourDimRedBlackGrid,
-			   RealD _mass,RealD _M5,
-			   RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
+  ShamirZolotarevFermion(GaugeField &_Umu,
+			 GridCartesian         &FiveDimGrid,
+			 GridRedBlackCartesian &FiveDimRedBlackGrid,
+			 GridCartesian         &FourDimGrid,
+			 GridRedBlackCartesian &FourDimRedBlackGrid,
+			 RealD _mass,RealD _M5,
+			 RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
      
-      // b+c = 1; b-c = 1 => b=1, c=0
-      MobiusZolotarevFermion<Impl>(_Umu,
-				   FiveDimGrid,
-				   FiveDimRedBlackGrid,
-				   FourDimGrid,
-				   FourDimRedBlackGrid,_mass,_M5,1.0,0.0,lo,hi,p)
+    // b+c = 1; b-c = 1 => b=1, c=0
+    MobiusZolotarevFermion<Impl>(_Umu,
+				 FiveDimGrid,
+				 FiveDimRedBlackGrid,
+				 FourDimGrid,
+				 FourDimRedBlackGrid,_mass,_M5,1.0,0.0,lo,hi,p)
      
-      {}
+  {}

-    };
+};

-  }
-}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/StaggeredImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredImpl.h
@ -0,0 +1,175 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template <class S, class Representation = FundamentalRepresentation >
+class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > 
+{
+
+public:
+
+  typedef RealD  _Coeff_t ;
+  static const int Dimension = Representation::Dimension;
+  static const bool isFundamental = Representation::isFundamental;
+  static const bool LsVectorised=false;
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
+      
+  //Necessary?
+  constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
+    
+  typedef _Coeff_t Coeff_t;
+
+  INHERIT_GIMPL_TYPES(Gimpl);
+      
+  template <typename vtype> using iImplSpinor            = iScalar<iScalar<iVector<vtype, Dimension> > >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iScalar<iVector<vtype, Dimension> > >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+  template <typename vtype> using iImplPropagator        = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+    
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+  typedef iImplPropagator<Simd>        SitePropagator;
+    
+  typedef Lattice<SiteSpinor>            FermionField;
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+  typedef Lattice<SitePropagator> PropagatorField;
+    
+  typedef StaggeredImplParams ImplParams;
+  typedef SimpleCompressor<SiteSpinor> Compressor;
+  typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
+  typedef typename StencilImpl::View_type StencilView;
+
+  ImplParams Params;
+    
+  StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){};
+      
+  static accelerator_inline void multLink(SiteSpinor &phi,
+		       const SiteDoubledGaugeField &U,
+		       const SiteSpinor &chi,
+		       int mu)
+  {
+    mult(&phi(), &U(mu), &chi());
+  }
+  static accelerator_inline void multLinkAdd(SiteSpinor &phi,
+			  const SiteDoubledGaugeField &U,
+			  const SiteSpinor &chi,
+			  int mu)
+  {
+    mac(&phi(), &U(mu), &chi());
+  }
+      
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    reg = memory;
+  }
+      
+    inline void InsertGaugeField(DoubledGaugeField &U_ds,
+				 const GaugeLinkField &U,int mu)
+    {
+      PokeIndex<LorentzIndex>(U_ds, U, mu);
+    }
+  inline void DoubleStore(GridBase *GaugeGrid,
+			  DoubledGaugeField &UUUds, // for Naik term
+			  DoubledGaugeField &Uds,
+			  const GaugeField &Uthin,
+			  const GaugeField &Ufat) {
+    conformable(Uds.Grid(), GaugeGrid);
+    conformable(Uthin.Grid(), GaugeGrid);
+    conformable(Ufat.Grid(), GaugeGrid);
+    GaugeLinkField U(GaugeGrid);
+    GaugeLinkField UU(GaugeGrid);
+    GaugeLinkField UUU(GaugeGrid);
+    GaugeLinkField Udag(GaugeGrid);
+    GaugeLinkField UUUdag(GaugeGrid);
+    for (int mu = 0; mu < Nd; mu++) {
+
+      // Staggered Phase.
+      Lattice<iScalar<vInteger> > coor(GaugeGrid);
+      Lattice<iScalar<vInteger> > x(GaugeGrid); LatticeCoordinate(x,0);
+      Lattice<iScalar<vInteger> > y(GaugeGrid); LatticeCoordinate(y,1);
+      Lattice<iScalar<vInteger> > z(GaugeGrid); LatticeCoordinate(z,2);
+      Lattice<iScalar<vInteger> > t(GaugeGrid); LatticeCoordinate(t,3);
+
+      Lattice<iScalar<vInteger> > lin_z(GaugeGrid); lin_z=x+y;
+      Lattice<iScalar<vInteger> > lin_t(GaugeGrid); lin_t=x+y+z;
+
+      ComplexField phases(GaugeGrid);	phases=1.0;
+
+      if ( mu == 1 ) phases = where( mod(x    ,2)==(Integer)0, phases,-phases);
+      if ( mu == 2 ) phases = where( mod(lin_z,2)==(Integer)0, phases,-phases);
+      if ( mu == 3 ) phases = where( mod(lin_t,2)==(Integer)0, phases,-phases);
+
+      // 1 hop based on fat links
+      U      = PeekIndex<LorentzIndex>(Ufat, mu);
+      Udag   = adj( Cshift(U, mu, -1));
+
+      U    = U    *phases;
+      Udag = Udag *phases;
+
+	InsertGaugeField(Uds,U,mu);
+	InsertGaugeField(Uds,Udag,mu+4);
+	//	PokeIndex<LorentzIndex>(Uds, U, mu);
+	//	PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
+
+      // 3 hop based on thin links. Crazy huh ?
+      U  = PeekIndex<LorentzIndex>(Uthin, mu);
+      UU = Gimpl::CovShiftForward(U,mu,U);
+      UUU= Gimpl::CovShiftForward(U,mu,UU);
+	
+      UUUdag = adj( Cshift(UUU, mu, -3));
+
+      UUU    = UUU    *phases;
+      UUUdag = UUUdag *phases;
+
+	InsertGaugeField(UUUds,UUU,mu);
+	InsertGaugeField(UUUds,UUUdag,mu+4);
+
+    }
+  }
+
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+    GaugeLinkField link(mat.Grid());
+    link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+    PokeIndex<LorentzIndex>(mat,link,mu);
+  }   
+      
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+    assert (0); 
+    // Must never hit
+  }
+};
+typedef StaggeredImpl<vComplex,  FundamentalRepresentation > StaggeredImplR;   // Real.. whichever prec
+typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF;  // Float
+typedef StaggeredImpl<vComplexD, FundamentalRepresentation > StaggeredImplD;  // Double
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/StaggeredKernels.h
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@ -26,11 +26,9 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef GRID_QCD_STAGGERED_KERNELS_H
-#define GRID_QCD_STAGGERED_KERNELS_H
+#pragma once

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid)

  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Helper routines that implement Staggered stencil for a single site.
@ -51,72 +49,69 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   
 public:
    
-   void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
-		      int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
+   void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
+		      int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);

   ///////////////////////////////////////////////////////////////////////////////////////
   // Generic Nc kernels
   ///////////////////////////////////////////////////////////////////////////////////////
   void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
-			DoubledGaugeField &U, DoubledGaugeField &UUU, 
+			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
-			const FermionField &in, FermionField &out,int dag);
+			const FermionFieldView &in, FermionFieldView &out,int dag);
   void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
-			   DoubledGaugeField &U, DoubledGaugeField &UUU, 
+			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			   SiteSpinor * buf, int LLs, int sU, 
-			   const FermionField &in, FermionField &out,int dag);
+			   const FermionFieldView &in, FermionFieldView &out,int dag);
   void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
-			   DoubledGaugeField &U, DoubledGaugeField &UUU,
+			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 			   SiteSpinor * buf, int LLs, int sU, 
-			   const FermionField &in, FermionField &out,int dag);
+			   const FermionFieldView &in, FermionFieldView &out,int dag);

   ///////////////////////////////////////////////////////////////////////////////////////
   // Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
   void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
-		     DoubledGaugeField &U,DoubledGaugeField &UUU, 
+		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		     SiteSpinor * buf, int LLs, int sU, 
-		     const FermionField &in, FermionField &out,int dag);
+		     const FermionFieldView &in, FermionFieldView &out,int dag);
   void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
-			DoubledGaugeField &U,DoubledGaugeField &UUU, 
+			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
-			const FermionField &in, FermionField &out,int dag);
+			const FermionFieldView &in, FermionFieldView &out,int dag);
   void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
-			DoubledGaugeField &U,DoubledGaugeField &UUU, 
+			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
-			const FermionField &in, FermionField &out,int dag);
+			const FermionFieldView &in, FermionFieldView &out,int dag);

   ///////////////////////////////////////////////////////////////////////////////////////
   // Asm Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
   void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-		    DoubledGaugeField &U,DoubledGaugeField &UUU, 
+		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU, 
-		    const FermionField &in, FermionField &out,int dag);
+		    const FermionFieldView &in, FermionFieldView &out,int dag);
   ///////////////////////////////////////////////////////////////////////////////////////////////////
   // Generic interface; fan out to right routine
   ///////////////////////////////////////////////////////////////////////////////////////////////////
   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
-		 DoubledGaugeField &U, DoubledGaugeField &UUU, 
+		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 		 SiteSpinor * buf, int LLs, int sU,
-		 const FermionField &in, FermionField &out, int interior=1,int exterior=1);
+		 const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);

   void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, 
-		    DoubledGaugeField &U, DoubledGaugeField &UUU, 
+		    DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU,
-		    const FermionField &in, FermionField &out, int interior=1,int exterior=1);
+		    const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);

   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
-		 DoubledGaugeField &U, DoubledGaugeField &UUU, 
+		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 		 SiteSpinor * buf, int LLs, int sU,
-		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
+		 const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
  
 public:

  StaggeredKernels(const ImplParams &p = ImplParams());

 };
-    
-}}
-
-#endif
+NAMESPACE_END(Grid);    
--- a/Grid/qcd/action/fermion/StaggeredVec5dImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredVec5dImpl.h
@ -0,0 +1,203 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template <class S, class Representation = FundamentalRepresentation >
+class StaggeredVec5dImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
+
+public:
+
+  static const int Dimension = Representation::Dimension;
+    static const bool isFundamental = Representation::isFundamental;
+  static const bool LsVectorised=true;
+  typedef RealD   Coeff_t ;
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
+      
+  //Necessary?
+  constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
+
+
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  template <typename vtype> using iImplSpinor            = iScalar<iScalar<iVector<vtype, Dimension> > >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iScalar<iVector<vtype, Dimension> > >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
+  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+  template <typename vtype> using iImplPropagator        = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+
+  // Make the doubled gauge field a *scalar*
+  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
+  typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar
+  typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar
+  typedef iImplPropagator<Simd>        SitePropagator;
+
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+  typedef Lattice<SitePropagator> PropagatorField;
+    
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+
+    
+  typedef Lattice<SiteSpinor>            FermionField;
+    
+  typedef StaggeredImplParams ImplParams;
+  typedef SimpleCompressor<SiteSpinor> Compressor;
+  typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
+  typedef typename StencilImpl::View_type StencilView;
+    
+  ImplParams Params;
+    
+  StaggeredVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
+
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    vsplat(reg, memory);
+  }
+
+  static accelerator_inline void multLink(SiteHalfSpinor &phi, 
+					  const SiteDoubledGaugeField &U,
+					  const SiteHalfSpinor &chi, 
+					  int mu) 
+  {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+	vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mult(&phi(), &UU(), &chi());
+  }
+  static accelerator_inline void multLinkAdd(SiteHalfSpinor &phi, 
+					     const SiteDoubledGaugeField &U,
+					     const SiteHalfSpinor &chi, 
+					     int mu) 
+  {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+	vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mac(&phi(), &UU(), &chi());
+  }
+      
+  inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
+  {
+    GridBase *GaugeGrid = U_ds.Grid();
+    thread_for(lidx, GaugeGrid->lSites(),{
+
+	SiteScalarGaugeLink   ScalarU;
+	SiteDoubledGaugeField ScalarUds;
+	
+	Coordinate lcoor;
+	GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
+	peekLocalSite(ScalarUds, U_ds, lcoor);
+	
+	peekLocalSite(ScalarU, U, lcoor);
+	ScalarUds(mu) = ScalarU();
+	
+    });
+  }
+  inline void DoubleStore(GridBase *GaugeGrid,
+			  DoubledGaugeField &UUUds, // for Naik term
+			  DoubledGaugeField &Uds,
+			  const GaugeField &Uthin,
+			  const GaugeField &Ufat) 
+  {
+
+    GridBase * InputGrid = Uthin.Grid();
+    conformable(InputGrid,Ufat.Grid());
+
+    GaugeLinkField U(InputGrid);
+    GaugeLinkField UU(InputGrid);
+    GaugeLinkField UUU(InputGrid);
+    GaugeLinkField Udag(InputGrid);
+    GaugeLinkField UUUdag(InputGrid);
+
+    for (int mu = 0; mu < Nd; mu++) {
+
+      // Staggered Phase.
+      Lattice<iScalar<vInteger> > coor(InputGrid);
+      Lattice<iScalar<vInteger> > x(InputGrid); LatticeCoordinate(x,0);
+      Lattice<iScalar<vInteger> > y(InputGrid); LatticeCoordinate(y,1);
+      Lattice<iScalar<vInteger> > z(InputGrid); LatticeCoordinate(z,2);
+      Lattice<iScalar<vInteger> > t(InputGrid); LatticeCoordinate(t,3);
+
+      Lattice<iScalar<vInteger> > lin_z(InputGrid); lin_z=x+y;
+      Lattice<iScalar<vInteger> > lin_t(InputGrid); lin_t=x+y+z;
+
+      ComplexField phases(InputGrid);	phases=1.0;
+
+      if ( mu == 1 ) phases = where( mod(x    ,2)==(Integer)0, phases,-phases);
+      if ( mu == 2 ) phases = where( mod(lin_z,2)==(Integer)0, phases,-phases);
+      if ( mu == 3 ) phases = where( mod(lin_t,2)==(Integer)0, phases,-phases);
+
+      // 1 hop based on fat links
+      U      = PeekIndex<LorentzIndex>(Ufat, mu);
+      Udag   = adj( Cshift(U, mu, -1));
+
+      U    = U    *phases;
+      Udag = Udag *phases;
+
+      InsertGaugeField(Uds,U,mu);
+      InsertGaugeField(Uds,Udag,mu+4);
+
+      // 3 hop based on thin links. Crazy huh ?
+      U  = PeekIndex<LorentzIndex>(Uthin, mu);
+      UU = Gimpl::CovShiftForward(U,mu,U);
+      UUU= Gimpl::CovShiftForward(U,mu,UU);
+	
+      UUUdag = adj( Cshift(UUU, mu, -3));
+
+      UUU    = UUU    *phases;
+      UUUdag = UUUdag *phases;
+
+      InsertGaugeField(UUUds,UUU,mu);
+      InsertGaugeField(UUUds,UUUdag,mu+4);
+
+    }
+  }
+
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+    assert(0);
+  }   
+      
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+    assert (0); 
+  }
+};
+typedef StaggeredVec5dImpl<vComplex,  FundamentalRepresentation > StaggeredVec5dImplR;   // Real.. whichever prec
+typedef StaggeredVec5dImpl<vComplexF, FundamentalRepresentation > StaggeredVec5dImplF;  // Float
+typedef StaggeredVec5dImpl<vComplexD, FundamentalRepresentation > StaggeredVec5dImplD;  // Double
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@ -27,15 +27,11 @@
    *************************************************************************************/
 /*  END LEGAL */

-#ifndef GRID_QCD_WILSON_CLOVER_FERMION_H
-#define GRID_QCD_WILSON_CLOVER_FERMION_H
+#pragma once

 #include <Grid/Grid.h>

-namespace Grid
-{
-namespace QCD
-{
+NAMESPACE_BEGIN(Grid);

 ///////////////////////////////////////////////////////////////////
 // Wilson Clover
@ -131,22 +127,22 @@ public:
  // Derivative parts unpreconditioned pseudofermions
  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
  {
-    conformable(X._grid, Y._grid);
-    conformable(X._grid, force._grid);
-    GaugeLinkField force_mu(force._grid), lambda(force._grid);
-    GaugeField clover_force(force._grid);
-    PropagatorField Lambda(force._grid);
+    conformable(X.Grid(), Y.Grid());
+    conformable(X.Grid(), force.Grid());
+    GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
+    GaugeField clover_force(force.Grid());
+    PropagatorField Lambda(force.Grid());

    // Guido: Here we are hitting some performance issues:
    // need to extract the components of the DoubledGaugeField
    // for each call
    // Possible solution
    // Create a vector object to store them? (cons: wasting space)
-    std::vector<GaugeLinkField> U(Nd, this->Umu._grid);
+    std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());

    Impl::extractLinkField(U, this->Umu);

-    force = zero;
+    force = Zero();
    // Derivative of the Wilson hopping term
    this->DhopDeriv(force, X, Y, dag);

@ -179,10 +175,10 @@ public:
    */

    int count = 0;
-    clover_force = zero;
+    clover_force = Zero();
    for (int mu = 0; mu < 4; mu++)
    {
-      force_mu = zero;
+      force_mu = Zero();
      for (int nu = 0; nu < 4; nu++)
      {
        if (mu == nu)
@ -212,8 +208,8 @@ public:
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
-    conformable(lambda._grid, U[0]._grid);
-    GaugeLinkField out(lambda._grid), tmp(lambda._grid);
+    conformable(lambda.Grid(), U[0].Grid());
+    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
    // insertion in upper staple
    // please check redundancy of shift operations

@ -266,102 +262,113 @@ private:
  // using the DeGrand-Rossi basis for the gamma matrices
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    CloverFieldType T(F.Grid());
+    T = Zero();
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_for(i, CloverTerm.Grid()->oSites(),
    {
-      T._odata[i]()(0, 1) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(1, 0) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
-    }
+      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
+      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
+      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
+    });

    return T;
  }

  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    CloverFieldType T(F.Grid());
+    T = Zero();
+    
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_for(i, CloverTerm.Grid()->oSites(),
    {
-      T._odata[i]()(0, 1) = -F._odata[i]()();
-      T._odata[i]()(1, 0) = F._odata[i]()();
-      T._odata[i]()(2, 3) = -F._odata[i]()();
-      T._odata[i]()(3, 2) = F._odata[i]()();
-    }
+      T_v[i]()(0, 1) = -F_v[i]()();
+      T_v[i]()(1, 0) = F_v[i]()();
+      T_v[i]()(2, 3) = -F_v[i]()();
+      T_v[i]()(3, 2) = F_v[i]()();
+    });

    return T;
  }

  CloverFieldType fillCloverXY(const GaugeLinkField &F)
  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
-    {
+    CloverFieldType T(F.Grid());
+    T = Zero();

-      T._odata[i]()(0, 0) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(1, 1) = timesI(F._odata[i]()());
-      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
-    }
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_for(i, CloverTerm.Grid()->oSites(),
+    {
+      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
+      T_v[i]()(1, 1) = timesI(F_v[i]()());
+      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 3) = timesI(F_v[i]()());
+    });

    return T;
  }

  CloverFieldType fillCloverXT(const GaugeLinkField &F)
  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    CloverFieldType T(F.Grid());
+    T = Zero();
+
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_for(i, CloverTerm.Grid()->oSites(),
    {
-      T._odata[i]()(0, 1) = timesI(F._odata[i]()());
-      T._odata[i]()(1, 0) = timesI(F._odata[i]()());
-      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
-    }
+      T_v[i]()(0, 1) = timesI(F_v[i]()());
+      T_v[i]()(1, 0) = timesI(F_v[i]()());
+      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
+    });

    return T;
  }

  CloverFieldType fillCloverYT(const GaugeLinkField &F)
  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    CloverFieldType T(F.Grid());
+    T = Zero();
+    
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_for(i, CloverTerm.Grid()->oSites(),
    {
-      T._odata[i]()(0, 1) = -(F._odata[i]()());
-      T._odata[i]()(1, 0) = (F._odata[i]()());
-      T._odata[i]()(2, 3) = (F._odata[i]()());
-      T._odata[i]()(3, 2) = -(F._odata[i]()());
-    }
+      T_v[i]()(0, 1) = -(F_v[i]()());
+      T_v[i]()(1, 0) = (F_v[i]()());
+      T_v[i]()(2, 3) = (F_v[i]()());
+      T_v[i]()(3, 2) = -(F_v[i]()());
+    });

    return T;
  }

  CloverFieldType fillCloverZT(const GaugeLinkField &F)
  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    CloverFieldType T(F.Grid());
+
+    T = Zero();
+
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_for(i, CloverTerm.Grid()->oSites(),
    {
-      T._odata[i]()(0, 0) = timesI(F._odata[i]()());
-      T._odata[i]()(1, 1) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
-    }
+      T_v[i]()(0, 0) = timesI(F_v[i]()());
+      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
+      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 3) = timesI(F_v[i]()());
+    });

    return T;
  }
 };
-}
-}
+NAMESPACE_END(Grid);
+
+

-#endif // GRID_QCD_WILSON_CLOVER_FERMION_H
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -25,13 +25,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_WILSON_COMPRESSOR_H
 #define  GRID_QCD_WILSON_COMPRESSOR_H

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 /////////////////////////////////////////////////////////////////////////////////////////////
 // optimised versions supporting half precision too
@ -43,9 +42,9 @@ class WilsonCompressorTemplate;

 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
 class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
-  typename std::enable_if<std::is_same<_HCspinor,_Hspinor>::value>::type >
+				typename std::enable_if<std::is_same<_HCspinor,_Hspinor>::value>::type >
 {
- public:
+public:
  
  int mu,dag;  

@ -62,15 +61,16 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);

-  inline int CommDatumSize(void) {
+  accelerator_inline int CommDatumSize(void) {
    return sizeof(SiteHalfCommSpinor);
  }

  /*****************************************************/
  /* Compress includes precision change if mpi data is not same */
  /*****************************************************/
-  inline void Compress(SiteHalfSpinor * __restrict__ buf,Integer o,const SiteSpinor &in) {
-    SiteHalfSpinor tmp;
+  template<class _SiteHalfSpinor, class _SiteSpinor>
+  accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
+    _SiteHalfSpinor tmp;
    projector::Proj(tmp,in,mu,dag);
    vstream(buf[o],tmp);
  }
@ -78,10 +78,10 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Exchange includes precision change if mpi data is not same */
  /*****************************************************/
-  inline void Exchange(SiteHalfSpinor * __restrict__ mp,
-                       const SiteHalfSpinor * __restrict__ vp0,
-                       const SiteHalfSpinor * __restrict__ vp1,
-		       Integer type,Integer o){
+  accelerator_inline void Exchange(SiteHalfSpinor *mp,
+				   const SiteHalfSpinor * __restrict__ vp0,
+				   const SiteHalfSpinor * __restrict__ vp1,
+				   Integer type,Integer o){
    SiteHalfSpinor tmp1;
    SiteHalfSpinor tmp2;
    exchange(tmp1,tmp2,vp0[o],vp1[o],type);
@ -92,19 +92,21 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
-  inline void Decompress(SiteHalfSpinor * __restrict__ out,
-			 SiteHalfSpinor * __restrict__ in, Integer o) {    
+  accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out,
+				     SiteHalfSpinor * __restrict__ in, Integer o) {    
    assert(0);
  }

  /*****************************************************/
  /* Compress Exchange                                 */
  /*****************************************************/
-  inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
-			       SiteHalfSpinor * __restrict__ out1,
-			       const SiteSpinor * __restrict__ in,
-			       Integer j,Integer k, Integer m,Integer type){
-    SiteHalfSpinor temp1, temp2,temp3,temp4;
+  accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
+					   SiteHalfSpinor * __restrict__ out1,
+					   const SiteSpinor * __restrict__ in,
+					   Integer j,Integer k, Integer m,Integer type)
+  {
+    SiteHalfSpinor temp1, temp2;
+    SiteHalfSpinor temp3, temp4;
    projector::Proj(temp1,in[k],mu,dag);
    projector::Proj(temp2,in[m],mu,dag);
    exchange(temp3,temp4,temp1,temp2,type);
@ -115,15 +117,15 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Pass the info to the stencil */
  /*****************************************************/
-  inline bool DecompressionStep(void) { return false; }
+  accelerator_inline bool DecompressionStep(void) { return false; }

 };

 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
 class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
-  typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
+				typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
 {
- public:
+public:
  
  int mu,dag;  

@ -140,15 +142,16 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);

-  inline int CommDatumSize(void) {
+  accelerator_inline int CommDatumSize(void) {
    return sizeof(SiteHalfCommSpinor);
  }

  /*****************************************************/
  /* Compress includes precision change if mpi data is not same */
  /*****************************************************/
-  inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
-    SiteHalfSpinor hsp;
+  template<class _SiteHalfSpinor, class _SiteSpinor>
+  accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
+    _SiteHalfSpinor hsp;
    SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
    projector::Proj(hsp,in,mu,dag);
    precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
@ -157,7 +160,7 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Exchange includes precision change if mpi data is not same */
  /*****************************************************/
-  inline void Exchange(SiteHalfSpinor *mp,
+  accelerator_inline void Exchange(SiteHalfSpinor *mp,
                       SiteHalfSpinor *vp0,
                       SiteHalfSpinor *vp1,
 		       Integer type,Integer o){
@ -172,8 +175,7 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
-  inline void Decompress(SiteHalfSpinor *out,
-			 SiteHalfSpinor *in, Integer o){
+  accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o){
    SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
    precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
  }
@ -181,7 +183,7 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Compress Exchange                                 */
  /*****************************************************/
-  inline void CompressExchange(SiteHalfSpinor *out0,
+  accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
 			       SiteHalfSpinor *out1,
 			       const SiteSpinor *in,
 			       Integer j,Integer k, Integer m,Integer type){
@ -198,19 +200,19 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
  /*****************************************************/
  /* Pass the info to the stencil */
  /*****************************************************/
-  inline bool DecompressionStep(void) { return true; }
+  accelerator_inline bool DecompressionStep(void) { return true; }

 };

 #define DECLARE_PROJ(Projector,Compressor,spProj)			\
  class Projector {							\
  public:								\
-    template<class hsp,class fsp>					\
-    static void Proj(hsp &result,const fsp &in,int mu,int dag){			\
-      spProj(result,in);						\
-    }									\
+  template<class hsp,class fsp>						\
+  static accelerator void Proj(hsp &result,const fsp &in,int mu,int dag){ \
+    spProj(result,in);							\
+  }									\
  };									\
-template<typename HCS,typename HS,typename S> using Compressor = WilsonCompressorTemplate<HCS,HS,S,Projector>;
+  template<typename HCS,typename HS,typename S> using Compressor = WilsonCompressorTemplate<HCS,HS,S,Projector>;

 DECLARE_PROJ(WilsonXpProjector,WilsonXpCompressor,spProjXp);
 DECLARE_PROJ(WilsonYpProjector,WilsonYpCompressor,spProjYp);
@ -222,9 +224,9 @@ DECLARE_PROJ(WilsonZmProjector,WilsonZmCompressor,spProjZm);
 DECLARE_PROJ(WilsonTmProjector,WilsonTmCompressor,spProjTm);

 class WilsonProjector {
- public:
+public:
  template<class hsp,class fsp>
-  static void Proj(hsp &result,const fsp &in,int mu,int dag){
+  static accelerator void Proj(hsp &result,const fsp &in,int mu,int dag){
    int mudag=dag? mu : (mu+Nd)%(2*Nd);
    switch(mudag) {
    case Xp:	spProjXp(result,in);	break;
@ -243,9 +245,14 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom

 // Fast comms buffer manipulation which should inline right through (avoid direction
 // dependent logic that prevents inlining
-template<class vobj,class cobj>
-class WilsonStencil : public CartesianStencil<vobj,cobj> {
+template<class vobj,class cobj,class Parameters>
+class WilsonStencil : public CartesianStencil<vobj,cobj,Parameters> {
 public:
+
+  typedef CartesianStencil<vobj,cobj,Parameters> Base;
+  typedef typename Base::View_type View_type;
+  typedef typename Base::StencilVector StencilVector;
+
  double timer0;
  double timer1;
  double timer2;
@ -274,16 +281,40 @@ public:
    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
  }

+  std::vector<int> surface_list;
+
  WilsonStencil(GridBase *grid,
 		int npoints,
 		int checkerboard,
 		const std::vector<int> &directions,
-		const std::vector<int> &distances)  
-    : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) 
+		const std::vector<int> &distances,Parameters p)  
+    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p) 
  { 
    ZeroCountersi();
+    surface_list.resize(0);
+    this->same_node.resize(npoints);
  };

+  void BuildSurfaceList(int Ls,int vol4){
+
+    // find same node for SHM
+    // Here we know the distance is 1 for WilsonStencil
+    for(int point=0;point<this->_npoints;point++){
+      this->same_node[point] = this->SameNode(point);
+    }
+    
+    for(int site = 0 ;site< vol4;site++){
+      int local = 1;
+      for(int point=0;point<this->_npoints;point++){
+	if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ 
+	  local = 0;
+	}
+      }
+      if(local == 0) { 
+	surface_list.push_back(site);
+      }
+    }
+  }

  template < class compressor>
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
@ -292,8 +323,6 @@ public:
    this->HaloExchangeOptGather(source,compress);
    double t1=usecond();
    // Asynchronous MPI calls multidirectional, Isend etc...
-    //    this->CommunicateBegin(reqs);
-    //    this->CommunicateComplete(reqs);
    // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
    this->Communicate();
    double t2=usecond(); timer1 += t2-t1;
@ -327,7 +356,7 @@ public:
    this->_grid->StencilBarrier();
    this->mpi3synctime_g+=usecond();

-    assert(source._grid==this->_grid);
+    assert(source.Grid()==this->_grid);
    this->halogtime-=usecond();
    
    this->u_comm_offset=0;
@ -365,9 +394,10 @@ public:
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
    this->halogtime+=usecond();
+    accelerator_barrier();
  }

- };
+};

-}} // namespace close
+NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@ -27,16 +27,13 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_QCD_WILSON_FERMION_H
-#define GRID_QCD_WILSON_FERMION_H
+			   /*  END LEGAL */
+#pragma once

-namespace Grid {
-
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 class WilsonFermionStatic {
- public:
+public:
  static int HandOptDslash;  // these are a temporary hack
  static int MortonOrder;
  static const std::vector<int> directions;
@ -60,8 +57,9 @@ class WilsonFermionStatic {
 };

 template <class Impl>
-class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
- public:
+class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic 
+{
+public:
  INHERIT_IMPL_TYPES(Impl);
  typedef WilsonKernels<Impl> Kernels;

@ -138,10 +136,10 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {

  // Constructor
  WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
-                GridRedBlackCartesian &Hgrid, RealD _mass, 
+                GridRedBlackCartesian &Hgrid, RealD _mass,
                const ImplParams &p = ImplParams(), 
                const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );
-  
+
  // DoubleStore impl dependent
  void ImportGauge(const GaugeField &_Umu);

@ -150,7 +148,7 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  ///////////////////////////////////////////////////////////////

  //    protected:
- public:
+public:
  virtual RealD Mass(void) { return mass; }
  virtual int   isTrivialEE(void) { return 1; };
  RealD mass;
@ -171,7 +169,7 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {

  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
-
+  
  WilsonAnisotropyCoefficients anisotropyCoeff;
  
  ///////////////////////////////////////////////////////////////
@ -182,11 +180,11 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
                                PropagatorField &q_out,
                                Current curr_type,
                                unsigned int mu);
-  void SeqConservedCurrent(PropagatorField &q_in, 
-                             PropagatorField &q_out,
-                             Current curr_type, 
-                             unsigned int mu,
-                             unsigned int tmin, 
+  void SeqConservedCurrent(PropagatorField &q_in,
+                           PropagatorField &q_out,
+                           Current curr_type,
+                           unsigned int mu, 
+                           unsigned int tmin,
                             unsigned int tmax,
 			     ComplexField &lattice_cmplx);
 };
@ -194,7 +192,6 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 typedef WilsonFermion<WilsonImplD> WilsonFermionD;

+NAMESPACE_END(Grid);
+

-}
-}
-#endif
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@ -1,5 +1,5 @@

-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -26,216 +26,215 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef  GRID_QCD_WILSON_FERMION_5D_H
 #define  GRID_QCD_WILSON_FERMION_5D_H

 #include <Grid/perfmon/Stat.h>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  ////////////////////////////////////////////////////////////////////////////////
-  // This is the 4d red black case appropriate to support
-  //
-  // parity = (x+y+z+t)|2;
-  // generalised five dim fermions like mobius, zolotarev etc..	
-  //
-  // i.e. even even contains fifth dim hopping term.
-  //
-  // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
-  ////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// This is the 4d red black case appropriate to support
+//
+// parity = (x+y+z+t)|2;
+// generalised five dim fermions like mobius, zolotarev etc..	
+//
+// i.e. even even contains fifth dim hopping term.
+//
+// [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
+////////////////////////////////////////////////////////////////////////////////

-    ////////////////////////////////////////////////////////////////////////////////
-    // This is the 4d red black case appropriate to support
-    //
-    // parity = (x+y+z+t)|2;
-    // generalised five dim fermions like mobius, zolotarev etc..	
-    //
-    // i.e. even even contains fifth dim hopping term.
-    //
-    // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
-    ////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// This is the 4d red black case appropriate to support
+//
+// parity = (x+y+z+t)|2;
+// generalised five dim fermions like mobius, zolotarev etc..	
+//
+// i.e. even even contains fifth dim hopping term.
+//
+// [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
+////////////////////////////////////////////////////////////////////////////////

-    class WilsonFermion5DStatic { 
-    public:
-      // S-direction is INNERMOST and takes no part in the parity.
-      static const std::vector<int> directions;
-      static const std::vector<int> displacements;
-      const int npoint = 8;
-    };
+class WilsonFermion5DStatic { 
+public:
+  // S-direction is INNERMOST and takes no part in the parity.
+  static const std::vector<int> directions;
+  static const std::vector<int> displacements;
+  static constexpr int npoint = 8;
+};

-    template<class Impl>
-    class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-     typedef WilsonKernels<Impl> Kernels;
-     PmuStat stat;
+template<class Impl>
+class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+  typedef WilsonKernels<Impl> Kernels;
+  PmuStat stat;

-     FermionField _tmp;
-     FermionField &tmp(void) { return _tmp; }
+  FermionField _tmp;
+  FermionField &tmp(void) { return _tmp; }

-     void Report(void);
-     void ZeroCounters(void);
-     double DhopCalls;
-     double DhopCommTime;
-     double DhopComputeTime;
-     double DhopComputeTime2;
-     double DhopFaceTime;
-     double DhopTotalTime;
+  void Report(void);
+  void ZeroCounters(void);
+  double DhopCalls;
+  double DhopCommTime;
+  double DhopComputeTime;
+  double DhopComputeTime2;
+  double DhopFaceTime;
+  double DhopTotalTime;

-     double DerivCalls;
-     double DerivCommTime;
-     double DerivComputeTime;
-     double DerivDhopComputeTime;
+  double DerivCalls;
+  double DerivCommTime;
+  double DerivComputeTime;
+  double DerivDhopComputeTime;

-      ///////////////////////////////////////////////////////////////
-      // Implement the abstract base
-      ///////////////////////////////////////////////////////////////
-      GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
-      GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
-      GridBase *FermionGrid(void)            { return _FiveDimGrid;}
-      GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
+  ///////////////////////////////////////////////////////////////
+  // Implement the abstract base
+  ///////////////////////////////////////////////////////////////
+  GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
+  GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
+  GridBase *FermionGrid(void)            { return _FiveDimGrid;}
+  GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}

-      // full checkerboard operations; leave unimplemented as abstract for now
-      virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
-      virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+  // full checkerboard operations; leave unimplemented as abstract for now
+  virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+  virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};

-      // half checkerboard operations; leave unimplemented as abstract for now
-      virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
+  // half checkerboard operations; leave unimplemented as abstract for now
+  virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};

-      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

-      // These can be overridden by fancy 5d chiral action
-      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  // These can be overridden by fancy 5d chiral action
+  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

      void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
      void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
      void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;

-      // Implement hopping term non-hermitian hopping term; half cb or both
-      // Implement s-diagonal DW
-      void DW    (const FermionField &in, FermionField &out,int dag);
-      void Dhop  (const FermionField &in, FermionField &out,int dag);
-      void DhopOE(const FermionField &in, FermionField &out,int dag);
-      void DhopEO(const FermionField &in, FermionField &out,int dag);
+  // Implement hopping term non-hermitian hopping term; half cb or both
+  // Implement s-diagonal DW
+  void DW    (const FermionField &in, FermionField &out,int dag);
+  void Dhop  (const FermionField &in, FermionField &out,int dag);
+  void DhopOE(const FermionField &in, FermionField &out,int dag);
+  void DhopEO(const FermionField &in, FermionField &out,int dag);

-      // add a DhopComm
-      // -- suboptimal interface will presently trigger multiple comms.
-    void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+  // add a DhopComm
+  // -- suboptimal interface will presently trigger multiple comms.
+  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
    
-    ///////////////////////////////////////////////////////////////
-    // New methods added 
-    ///////////////////////////////////////////////////////////////
-    void DerivInternal(StencilImpl & st,
-		       DoubledGaugeField & U,
-		       GaugeField &mat,
-		       const FermionField &A,
-		       const FermionField &B,
-		       int dag);
+  ///////////////////////////////////////////////////////////////
+  // New methods added 
+  ///////////////////////////////////////////////////////////////
+  void DerivInternal(StencilImpl & st,
+		     DoubledGaugeField & U,
+		     GaugeField &mat,
+		     const FermionField &A,
+		     const FermionField &B,
+		     int dag);
    
-    void DhopInternal(StencilImpl & st,
-		      LebesgueOrder &lo,
-		      DoubledGaugeField &U,
-		      const FermionField &in, 
-		      FermionField &out,
-		      int dag);
+  void DhopInternal(StencilImpl & st,
+		    LebesgueOrder &lo,
+		    DoubledGaugeField &U,
+		    const FermionField &in, 
+		    FermionField &out,
+		    int dag);

-    void DhopInternalOverlappedComms(StencilImpl & st,
-				     LebesgueOrder &lo,
-				     DoubledGaugeField &U,
-				     const FermionField &in, 
-				     FermionField &out,
-				     int dag);
+  void DhopInternalOverlappedComms(StencilImpl & st,
+				   LebesgueOrder &lo,
+				   DoubledGaugeField &U,
+				   const FermionField &in, 
+				   FermionField &out,
+				   int dag);

-    void DhopInternalSerialComms(StencilImpl & st,
-				 LebesgueOrder &lo,
-				 DoubledGaugeField &U,
-				 const FermionField &in, 
-				 FermionField &out,
-				 int dag);
+  void DhopInternalSerialComms(StencilImpl & st,
+			       LebesgueOrder &lo,
+			       DoubledGaugeField &U,
+			       const FermionField &in, 
+			       FermionField &out,
+			       int dag);
    
-    // Constructors
-    WilsonFermion5D(GaugeField &_Umu,
-		    GridCartesian         &FiveDimGrid,
-		    GridRedBlackCartesian &FiveDimRedBlackGrid,
-		    GridCartesian         &FourDimGrid,
-		    GridRedBlackCartesian &FourDimRedBlackGrid,
-		    double _M5,const ImplParams &p= ImplParams());
+  // Constructors
+  WilsonFermion5D(GaugeField &_Umu,
+		  GridCartesian         &FiveDimGrid,
+		  GridRedBlackCartesian &FiveDimRedBlackGrid,
+		  GridCartesian         &FourDimGrid,
+		  GridRedBlackCartesian &FourDimRedBlackGrid,
+		  double _M5,const ImplParams &p= ImplParams());
    
-    // Constructors
-    /*
-      WilsonFermion5D(int simd, 
-      GaugeField &_Umu,
-      GridCartesian         &FiveDimGrid,
-      GridRedBlackCartesian &FiveDimRedBlackGrid,
-      GridCartesian         &FourDimGrid,
-      double _M5,const ImplParams &p= ImplParams());
-    */
+  // Constructors
+  /*
+    WilsonFermion5D(int simd, 
+    GaugeField &_Umu,
+    GridCartesian         &FiveDimGrid,
+    GridRedBlackCartesian &FiveDimRedBlackGrid,
+    GridCartesian         &FourDimGrid,
+    double _M5,const ImplParams &p= ImplParams());
+  */
    
-    // DoubleStore
-    void ImportGauge(const GaugeField &_Umu);
+  // DoubleStore
+  void ImportGauge(const GaugeField &_Umu);
    
-    ///////////////////////////////////////////////////////////////
-    // Data members require to support the functionality
-    ///////////////////////////////////////////////////////////////
-  public:
+  ///////////////////////////////////////////////////////////////
+  // Data members require to support the functionality
+  ///////////////////////////////////////////////////////////////
+public:
    
-    // Add these to the support from Wilson
-    GridBase *_FourDimGrid;
-    GridBase *_FourDimRedBlackGrid;
-    GridBase *_FiveDimGrid;
-    GridBase *_FiveDimRedBlackGrid;
+  // Add these to the support from Wilson
+  GridBase *_FourDimGrid;
+  GridBase *_FourDimRedBlackGrid;
+  GridBase *_FiveDimGrid;
+  GridBase *_FiveDimRedBlackGrid;
    
-    double                        M5;
-    int Ls;
+  double                        M5;
+  int Ls;
    
-    //Defines the stencils for even and odd
-    StencilImpl Stencil; 
-    StencilImpl StencilEven; 
-    StencilImpl StencilOdd; 
+  //Defines the stencils for even and odd
+  StencilImpl Stencil; 
+  StencilImpl StencilEven; 
+  StencilImpl StencilOdd; 
    
-    // Copy of the gauge field , with even and odd subsets
-    DoubledGaugeField Umu;
-    DoubledGaugeField UmuEven;
-    DoubledGaugeField UmuOdd;
+  // Copy of the gauge field , with even and odd subsets
+  DoubledGaugeField Umu;
+  DoubledGaugeField UmuEven;
+  DoubledGaugeField UmuOdd;
    
-    LebesgueOrder Lebesgue;
-    LebesgueOrder LebesgueEvenOdd;
+  LebesgueOrder Lebesgue;
+  LebesgueOrder LebesgueEvenOdd;
    
-    // Comms buffer
-    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  // Comms buffer
+  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    
-    ///////////////////////////////////////////////////////////////
-    // Conserved current utilities
-    ///////////////////////////////////////////////////////////////
-    void ContractConservedCurrent(PropagatorField &q_in_1,
-                                  PropagatorField &q_in_2,
-                                  PropagatorField &q_out,
-                                  Current curr_type, 
-                                  unsigned int mu);
-    void SeqConservedCurrent(PropagatorField &q_in, 
-                             PropagatorField &q_out,
-                             Current curr_type, 
-                             unsigned int mu,
-                             unsigned int tmin, 
-                             unsigned int tmax,
-			     ComplexField &lattice_cmplx);
+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+				PropagatorField &q_in_2,
+				PropagatorField &q_out,
+				Current curr_type, 
+				unsigned int mu);
+  void SeqConservedCurrent(PropagatorField &q_in,
+			   PropagatorField &q_out,
+			   Current curr_type,
+			   unsigned int mu,
+			   unsigned int tmin,
+			   unsigned int tmax,
+			   ComplexField &lattice_cmplx);

-    void ContractJ5q(PropagatorField &q_in,ComplexField &J5q);
-    void ContractJ5q(FermionField &q_in,ComplexField &J5q);
+  void ContractJ5q(PropagatorField &q_in,ComplexField &J5q);
+  void ContractJ5q(FermionField &q_in,ComplexField &J5q);

-  };
+};

-}}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@ -0,0 +1,226 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+  
+/////////////////////////////////////////////////////////////////////////////
+// Single flavour four spinors with colour index
+/////////////////////////////////////////////////////////////////////////////
+template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
+class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
+public:
+
+  static const int Dimension = Representation::Dimension;
+  static const bool isFundamental = Representation::isFundamental;
+  static const bool LsVectorised=false;
+  static const int Nhcs = Options::Nhcs;
+
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
+  INHERIT_GIMPL_TYPES(Gimpl);
+      
+  //Necessary?
+  constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
+    
+  typedef typename Options::_Coeff_t Coeff_t;
+  typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
+      
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+    
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplPropagator<Simd>        SitePropagator;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+  typedef iImplHalfCommSpinor<SimdL>   SiteHalfCommSpinor;
+  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+    
+  typedef Lattice<SiteSpinor>            FermionField;
+  typedef Lattice<SitePropagator>        PropagatorField;
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+    
+  typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
+  typedef WilsonImplParams ImplParams;
+  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
+  typedef typename StencilImpl::View_type StencilView;
+    
+  ImplParams Params;
+
+  WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){
+    assert(Params.boundary_phases.size() == Nd);
+  };
+
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi,
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi,
+					  int mu) 
+  {
+    auto UU = coalescedRead(U(mu));
+    mult(&phi(), &UU, &chi());
+  }
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi,
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi,
+					  int mu,
+					  StencilEntry *SE,
+					  StencilView &St) 
+  {
+    multLink(phi,U,chi,mu);
+  }
+    
+      
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    reg = memory;
+  }
+      
+  inline void DoubleStore(GridBase *GaugeGrid,
+			  DoubledGaugeField &Uds,
+			  const GaugeField &Umu) 
+  {
+    typedef typename Simd::scalar_type scalar_type;
+
+    conformable(Uds.Grid(), GaugeGrid);
+    conformable(Umu.Grid(), GaugeGrid);
+
+    GaugeLinkField U(GaugeGrid);
+    GaugeLinkField tmp(GaugeGrid);
+
+    Lattice<iScalar<vInteger> > coor(GaugeGrid);
+      ////////////////////////////////////////////////////
+      // apply any boundary phase or twists
+      ////////////////////////////////////////////////////
+    for (int mu = 0; mu < Nd; mu++) {
+
+	////////// boundary phase /////////////
+      auto pha = Params.boundary_phases[mu];
+      scalar_type phase( real(pha),imag(pha) );
+
+	int L   = GaugeGrid->GlobalDimensions()[mu];
+        int Lmu = L - 1;
+
+      LatticeCoordinate(coor, mu);
+
+      U = PeekIndex<LorentzIndex>(Umu, mu);
+
+	// apply any twists
+	RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
+	if ( theta != 0.0) { 
+	  scalar_type twphase(::cos(theta),::sin(theta));
+	  U = twphase*U;
+	  std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
+	}
+
+      tmp = where(coor == Lmu, phase * U, U);
+      PokeIndex<LorentzIndex>(Uds, tmp, mu);
+
+      U = adj(Cshift(U, mu, -1));
+      U = where(coor == 0, conjugate(phase) * U, U); 
+      PokeIndex<LorentzIndex>(Uds, U, mu + 4);
+    }
+  }
+
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+    GaugeLinkField link(mat.Grid());
+    link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+    PokeIndex<LorentzIndex>(mat,link,mu);
+  }   
+      
+    inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
+      mat = outerProduct(B,A); 
+    }  
+
+    inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+      mat = TraceIndex<SpinIndex>(P); 
+    }
+      
+    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+      for (int mu = 0; mu < Nd; mu++)
+      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
+    }
+
+
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+      
+    int Ls=Btilde.Grid()->_fdimensions[0];
+    GaugeLinkField tmp(mat.Grid());
+    tmp = Zero();
+    auto tmp_v = tmp.View();
+    auto Btilde_v = Btilde.View();
+    auto Atilde_v = Atilde.View();
+    thread_for(sss,tmp.Grid()->oSites(),{
+      int sU=sss;
+      for(int s=0;s<Ls;s++){
+	int sF = s+Ls*sU;
+	tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
+      }
+    });
+    PokeIndex<LorentzIndex>(mat,tmp,mu);
+      
+  }
+};
+
+
+typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffReal > WilsonImplR;  // Real.. whichever prec
+typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF;  // Float
+typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD;  // Double
+
+typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL;  // Real.. whichever prec
+typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH;  // Float
+typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF;  // Double
+
+typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec
+typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
+typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
+
+typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec
+typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
+typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double
+ 
+typedef WilsonImpl<vComplex,  AdjointRepresentation, CoeffReal > WilsonAdjImplR;   // Real.. whichever prec
+typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF;  // Float
+typedef WilsonImpl<vComplexD, AdjointRepresentation, CoeffReal > WilsonAdjImplD;  // Double
+ 
+typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplR;   // Real.. whichever prec
+typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplF;  // Float
+typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplD;  // Double
+ 
+typedef WilsonImpl<vComplex,  TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplR;   // Real.. whichever prec
+typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF;  // Float
+typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD;  // Double
+
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/WilsonKernels.cc
+++ b/Grid/qcd/action/fermion/WilsonKernels.cc
@ -1,455 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-namespace Grid {
-namespace QCD {
-
-int WilsonKernelsStatic::Opt   = WilsonKernelsStatic::OptGeneric;
-int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
-
-template <class Impl>
-WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
-
-////////////////////////////////////////////
-// Generic implementation; move to different file?
-////////////////////////////////////////////
-  
-#define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
-  SE = st.GetEntry(ptype, Dir, sF);				\
-  if (SE->_is_local) {						\
-    chi_p = &chi;						\
-    if (SE->_permute) {						\
-      spProj(tmp, in._odata[SE->_offset]);			\
-      permute(chi, tmp, ptype);					\
-    } else {							\
-      spProj(chi, in._odata[SE->_offset]);			\
-    }								\
-  } else {							\
-    chi_p = &buf[SE->_offset];					\
-  }								\
-  Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st);	\
-  Recon(result, Uchi);
-  
-#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon)		\
-  SE = st.GetEntry(ptype, Dir, sF);				\
-  if (SE->_is_local) {						\
-    chi_p = &chi;						\
-    if (SE->_permute) {						\
-      spProj(tmp, in._odata[SE->_offset]);			\
-      permute(chi, tmp, ptype);					\
-    } else {							\
-      spProj(chi, in._odata[SE->_offset]);			\
-    }								\
-  } else if ( st.same_node[Dir] ) {				\
-      chi_p = &buf[SE->_offset];				\
-  }								\
-  if (SE->_is_local || st.same_node[Dir] ) {			\
-    Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st);	\
-    Recon(result, Uchi);					\
-  }
-
-#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
-  SE = st.GetEntry(ptype, Dir, sF);				\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
-    chi_p = &buf[SE->_offset];					\
-    Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st);	\
-    Recon(result, Uchi);					\
-    nmu++;							\
-  }
-
-#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
-  if (gamma == Dir) {						\
-    if (SE->_is_local && SE->_permute) {			\
-      spProj(tmp, in._odata[SE->_offset]);			\
-      permute(chi, tmp, ptype);					\
-    } else if (SE->_is_local) {					\
-      spProj(chi, in._odata[SE->_offset]);			\
-    } else {							\
-      chi = buf[SE->_offset];					\
-    }								\
-    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);	\
-    Recon(result, Uchi);					\
-  }
-
-  ////////////////////////////////////////////////////////////////////
-  // All legs kernels ; comms then compute
-  ////////////////////////////////////////////////////////////////////
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-					     SiteHalfSpinor *buf, int sF,
-					     int sU, const FermionField &in, FermionField &out)
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-
-  GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
-  GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG(Tp,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG(Xm,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG(Ym,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
-  vstream(out._odata[sF], result);
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-					  SiteHalfSpinor *buf, int sF,
-					  int sU, const FermionField &in, FermionField &out) 
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-
-  GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
-  GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG(Tm,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG(Xp,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG(Yp,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
-  vstream(out._odata[sF], result);
-};
-  ////////////////////////////////////////////////////////////////////
-  // Interior kernels
-  ////////////////////////////////////////////////////////////////////
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-						SiteHalfSpinor *buf, int sF,
-						int sU, const FermionField &in, FermionField &out)
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-
-  result=zero;
-  GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_INT(Yp,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_INT(Zp,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_INT(Tp,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_INT(Xm,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_INT(Ym,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
-  vstream(out._odata[sF], result);
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-					     SiteHalfSpinor *buf, int sF,
-					     int sU, const FermionField &in, FermionField &out) 
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-  result=zero;
-  GENERIC_STENCIL_LEG_INT(Xm,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_INT(Ym,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_INT(Zm,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_INT(Tm,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_INT(Xp,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_INT(Yp,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
-  vstream(out._odata[sF], result);
-};
-////////////////////////////////////////////////////////////////////
-// Exterior kernels
-////////////////////////////////////////////////////////////////////
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-						SiteHalfSpinor *buf, int sF,
-						int sU, const FermionField &in, FermionField &out)
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-  int nmu=0;
-  result=zero;
-  GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_EXT(Zp,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_EXT(Tp,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_EXT(Xm,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm);
-  if ( nmu ) { 
-    out._odata[sF] = out._odata[sF] + result; 
-  }
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-					     SiteHalfSpinor *buf, int sF,
-					     int sU, const FermionField &in, FermionField &out) 
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-  int nmu=0;
-  result=zero;
-  GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_EXT(Zm,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_EXT(Tm,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_EXT(Xp,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm);
-  if ( nmu ) { 
-    out._odata[sF] = out._odata[sF] + result; 
-  }
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF,
-					   int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
-
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteSpinor result;
-  SiteHalfSpinor Uchi;
-  StencilEntry *SE;
-  int ptype;
-
-  SE = st.GetEntry(ptype, dir, sF);
-  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
-  GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
-  GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
-  GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
-  GENERIC_DHOPDIR_LEG(Xm,spProjXm,spReconXm);
-  GENERIC_DHOPDIR_LEG(Ym,spProjYm,spReconYm);
-  GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
-  GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
-  vstream(out._odata[sF], result);
-}
-
-/*******************************************************************************
- * Conserved current utilities for Wilson fermions, for contracting propagators
- * to make a conserved current sink or inserting the conserved current 
- * sequentially. Common to both 4D and 5D.
- ******************************************************************************/
-// N.B. Functions below assume a -1/2 factor within U.
-#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
-#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
-
-/*******************************************************************************
- * Name: ContractConservedCurrentSiteFwd
- * Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in_1 shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(
-                                                  const SitePropagator &q_in_1,
-                                                  const SitePropagator &q_in_2,
-                                                  SitePropagator &q_out,
-                                                  DoubledGaugeField &U,
-                                                  unsigned int sU,
-                                                  unsigned int mu,
-                                                  bool switch_sign)
-{
-    SitePropagator result, tmp;
-    Gamma g5(Gamma::Algebra::Gamma5);
-    Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu);
-    result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
-    if (switch_sign)
-    {
-        q_out -= result;
-    }
-    else
-    {
-        q_out += result;
-    }
-}
-
-/*******************************************************************************
- * Name: ContractConservedCurrentSiteBwd
- * Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in_2 shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(
-                                                  const SitePropagator &q_in_1,
-                                                  const SitePropagator &q_in_2,
-                                                  SitePropagator &q_out,
-                                                  DoubledGaugeField &U,
-                                                  unsigned int sU,
-                                                  unsigned int mu,
-                                                  bool switch_sign)
-{
-    SitePropagator result, tmp;
-    Gamma g5(Gamma::Algebra::Gamma5);
-    Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu + Nd);
-    result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
-    if (switch_sign)
-    {
-        q_out += result;
-    }
-    else
-    {
-        q_out -= result;
-    }
-}
-
-// G-parity requires more specialised implementation.
-#define NO_CURR_SITE(Impl) \
-template <> \
-void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd( \
-                                                  const SitePropagator &q_in_1, \
-                                                  const SitePropagator &q_in_2, \
-                                                  SitePropagator &q_out,        \
-                                                  DoubledGaugeField &U,         \
-                                                  unsigned int sU,              \
-                                                  unsigned int mu,              \
-                                                  bool switch_sign)             \
-{ \
-    assert(0); \
-} \
-template <> \
-void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd( \
-                                                  const SitePropagator &q_in_1, \
-                                                  const SitePropagator &q_in_2, \
-                                                  SitePropagator &q_out,        \
-                                                  DoubledGaugeField &U,         \
-                                                  unsigned int mu,              \
-                                                  unsigned int sU,              \
-                                                  bool switch_sign)             \
-{ \
-    assert(0); \
-}
-
-NO_CURR_SITE(GparityWilsonImplF);
-NO_CURR_SITE(GparityWilsonImplD);
-NO_CURR_SITE(GparityWilsonImplFH);
-NO_CURR_SITE(GparityWilsonImplDF);
-
-
-/*******************************************************************************
- * Name: SeqConservedCurrentSiteFwd
- * Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
-                                                     SitePropagator &q_out,
-                                                     DoubledGaugeField &U,
-                                                     unsigned int sU,
-                                                     unsigned int mu,
-                                                     vInteger t_mask,
-                                                     bool switch_sign)
-{
-    SitePropagator result;
-    Impl::multLinkProp(result, U._odata[sU], q_in, mu);
-    result = WilsonCurrentFwd(result, mu);
-
-    // Zero any unwanted timeslice entries.
-    result = predicatedWhere(t_mask, result, 0.*result);
-
-    if (switch_sign)
-    {
-        q_out -= result;
-    }
-    else
-    {
-        q_out += result;
-    }
-}
-
-/*******************************************************************************
- * Name: SeqConservedCurrentSiteFwd
- * Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in shifted in -ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in, 
-                                                     SitePropagator &q_out,
-                                                     DoubledGaugeField &U,
-                                                     unsigned int sU,
-                                                     unsigned int mu,
-                                                     vInteger t_mask,
-                                                     bool switch_sign)
-{
-    SitePropagator result;
-    Impl::multLinkProp(result, U._odata[sU], q_in, mu + Nd);
-    result = WilsonCurrentBwd(result, mu);
-
-    // Zero any unwanted timeslice entries.
-    result = predicatedWhere(t_mask, result, 0.*result);
-
-    if (switch_sign)
-    {
-        q_out += result;
-    }
-    else
-    {
-        q_out -= result;
-    }
-}
-
-FermOpTemplateInstantiate(WilsonKernels);
-AdjointFermOpTemplateInstantiate(WilsonKernels);
-TwoIndexFermOpTemplateInstantiate(WilsonKernels);
-
-}}
-
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@ -27,19 +27,17 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_QCD_DHOP_H
-#define GRID_QCD_DHOP_H
+			   /*  END LEGAL */
+#pragma once

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Helper routines that implement Wilson stencil for a single site.
-  // Common to both the WilsonFermion and WilsonFermion5D
-  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Helper routines that implement Wilson stencil for a single site.
+// Common to both the WilsonFermion and WilsonFermion5D
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 class WilsonKernelsStatic { 
- public:
+public:
  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
  enum { CommsAndCompute, CommsThenCompute };
  static int Opt;  
@ -47,235 +45,123 @@ class WilsonKernelsStatic {
 };
 
 template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
- public:
-   
+public:
+
  INHERIT_IMPL_TYPES(Impl);
  typedef FermionOperator<Impl> Base;
   
 public:

-  template <bool EnableBool = true>
-  typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
-  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
-  {
-    bgq_l1p_optimisation(1);
-    switch(Opt) {
-#if defined(AVX512) || defined (QPX)
-    case OptInlineAsm:
-      if(interior&&exterior) WilsonKernels<Impl>::AsmDhopSite   (st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-      else if (interior)     WilsonKernels<Impl>::AsmDhopSiteInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-      else if (exterior)     WilsonKernels<Impl>::AsmDhopSiteExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-      else assert(0);
-      break;
-#endif
-    case OptHandUnroll:
-         for (int site = 0; site < Ns; site++) {
-	   for (int s = 0; s < Ls; s++) {
-	     if(interior&&exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out);
-	     else if (interior)     WilsonKernels<Impl>::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
-	     else if (exterior)     WilsonKernels<Impl>::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
-	     sF++;
-	   }
-	   sU++;
-         }
-      break;
-    case OptGeneric:
-         for (int site = 0; site < Ns; site++) {
-	   for (int s = 0; s < Ls; s++) {
-	     if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
-	     else if (interior)     WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
-	     else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
-	     else assert(0);
-	     sF++;
-	   }
-	   sU++;
-       } 
-      break;
-    default:
-      assert(0);
-    }
-    bgq_l1p_optimisation(0);
-  }
-     
-  template <bool EnableBool = true>
-  typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool, void>::type
-  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-	   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) {
-    // no kernel choice  
-    for (int site = 0; site < Ns; site++) {
-      for (int s = 0; s < Ls; s++) {
-	if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
-	else if (interior)     WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
-	else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
-	else assert(0);
-	sF++;
-      }
-      sU++;
-    }
-  }
-     
-  template <bool EnableBool = true>
-  typename std::enable_if<Impl::isFundamental==true && Nc == 3 && EnableBool,void>::type
-  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-	      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
-{
-    bgq_l1p_optimisation(1);
-    switch(Opt) {
-#if defined(AVX512) || defined (QPX)
-    case OptInlineAsm:
-      if(interior&&exterior) WilsonKernels<Impl>::AsmDhopSiteDag   (st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-      else if (interior)     WilsonKernels<Impl>::AsmDhopSiteDagInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-      else if (exterior)     WilsonKernels<Impl>::AsmDhopSiteDagExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-      else assert(0);
-      break;
-#endif
-    case OptHandUnroll:
-      for (int site = 0; site < Ns; site++) {
-	for (int s = 0; s < Ls; s++) {
-	  if(interior&&exterior) WilsonKernels<Impl>::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-	  else if (interior)     WilsonKernels<Impl>::HandDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out);
-	  else if (exterior)     WilsonKernels<Impl>::HandDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out);
-	  else assert(0);
-	  sF++;
-	}
-	sU++;
-      }
-      break;
-    case OptGeneric:
-      for (int site = 0; site < Ns; site++) {
-	for (int s = 0; s < Ls; s++) {
-	  if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-	  else if (interior)     WilsonKernels<Impl>::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out);
-	  else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out);
-	  else assert(0);
-	  sF++;
-	}
-	sU++;
-      }
-      break;
-    default:
-      assert(0);
-    }
-    bgq_l1p_optimisation(0);
-  }
+  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+			 int Ls, int Nsite, const FermionField &in, FermionField &out,
+			 int interior=1,int exterior=1) ;

-  template <bool EnableBool = true>
-  typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool,void>::type
-  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
-		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) {
+  static void DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+			    int Ls, int Nsite, const FermionField &in, FermionField &out,
+			    int interior=1,int exterior=1) ;

-    for (int site = 0; site < Ns; site++) {
-      for (int s = 0; s < Ls; s++) {
-	if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-	else if (interior)     WilsonKernels<Impl>::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out);
-	else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out);
-	else assert(0);
-	sF++;
-      }
-      sU++;
-    }
-  }
+  static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
+			    int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);

-  void DhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
-		       int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
-      
  //////////////////////////////////////////////////////////////////////////////
  // Utilities for inserting Wilson conserved current.
  //////////////////////////////////////////////////////////////////////////////
-  void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
+  static void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
                                       const SitePropagator &q_in_2,
                                       SitePropagator &q_out,
-                                       DoubledGaugeField &U,
+                                       DoubledGaugeFieldView &U,
                                       unsigned int sU,
                                       unsigned int mu,
                                       bool switch_sign = false);
-  void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
+
+  static void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
                                       const SitePropagator &q_in_2,
                                       SitePropagator &q_out,
-                                       DoubledGaugeField &U,
+                                       DoubledGaugeFieldView &U,
                                       unsigned int sU,
                                       unsigned int mu,
                                       bool switch_sign = false);
-  void SeqConservedCurrentSiteFwd(const SitePropagator &q_in, 
+
+  static void SeqConservedCurrentSiteFwd(const SitePropagator &q_in, 
                                  SitePropagator &q_out,
-                                  DoubledGaugeField &U,
+                                  DoubledGaugeFieldView &U,
                                  unsigned int sU,
                                  unsigned int mu,
-                                  vInteger t_mask,
+                                  vPredicate t_mask,
                                  bool switch_sign = false);
-  void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
+
+  static void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
                                  SitePropagator &q_out,
-                                  DoubledGaugeField &U,
+                                  DoubledGaugeFieldView &U,
                                  unsigned int sU,
                                  unsigned int mu,
-                                  vInteger t_mask,
+                                  vPredicate t_mask,
                                  bool switch_sign = false);

 private:
-     // Specialised variants
-  void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		       int sF, int sU, const FermionField &in, FermionField &out);
+
+  static accelerator void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
+				   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
      
-  void GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			  int sF, int sU, const FermionField &in, FermionField &out);
-
-  void GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			  int sF, int sU, const FermionField &in, FermionField &out);
+  // Specialised variants
+  static accelerator void GenericDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
      
-  void GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			     int sF, int sU, const FermionField &in, FermionField &out);
-
-  void GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			  int sF, int sU, const FermionField &in, FermionField &out);
+  static accelerator void GenericDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+						    int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  
+  static accelerator void GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+						    int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
      
-  void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			     int sF, int sU, const FermionField &in, FermionField &out);
-
-
-  void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		   int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
-
-  void AsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
-
-  void AsmDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		      int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
-
-  void AsmDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			 int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
-
-  void AsmDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		      int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
-
-  void AsmDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			 int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
-
-
-  void HandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		    int sF, int sU, const FermionField &in, FermionField &out);
-
-  void HandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		       int sF, int sU, const FermionField &in, FermionField &out);
+  static accelerator void GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+						int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  
+  static accelerator void GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					     int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
      
-  void HandDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		       int sF, int sU, const FermionField &in, FermionField &out);
-  
-  void HandDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			  int sF, int sU, const FermionField &in, FermionField &out);
-  
-  void HandDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		       int sF, int sU, const FermionField &in, FermionField &out);
-  
-  void HandDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			  int sF, int sU, const FermionField &in, FermionField &out);
-  
-public:
+  static accelerator void GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+						       int sF, int sU, const FermionFieldView &in, FermionFieldView &out);

-  WilsonKernels(const ImplParams &p = ImplParams());
+  static void AsmDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+			  int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,FermionFieldView &out);
+  
+  static void AsmDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+			     int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out);
+  
+  static void AsmDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+			     int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,FermionFieldView &out);
+  
+  static void AsmDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+				int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out);
+  
+  static void AsmDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+			     int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,FermionFieldView &out);
+  
+  static void AsmDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+				int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out);

+// Keep Hand unrolled temporarily  
+  static accelerator void HandDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+				       int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  
+  static accelerator void HandDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  
+  static accelerator void HandDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  
+  static accelerator void HandDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					     int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  
+  static accelerator void HandDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+  
+  static accelerator void HandDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					     int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+ public:
+ WilsonKernels(const ImplParams &p = ImplParams()) : Base(p){};
 };
    
-}}
+NAMESPACE_END(Grid);
+

-#endif
--- a/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
@ -1,127 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-namespace Grid {
-namespace QCD {
-
-
-///////////////////////////////////////////////////////////
-// Default to no assembler implementation
-///////////////////////////////////////////////////////////
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-
-#include <Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h>
-#include <Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h>
-
-#define INSTANTIATE_ASM(A)\
-template void WilsonKernels<A>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
- \
-template void WilsonKernels<A>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
- \
-template void WilsonKernels<A>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
- \
-template void WilsonKernels<A>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
-
-INSTANTIATE_ASM(WilsonImplF);
-INSTANTIATE_ASM(WilsonImplD);
-INSTANTIATE_ASM(ZWilsonImplF);
-INSTANTIATE_ASM(ZWilsonImplD);
-INSTANTIATE_ASM(GparityWilsonImplF);
-INSTANTIATE_ASM(GparityWilsonImplD);
-INSTANTIATE_ASM(DomainWallVec5dImplF);
-INSTANTIATE_ASM(DomainWallVec5dImplD);
-INSTANTIATE_ASM(ZDomainWallVec5dImplF);
-INSTANTIATE_ASM(ZDomainWallVec5dImplD);
-
-INSTANTIATE_ASM(WilsonImplFH);
-INSTANTIATE_ASM(WilsonImplDF);
-INSTANTIATE_ASM(ZWilsonImplFH);
-INSTANTIATE_ASM(ZWilsonImplDF);
-INSTANTIATE_ASM(GparityWilsonImplFH);
-INSTANTIATE_ASM(GparityWilsonImplDF);
-INSTANTIATE_ASM(DomainWallVec5dImplFH);
-INSTANTIATE_ASM(DomainWallVec5dImplDF);
-INSTANTIATE_ASM(ZDomainWallVec5dImplFH);
-INSTANTIATE_ASM(ZDomainWallVec5dImplDF);
-
-}}
-
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h
@ -1,650 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-
-#if defined(AVX512) 
-    ///////////////////////////////////////////////////////////
-    // If we are AVX512 specialise the single precision routine
-    ///////////////////////////////////////////////////////////
-#include <simd/Intel512wilson.h>
-#include <simd/Intel512single.h>
-    
-static Vector<vComplexF> signsF;
-
-  template<typename vtype>    
-  int setupSigns(Vector<vtype>& signs ){
-    Vector<vtype> bother(2);
-    signs = bother;
-    vrsign(signs[0]);
-    visign(signs[1]);
-    return 1;
-  }
-
-  static int signInitF = setupSigns(signsF);
-
-#define MAYBEPERM(A,perm) if (perm) { A ; }
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
-#define COMPLEX_SIGNS(isigns) vComplexF *isigns = &signsF[0];  
-  
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-#undef MAYBEPERM
-#undef MULT_2SPIN
-#define MAYBEPERM(A,B) 
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-#undef  MULT_2SPIN
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef COMPLEX_SIGNS
-#undef MAYBEPERM
-#undef MULT_2SPIN
-	
-
-
-///////////////////////////////////////////////////////////
-// If we are AVX512 specialise the double precision routine
-///////////////////////////////////////////////////////////
-
-#include <simd/Intel512double.h>
-    
-static Vector<vComplexD> signsD;
-static int signInitD = setupSigns(signsD);
-    
-#define MAYBEPERM(A,perm) if (perm) { A ; }
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
-#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];  
-
-
-#define INTERIOR_AND_EXTERIOR    
-#undef  INTERIOR
-#undef  EXTERIOR
-  
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-#undef MAYBEPERM
-#undef MULT_2SPIN
-#define MAYBEPERM(A,B) 
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-#undef  MULT_2SPIN
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef COMPLEX_SIGNS
-#undef MAYBEPERM
-#undef MULT_2SPIN
-
-#endif //AVX512
--- a/Grid/qcd/action/fermion/WilsonTMFermion.cc
+++ b/Grid/qcd/action/fermion/WilsonTMFermion.cc
@ -1,99 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
-
-    Copyright (C) 2015
-
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    /*
-     * BF sequence
-     *
-      void bfmbase<Float>::MooeeInv(Fermion_t psi, 
-			       Fermion_t chi, 
-			      int dag, int cb)
-
-    double m    = this->mass;
-    double tm   = this->twistedmass;
-    double mtil = 4.0+this->mass;
-
-    double sq = mtil*mtil + tm*tm;
-
-    double a = mtil/sq;
-    double b = -tm /sq;
-    if(dag) b=-b;
-    axpibg5x(chi,psi,a,b);
-
-      void bfmbase<Float>::Mooee(Fermion_t psi, 
-			   Fermion_t chi, 
-			   int dag,int cb)
-    double a = 4.0+this->mass;
-    double b = this->twistedmass;
-    if(dag) b=-b;
-    axpibg5x(chi,psi,a,b);
-    */
-
-  template<class Impl>
-  void WilsonTMFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-    RealD a = 4.0+this->mass;
-    RealD b = this->mu;
-    out.checkerboard = in.checkerboard;
-    axpibg5x(out,in,a,b);
-  }
-  template<class Impl>
-  void WilsonTMFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-    RealD a = 4.0+this->mass;
-    RealD b = -this->mu;
-    out.checkerboard = in.checkerboard;
-    axpibg5x(out,in,a,b);
-  }
-  template<class Impl>
-  void WilsonTMFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-    RealD m    = this->mass;
-    RealD tm   = this->mu;
-    RealD mtil = 4.0+this->mass;
-    RealD sq   = mtil*mtil+tm*tm;
-    RealD a    = mtil/sq;
-    RealD b    = -tm /sq;
-    axpibg5x(out,in,a,b);
-  }
-  template<class Impl>
-  void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
-    RealD m    = this->mass;
-    RealD tm   = this->mu;
-    RealD mtil = 4.0+this->mass;
-    RealD sq   = mtil*mtil+tm*tm;
-    RealD a    = mtil/sq;
-    RealD b    = tm /sq;
-    axpibg5x(out,in,a,b);
-  }
-
-  FermOpTemplateInstantiate(WilsonTMFermion);
-
-}
-}
--- a/Grid/qcd/action/fermion/WilsonTMFermion.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -23,55 +23,52 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_WILSON_TM_FERMION_H
-#define  GRID_QCD_WILSON_TM_FERMION_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once 

 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/WilsonFermion.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class WilsonTMFermion : public WilsonFermion<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class WilsonTMFermion : public WilsonFermion<Impl>
-    {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
-    public:
+  virtual void   Instantiatable(void) {};
+  // Constructors
+  WilsonTMFermion(GaugeField &_Umu,
+		  GridCartesian         &Fgrid,
+		  GridRedBlackCartesian &Hgrid, 
+		  RealD _mass,
+		  RealD _mu,
+		  const ImplParams &p= ImplParams()
+		  ) :
+    WilsonFermion<Impl>(_Umu,
+			Fgrid,
+			Hgrid,
+			_mass,p)

-      virtual void   Instantiatable(void) {};
-      // Constructors
-      WilsonTMFermion(GaugeField &_Umu,
-		    GridCartesian         &Fgrid,
-		    GridRedBlackCartesian &Hgrid, 
-		    RealD _mass,
-		    RealD _mu,
-		    const ImplParams &p= ImplParams()
-		      ) :
-	WilsonFermion<Impl>(_Umu,
-			    Fgrid,
-			    Hgrid,
-			    _mass,p)
-
-      {
-	mu = _mu;
-      }
+  {
+    mu = _mu;
+  }


-    // allow override for twisted mass and clover
-    virtual void Mooee(const FermionField &in, FermionField &out) ;
-    virtual void MooeeDag(const FermionField &in, FermionField &out) ;
-    virtual void MooeeInv(const FermionField &in, FermionField &out) ;
-    virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
+  // allow override for twisted mass and clover
+  virtual void Mooee(const FermionField &in, FermionField &out) ;
+  virtual void MooeeDag(const FermionField &in, FermionField &out) ;
+  virtual void MooeeInv(const FermionField &in, FermionField &out) ;
+  virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;

-  private:
-     RealD mu; // TwistedMass parameter
+private:
+  RealD mu; // TwistedMass parameter

-  };
+};
+
+NAMESPACE_END(Grid);

-}}

-#endif
--- a/Grid/qcd/action/fermion/WilsonTMFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
@ -30,126 +30,123 @@ Author: paboyle <paboyle@ph.ed.ac.uk> ; NB Christoph did similar in GPT
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/WilsonFermion.h>

-
-namespace Grid {
-
-  namespace QCD {
+NAMESPACE_BEGIN(Grid);
    
-    template<class Impl>
-      class WilsonTMFermion5D : public WilsonFermion5D<Impl>
-      {
-      public:
-	INHERIT_IMPL_TYPES(Impl);
-      public:
-
-	virtual void   Instantiatable(void) {};
-
-	// Constructors
-        WilsonTMFermion5D(GaugeField &_Umu,
-			  GridCartesian         &Fgrid,
-			  GridRedBlackCartesian &Frbgrid, 
-			  GridCartesian         &Ugrid,
-			  GridRedBlackCartesian &Urbgrid, 
-			  const std::vector<RealD> _mass,
-			  const std::vector<RealD> _mu,
-			  const ImplParams &p= ImplParams()
-			  ) :
-	WilsonFermion5D<Impl>(_Umu,
-			      Fgrid,
-			      Frbgrid,
-			      Ugrid,
-			      Urbgrid,
-			      4.0,p)
-	
-	  {
-	    update(_mass,_mu);
-	  }
-
-	virtual void Meooe(const FermionField &in, FermionField &out) {
-	  if (in.checkerboard == Odd) {
-	    this->DhopEO(in, out, DaggerNo);
-	  } else {
-	    this->DhopOE(in, out, DaggerNo);
-	  }
-	}
-
-	virtual void MeooeDag(const FermionField &in, FermionField &out) {
-	  if (in.checkerboard == Odd) {
-	    this->DhopEO(in, out, DaggerYes);
-	  } else {
-	    this->DhopOE(in, out, DaggerYes);
-	  }
-	}	
-	
-	// allow override for twisted mass and clover
-	virtual void Mooee(const FermionField &in, FermionField &out) {
-	  out.checkerboard = in.checkerboard;
-	  //axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
-	  for (int s=0;s<(int)this->mass.size();s++) {
-	    ComplexD a = 4.0+this->mass[s];
-	    ComplexD b(0.0,this->mu[s]);
-	    axpbg5y_ssp(out,a,in,b,in,s,s);
-	  }
-	}
-
-	virtual void MooeeDag(const FermionField &in, FermionField &out) {
-	  out.checkerboard = in.checkerboard;
-	  for (int s=0;s<(int)this->mass.size();s++) {
-	    ComplexD a = 4.0+this->mass[s];
-	    ComplexD b(0.0,-this->mu[s]);
-	    axpbg5y_ssp(out,a,in,b,in,s,s);
-	  }
-	}
-	virtual void MooeeInv(const FermionField &in, FermionField &out) {
-	  for (int s=0;s<(int)this->mass.size();s++) {
-	    RealD m    = this->mass[s];
-	    RealD tm   = this->mu[s];
-	    RealD mtil = 4.0+this->mass[s];
-	    RealD sq   = mtil*mtil+tm*tm;
-	    ComplexD a    = mtil/sq;
-	    ComplexD b(0.0, -tm /sq);
-	    axpbg5y_ssp(out,a,in,b,in,s,s);
-	  }
-	}
-	virtual void MooeeInvDag(const FermionField &in, FermionField &out) {
-	  for (int s=0;s<(int)this->mass.size();s++) {
-	    RealD m    = this->mass[s];
-	    RealD tm   = this->mu[s];
-	    RealD mtil = 4.0+this->mass[s];
-	    RealD sq   = mtil*mtil+tm*tm;
-	    ComplexD a    = mtil/sq;
-	    ComplexD b(0.0,tm /sq);
-	    axpbg5y_ssp(out,a,in,b,in,s,s);
-	  }
-	}
-
-	virtual RealD M(const FermionField &in, FermionField &out) {
-	  out.checkerboard = in.checkerboard;
-	  this->Dhop(in, out, DaggerNo);
-	  FermionField tmp(out._grid);
-	  for (int s=0;s<(int)this->mass.size();s++) {
-	    ComplexD a = 4.0+this->mass[s];
-	    ComplexD b(0.0,this->mu[s]);
-	    axpbg5y_ssp(tmp,a,in,b,in,s,s);
-	  }
-	  return axpy_norm(out, 1.0, tmp, out);
-	}
-	
-	// needed for fast PV
-	void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) {
-	  assert(_mass.size() == _mu.size());
-	  assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
-	  this->mass = _mass;
-	  this->mu = _mu;
-	}
-	
-      private:
-	std::vector<RealD> mu;
-	std::vector<RealD> mass;
-	
-      };
+template<class Impl>
+class WilsonTMFermion5D : public WilsonFermion5D<Impl>
+{
+ public:
+  INHERIT_IMPL_TYPES(Impl);
+ public:
+  
+  virtual void   Instantiatable(void) {};
+  
+  // Constructors
+ WilsonTMFermion5D(GaugeField &_Umu,
+		   GridCartesian         &Fgrid,
+		   GridRedBlackCartesian &Frbgrid, 
+		   GridCartesian         &Ugrid,
+		   GridRedBlackCartesian &Urbgrid, 
+		   const std::vector<RealD> _mass,
+		   const std::vector<RealD> _mu,
+		   const ImplParams &p= ImplParams()
+		   ) :
+  WilsonFermion5D<Impl>(_Umu,
+			Fgrid,
+			Frbgrid,
+			Ugrid,
+			Urbgrid,
+			4.0,p)
   
-    typedef WilsonTMFermion5D<WilsonImplF> WilsonTMFermion5DF; 
-    typedef WilsonTMFermion5D<WilsonImplD> WilsonTMFermion5DD; 
+    {
+      update(_mass,_mu);
+    }
+  
+  virtual void Meooe(const FermionField &in, FermionField &out) {
+    if (in.Checkerboard() == Odd) {
+      this->DhopEO(in, out, DaggerNo);
+    } else {
+      this->DhopOE(in, out, DaggerNo);
+    }
+  }
+  
+  virtual void MeooeDag(const FermionField &in, FermionField &out) {
+    if (in.Checkerboard() == Odd) {
+      this->DhopEO(in, out, DaggerYes);
+    } else {
+      this->DhopOE(in, out, DaggerYes);
+    }
+  }	
+  
+  // allow override for twisted mass and clover
+  virtual void Mooee(const FermionField &in, FermionField &out) {
+    out.Checkerboard() = in.Checkerboard();
+    //axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
+    for (int s=0;s<(int)this->mass.size();s++) {
+      ComplexD a = 4.0+this->mass[s];
+      ComplexD b(0.0,this->mu[s]);
+      axpbg5y_ssp(out,a,in,b,in,s,s);
+    }
+  }
+  
+  virtual void MooeeDag(const FermionField &in, FermionField &out) {
+    out.Checkerboard() = in.Checkerboard();
+    for (int s=0;s<(int)this->mass.size();s++) {
+      ComplexD a = 4.0+this->mass[s];
+      ComplexD b(0.0,-this->mu[s]);
+      axpbg5y_ssp(out,a,in,b,in,s,s);
+    }
+  }
+  virtual void MooeeInv(const FermionField &in, FermionField &out) {
+    for (int s=0;s<(int)this->mass.size();s++) {
+      RealD m    = this->mass[s];
+      RealD tm   = this->mu[s];
+      RealD mtil = 4.0+this->mass[s];
+      RealD sq   = mtil*mtil+tm*tm;
+      ComplexD a    = mtil/sq;
+      ComplexD b(0.0, -tm /sq);
+      axpbg5y_ssp(out,a,in,b,in,s,s);
+    }
+  }
+  virtual void MooeeInvDag(const FermionField &in, FermionField &out) {
+    for (int s=0;s<(int)this->mass.size();s++) {
+      RealD m    = this->mass[s];
+      RealD tm   = this->mu[s];
+      RealD mtil = 4.0+this->mass[s];
+      RealD sq   = mtil*mtil+tm*tm;
+      ComplexD a    = mtil/sq;
+      ComplexD b(0.0,tm /sq);
+      axpbg5y_ssp(out,a,in,b,in,s,s);
+    }
+  }
+  
+  virtual RealD M(const FermionField &in, FermionField &out) {
+    out.Checkerboard() = in.Checkerboard();
+    this->Dhop(in, out, DaggerNo);
+    FermionField tmp(out.Grid());
+    for (int s=0;s<(int)this->mass.size();s++) {
+      ComplexD a = 4.0+this->mass[s];
+      ComplexD b(0.0,this->mu[s]);
+      axpbg5y_ssp(tmp,a,in,b,in,s,s);
+    }
+    return axpy_norm(out, 1.0, tmp, out);
+  }
+  
+  // needed for fast PV
+  void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) {
+    assert(_mass.size() == _mu.size());
+    assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
+    this->mass = _mass;
+    this->mu = _mu;
+  }
+  
+ private:
+  std::vector<RealD> mu;
+  std::vector<RealD> mass;
+  
+};
+   
+typedef WilsonTMFermion5D<WilsonImplF> WilsonTMFermion5DF; 
+typedef WilsonTMFermion5D<WilsonImplD> WilsonTMFermion5DD; 

-}}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ZMobiusFermion.h
+++ b/Grid/qcd/action/fermion/ZMobiusFermion.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,56 +24,50 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_ZMOBIUS_FERMION_H
-#define  GRID_QCD_ZMOBIUS_FERMION_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once

 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
+NAMESPACE_BEGIN(Grid);

-  namespace QCD {
+template<class Impl>
+class ZMobiusFermion : public CayleyFermion5D<Impl>
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+public:

-    template<class Impl>
-    class ZMobiusFermion : public CayleyFermion5D<Impl>
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-    public:
-
-      virtual void   Instantiatable(void) {};
-      // Constructors
-      ZMobiusFermion(GaugeField &_Umu,
-		     GridCartesian         &FiveDimGrid,
-		     GridRedBlackCartesian &FiveDimRedBlackGrid,
-		     GridCartesian         &FourDimGrid,
-		     GridRedBlackCartesian &FourDimRedBlackGrid,
-		     RealD _mass,RealD _M5,
-		     std::vector<ComplexD> &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : 
+  virtual void   Instantiatable(void) {};
+  // Constructors
+  ZMobiusFermion(GaugeField &_Umu,
+		 GridCartesian         &FiveDimGrid,
+		 GridRedBlackCartesian &FiveDimRedBlackGrid,
+		 GridCartesian         &FourDimGrid,
+		 GridRedBlackCartesian &FourDimRedBlackGrid,
+		 RealD _mass,RealD _M5,
+		 std::vector<ComplexD> &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : 
      
-      CayleyFermion5D<Impl>(_Umu,
-			    FiveDimGrid,
-			    FiveDimRedBlackGrid,
-			    FourDimGrid,
-			    FourDimRedBlackGrid,_mass,_M5,p)
+    CayleyFermion5D<Impl>(_Umu,
+			  FiveDimGrid,
+			  FiveDimRedBlackGrid,
+			  FourDimGrid,
+			  FourDimRedBlackGrid,_mass,_M5,p)

-      {
-	RealD eps = 1.0;
-	
-	std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
-	std::vector<Coeff_t> zgamma(this->Ls);
-	for(int s=0;s<this->Ls;s++){
-	  zgamma[s] = gamma[s];
-	}
-
-	// Call base setter
-	this->SetCoefficientsInternal(1.0,zgamma,b,c);
-      }
-
-    };
+  {
+    //    RealD eps = 1.0;
+    std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
+    Vector<Coeff_t> zgamma(this->Ls);
+    for(int s=0;s<this->Ls;s++){
+      zgamma[s] = gamma[s];
+    }

+    // Call base setter
+    this->SetCoefficientsInternal(1.0,zgamma,b,c);
  }
-}

-#endif
+};
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/deprecated/CayleyFermion5Ddense.h
+++ b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Ddense.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -26,19 +26,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */

 #include <Grid/Grid_Eigen_Dense.h>
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>


-namespace Grid {
-namespace QCD {
-  /*
-   * Dense matrix versions of routines
-   */
+NAMESPACE_BEGIN(Grid);
+
+/*
+ * Dense matrix versions of routines
+ */
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
 {
@ -54,10 +54,10 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
  int Ls=this->Ls;
-  int LLs = psi._grid->_rdimensions[0];
-  int vol = psi._grid->oSites()/LLs;
+  int LLs = psi.Grid()->_rdimensions[0];
+  int vol = psi.Grid()->oSites()/LLs;
  
-  chi.checkerboard=psi.checkerboard;
+  chi.Checkerboard()=psi.Checkerboard();
  
  assert(Ls==LLs);
  
@ -96,15 +96,14 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
  }

  // For the non-vectorised s-direction this is simple
-  
-  for(auto site=0;site<vol;site++){
+  thread_loop( (auto site=0;site<vol;site++), {
    
    SiteSpinor     SiteChi;
    SiteHalfSpinor SitePplus;
    SiteHalfSpinor SitePminus;
    
    for(int s1=0;s1<Ls;s1++){
-      SiteChi =zero;
+      SiteChi =Zero();
      for(int s2=0;s2<Ls;s2++){
 	int lex2 = s2+Ls*site;
 	
@ -120,7 +119,7 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
      }
      chi[s1+Ls*site] = SiteChi*0.5;
    }
-  }
+  });
 }

 #ifdef CAYLEY_DPERP_DENSE
@ -153,4 +152,4 @@ template void CayleyFermion5D<ZWilsonImplFH>::MooeeInternal(const FermionField &
 template void CayleyFermion5D<ZWilsonImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 #endif

-}}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dssp.h
+++ b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dssp.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -26,26 +26,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */

 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>


-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

-  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-  // Pminus fowards
-  // Pplus  backwards
+// Pminus fowards
+// Pplus  backwards
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
-				std::vector<Coeff_t> &lower,
-				std::vector<Coeff_t> &diag,
-				std::vector<Coeff_t> &upper)
+				Vector<Coeff_t> &lower,
+				Vector<Coeff_t> &diag,
+				Vector<Coeff_t> &upper)
 {
  Coeff_t one(1.0);
  int Ls=this->Ls;
@ -66,9 +64,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
-				   std::vector<Coeff_t> &lower,
-				   std::vector<Coeff_t> &diag,
-				   std::vector<Coeff_t> &upper)
+				   Vector<Coeff_t> &lower,
+				   Vector<Coeff_t> &diag,
+				   Vector<Coeff_t> &upper)
 {
  Coeff_t one(1.0);
  int Ls=this->Ls;
@ -91,7 +89,7 @@ void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &
 {
  Coeff_t one(1.0);
  Coeff_t czero(0.0);
-  chi.checkerboard=psi.checkerboard;
+  chi.Checkerboard()=psi.Checkerboard();
  int Ls=this->Ls;
  // Apply (L^{\prime})^{-1}
  axpby_ssp (chi,one,psi,     czero,psi,0,0);      // chi[0]=psi[0]
@ -120,7 +118,7 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
 {
  Coeff_t one(1.0);
  Coeff_t czero(0.0);
-  chi.checkerboard=psi.checkerboard;
+  chi.Checkerboard()=psi.Checkerboard();
  int Ls=this->Ls;
  // Apply (U^{\prime})^{-dagger}
  axpby_ssp (chi,one,psi,     czero,psi,0,0);      // chi[0]=psi[0]
@ -145,20 +143,19 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &


 #ifdef CAYLEY_DPERP_LINALG
-  INSTANTIATE_DPERP(WilsonImplF);
-  INSTANTIATE_DPERP(WilsonImplD);
-  INSTANTIATE_DPERP(GparityWilsonImplF);
-  INSTANTIATE_DPERP(GparityWilsonImplD);
-  INSTANTIATE_DPERP(ZWilsonImplF);
-  INSTANTIATE_DPERP(ZWilsonImplD);
+INSTANTIATE_DPERP(WilsonImplF);
+INSTANTIATE_DPERP(WilsonImplD);
+INSTANTIATE_DPERP(GparityWilsonImplF);
+INSTANTIATE_DPERP(GparityWilsonImplD);
+INSTANTIATE_DPERP(ZWilsonImplF);
+INSTANTIATE_DPERP(ZWilsonImplD);

-  INSTANTIATE_DPERP(WilsonImplFH);
-  INSTANTIATE_DPERP(WilsonImplDF);
-  INSTANTIATE_DPERP(GparityWilsonImplFH);
-  INSTANTIATE_DPERP(GparityWilsonImplDF);
-  INSTANTIATE_DPERP(ZWilsonImplFH);
-  INSTANTIATE_DPERP(ZWilsonImplDF);
+INSTANTIATE_DPERP(WilsonImplFH);
+INSTANTIATE_DPERP(WilsonImplDF);
+INSTANTIATE_DPERP(GparityWilsonImplFH);
+INSTANTIATE_DPERP(GparityWilsonImplDF);
+INSTANTIATE_DPERP(ZWilsonImplFH);
+INSTANTIATE_DPERP(ZWilsonImplDF);
 #endif

-}
-}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermiondense.h
+++ b/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermiondense.h
@ -0,0 +1,158 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+/*
+ * Dense matrix versions of routines
+ */
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
+{
+  int Ls = this->Ls;
+  int LLs = psi.Grid()->_rdimensions[0];
+  int vol = psi.Grid()->oSites()/LLs;
+
+  chi.Checkerboard() = psi.Checkerboard();
+
+  assert(Ls==LLs);
+
+  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s)  = this->bee[s];
+    Pminus(s,s) = this->bee[s];
+  }
+
+  for(int s=0; s<Ls-1; s++){
+    Pminus(s,s+1) = -this->cee[s];
+  }
+
+  for(int s=0; s<Ls-1; s++){
+    Pplus(s+1,s) = -this->cee[s+1];
+  }
+
+  Pplus (0,Ls-1) = this->dp;
+  Pminus(Ls-1,0) = this->dm;
+
+  Eigen::MatrixXd PplusMat ;
+  Eigen::MatrixXd PminusMat;
+
+  if(inv) {
+    PplusMat  = Pplus.inverse();
+    PminusMat = Pminus.inverse();
+  } else {
+    PplusMat  = Pplus;
+    PminusMat = Pminus;
+  }
+
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+
+  // For the non-vectorised s-direction this is simple
+
+  for(auto site=0; site<vol; site++){
+
+    SiteSpinor     SiteChi;
+    SiteHalfSpinor SitePplus;
+    SiteHalfSpinor SitePminus;
+
+    for(int s1=0; s1<Ls; s1++){
+      SiteChi = Zero();
+      for(int s2=0; s2<Ls; s2++){
+	int lex2 = s2 + Ls*site;
+	if(PplusMat(s1,s2) != 0.0){
+	  spProj5p(SitePplus,psi[lex2]);
+	  accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
+	}
+	if(PminusMat(s1,s2) != 0.0){
+	  spProj5m(SitePminus, psi[lex2]);
+	  accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
+	}
+      }
+      chi[s1+Ls*site] = SiteChi*0.5;
+    }
+  }
+}
+
+#ifdef DOMAIN_WALL_EOFA_DPERP_DENSE
+
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
+
+template void DomainWallEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
+
+template void DomainWallEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermionssp.h
+++ b/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermionssp.h
@ -0,0 +1,167 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// FIXME -- make a version of these routines with site loop outermost for cache reuse.
+// Pminus fowards
+// Pplus  backwards
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
+				      FermionField& chi, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  Coeff_t one(1.0);
+  int Ls = this->Ls;
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
+    } else if (s==(Ls-1)) {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
+      axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
+    } else {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
+    }
+  }
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
+					 FermionField& chi, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  Coeff_t one(1.0);
+  int Ls = this->Ls;
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
+    } else if (s==(Ls-1)) {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+    } else {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+    }
+  }
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+{
+  Coeff_t one(1.0);
+  Coeff_t czero(0.0);
+  chi.Checkerboard() = psi.Checkerboard();
+  int Ls = this->Ls;
+
+  FermionField tmp(psi.Grid());
+
+  // Apply (L^{\prime})^{-1}
+  axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+  for(int s=1; s<Ls; s++){
+    axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+  }
+
+  // L_m^{-1}
+  for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+    axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
+  }
+
+  // U_m^{-1} D^{-1}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls], chi, s, Ls-1);
+  }
+  axpby_ssp_pminus(tmp, czero, chi, one/this->dee[Ls-1], chi, Ls-1, Ls-1);
+  axpby_ssp_pplus(chi, one, tmp, one/this->dee[Ls], chi, Ls-1, Ls-1);
+
+  // Apply U^{-1}
+  for(int s=Ls-2; s>=0; s--){
+    axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
+  }
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+{
+  Coeff_t one(1.0);
+  Coeff_t czero(0.0);
+  chi.Checkerboard() = psi.Checkerboard();
+  int Ls = this->Ls;
+
+  FermionField tmp(psi.Grid());
+
+  // Apply (U^{\prime})^{-dagger}
+  axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+  for(int s=1; s<Ls; s++){
+    axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
+  }
+
+  // U_m^{-\dagger}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
+  }
+
+  // L_m^{-\dagger} D^{-dagger}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
+  }
+  axpby_ssp_pminus(tmp, czero, chi, one/conjugate(this->dee[Ls-1]), chi, Ls-1, Ls-1);
+  axpby_ssp_pplus(chi, one, tmp, one/conjugate(this->dee[Ls]), chi, Ls-1, Ls-1);
+
+  // Apply L^{-dagger}
+  for(int s=Ls-2; s>=0; s--){
+    axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
+  }
+}
+
+#ifdef DOMAIN_WALL_EOFA_DPERP_LINALG
+
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
+
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
+
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermiondense.h
+++ b/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermiondense.h
@ -0,0 +1,183 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+/*
+ * Dense matrix versions of routines
+ */
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
+{
+  int Ls = this->Ls;
+  int LLs = psi.Grid()->_rdimensions[0];
+  int vol = psi.Grid()->oSites()/LLs;
+
+  int pm      = this->pm;
+  RealD shift = this->shift;
+  RealD alpha = this->alpha;
+  RealD k     = this->k;
+  RealD mq1   = this->mq1;
+
+  chi.Checkerboard() = psi.Checkerboard();
+
+  assert(Ls==LLs);
+
+  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s)  = this->bee[s];
+    Pminus(s,s) = this->bee[s];
+  }
+
+  for(int s=0; s<Ls-1; s++){
+    Pminus(s,s+1) = -this->cee[s];
+  }
+
+  for(int s=0; s<Ls-1; s++){
+    Pplus(s+1,s) = -this->cee[s+1];
+  }
+  Pplus (0,Ls-1) = mq1*this->cee[0];
+  Pminus(Ls-1,0) = mq1*this->cee[Ls-1];
+
+  if(shift != 0.0){
+    Coeff_t N = 2.0 * ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
+    for(int s=0; s<Ls; ++s){
+      if(pm == 1){ Pplus(s,Ls-1) += shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
+      else{ Pminus(Ls-1-s,Ls-1) -= shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
+    }
+  }
+
+  Eigen::MatrixXd PplusMat ;
+  Eigen::MatrixXd PminusMat;
+
+  if(inv){
+    PplusMat  = Pplus.inverse();
+    PminusMat = Pminus.inverse();
+  } else {
+    PplusMat  = Pplus;
+    PminusMat = Pminus;
+  }
+
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+
+  // For the non-vectorised s-direction this is simple
+
+  for(auto site=0; site<vol; site++){
+
+    SiteSpinor     SiteChi;
+    SiteHalfSpinor SitePplus;
+    SiteHalfSpinor SitePminus;
+
+    for(int s1=0; s1<Ls; s1++){
+      SiteChi = Zero();
+      for(int s2=0; s2<Ls; s2++){
+	int lex2 = s2 + Ls*site;
+	if(PplusMat(s1,s2) != 0.0){
+	  spProj5p(SitePplus,psi[lex2]);
+	  accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
+	}
+	if(PminusMat(s1,s2) != 0.0){
+	  spProj5m(SitePminus, psi[lex2]);
+	  accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
+	}
+      }
+      chi[s1+Ls*site] = SiteChi*0.5;
+    }
+  }
+}
+
+#ifdef MOBIUS_EOFA_DPERP_DENSE
+
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
+
+template void MobiusEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
+
+template void MobiusEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermionssp.h
+++ b/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermionssp.h
@ -0,0 +1,289 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// FIXME -- make a version of these routines with site loop outermost for cache reuse.
+// Pminus fowards
+// Pplus  backwards
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
+				  FermionField& chi, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  Coeff_t one(1.0);
+  int Ls = this->Ls;
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
+    } else if (s==(Ls-1)) {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
+      axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
+    } else {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
+    }
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
+					FermionField& chi, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
+					Vector<Coeff_t>& shift_coeffs)
+{
+  Coeff_t one(1.0);
+  int Ls = this->Ls;
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
+    } else if (s==(Ls-1)) {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
+      axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
+    } else {
+      axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
+    }
+    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
+    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
+				     FermionField& chi, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  Coeff_t one(1.0);
+  int Ls = this->Ls;
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
+    } else if (s==(Ls-1)) {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+    } else {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+    }
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
+					   FermionField& chi, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
+					   Vector<Coeff_t>& shift_coeffs)
+{
+  Coeff_t one(1.0);
+  int Ls = this->Ls;
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
+    } else if (s==(Ls-1)) {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+    } else {
+      axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+      axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+    }
+    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
+    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+{
+  if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
+
+  Coeff_t one(1.0);
+  Coeff_t czero(0.0);
+  chi.Checkerboard() = psi.Checkerboard();
+  int Ls = this->Ls;
+
+  // Apply (L^{\prime})^{-1}
+  axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+  for(int s=1; s<Ls; s++){
+    axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+  }
+
+  // L_m^{-1}
+  for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+    axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
+  }
+
+  // U_m^{-1} D^{-1}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
+  }
+  axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
+
+  // Apply U^{-1}
+  for(int s=Ls-2; s>=0; s--){
+    axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
+{
+  Coeff_t one(1.0);
+  Coeff_t czero(0.0);
+  chi.Checkerboard() = psi.Checkerboard();
+  int Ls = this->Ls;
+
+  FermionField tmp(psi.Grid());
+
+  // Apply (L^{\prime})^{-1}
+  axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+  axpby_ssp(tmp, czero, tmp, this->MooeeInv_shift_lc[0], psi, 0, 0);
+  for(int s=1; s<Ls; s++){
+    axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+    axpby_ssp(tmp, one, tmp, this->MooeeInv_shift_lc[s], psi, 0, s);
+  }
+
+  // L_m^{-1}
+  for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+    axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
+  }
+
+  // U_m^{-1} D^{-1}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
+  }
+  axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
+
+  // Apply U^{-1} and add shift term
+  if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
+  else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
+  for(int s=Ls-2; s>=0; s--){
+    axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
+    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
+    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+{
+  if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
+
+  Coeff_t one(1.0);
+  Coeff_t czero(0.0);
+  chi.Checkerboard() = psi.Checkerboard();
+  int Ls = this->Ls;
+
+  // Apply (U^{\prime})^{-dagger}
+  axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+  for(int s=1; s<Ls; s++){
+    axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
+  }
+
+  // U_m^{-\dagger}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
+  }
+
+  // L_m^{-\dagger} D^{-dagger}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
+  }
+  axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
+
+  // Apply L^{-dagger}
+  for(int s=Ls-2; s>=0; s--){
+    axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
+{
+  Coeff_t one(1.0);
+  Coeff_t czero(0.0);
+  chi.Checkerboard() = psi.Checkerboard();
+  int Ls = this->Ls;
+
+  FermionField tmp(psi.Grid());
+
+  // Apply (U^{\prime})^{-dagger} and accumulate (MooeeInvDag_shift_lc)_{j} \psi_{j} in tmp[0]
+  axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+  axpby_ssp(tmp, czero, tmp, this->MooeeInvDag_shift_lc[0], psi, 0, 0);
+  for(int s=1; s<Ls; s++){
+    axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
+    axpby_ssp(tmp, one, tmp, this->MooeeInvDag_shift_lc[s], psi, 0, s);
+  }
+
+  // U_m^{-\dagger}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
+  }
+
+  // L_m^{-\dagger} D^{-dagger}
+  for(int s=0; s<Ls-1; s++){
+    axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
+  }
+  axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
+
+  // Apply L^{-dagger} and add shift
+  if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
+  else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
+  for(int s=Ls-2; s>=0; s--){
+    axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
+    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
+    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
+  }
+}
+
+#ifdef MOBIUS_EOFA_DPERP_LINALG
+
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
+
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
+
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/g5HermitianLinop.h
+++ b/Grid/qcd/action/fermion/g5HermitianLinop.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -23,13 +23,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef G5_HERMITIAN_LINOP
 #define G5_HERMITIAN_LINOP

-namespace Grid {
-  namespace QCD {
+NAMESPACE_BEGIN(Grid);

 ////////////////////////////////////////////////////////////////////
 // Wrap an already herm matrix
@ -46,12 +45,12 @@ public:
    HermOp(in,out);
  }
  void OpDiag (const Field &in, Field &out) {
-    Field tmp(in._grid);
+    Field tmp(in.Grid());
    _Mat.Mdiag(in,tmp);
    G5R5(out,tmp);
  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    Field tmp(in._grid);
+    Field tmp(in.Grid());
    _Mat.Mdir(in,tmp,dir,disp);
    G5R5(out,tmp);
  }
@ -68,7 +67,7 @@ public:
    n2=real(dot);
  }
  void HermOp(const Field &in, Field &out){
-    Field tmp(in._grid);
+    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    G5R5(out,tmp);
  }
@ -80,7 +79,7 @@ class Gamma5HermitianLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Gamma g5;
 public:
-    Gamma5HermitianLinearOperator(Matrix &Mat): _Mat(Mat), g5(Gamma::Algebra::Gamma5) {};
+  Gamma5HermitianLinearOperator(Matrix &Mat): _Mat(Mat), g5(Gamma::Algebra::Gamma5) {};
  void Op     (const Field &in, Field &out){
    HermOp(in,out);
  }
@ -88,12 +87,12 @@ public:
    HermOp(in,out);
  }
  void OpDiag (const Field &in, Field &out) {
-    Field tmp(in._grid);
+    Field tmp(in.Grid());
    _Mat.Mdiag(in,tmp);
    out=g5*tmp;
  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    Field tmp(in._grid);
+    Field tmp(in.Grid());
    _Mat.Mdir(in,tmp,dir,disp);
    out=g5*tmp;
  }
@ -110,12 +109,11 @@ public:
    n2=real(dot);
  }
  void HermOp(const Field &in, Field &out){
-    Field tmp(in._grid);
+    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    out=g5*tmp;
  }
 };

-
-}}
+NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -26,31 +26,30 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */

 #include <Grid/Grid_Eigen_Dense.h>
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

- template<class Impl>
- CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
-					GridCartesian         &FiveDimGrid,
-					GridRedBlackCartesian &FiveDimRedBlackGrid,
-					GridCartesian         &FourDimGrid,
-					GridRedBlackCartesian &FourDimRedBlackGrid,
-					RealD _mass,RealD _M5,const ImplParams &p) :
-   WilsonFermion5D<Impl>(_Umu,
-		   FiveDimGrid,
-		   FiveDimRedBlackGrid,
-		   FourDimGrid,
- 	 	   FourDimRedBlackGrid,_M5,p),
-   mass(_mass)
- { 
- }
+template<class Impl>
+CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
+				       GridCartesian         &FiveDimGrid,
+				       GridRedBlackCartesian &FiveDimRedBlackGrid,
+				       GridCartesian         &FourDimGrid,
+				       GridRedBlackCartesian &FourDimRedBlackGrid,
+				       RealD _mass,RealD _M5,const ImplParams &p) :
+  WilsonFermion5D<Impl>(_Umu,
+			FiveDimGrid,
+			FiveDimRedBlackGrid,
+			FourDimGrid,
+			FourDimRedBlackGrid,_M5,p),
+  mass(_mass)
+{ 
+}

 ///////////////////////////////////////////////////////////////
 // Physical surface field utilities
@ -61,8 +60,8 @@ void CayleyFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &so
  int Ls = this->Ls;
  FermionField tmp(this->FermionGrid());
  tmp = solution5d;
-  conformable(solution5d._grid,this->FermionGrid());
-  conformable(exported4d._grid,this->GaugeGrid());
+  conformable(solution5d.Grid(),this->FermionGrid());
+  conformable(exported4d.Grid(),this->GaugeGrid());
  axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0);
  axpby_ssp_pplus (tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
  ExtractSlice(exported4d, tmp, 0, 0);
@ -71,7 +70,7 @@ template<class Impl>
 void CayleyFermion5D<Impl>::P(const FermionField &psi, FermionField &chi)
 {
  int Ls= this->Ls;
-  chi=zero;
+  chi=Zero();
  for(int s=0;s<Ls;s++){
    axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
    axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s+1)%Ls);
@ -81,7 +80,7 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Pdag(const FermionField &psi, FermionField &chi)
 {
  int Ls= this->Ls;
-  chi=zero;
+  chi=Zero();
  for(int s=0;s<Ls;s++){
    axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
    axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s-1+Ls)%Ls);
@ -93,8 +92,8 @@ void CayleyFermion5D<Impl>::ExportPhysicalFermionSource(const FermionField &solu
  int Ls = this->Ls;
  FermionField tmp(this->FermionGrid());
  tmp = solution5d;
-  conformable(solution5d._grid,this->FermionGrid());
-  conformable(exported4d._grid,this->GaugeGrid());
+  conformable(solution5d.Grid(),this->FermionGrid());
+  conformable(exported4d.Grid(),this->GaugeGrid());
  axpby_ssp_pplus (tmp, 0., solution5d, 1., solution5d, 0, 0);
  axpby_ssp_pminus(tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
  ExtractSlice(exported4d, tmp, 0, 0);
@ -104,9 +103,9 @@ void CayleyFermion5D<Impl>::ImportUnphysicalFermion(const FermionField &input4d,
 {
  int Ls = this->Ls;
  FermionField tmp(this->FermionGrid());
-  conformable(imported5d._grid,this->FermionGrid());
-  conformable(input4d._grid   ,this->GaugeGrid());
-  tmp = zero;
+  conformable(imported5d.Grid(),this->FermionGrid());
+  conformable(input4d.Grid()   ,this->GaugeGrid());
+  tmp = Zero();
  InsertSlice(input4d, tmp, 0   , 0);
  InsertSlice(input4d, tmp, Ls-1, 0);
  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
@ -119,9 +118,9 @@ void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &inpu
 {
  int Ls = this->Ls;
  FermionField tmp(this->FermionGrid());
-  conformable(imported5d._grid,this->FermionGrid());
-  conformable(input4d._grid   ,this->GaugeGrid());
-  tmp = zero;
+  conformable(imported5d.Grid(),this->FermionGrid());
+  conformable(input4d.Grid()   ,this->GaugeGrid());
+  tmp = Zero();
  InsertSlice(input4d, tmp, 0   , 0);
  InsertSlice(input4d, tmp, Ls-1, 0);
  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
@ -156,7 +155,7 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
 template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
 {
  this->Report();
-  std::vector<int> latt = GridDefaultLatt();          
+  Coordinate latt = GridDefaultLatt();          
  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP     = this->_FourDimGrid->_Nprocessors;
  if ( M5Dcalls > 0 ) {
@ -164,10 +163,16 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;

-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
+    // Flops = 10.0*(Nc*Ns) *Ls*vol
+    RealD mflops = 10.0*(Nc*Ns)*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+
+    // Bytes = sizeof(Real) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting)
+    // read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 )
+    // write = 1
+    RealD Gbytes = sizeof(Real) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9;
+    std::cout << GridLogMessage << "Average bandwidth (GB/s)                 : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl;
  }

  if ( MooeeInvCalls > 0 ) {
@ -175,11 +180,16 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
-
+#ifdef GRID_NVCC
+    RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
+    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+#else
    // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
    RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+#endif
  }

 }
@ -198,18 +208,18 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag (Ls,1.0);
-  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
-  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
+  Vector<Coeff_t> diag (Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag = bs;
-  std::vector<Coeff_t> upper= cs;
-  std::vector<Coeff_t> lower= cs; 
+  Vector<Coeff_t> diag = bs;
+  Vector<Coeff_t> upper= cs;
+  Vector<Coeff_t> lower= cs; 
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
@ -218,9 +228,9 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag = beo;
-  std::vector<Coeff_t> upper(Ls);
-  std::vector<Coeff_t> lower(Ls);
+  Vector<Coeff_t> diag = beo;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
@ -233,9 +243,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag = bee;
-  std::vector<Coeff_t> upper(Ls);
-  std::vector<Coeff_t> lower(Ls);
+  Vector<Coeff_t> diag = bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-cee[i];
    lower[i]=-cee[i];
@ -248,9 +258,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag = bee;
-  std::vector<Coeff_t> upper(Ls);
-  std::vector<Coeff_t> lower(Ls);
+  Vector<Coeff_t> diag = bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);

  for (int s=0;s<Ls;s++){
    // Assemble the 5d matrix
@ -278,9 +288,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag(Ls,1.0);
-  std::vector<Coeff_t> upper(Ls,-1.0);
-  std::vector<Coeff_t> lower(Ls,-1.0);
+  Vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0);
+  Vector<Coeff_t> lower(Ls,-1.0);
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
@ -290,9 +300,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag =bs;
-  std::vector<Coeff_t> upper=cs;
-  std::vector<Coeff_t> lower=cs; 
+  Vector<Coeff_t> diag =bs;
+  Vector<Coeff_t> upper=cs;
+  Vector<Coeff_t> lower=cs; 

  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
@ -315,9 +325,7 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
 template<class Impl>
 RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
 {
-  int Ls=this->Ls;
-  
-  FermionField Din(psi._grid);
+  FermionField Din(psi.Grid());
  
  // Assemble Din
  Meooe5D(psi,Din);
@ -337,7 +345,7 @@ RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
  //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
  //D2- P+     D2+            P-D1-^dag D2+dag
  
-  FermionField Din(psi._grid);
+  FermionField Din(psi.Grid());
  // Apply Dw
  this->DW(psi,Din,DaggerYes); 
  
@ -353,11 +361,9 @@ RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
 template<class Impl>
 void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
 {
-  int Ls=this->Ls;
-
  Meooe5D(psi,this->tmp()); 

-  if ( psi.checkerboard == Odd ) {
+  if ( psi.Checkerboard() == Odd ) {
    this->DhopEO(this->tmp(),chi,DaggerNo);
  } else {
    this->DhopOE(this->tmp(),chi,DaggerNo);
@ -368,7 +374,7 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
 {
  // Apply 4d dslash
-  if ( psi.checkerboard == Odd ) {
+  if ( psi.Checkerboard() == Odd ) {
    this->DhopEO(psi,this->tmp(),DaggerYes);
  } else {
    this->DhopOE(psi,this->tmp(),DaggerYes);
@ -386,7 +392,7 @@ void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,in
 template<class Impl>
 void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
 {
-  FermionField Din(V._grid);
+  FermionField Din(V.Grid());
  
  if ( dag == DaggerNo ) {
    //      U d/du [D_w D5] V = U d/du DW D5 V
@ -401,7 +407,7 @@ void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const
 template<class Impl>
 void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
 {
-  FermionField Din(V._grid);
+  FermionField Din(V.Grid());
  
  if ( dag == DaggerNo ) {
    //      U d/du [D_w D5] V = U d/du DW D5 V
@ -416,7 +422,7 @@ void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const
 template<class Impl>
 void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
 {
-  FermionField Din(V._grid);
+  FermionField Din(V.Grid());
  
  if ( dag == DaggerNo ) {
    //      U d/du [D_w D5] V = U d/du DW D5 V
@ -433,7 +439,7 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  std::vector<Coeff_t> gamma(this->Ls);
+  Vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(1.0,gamma,b,c);
 }
@ -441,13 +447,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  std::vector<Coeff_t> gamma(this->Ls);
+  Vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(zolo_hi,gamma,b,c);
 }
 //Zolo
 template<class Impl>
-void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
+void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
 {
  int Ls=this->Ls;

@ -568,12 +574,12 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
    dee[Ls-1] += delta_d;
  }  

-  int inv=1;
-  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
-  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
+  //  int inv=1;
+  //  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
+  //  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
 }

-
+#if 0
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
 						 Vector<iSinglet<Simd> > & Matp,
@ -628,35 +634,32 @@ void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
  Matm.resize(Ls*LLs);

  for(int s2=0;s2<Ls;s2++){
-  for(int s1=0;s1<LLs;s1++){
-    int istride = LLs;
-    int ostride = 1;
-    Simd Vp;
-    Simd Vm;
-    scalar_type *sp = (scalar_type *)&Vp;
-    scalar_type *sm = (scalar_type *)&Vm;
-    for(int l=0;l<Nsimd;l++){
-      if ( switcheroo<Coeff_t>::iscomplex() ) {
-	sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	sm[l] = PminusMat(l*istride+s1*ostride,s2);
-      } else { 
-      // if real
-	scalar_type tmp;
-	tmp = PplusMat (l*istride+s1*ostride,s2);
-	sp[l] = scalar_type(tmp.real(),tmp.real());
-	tmp = PminusMat(l*istride+s1*ostride,s2);
-	sm[l] = scalar_type(tmp.real(),tmp.real());
+    for(int s1=0;s1<LLs;s1++){
+      int istride = LLs;
+      int ostride = 1;
+      Simd Vp;
+      Simd Vm;
+      scalar_type *sp = (scalar_type *)&Vp;
+      scalar_type *sm = (scalar_type *)&Vm;
+      for(int l=0;l<Nsimd;l++){
+	if ( switcheroo<Coeff_t>::iscomplex() ) {
+	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
+	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
+	} else { 
+	  // if real
+	  scalar_type tmp;
+	  tmp = PplusMat (l*istride+s1*ostride,s2);
+	  sp[l] = scalar_type(tmp.real(),tmp.real());
+	  tmp = PminusMat(l*istride+s1*ostride,s2);
+	  sm[l] = scalar_type(tmp.real(),tmp.real());
+	}
      }
-    }
-    Matp[LLs*s2+s1] = Vp;
-    Matm[LLs*s2+s1] = Vm;
-  }}
+      Matp[LLs*s2+s1] = Vp;
+      Matm[LLs*s2+s1] = Vm;
+    }}
 }
-
-
-  FermOpTemplateInstantiate(CayleyFermion5D);
-  GparityFermOpTemplateInstantiate(CayleyFermion5D);
-
-}}
+#endif
+
+NAMESPACE_END(Grid);


--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@ -0,0 +1,235 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
+
+
+NAMESPACE_BEGIN(Grid);
+
+// Pminus fowards
+// Pplus  backwards..
+template<class Impl>  
+void
+CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
+			       const FermionField &phi_i, 
+			       FermionField &chi_i,
+			       Vector<Coeff_t> &lower,
+			       Vector<Coeff_t> &diag,
+			       Vector<Coeff_t> &upper)
+{
+  
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  int Ls =this->Ls;
+
+  // 10 = 3 complex mult + 2 complex add
+  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
+  M5Dcalls++;
+  M5Dtime-=usecond();
+
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss= sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1, tmp2;
+    for(int s=0;s<Ls;s++){
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      spProj5m(tmp1,psi(idx_u));
+      spProj5p(tmp2,psi(idx_l));
+      coalescedWrite(chi[ss+s],diag[s]*phi(ss+s)+upper[s]*tmp1+lower[s]*tmp2);
+    }
+  });
+  M5Dtime+=usecond();
+}
+
+template<class Impl>  
+void
+CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
+			      const FermionField &phi_i, 
+			      FermionField &chi_i,
+			      Vector<Coeff_t> &lower,
+			      Vector<Coeff_t> &diag,
+			      Vector<Coeff_t> &upper)
+{
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  int Ls=this->Ls;
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  M5Dcalls++;
+  M5Dtime-=usecond();
+
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss=sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1,tmp2;
+    for(int s=0;s<Ls;s++){
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      spProj5p(tmp1,psi(idx_u));
+      spProj5m(tmp2,psi(idx_l));
+      coalescedWrite(chi[ss+s],diag[s]*phi(ss+s)+upper[s]*tmp1+lower[s]*tmp2);
+    }
+  });
+  M5Dtime+=usecond();
+}
+
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+  int Ls=this->Ls;
+
+  auto plee  = & lee [0];
+  auto pdee  = & dee [0];
+  auto puee  = & uee [0];
+  auto pleem = & leem[0];
+  auto pueem = & ueem[0];
+
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss=sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp;
+
+    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
+    // Apply (L^{\prime})^{-1}
+    coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0]
+    for(int s=1;s<Ls;s++){
+      spProj5p(tmp,chi(ss+s-1));  
+      coalescedWrite(chi[ss+s] , psi(ss+s)-plee[s-1]*tmp);
+    }
+
+    // L_m^{-1} 
+    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -pleem[s]P_- chi
+      spProj5m(tmp,chi(ss+s));    
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp);
+    }
+
+    // U_m^{-1} D^{-1}
+    for (int s=0;s<Ls-1;s++){
+      // Chi[s] + 1/d chi[s] 
+      spProj5p(tmp,chi(ss+Ls-1)); 
+      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s)-(pueem[s]/pdee[Ls-1])*tmp);
+    }	
+    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
+      
+    // Apply U^{-1}
+    for (int s=Ls-2;s>=0;s--){
+      spProj5m(tmp,chi(ss+s+1));  
+      coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp);
+    }
+  });
+
+  MooeeInvTime+=usecond();
+
+}
+
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  int Ls=this->Ls;
+
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+  auto plee  = & lee [0];
+  auto pdee  = & dee [0];
+  auto puee  = & uee [0];
+  auto pleem = & leem[0];
+  auto pueem = & ueem[0];
+
+  assert(psi.Checkerboard() == psi.Checkerboard());
+
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+
+
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss=sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp;
+
+    // Apply (U^{\prime})^{-dagger}
+    coalescedWrite(chi[ss],psi(ss));
+    for (int s=1;s<Ls;s++){
+      spProj5m(tmp,chi(ss+s-1));
+      coalescedWrite(chi[ss+s], psi(ss+s)-conjugate(puee[s-1])*tmp);
+    }
+    // U_m^{-\dagger} 
+    for (int s=0;s<Ls-1;s++){
+      spProj5p(tmp,chi(ss+s));
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - conjugate(pueem[s])*tmp);
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for (int s=0;s<Ls-1;s++){
+      spProj5m(tmp,chi(ss+Ls-1));
+      coalescedWrite(chi[ss+s], conjugate(1.0/pdee[s])*chi(ss+s)-conjugate(pleem[s]/pdee[Ls-1])*tmp);
+    }	
+    coalescedWrite(chi[ss+Ls-1], conjugate(1.0/pdee[Ls-1])*chi(ss+Ls-1));
+  
+    // Apply L^{-dagger}
+    for (int s=Ls-2;s>=0;s--){
+      spProj5p(tmp,chi(ss+s+1));
+      coalescedWrite(chi[ss+s], chi(ss+s) - conjugate(plee[s])*tmp);
+    }
+  });
+  MooeeInvTime+=usecond();
+
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
@ -0,0 +1,831 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
+
+NAMESPACE_BEGIN(Grid);
+
+/*
+ * Dense matrix versions of routines
+ */
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInvDag(const FermionField &psi, FermionField &chi)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
+}
+  
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
+}
+template<class Impl>  
+void
+CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
+			   const FermionField &phi_i, 
+			   FermionField &chi_i,
+			   Vector<Coeff_t> &lower,
+			   Vector<Coeff_t> &diag,
+			   Vector<Coeff_t> &upper)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  const int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+    for(int i=0;i<nsimd;i++){ //inner
+      int s  = o+i*LLs;
+      int ss = o*nsimd+i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+    }}
+
+
+  M5Dcalls++;
+  M5Dtime-=usecond();
+
+  assert(Nc==3);
+
+  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
+#if 0
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5m(hp,psi[ss+vp]);
+      spProj5p(hm,psi[ss+vm]);
+
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+	
+      hp=0.5*hp;
+      hm=0.5*hm;
+
+      spRecon5m(fp,hp);
+      spRecon5p(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v];
+      chi[ss+v] = chi[ss+v]     +u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+#else
+    for(int v=0;v<LLs;v++){
+      
+      vprefetch(psi[ss+v+LLs]);
+
+      int vp= (v==LLs-1) ? 0     : v+1;
+      int vm= (v==0    ) ? LLs-1 : v-1;
+	
+      Simd hp_00 = psi[ss+vp]()(2)(0); 
+      Simd hp_01 = psi[ss+vp]()(2)(1); 
+      Simd hp_02 = psi[ss+vp]()(2)(2); 
+      Simd hp_10 = psi[ss+vp]()(3)(0); 
+      Simd hp_11 = psi[ss+vp]()(3)(1); 
+      Simd hp_12 = psi[ss+vp]()(3)(2); 
+	
+      Simd hm_00 = psi[ss+vm]()(0)(0); 
+      Simd hm_01 = psi[ss+vm]()(0)(1); 
+      Simd hm_02 = psi[ss+vm]()(0)(2); 
+      Simd hm_10 = psi[ss+vm]()(1)(0); 
+      Simd hm_11 = psi[ss+vm]()(1)(1); 
+      Simd hm_12 = psi[ss+vm]()(1)(2); 
+
+      if ( vp<=v ) {
+	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+      }
+      if ( vm>=v ) {
+	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+      }
+
+      // Can force these to real arithmetic and save 2x.
+      Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
+      Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
+      Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02); 
+      Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
+      Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
+      Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
+      Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
+      Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
+      Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);  
+      Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
+      Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
+      Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
+
+      vstream(chi[ss+v]()(0)(0),p_00);
+      vstream(chi[ss+v]()(0)(1),p_01);
+      vstream(chi[ss+v]()(0)(2),p_02);
+      vstream(chi[ss+v]()(1)(0),p_10);
+      vstream(chi[ss+v]()(1)(1),p_11);
+      vstream(chi[ss+v]()(1)(2),p_12);
+      vstream(chi[ss+v]()(2)(0),p_20);
+      vstream(chi[ss+v]()(2)(1),p_21);
+      vstream(chi[ss+v]()(2)(2),p_22);
+      vstream(chi[ss+v]()(3)(0),p_30);
+      vstream(chi[ss+v]()(3)(1),p_31);
+      vstream(chi[ss+v]()(3)(2),p_32);
+
+    }
+#endif
+  });
+  M5Dtime+=usecond();
+}
+
+template<class Impl>  
+void
+CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
+			      const FermionField &phi_i, 
+			      FermionField &chi_i,
+			      Vector<Coeff_t> &lower,
+			      Vector<Coeff_t> &diag,
+			      Vector<Coeff_t> &upper)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  auto psi=psi_i.View();
+  auto phi=phi_i.View();
+  auto chi=chi_i.View();
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+    for(int i=0;i<nsimd;i++){ //inner
+      int s  = o+i*LLs;
+      int ss = o*nsimd+i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+    }}
+
+  M5Dcalls++;
+  M5Dtime-=usecond();
+  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
+#if 0
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5p(hp,psi[ss+vp]);
+      spProj5m(hm,psi[ss+vm]);
+
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+      
+      hp=hp*0.5;
+      hm=hm*0.5;
+      spRecon5p(fp,hp);
+      spRecon5m(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+#else
+    for(int v=0;v<LLs;v++){
+
+      vprefetch(psi[ss+v+LLs]);
+
+      int vp= (v==LLs-1) ? 0     : v+1;
+      int vm= (v==0    ) ? LLs-1 : v-1;
+	
+      Simd hp_00 = psi[ss+vp]()(0)(0); 
+      Simd hp_01 = psi[ss+vp]()(0)(1); 
+      Simd hp_02 = psi[ss+vp]()(0)(2); 
+      Simd hp_10 = psi[ss+vp]()(1)(0); 
+      Simd hp_11 = psi[ss+vp]()(1)(1); 
+      Simd hp_12 = psi[ss+vp]()(1)(2); 
+	
+      Simd hm_00 = psi[ss+vm]()(2)(0); 
+      Simd hm_01 = psi[ss+vm]()(2)(1); 
+      Simd hm_02 = psi[ss+vm]()(2)(2); 
+      Simd hm_10 = psi[ss+vm]()(3)(0); 
+      Simd hm_11 = psi[ss+vm]()(3)(1); 
+      Simd hm_12 = psi[ss+vm]()(3)(2); 
+
+      if ( vp<=v ) {
+	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+      }
+      if ( vm>=v ) {
+	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+      }
+
+      Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
+      Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
+      Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02); 
+      Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
+      Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
+      Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
+
+      Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
+      Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
+      Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);  
+      Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
+      Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
+      Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
+
+      vstream(chi[ss+v]()(0)(0),p_00);
+      vstream(chi[ss+v]()(0)(1),p_01);
+      vstream(chi[ss+v]()(0)(2),p_02);
+      vstream(chi[ss+v]()(1)(0),p_10);
+      vstream(chi[ss+v]()(1)(1),p_11);
+      vstream(chi[ss+v]()(1)(2),p_12);
+      vstream(chi[ss+v]()(2)(0),p_20);
+      vstream(chi[ss+v]()(2)(1),p_21);
+      vstream(chi[ss+v]()(2)(2),p_22);
+      vstream(chi[ss+v]()(3)(0),p_30);
+      vstream(chi[ss+v]()(3)(1),p_31);
+      vstream(chi[ss+v]()(3)(2),p_32);
+    }
+#endif
+  });
+  M5Dtime+=usecond();
+}
+
+
+#ifdef AVX512 
+#include <simd/Intel512common.h>
+#include <simd/Intel512avx.h>
+#include <simd/Intel512single.h>
+#endif 
+
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField &chi_i,
+					int LLs, int site,
+					Vector<iSinglet<Simd> > &Matp,
+					Vector<iSinglet<Simd> > &Matm)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+#ifndef AVX512
+  {
+    SiteHalfSpinor BcastP;
+    SiteHalfSpinor BcastM;
+    SiteHalfSpinor SiteChiP;
+    SiteHalfSpinor SiteChiM;
+
+    // Ls*Ls * 2 * 12 * vol flops
+    for(int s1=0;s1<LLs;s1++){ 
+      for(int s2=0;s2<LLs;s2++){ 
+	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+
+	  int s=s2+l*LLs;
+	  int lex=s2+LLs*site;
+	
+	  if ( s2==0 && l==0) {
+	    SiteChiP=Zero();
+	    SiteChiM=Zero();
+	  }
+	
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
+	    }}
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
+	    }}
+
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
+	      SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
+	    }}
+
+	}}
+      {
+	int lex = s1+LLs*site;
+	for(int sp=0;sp<2;sp++){
+	  for(int co=0;co<Nc;co++){
+	    vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
+	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+	  }}
+      }
+    }
+
+  }
+#else
+  {
+    // pointers
+    //  MASK_REGS;
+#define Chi_00 %%zmm1
+#define Chi_01 %%zmm2
+#define Chi_02 %%zmm3
+#define Chi_10 %%zmm4
+#define Chi_11 %%zmm5
+#define Chi_12 %%zmm6
+#define Chi_20 %%zmm7
+#define Chi_21 %%zmm8
+#define Chi_22 %%zmm9
+#define Chi_30 %%zmm10
+#define Chi_31 %%zmm11
+#define Chi_32 %%zmm12
+
+#define BCAST0   %%zmm13
+#define BCAST1   %%zmm14
+#define BCAST2   %%zmm15
+#define BCAST3   %%zmm16
+#define BCAST4   %%zmm17
+#define BCAST5   %%zmm18
+#define BCAST6   %%zmm19
+#define BCAST7   %%zmm20
+#define BCAST8   %%zmm21
+#define BCAST9   %%zmm22
+#define BCAST10  %%zmm23
+#define BCAST11  %%zmm24
+
+    int incr=LLs*LLs*sizeof(iSinglet<Simd>);
+    for(int s1=0;s1<LLs;s1++){ 
+      for(int s2=0;s2<LLs;s2++){ 
+	int lex=s2+LLs*site;
+	uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
+	uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
+	uint64_t a2 = (uint64_t)&psi[lex];
+	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+	  if ( (s2+l)==0 ) {
+	    asm (
+		 VPREFETCH1(0,%2)  	     VPREFETCH1(0,%1)
+		 VPREFETCH1(12,%2)  	     VPREFETCH1(13,%2)
+		 VPREFETCH1(14,%2)  	     VPREFETCH1(15,%2)         
+		 VBCASTCDUP(0,%2,BCAST0)   
+		 VBCASTCDUP(1,%2,BCAST1)   
+		 VBCASTCDUP(2,%2,BCAST2)   
+		 VBCASTCDUP(3,%2,BCAST3)   
+		 VBCASTCDUP(4,%2,BCAST4)     VMULMEM (0,%0,BCAST0,Chi_00)
+		 VBCASTCDUP(5,%2,BCAST5)     VMULMEM (0,%0,BCAST1,Chi_01)
+		 VBCASTCDUP(6,%2,BCAST6)     VMULMEM (0,%0,BCAST2,Chi_02)
+		 VBCASTCDUP(7,%2,BCAST7)     VMULMEM (0,%0,BCAST3,Chi_10)
+		 VBCASTCDUP(8,%2,BCAST8)     VMULMEM (0,%0,BCAST4,Chi_11)
+		 VBCASTCDUP(9,%2,BCAST9)     VMULMEM (0,%0,BCAST5,Chi_12)
+		 VBCASTCDUP(10,%2,BCAST10)   VMULMEM (0,%1,BCAST6,Chi_20)
+		 VBCASTCDUP(11,%2,BCAST11)   VMULMEM (0,%1,BCAST7,Chi_21)
+		 VMULMEM (0,%1,BCAST8,Chi_22)         
+		 VMULMEM (0,%1,BCAST9,Chi_30)
+		 VMULMEM (0,%1,BCAST10,Chi_31)       
+		 VMULMEM (0,%1,BCAST11,Chi_32)
+		 : : "r" (a0), "r" (a1), "r" (a2)  );
+	  } else { 
+	    asm (
+		 VBCASTCDUP(0,%2,BCAST0)   VMADDMEM (0,%0,BCAST0,Chi_00)
+		 VBCASTCDUP(1,%2,BCAST1)   VMADDMEM (0,%0,BCAST1,Chi_01)
+		 VBCASTCDUP(2,%2,BCAST2)   VMADDMEM (0,%0,BCAST2,Chi_02)
+		 VBCASTCDUP(3,%2,BCAST3)   VMADDMEM (0,%0,BCAST3,Chi_10)
+		 VBCASTCDUP(4,%2,BCAST4)   VMADDMEM (0,%0,BCAST4,Chi_11)
+		 VBCASTCDUP(5,%2,BCAST5)   VMADDMEM (0,%0,BCAST5,Chi_12)
+		 VBCASTCDUP(6,%2,BCAST6)   VMADDMEM (0,%1,BCAST6,Chi_20)
+		 VBCASTCDUP(7,%2,BCAST7)   VMADDMEM (0,%1,BCAST7,Chi_21)
+		 VBCASTCDUP(8,%2,BCAST8)   VMADDMEM (0,%1,BCAST8,Chi_22)
+		 VBCASTCDUP(9,%2,BCAST9)   VMADDMEM (0,%1,BCAST9,Chi_30)
+		 VBCASTCDUP(10,%2,BCAST10)  VMADDMEM (0,%1,BCAST10,Chi_31)
+		 VBCASTCDUP(11,%2,BCAST11)  VMADDMEM (0,%1,BCAST11,Chi_32) 
+		 : : "r" (a0), "r" (a1), "r" (a2)  );
+	  }
+	  a0 = a0+incr;
+	  a1 = a1+incr;
+	a2 = a2+sizeof(typename Simd::scalar_type);
+	}}
+      {
+	int lexa = s1+LLs*site;
+	asm (
+	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)		
+	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)		
+	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)		
+	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)		
+	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+      }
+    }
+  }
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef BCAST0
+#undef BCAST1
+#undef BCAST2
+#undef BCAST3
+#undef BCAST4
+#undef BCAST5
+#undef BCAST6
+#undef BCAST7
+#undef BCAST8
+#undef BCAST9
+#undef BCAST10
+#undef BCAST11
+#endif
+};
+
+// Z-mobius version
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField &chi_i,
+					 int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
+{
+  EnableIf<Impl::LsVectorised,int> sfinae=0;
+#ifndef AVX512
+  {
+    auto psi = psi_i.View();
+    auto chi = chi_i.View();
+
+    SiteHalfSpinor BcastP;
+    SiteHalfSpinor BcastM;
+    SiteHalfSpinor SiteChiP;
+    SiteHalfSpinor SiteChiM;
+
+    // Ls*Ls * 2 * 12 * vol flops
+    for(int s1=0;s1<LLs;s1++){ 
+      for(int s2=0;s2<LLs;s2++){ 
+	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+
+	  int s=s2+l*LLs;
+	  int lex=s2+LLs*site;
+	
+	  if ( s2==0 && l==0) {
+	    SiteChiP=Zero();
+	    SiteChiM=Zero();
+	  }
+	
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
+	    }}
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
+	    }}
+
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co); 
+	      SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co); 
+	    }}
+
+
+	}}
+      {
+	int lex = s1+LLs*site;
+	for(int sp=0;sp<2;sp++){
+	  for(int co=0;co<Nc;co++){
+	    vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
+	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+	  }}
+      }
+    }
+
+  }
+#else
+  {
+    auto psi = psi_i.View();
+    auto chi = chi_i.View();
+    // pointers
+    //  MASK_REGS;
+#define Chi_00 %zmm0
+#define Chi_01 %zmm1
+#define Chi_02 %zmm2
+#define Chi_10 %zmm3
+#define Chi_11 %zmm4
+#define Chi_12 %zmm5
+#define Chi_20 %zmm6
+#define Chi_21 %zmm7
+#define Chi_22 %zmm8
+#define Chi_30 %zmm9
+#define Chi_31 %zmm10
+#define Chi_32 %zmm11
+#define pChi_00 %%zmm0
+#define pChi_01 %%zmm1
+#define pChi_02 %%zmm2
+#define pChi_10 %%zmm3
+#define pChi_11 %%zmm4
+#define pChi_12 %%zmm5
+#define pChi_20 %%zmm6
+#define pChi_21 %%zmm7
+#define pChi_22 %%zmm8
+#define pChi_30 %%zmm9
+#define pChi_31 %%zmm10
+#define pChi_32 %%zmm11
+
+#define BCAST_00   %zmm12
+#define  SHUF_00   %zmm13
+#define BCAST_01   %zmm14
+#define  SHUF_01   %zmm15
+#define BCAST_02   %zmm16
+#define  SHUF_02   %zmm17
+#define BCAST_10   %zmm18
+#define  SHUF_10   %zmm19
+#define BCAST_11   %zmm20
+#define  SHUF_11   %zmm21
+#define BCAST_12   %zmm22
+#define  SHUF_12   %zmm23
+
+#define Mp  %zmm24
+#define Mps %zmm25
+#define Mm  %zmm26
+#define Mms %zmm27
+#define N 8
+    int incr=LLs*LLs*sizeof(iSinglet<Simd>);
+    for(int s1=0;s1<LLs;s1++){ 
+      for(int s2=0;s2<LLs;s2++){ 
+	int lex=s2+LLs*site;
+	uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
+	uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
+	uint64_t a2 = (uint64_t)&psi[lex];
+	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+	  if ( (s2+l)==0 ) {
+	    LOAD64(%r8,a0);
+	    LOAD64(%r9,a1);
+	    LOAD64(%r10,a2);
+	    asm (
+		 VLOAD(0,%r8,Mp)// i r
+		 VLOAD(0,%r9,Mm)
+		 VSHUF(Mp,Mps)  // r i 
+		 VSHUF(Mm,Mms)
+		 VPREFETCH1(12,%r10)  	     VPREFETCH1(13,%r10)
+		 VPREFETCH1(14,%r10)  	     VPREFETCH1(15,%r10)         
+
+		 VMULIDUP(0*N,%r10,Mps,Chi_00)
+		 VMULIDUP(1*N,%r10,Mps,Chi_01)
+		 VMULIDUP(2*N,%r10,Mps,Chi_02)
+		 VMULIDUP(3*N,%r10,Mps,Chi_10)
+		 VMULIDUP(4*N,%r10,Mps,Chi_11)
+		 VMULIDUP(5*N,%r10,Mps,Chi_12)
+
+		 VMULIDUP(6*N ,%r10,Mms,Chi_20)
+		 VMULIDUP(7*N ,%r10,Mms,Chi_21)
+		 VMULIDUP(8*N ,%r10,Mms,Chi_22)
+		 VMULIDUP(9*N ,%r10,Mms,Chi_30)
+		 VMULIDUP(10*N,%r10,Mms,Chi_31)
+		 VMULIDUP(11*N,%r10,Mms,Chi_32)
+
+		 VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
+		 VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
+		 VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
+		 VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
+		 VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
+		 VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
+
+		 VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
+		 VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
+		 VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
+		 VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
+		 VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
+		 VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
+		 );
+	  } else { 
+	    LOAD64(%r8,a0);
+	    LOAD64(%r9,a1);
+	    LOAD64(%r10,a2);
+	    asm (
+		 VLOAD(0,%r8,Mp)
+		 VSHUF(Mp,Mps)
+
+		 VLOAD(0,%r9,Mm)
+		 VSHUF(Mm,Mms)
+
+		 VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) //  Mri * Pii +- Cir
+		 VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
+		 VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
+		 VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
+		 VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
+		 VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
+
+		 VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
+		 VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
+		 VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
+		 VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
+		 VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
+		 VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
+
+		 VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) //  Cir = Mir * Prr +- ( Mri * Pii +- Cir) 
+		 VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) //  Ci = MiPr + Ci + MrPi ;    Cr = MrPr - ( MiPi - Cr)
+		 VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
+		 VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
+		 VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
+		 VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
+
+		 VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
+		 VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
+		 VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
+		 VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
+		 VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
+		 VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
+		 );
+	  }
+	  a0 = a0+incr;
+	  a1 = a1+incr;
+	a2 = a2+sizeof(typename Simd::scalar_type);
+	}}
+      {
+	int lexa = s1+LLs*site;
+	/*
+	  SiteSpinor tmp;
+	  asm (
+	  VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
+	  VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
+	  VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
+	  VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
+	  : : "r" ((uint64_t)&tmp) : "memory" );
+	*/
+
+	asm (
+	     VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
+	     VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
+	     VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
+	     VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
+	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+	//      if ( 1 || (site==0) ) { 
+	//	std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
+	//      }
+      }
+    }
+  }
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef BCAST0
+#undef BCAST1
+#undef BCAST2
+#undef BCAST3
+#undef BCAST4
+#undef BCAST5
+#undef BCAST6
+#undef BCAST7
+#undef BCAST8
+#undef BCAST9
+#undef BCAST10
+#undef BCAST11
+
+#endif
+};
+
+
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
+{
+  EnableIf<Impl::LsVectorised,int> sfinae=0;
+  chi.Checkerboard()=psi.Checkerboard();
+
+  int Ls=this->Ls;
+  int LLs = psi.Grid()->_rdimensions[0];
+  int vol = psi.Grid()->oSites()/LLs;
+
+  
+  Vector<iSinglet<Simd> >  Matp;
+  Vector<iSinglet<Simd> >  Matm;
+  Vector<iSinglet<Simd> >  *_Matp;
+  Vector<iSinglet<Simd> >  *_Matm;
+  
+  //  MooeeInternalCompute(dag,inv,Matp,Matm);
+  if ( inv && dag ) { 
+    _Matp = &MatpInvDag;
+    _Matm = &MatmInvDag;
+  }
+  if ( inv && (!dag) ) { 
+    _Matp = &MatpInv;
+    _Matm = &MatmInv;
+  } 
+  if ( !inv ) {
+    MooeeInternalCompute(dag,inv,Matp,Matm);
+    _Matp = &Matp;
+    _Matm = &Matm;
+  }
+  assert(_Matp->size()==Ls*LLs);
+
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+
+  if ( switcheroo<Coeff_t>::iscomplex() ) {
+    thread_loop( (auto site=0;site<vol;site++),{
+      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
+    });
+  } else { 
+    thread_loop( (auto site=0;site<vol;site++),{
+      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
+    });
+  }
+  MooeeInvTime+=usecond();
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@ -0,0 +1,321 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
+
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
+{
+  SetCoefficientsZolotarev(1.0/scale,zdata);
+}
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
+{
+  // How to check Ls matches??
+  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+  int Ls = this->Ls;
+  assert(zdata->db==Ls);// Beta has Ls coeffs
+
+  R=(1+this->mass)/(1-this->mass);
+
+  Beta.resize(Ls);
+  cc.resize(Ls);
+  cc_d.resize(Ls);
+  sqrt_cc.resize(Ls);
+  for(int i=0; i < Ls ; i++){
+    Beta[i] = zdata -> beta[i];
+    cc[i] = 1.0/Beta[i];
+    cc_d[i]=std::sqrt(cc[i]);
+  }
+    
+  cc_d[Ls-1]=1.0;
+  for(int i=0; i < Ls-1 ; i++){
+    sqrt_cc[i]= std::sqrt(cc[i]*cc[i+1]);
+  }    
+  sqrt_cc[Ls-2]=std::sqrt(cc[Ls-2]);
+
+
+  ZoloHiInv =1.0/zolo_hi;
+  dw_diag = (4.0-this->M5)*ZoloHiInv;
+    
+  See.resize(Ls);
+  Aee.resize(Ls);
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    Aee[s] = sign * Beta[s] * dw_diag;
+    sign   = - sign;
+  }
+  Aee[Ls-1] += R;
+    
+  See[0] = Aee[0];
+  for(int s=1;s<Ls;s++){
+    See[s] = Aee[s] - 1.0/See[s-1];
+  }
+  for(int s=0;s<Ls;s++){
+    std::cout<<GridLogMessage <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
+  }
+}
+
+
+
+template<class Impl>
+RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
+{
+  int Ls = this->Ls;
+
+  FermionField D(psi.Grid());
+
+  this->DW(psi,D,DaggerNo); 
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*ZoloHiInv,D,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
+    } else if ( s==(Ls-1) ){
+      RealD R=(1.0+mass)/(1.0-mass);
+      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,D,sqrt_cc[s-1],psi,s,s-1);
+      ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
+    } else {
+      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,D,sqrt_cc[s],psi,s,s+1);
+      axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
+    }
+    sign=-sign; 
+  }
+  return norm2(chi);
+}
+template<class Impl>
+RealD  ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
+{
+  // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
+  // The rest of matrix is symmetric.
+  // Can ignore "dag"
+  return M(psi,chi);
+}
+template<class Impl>
+void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+  int Ls = this->Ls;
+
+  this->DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
+    } else {
+      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
+    }
+    sign=-sign; 
+  }
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
+{
+  int Ls = this->Ls;
+
+  // Apply 4d dslash
+  if ( psi.Checkerboard() == Odd ) {
+    this->DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
+  } else {
+    this->DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
+  }
+      
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
+    } else {
+      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
+    }
+    sign=-sign; 
+  }
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
+{
+  this->Meooe(psi,chi);
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
+{
+  int Ls = this->Ls;
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*dw_diag,psi,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
+    } else if ( s==(Ls-1) ){
+      // Drop the CC here.
+      double R=(1+mass)/(1-mass);
+      ag5xpby_ssp(chi,Beta[s]*dw_diag,psi,sqrt_cc[s-1],psi,s,s-1);
+      ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
+    } else {
+      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*dw_diag,psi,sqrt_cc[s],psi,s,s+1);
+      axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
+    }
+    sign=-sign; 
+  }
+}
+
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
+{
+  this->Mooee(psi,chi);
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
+{
+  int Ls = this->Ls;
+
+  // Apply Linv
+  axpby_ssp(chi,1.0/cc_d[0],psi,0.0,psi,0,0); 
+  for(int s=1;s<Ls;s++){
+    axpbg5y_ssp(chi,1.0/cc_d[s],psi,-1.0/See[s-1],chi,s,s-1);
+  }
+  // Apply Dinv
+  for(int s=0;s<Ls;s++){
+    ag5xpby_ssp(chi,1.0/See[s],chi,0.0,chi,s,s); //only appearance of See[0]
+  }
+  // Apply Uinv = (Linv)^T
+  axpby_ssp(chi,1.0/cc_d[Ls-1],chi,0.0,chi,Ls-1,Ls-1);
+  for(int s=Ls-2;s>=0;s--){
+    axpbg5y_ssp(chi,1.0/cc_d[s],chi,-1.0*cc_d[s+1]/See[s]/cc_d[s],chi,s,s+1);
+  }
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInv(psi,chi);
+}
+
+// force terms; five routines; default to Dhop on diagonal
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+    } else {
+      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+    }
+    sign=-sign; 
+  }
+  this->DhopDeriv(mat,D,V,DaggerNo); 
+};
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+    } else {
+      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+    }
+    sign=-sign; 
+  }
+  this->DhopDerivOE(mat,D,V,DaggerNo); 
+};
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+    } else {
+      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+    }
+    sign=-sign; 
+  }
+  this->DhopDerivEO(mat,D,V,DaggerNo); 
+};
+    
+// Constructors
+template<class Impl>
+ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
+							     GaugeField &_Umu,
+							     GridCartesian         &FiveDimGrid,
+							     GridRedBlackCartesian &FiveDimRedBlackGrid,
+							     GridCartesian         &FourDimGrid,
+							     GridRedBlackCartesian &FourDimRedBlackGrid,
+							     RealD _mass,RealD M5,const ImplParams &p) :
+  WilsonFermion5D<Impl>(_Umu,
+			FiveDimGrid, FiveDimRedBlackGrid,
+			FourDimGrid, FourDimRedBlackGrid,M5,p),
+  mass(_mass)
+{
+  int Ls = this->Ls;
+  assert((Ls&0x1)==1); // Odd Ls required
+}
+
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+    {
+      int Ls = this->Ls;
+      conformable(solution5d.Grid(),this->FermionGrid());
+      conformable(exported4d.Grid(),this->GaugeGrid());
+      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+    }
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+    {
+      int Ls = this->Ls;
+      conformable(imported5d.Grid(),this->FermionGrid());
+      conformable(input4d.Grid()   ,this->GaugeGrid());
+      FermionField tmp(this->FermionGrid());
+      tmp=Zero();
+      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
+      this->Dminus(tmp,imported5d);
+    }
+
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
@ -0,0 +1,227 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// FIXME -- make a version of these routines with site loop outermost for cache reuse.
+// Pminus fowards
+// Pplus  backwards..
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
+				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  int Ls = this->Ls;
+  GridBase* grid = psi_i.Grid();
+  auto phi = phi_i.View();
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+  assert(phi.Checkerboard() == psi.Checkerboard());
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+  
+  auto nloop=grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    auto ss=sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    for(int s=0; s<Ls; s++){
+      spinor tmp1, tmp2;
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      spProj5m(tmp1, psi(idx_u));
+      spProj5p(tmp2, psi(idx_l));
+      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
+					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid = psi_i.Grid();
+  int Ls = this->Ls;
+
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  auto nloop=grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    auto ss=sss*Ls;
+    for(int s=0; s<Ls; s++){
+      spinor tmp1, tmp2;
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      spProj5p(tmp1, psi(idx_u));
+      spProj5m(tmp2, psi(idx_l));
+      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionField& chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid = psi_i.Grid();
+  auto psi=psi_i.View();
+  auto chi=chi_i.View();
+  int Ls = this->Ls;
+
+  auto plee  = & this->lee[0];
+  auto pdee  = & this->dee[0];
+  auto puee  = & this->uee[0];
+
+  auto pleem = & this->leem[0];
+  auto pueem = & this->ueem[0];
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+  uint64_t nloop=grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    auto ss=sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1,tmp2;
+
+    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
+    // Apply (L^{\prime})^{-1}
+    coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0]
+    for(int s=1; s<Ls; s++){
+      spProj5p(tmp1, chi(ss+s-1));
+      coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp1);
+    }
+
+    // L_m^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+      spProj5m(tmp1, chi(ss+s));
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp1);
+    }
+
+    // U_m^{-1} D^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+      spProj5p(tmp1, chi(ss+Ls-1));
+      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls])*tmp1);
+    }
+    spProj5m(tmp2, chi(ss+Ls-1));
+    coalescedWrite(chi[ss+Ls-1],(1.0/pdee[Ls])*tmp1 + (1.0/pdee[Ls-1])*tmp2);
+
+    // Apply U^{-1}
+    for(int s=Ls-2; s>=0; s--){
+      spProj5m(tmp1, chi(ss+s+1));
+      coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp1);
+    }
+  });
+  this->MooeeInvTime += usecond();
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, FermionField& chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid = psi_i.Grid();
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+  int Ls = this->Ls;
+
+  assert(psi.Checkerboard() == psi.Checkerboard());
+
+  Vector<Coeff_t> ueec(Ls);
+  Vector<Coeff_t> deec(Ls+1);
+  Vector<Coeff_t> leec(Ls);
+  Vector<Coeff_t> ueemc(Ls);
+  Vector<Coeff_t> leemc(Ls);
+
+  for(int s=0; s<ueec.size(); s++){
+    ueec[s]  = conjugate(this->uee[s]);
+    deec[s]  = conjugate(this->dee[s]);
+    leec[s]  = conjugate(this->lee[s]);
+    ueemc[s] = conjugate(this->ueem[s]);
+    leemc[s] = conjugate(this->leem[s]);
+  }
+  deec[Ls] = conjugate(this->dee[Ls]);
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+  auto nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1,tmp2;
+    auto ss=sss*Ls;
+
+    // Apply (U^{\prime})^{-dagger}
+    coalescedWrite(chi[ss], psi(ss));
+    for(int s=1; s<Ls; s++){
+      spProj5m(tmp1, chi(ss+s-1));
+      coalescedWrite(chi[ss+s], psi(ss+s) - ueec[s-1]*tmp1);
+    }
+
+    // U_m^{-\dagger}
+    for(int s=0; s<Ls-1; s++){
+      spProj5p(tmp1, chi(ss+s));
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - ueemc[s]*tmp1);
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for(int s=0; s<Ls-1; s++){
+      spProj5m(tmp1, chi(ss+Ls-1));
+      coalescedWrite(chi[ss+s] ,(1.0/deec[s])*chi(ss+s) - (leemc[s]/deec[Ls-1])*tmp1);
+    }
+    spProj5p(tmp2, chi(ss+Ls-1));
+    coalescedWrite(chi[ss+Ls-1], (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2);
+
+    // Apply L^{-dagger}
+    for(int s=Ls-2; s>=0; s--){
+      spProj5p(tmp1, chi(ss+s+1));
+      coalescedWrite(chi[ss+s],chi(ss+s) - leec[s]*tmp1);
+    }
+  });
+
+  this->MooeeInvTime += usecond();
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
@ -0,0 +1,321 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#pragma once
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl>
+DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
+						   GaugeField            &_Umu,
+						   GridCartesian         &FiveDimGrid,
+						   GridRedBlackCartesian &FiveDimRedBlackGrid,
+						   GridCartesian         &FourDimGrid,
+						   GridRedBlackCartesian &FourDimRedBlackGrid,
+						   RealD _mq1, RealD _mq2, RealD _mq3,
+						   RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
+  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
+			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
+			    _shift, _pm, _M5, 1.0, 0.0, p)
+{
+  RealD eps = 1.0;
+  Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
+  assert(zdata->n == this->Ls);
+
+  std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
+  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
+
+  Approx::zolotarev_free(zdata);
+}
+
+/***************************************************************
+ * Additional EOFA operators only called outside the inverter.
+ * Since speed is not essential, simple axpby-style
+ * implementations should be fine.
+ ***************************************************************/
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
+{
+  int Ls = this->Ls;
+
+  Din = Zero();
+  if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
+  else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
+  else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
+  else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
+}
+
+// This is just the identity for DWF
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
+
+// This is just the identity for DWF
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
+
+/*****************************************************************************************************/
+
+template<class Impl>
+RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
+{
+  FermionField Din(psi.Grid());
+
+  this->Meooe5D(psi, Din);
+  this->DW(Din, chi, DaggerNo);
+  axpby(chi, 1.0, 1.0, chi, psi);
+  this->M5D(psi, chi);
+  return(norm2(chi));
+}
+
+template<class Impl>
+RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
+{
+  FermionField Din(psi.Grid());
+
+  this->DW(psi, Din, DaggerYes);
+  this->MeooeDag5D(Din, chi);
+  this->M5Ddag(psi, chi);
+  axpby(chi, 1.0, 1.0, chi, psi);
+  return(norm2(chi));
+}
+
+/********************************************************************
+ * Performance critical fermion operators called inside the inverter
+ ********************************************************************/
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
+{
+  int   Ls    = this->Ls;
+  int   pm    = this->pm;
+  RealD shift = this->shift;
+  RealD mq1   = this->mq1;
+  RealD mq2   = this->mq2;
+  RealD mq3   = this->mq3;
+
+  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
+  Coeff_t shiftp(0.0), shiftm(0.0);
+  if(shift != 0.0){
+    if(pm == 1){ shiftp = shift*(mq3-mq2); }
+    else{ shiftm = -shift*(mq3-mq2); }
+  }
+
+  Vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
+
+#if(0)
+  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
+  for(int i=0; i<diag.size(); ++i){
+    std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
+  }
+  for(int i=0; i<upper.size(); ++i){
+    std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
+  }
+  for(int i=0; i<lower.size(); ++i){
+    std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
+  }
+#endif
+
+  this->M5D(psi, chi, chi, lower, diag, upper);
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
+{
+  int   Ls    = this->Ls;
+  int   pm    = this->pm;
+  RealD shift = this->shift;
+  RealD mq1   = this->mq1;
+  RealD mq2   = this->mq2;
+  RealD mq3   = this->mq3;
+
+  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
+  Coeff_t shiftp(0.0), shiftm(0.0);
+  if(shift != 0.0){
+    if(pm == 1){ shiftp = shift*(mq3-mq2); }
+    else{ shiftm = -shift*(mq3-mq2); }
+  }
+
+  Vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
+
+  this->M5Ddag(psi, chi, chi, lower, diag, upper);
+}
+
+// half checkerboard operations
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  Vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
+
+  for(int s=0; s<Ls; s++){
+    upper[s] = -this->cee[s];
+    lower[s] = -this->cee[s];
+  }
+  upper[Ls-1] = this->dm;
+  lower[0]    = this->dp;
+
+  this->M5D(psi, psi, chi, lower, diag, upper);
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  Vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
+
+  for(int s=0; s<Ls; s++){
+    upper[s] = -this->cee[s];
+    lower[s] = -this->cee[s];
+  }
+  upper[Ls-1] = this->dp;
+  lower[0]    = this->dm;
+
+  this->M5Ddag(psi, psi, chi, lower, diag, upper);
+}
+
+/****************************************************************************************/
+
+//Zolo
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
+{
+  int   Ls    = this->Ls;
+  int   pm    = this->pm;
+  RealD mq1   = this->mq1;
+  RealD mq2   = this->mq2;
+  RealD mq3   = this->mq3;
+  RealD shift = this->shift;
+
+  ////////////////////////////////////////////////////////
+  // Constants for the preconditioned matrix Cayley form
+  ////////////////////////////////////////////////////////
+  this->bs.resize(Ls);
+  this->cs.resize(Ls);
+  this->aee.resize(Ls);
+  this->aeo.resize(Ls);
+  this->bee.resize(Ls);
+  this->beo.resize(Ls);
+  this->cee.resize(Ls);
+  this->ceo.resize(Ls);
+
+  for(int i=0; i<Ls; ++i){
+    this->bee[i] = 4.0 - this->M5 + 1.0;
+    this->cee[i] = 1.0;
+  }
+
+  for(int i=0; i<Ls; ++i){
+    this->aee[i] = this->cee[i];
+    this->bs[i] = this->beo[i] = 1.0;
+    this->cs[i] = this->ceo[i] = 0.0;
+  }
+
+  //////////////////////////////////////////
+  // EOFA shift terms
+  //////////////////////////////////////////
+  if(pm == 1){
+    this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
+    this->dm = mq1*this->cee[Ls-1];
+  } else if(this->pm == -1) {
+    this->dp = mq1*this->cee[0];
+    this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
+  } else {
+    this->dp = mq1*this->cee[0];
+    this->dm = mq1*this->cee[Ls-1];
+  }
+
+  //////////////////////////////////////////
+  // LDU decomposition of eeoo
+  //////////////////////////////////////////
+  this->dee.resize(Ls+1);
+  this->lee.resize(Ls);
+  this->leem.resize(Ls);
+  this->uee.resize(Ls);
+  this->ueem.resize(Ls);
+
+  for(int i=0; i<Ls; ++i){
+
+    if(i < Ls-1){
+
+      this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
+
+      this->leem[i] = this->dm/this->bee[i];
+      for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
+
+      this->dee[i] = this->bee[i];
+
+      this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
+
+      this->ueem[i] = this->dp / this->bee[0];
+      for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
+
+    } else {
+
+      this->lee[i]  = 0.0;
+      this->leem[i] = 0.0;
+      this->uee[i]  = 0.0;
+      this->ueem[i] = 0.0;
+
+    }
+  }
+
+  {
+    Coeff_t delta_d = 1.0 / this->bee[0];
+    for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
+    this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
+    this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
+  }
+}
+
+// Recompute Cayley-form coefficients for different shift
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
+{
+  this->shift = new_shift;
+  Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
+  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@ -24,22 +24,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
 #include <Grid/perfmon/PerfCount.h>

-namespace Grid {
-namespace QCD {
-  
-// S-direction is INNERMOST and takes no part in the parity.
-const std::vector<int> 
-ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
-const std::vector<int> 
-ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
+#pragma once

-  // 5d lattice for DWF.
+NAMESPACE_BEGIN(Grid);
+
+// 5d lattice for DWF.
 template<class Impl>
 ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian         &FiveDimGrid,
 							     GridRedBlackCartesian &FiveDimRedBlackGrid,
@ -53,9 +48,9 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
-  Stencil    (&FiveDimGrid,npoint,Even,directions,displacements),
-  StencilEven(&FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
-  StencilOdd (&FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
+  Stencil    (&FiveDimGrid,npoint,Even,directions,displacements,p),
+  StencilEven(&FiveDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
+  StencilOdd (&FiveDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
  mass(_mass),
  c1(_c1),
  c2(_c2),
@ -108,8 +103,8 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);

    for(int d=0;d<4;d++){
-      assert(FourDimGrid._simd_layout[d]=1);
-      assert(FourDimRedBlackGrid._simd_layout[d]=1);
+      assert(FourDimGrid._simd_layout[d]==1);
+      assert(FourDimRedBlackGrid._simd_layout[d]==1);
      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
    }

@ -226,24 +221,27 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi

  Compressor compressor;
  Stencil.HaloExchange(in,compressor);
-
-  parallel_for(int ss=0;ss<Umu._grid->oSites();ss++){
+  auto Umu_v   = Umu.View();
+  auto UUUmu_v = UUUmu.View();
+  auto in_v    = in.View();
+  auto out_v   = out.View();
+  thread_for( ss,Umu.Grid()->oSites(),{
    for(int s=0;s<Ls;s++){
      int sU=ss;
      int sF = s+Ls*sU; 
-      Kernels::DhopDir(Stencil, Umu, UUUmu, Stencil.CommBuf(), sF, sU, in, out, dir, disp);
+      Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sF, sU, in_v, out_v, dir, disp);
    }
-  }
+  });
 };

 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DerivInternal(StencilImpl & st,
-            DoubledGaugeField & U,
-            DoubledGaugeField & UUU,
-            GaugeField &mat,
-            const FermionField &A,
-            const FermionField &B,
-            int dag)
+						     DoubledGaugeField & U,
+						     DoubledGaugeField & UUU,
+						     GaugeField &mat,
+						     const FermionField &A,
+						     const FermionField &B,
+						     int dag)
 {
  // No force terms in multi-rhs solver staggered
  assert(0);
@ -251,18 +249,18 @@ void ImprovedStaggeredFermion5D<Impl>::DerivInternal(StencilImpl & st,

 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopDeriv(GaugeField &mat,
-				      const FermionField &A,
-				      const FermionField &B,
-				      int dag)
+						 const FermionField &A,
+						 const FermionField &B,
+						 int dag)
 {
  assert(0);
 }

 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
-					const FermionField &A,
-					const FermionField &B,
-					int dag)
+						   const FermionField &A,
+						   const FermionField &B,
+						   int dag)
 {
  assert(0);
 }
@ -270,9 +268,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivEO(GaugeField &mat,

 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
-					const FermionField &A,
-					const FermionField &B,
-					int dag)
+						   const FermionField &A,
+						   const FermionField &B,
+						   int dag)
 {
  assert(0);
 }
@ -301,8 +299,8 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &

  Compressor compressor; 

-  int LLs = in._grid->_rdimensions[0];
-  int len =  U._grid->oSites();
+  int LLs = in.Grid()->_rdimensions[0];
+  int len =  U.Grid()->oSites();

  DhopFaceTime-=usecond();
  st.Prepare();
@ -328,7 +326,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
      double start = usecond();
      nthreads -= ncomms;
      int ttid  = tid - ncomms;
-      int n     = U._grid->oSites(); // 4d vol
+      int n     = U.Grid()->oSites(); // 4d vol
      int chunk = n / nthreads;
      int rem   = n % nthreads;
      int myblock, myn;
@ -341,17 +339,22 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
      }

      // do the compute
+      auto   U_v  =   U.View();
+      auto UUU_v  = UUU.View();
+      auto  in_v  =  in.View();
+      auto out_v  = out.View();
+
      if (dag == DaggerYes) {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
          int sU = ss;
 	  // Interior = 1; Exterior = 0; must implement for staggered
-          Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<---------
+          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
        }
      } else {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
 	  // Interior = 1; Exterior = 0;
          int sU = ss;
-          Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------
+          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
        }
      }
        ptime = usecond() - start;
@ -372,18 +375,23 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  DhopFaceTime+=usecond();

  DhopComputeTime2-=usecond();
+
+  auto   U_v  =   U.View();
+  auto UUU_v  = UUU.View();
+  auto  in_v  =  in.View();
+  auto out_v  = out.View();
  if (dag == DaggerYes) {
    int sz=st.surface_list.size();
-    parallel_for (int ss = 0; ss < sz; ss++) {
+    thread_for( ss,sz,{
      int sU = st.surface_list[ss];
-      Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<----------
-    }
+      Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
+    });
  } else {
    int sz=st.surface_list.size();
-    parallel_for (int ss = 0; ss < sz; ss++) {
+    thread_for( ss,sz,{
      int sU = st.surface_list[ss];
-      Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<----------
-    }
+      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
+    });
  }
  DhopComputeTime2+=usecond();
 #else
@ -398,7 +406,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
 						    const FermionField &in, FermionField &out,int dag)
 {
  Compressor compressor;
-  int LLs = in._grid->_rdimensions[0];
+  int LLs = in.Grid()->_rdimensions[0];



@ -410,16 +418,20 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  
  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  auto   U_v  =   U.View();
+  auto UUU_v  = UUU.View();
+  auto  in_v  =  in.View();
+  auto out_v  = out.View();
  if (dag == DaggerYes) {
-    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
+    thread_for( ss,U.Grid()->oSites(),{
      int sU=ss;
-      Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
-    }
+      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
+    });
  } else {
-    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
+    thread_for( ss,U.Grid()->oSites(),{
      int sU=ss;
-      Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
-    }
+      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
+    });
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
@ -432,50 +444,17 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
 }
 /*CHANGE END*/

-/* ORG
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
-						    DoubledGaugeField & U,DoubledGaugeField & UUU,
-						    const FermionField &in, FermionField &out,int dag)
-{
-  Compressor compressor;
-  int LLs = in._grid->_rdimensions[0];
-
-
-
-  DhopTotalTime -= usecond();
-  DhopCommTime -= usecond();
-  st.HaloExchange(in,compressor);
-  DhopCommTime += usecond();
-  
-  DhopComputeTime -= usecond();
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  if (dag == DaggerYes) {
-    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
-      int sU=ss;
-      Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
-    }
-  } else {
-    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
-      int sU=ss;
-	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
-    }
-  }
-  DhopComputeTime += usecond();
-  DhopTotalTime   += usecond();
-}
-*/


 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
-  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
-  conformable(in._grid,out._grid); // drops the cb check
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check

-  assert(in.checkerboard==Even);
-  out.checkerboard = Odd;
+  assert(in.Checkerboard()==Even);
+  out.Checkerboard() = Odd;

  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
 }
@ -483,11 +462,11 @@ template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
-  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
-  conformable(in._grid,out._grid); // drops the cb check
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check

-  assert(in.checkerboard==Odd);
-  out.checkerboard = Even;
+  assert(in.Checkerboard()==Odd);
+  out.Checkerboard() = Even;

  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
 }
@ -495,10 +474,10 @@ template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
-  conformable(in._grid,FermionGrid()); // verifies full grid
-  conformable(in._grid,out._grid);
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());

-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();

  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }
@ -506,7 +485,7 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Report(void) 
 {
-  std::vector<int> latt = GridDefaultLatt();          
+  Coordinate latt = GridDefaultLatt();          
  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _FourDimGrid->_Nprocessors;
  RealD NN = _FourDimGrid->NodeCount();
@ -564,21 +543,21 @@ void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField
 }
 template <class Impl>
 RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
  return axpy_norm(out, mass, in, out);
 }

 template <class Impl>
 RealD ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerYes);
  return axpy_norm(out, mass, in, out);
 }

 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) {
-  if (in.checkerboard == Odd) {
+  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerNo);
  } else {
    DhopOE(in, out, DaggerNo);
@ -586,7 +565,7 @@ void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionFiel
 }
 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
-  if (in.checkerboard == Odd) {
+  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerYes);
  } else {
    DhopOE(in, out, DaggerYes);
@ -595,27 +574,27 @@ void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionF

 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  typename FermionField::scalar_type scal(mass);
  out = scal * in;
 }

 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Mooee(in, out);
 }

 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  out = (1.0 / (mass)) * in;
 }

 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
-                                      FermionField &out) {
-  out.checkerboard = in.checkerboard;
+						   FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
  MooeeInv(in, out);
 }

@ -624,31 +603,28 @@ void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
 ////////////////////////////////////////////////////////
 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
-                                                         PropagatorField &q_in_2,
-                                                         PropagatorField &q_out,
-                                                         Current curr_type,
-                                                         unsigned int mu)
+								PropagatorField &q_in_2,
+								PropagatorField &q_out,
+								Current curr_type,
+								unsigned int mu)
 {
-    assert(0);
+  assert(0);
 }

 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
-                                              PropagatorField &q_out,
-                                              Current curr_type,
-                                              unsigned int mu,
-                                              unsigned int tmin, 
+void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+							   PropagatorField &q_out,
+							   Current curr_type,
+							   unsigned int mu, 
+							   unsigned int tmin,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
-    assert(0);
+  assert(0);

 }
-
-FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
-FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
  
-}}
+NAMESPACE_END(Grid);



--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@ -28,40 +28,35 @@ directory
 /*  END LEGAL */
 #include <Grid/Grid.h>

-namespace Grid {
-namespace QCD {
+#pragma once 

-const std::vector<int> 
-ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
-const std::vector<int> 
-ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
+NAMESPACE_BEGIN(Grid);

 /////////////////////////////////
 // Constructor and gauge import
 /////////////////////////////////

-
 template <class Impl>
 ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, 
 							 RealD _mass,
 							 RealD _c1, RealD _c2,RealD _u0,
 							 const ImplParams &p)
-    : Kernels(p),
-      _grid(&Fgrid),
-      _cbgrid(&Hgrid),
-      Stencil(&Fgrid, npoint, Even, directions, displacements),
-      StencilEven(&Hgrid, npoint, Even, directions, displacements),  // source is Even
-      StencilOdd(&Hgrid, npoint, Odd, directions, displacements),  // source is Odd
-      mass(_mass),
-      Lebesgue(_grid),
-      LebesgueEvenOdd(_cbgrid),
-      Umu(&Fgrid),
-      UmuEven(&Hgrid),
-      UmuOdd(&Hgrid),
-      UUUmu(&Fgrid),
-      UUUmuEven(&Hgrid),
-      UUUmuOdd(&Hgrid) ,
-      _tmp(&Hgrid)
+  : Kernels(p),
+    _grid(&Fgrid),
+    _cbgrid(&Hgrid),
+    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
+    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
+    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
+    mass(_mass),
+    Lebesgue(_grid),
+    LebesgueEvenOdd(_cbgrid),
+    Umu(&Fgrid),
+    UmuEven(&Hgrid),
+    UmuOdd(&Hgrid),
+    UUUmu(&Fgrid),
+    UUUmuEven(&Hgrid),
+    UUUmuOdd(&Hgrid) ,
+    _tmp(&Hgrid)
 {
  int vol4;
  int LLs=1;
@ -85,17 +80,17 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau
  ImportGauge(_Uthin,_Ufat);
 }

-  ////////////////////////////////////////////////////////////
-  // Momentum space propagator should be 
-  // https://arxiv.org/pdf/hep-lat/9712010.pdf
-  //
-  // mom space action.
-  //   gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
-  //
-  // must track through staggered flavour/spin reduction in literature to 
-  // turn to free propagator for the one component chi field, a la page 4/5
-  // of above link to implmement fourier based solver.
-  ////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+// Momentum space propagator should be 
+// https://arxiv.org/pdf/hep-lat/9712010.pdf
+//
+// mom space action.
+//   gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
+//
+// must track through staggered flavour/spin reduction in literature to 
+// turn to free propagator for the one component chi field, a la page 4/5
+// of above link to implmement fourier based solver.
+////////////////////////////////////////////////////////////
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) 
 {
@ -177,21 +172,21 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const

 template <class Impl>
 RealD ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
  return axpy_norm(out, mass, in, out);
 }

 template <class Impl>
 RealD ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerYes);
  return axpy_norm(out, mass, in, out);
 }

 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
-  if (in.checkerboard == Odd) {
+  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerNo);
  } else {
    DhopOE(in, out, DaggerNo);
@ -199,7 +194,7 @@ void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField
 }
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
-  if (in.checkerboard == Odd) {
+  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerYes);
  } else {
    DhopOE(in, out, DaggerYes);
@ -208,27 +203,27 @@ void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionFie

 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  typename FermionField::scalar_type scal(mass);
  out = scal * in;
 }

 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Mooee(in, out);
 }

 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  out = (1.0 / (mass)) * in;
 }

 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
-                                      FermionField &out) {
-  out.checkerboard = in.checkerboard;
+						 FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
  MooeeInv(in, out);
 }

@ -244,8 +239,8 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge

  Compressor compressor;

-  FermionField Btilde(B._grid);
-  FermionField Atilde(B._grid);
+  FermionField Btilde(B.Grid());
+  FermionField Atilde(B.Grid());
  Atilde = A;

  st.HaloExchange(B, compressor);
@ -255,10 +250,13 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
    ////////////////////////
    // Call the single hop
    ////////////////////////
-    PARALLEL_FOR_LOOP
-    for (int sss = 0; sss < B._grid->oSites(); sss++) {
-      Kernels::DhopDir(st, U, UUU, st.CommBuf(), sss, sss, B, Btilde, mu,1);
-    }
+    auto U_v   = U.View();
+    auto UUU_v = UUU.View();
+    auto B_v   = B.View();
+    auto Btilde_v   = Btilde.View();
+    thread_for(sss,B.Grid()->oSites(),{
+      Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
+    });

    // Force in three link terms
    //
@ -288,11 +286,11 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {

-  conformable(U._grid, _grid);
-  conformable(U._grid, V._grid);
-  conformable(U._grid, mat._grid);
+  conformable(U.Grid(), _grid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());

-  mat.checkerboard = U.checkerboard;
+  mat.Checkerboard() = U.Checkerboard();

  DerivInternal(Stencil, Umu, UUUmu, mat, U, V, dag);
 }
@ -300,13 +298,13 @@ void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionFie
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {

-  conformable(U._grid, _cbgrid);
-  conformable(U._grid, V._grid);
-  conformable(U._grid, mat._grid);
+  conformable(U.Grid(), _cbgrid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());

-  assert(V.checkerboard == Even);
-  assert(U.checkerboard == Odd);
-  mat.checkerboard = Odd;
+  assert(V.Checkerboard() == Even);
+  assert(U.Checkerboard() == Odd);
+  mat.Checkerboard() = Odd;

  DerivInternal(StencilEven, UmuOdd, UUUmuOdd, mat, U, V, dag);
 }
@ -314,48 +312,51 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionF
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {

-  conformable(U._grid, _cbgrid);
-  conformable(U._grid, V._grid);
-  conformable(U._grid, mat._grid);
+  conformable(U.Grid(), _cbgrid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());

-  assert(V.checkerboard == Odd);
-  assert(U.checkerboard == Even);
-  mat.checkerboard = Even;
+  assert(V.Checkerboard() == Odd);
+  assert(U.Checkerboard() == Even);
+  mat.Checkerboard() = Even;

  DerivInternal(StencilOdd, UmuEven, UUUmuEven, mat, U, V, dag);
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
+void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
+{
  DhopCalls+=2;
-  conformable(in._grid, _grid);  // verifies full grid
-  conformable(in._grid, out._grid);
+  conformable(in.Grid(), _grid);  // verifies full grid
+  conformable(in.Grid(), out.Grid());

-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();

  DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
+void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
+{
  DhopCalls+=1;
-  conformable(in._grid, _cbgrid);    // verifies half grid
-  conformable(in._grid, out._grid);  // drops the cb check
+  conformable(in.Grid(), _cbgrid);    // verifies half grid
+  conformable(in.Grid(), out.Grid());  // drops the cb check

-  assert(in.checkerboard == Even);
-  out.checkerboard = Odd;
+  assert(in.Checkerboard() == Even);
+  out.Checkerboard() = Odd;

  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
+void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
+{
  DhopCalls+=1;
-  conformable(in._grid, _cbgrid);    // verifies half grid
-  conformable(in._grid, out._grid);  // drops the cb check
+  conformable(in.Grid(), _cbgrid);    // verifies half grid
+  conformable(in.Grid(), out.Grid());  // drops the cb check

-  assert(in.checkerboard == Odd);
-  out.checkerboard = Even;
+  assert(in.Checkerboard() == Odd);
+  out.Checkerboard() = Even;

  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
 }
@ -370,11 +371,13 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel

  Compressor compressor;
  Stencil.HaloExchange(in, compressor);
-
-  PARALLEL_FOR_LOOP
-  for (int sss = 0; sss < in._grid->oSites(); sss++) {
-    Kernels::DhopDir(Stencil, Umu, UUUmu, Stencil.CommBuf(), sss, sss, in, out, dir, disp);
-  }
+  auto Umu_v   =   Umu.View();
+  auto UUUmu_v = UUUmu.View();
+  auto in_v    =  in.View();
+  auto out_v   = out.View();
+  thread_for( sss, in.Grid()->oSites(),{
+    Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
+  });
 };

 template <class Impl>
@ -400,7 +403,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
 {
 #ifdef GRID_OMP
  Compressor compressor; 
-  int len =  U._grid->oSites();
+  int len =  U.Grid()->oSites();
  const int LLs =  1;

  DhopTotalTime   -= usecond();
@ -439,17 +442,21 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
      }

      // do the compute
+      auto U_v   = U.View();
+      auto UUU_v = UUU.View();
+      auto in_v  = in.View();
+      auto out_v = out.View();
      if (dag == DaggerYes) {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
          int sU = ss;
 	  // Interior = 1; Exterior = 0; must implement for staggered
-          Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0); 
+          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); 
        }
      } else {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
 	  // Interior = 1; Exterior = 0;
          int sU = ss;
-          Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
+          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
        }
      }
    } else {
@ -464,17 +471,23 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  DhopFaceTime    -= usecond();

  DhopComputeTime2    -= usecond();
-  if (dag == DaggerYes) {
-    int sz=st.surface_list.size();
-    parallel_for (int ss = 0; ss < sz; ss++) {
-      int sU = st.surface_list[ss];
-      Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
-    }
-  } else {
-    int sz=st.surface_list.size();
-    parallel_for (int ss = 0; ss < sz; ss++) {
-      int sU = st.surface_list[ss];
-      Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
+  {
+    auto U_v   = U.View();
+    auto UUU_v = UUU.View();
+    auto in_v  = in.View();
+    auto out_v = out.View();
+    if (dag == DaggerYes) {
+      int sz=st.surface_list.size();
+      thread_for(ss,sz,{
+	int sU = st.surface_list[ss];
+	Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
+      });
+    } else {
+      int sz=st.surface_list.size();
+      thread_for(ss,sz,{
+	int sU = st.surface_list[ss];
+	Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
+      });
    }
  }
  DhopComputeTime2    += usecond();
@ -500,15 +513,19 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
  st.HaloExchange(in, compressor);
  DhopCommTime    += usecond();

+  auto U_v   =   U.View();
+  auto UUU_v = UUU.View();
+  auto in_v  =  in.View();
+  auto out_v = out.View();
  DhopComputeTime -= usecond();
  if (dag == DaggerYes) {
-    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
-    }
+    thread_for(sss, in.Grid()->oSites(),{
+      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
+    });
  } else {
-    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
-    }
+    thread_for(sss, in.Grid()->oSites(),{
+      Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
+    });
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
@ -520,7 +537,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
 template<class Impl>
 void ImprovedStaggeredFermion<Impl>::Report(void) 
 {
-  std::vector<int> latt = GridDefaultLatt();          
+  Coordinate latt = _grid->GlobalDimensions();
  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
@ -574,31 +591,25 @@ void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
 ////////////////////////////////////////////////////////
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
-                                                        PropagatorField &q_in_2,
-                                                        PropagatorField &q_out,
-                                                        Current curr_type,
-                                                        unsigned int mu)
+							      PropagatorField &q_in_2,
+							      PropagatorField &q_out,
+							      Current curr_type,
+							      unsigned int mu)
 {
-    assert(0);
+  assert(0);
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
-                                              PropagatorField &q_out,
-                                              Current curr_type,
-                                              unsigned int mu,
-                                              unsigned int tmin, 
+void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+                                                         PropagatorField &q_out,
+                                                         Current curr_type,
+                                                         unsigned int mu, 
+                                                         unsigned int tmin,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
-    assert(0);
+  assert(0);

 }

-
-FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
-
-  //AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
-  //TwoIndexFermOpTemplateInstantiate(ImprovedStaggeredFermion);
-
-}}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
@ -0,0 +1,453 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+ 
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
+				  Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1;
+    spinor tmp2;
+    for(int s=0; s<Ls; s++){
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      spProj5m(tmp1, psi(idx_u));
+      spProj5p(tmp2, psi(idx_l));
+      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
+					Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
+					Vector<Coeff_t> &shift_coeffs)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+
+  auto pm  = this->pm;
+  int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
+
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1;
+    spinor tmp2;
+    spinor tmp;
+    for(int s=0; s<Ls; s++){
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      spProj5m(tmp1, psi(idx_u));
+      spProj5p(tmp2, psi(idx_l));
+
+      if(pm == 1){ spProj5p(tmp, psi(ss+shift_s)); }
+      else       { spProj5m(tmp, psi(ss+shift_s)); }
+
+      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 +lower[s]*tmp2 + shift_coeffs[s]*tmp);
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
+				     Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(), {
+    uint64_t ss = sss*Ls;
+
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1, tmp2;
+
+    for(int s=0; s<Ls; s++){
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      spProj5p(tmp1, psi(idx_u));
+      spProj5m(tmp2, psi(idx_l));
+      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
+					   Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
+					   Vector<Coeff_t> &shift_coeffs)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  auto pm = this->pm;
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss = sss*Ls;
+
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1, tmp2, tmp;
+    tmp1=Zero();
+    coalescedWrite(chi[ss+Ls-1],tmp1);
+
+    for(int s=0; s<Ls; s++){
+
+      uint64_t idx_u = ss+((s+1)%Ls);
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
+
+      spProj5p(tmp1, psi(idx_u));
+      spProj5m(tmp2, psi(idx_l));
+
+      if(s==(Ls-1)) coalescedWrite(chi[ss+s], chi(ss+s)+ diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+      else          coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+      if(pm == 1){ spProj5p(tmp, psi(ss+s)); }
+      else       { spProj5m(tmp, psi(ss+s)); }
+
+      coalescedWrite(chi[ss+shift_s],chi(ss+shift_s)+shift_coeffs[s]*tmp);
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+  auto plee = & this->lee [0];
+  auto pdee = & this->dee [0];
+  auto puee = & this->uee [0];
+  auto pleem= & this->leem[0];
+  auto pueem= & this->ueem[0];
+
+  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+
+    uint64_t ss = sss*Ls;
+
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp;
+
+    // Apply (L^{\prime})^{-1}
+    coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0]
+    for(int s=1; s<Ls; s++){
+      spProj5p(tmp, chi(ss+s-1));
+      coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp);
+    }
+
+    // L_m^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+      spProj5m(tmp, chi(ss+s));
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp);
+    }
+
+    // U_m^{-1} D^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+      spProj5p(tmp, chi(ss+Ls-1));
+      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls-1])*tmp);
+    }
+    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
+
+    // Apply U^{-1}
+    for(int s=Ls-2; s>=0; s--){
+      spProj5m(tmp, chi(ss+s+1));
+      coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp);
+    }
+  });
+   
+  this->MooeeInvTime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+  auto pm = this->pm;
+  auto plee = & this->lee [0];
+  auto pdee = & this->dee [0];
+  auto puee = & this->uee [0];
+  auto pleem= & this->leem[0];
+  auto pueem= & this->ueem[0];
+  auto pMooeeInv_shift_lc   = &MooeeInv_shift_lc[0];
+  auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+
+    uint64_t ss = sss*Ls;
+
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1,tmp2,tmp2_spProj;
+
+    // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
+    coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0]
+    tmp2 = pMooeeInv_shift_lc[0]*psi(ss);
+    for(int s=1; s<Ls; s++){
+      spProj5p(tmp1, chi(ss+s-1));
+      coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp1);
+      tmp2 = tmp2 + pMooeeInv_shift_lc[s]*psi(ss+s);
+    }
+    if(pm == 1){ spProj5p(tmp2_spProj, tmp2);}
+    else       { spProj5m(tmp2_spProj, tmp2); }
+
+    // L_m^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+      spProj5m(tmp1, chi(ss+s));
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp1);
+    }
+
+    // U_m^{-1} D^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+      spProj5p(tmp1, chi(ss+Ls-1));
+      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls-1])*tmp1);
+    }
+    // chi[ss+Ls-1] = (1.0/pdee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
+    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
+    spProj5m(tmp1, chi(ss+Ls-1));
+    coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + pMooeeInv_shift_norm[Ls-1]*tmp2_spProj);
+
+    // Apply U^{-1} and add shift term
+    for(int s=Ls-2; s>=0; s--){
+      coalescedWrite(chi[ss+s] , chi(ss+s) - puee[s]*tmp1);
+      spProj5m(tmp1, chi(ss+s));
+      coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInv_shift_norm[s]*tmp2_spProj);
+    }
+  });
+
+  this->MooeeInvTime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionField &chi_i)
+{
+  if(this->shift != 0.0){ MooeeInvDag_shift(psi_i,chi_i); return; }
+
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+  auto plee = & this->lee [0];
+  auto pdee = & this->dee [0];
+  auto puee = & this->uee [0];
+  auto pleem= & this->leem[0];
+  auto pueem= & this->ueem[0];
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+
+    uint64_t ss = sss*Ls;
+
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp;
+
+    // Apply (U^{\prime})^{-dag}
+    coalescedWrite(chi[ss], psi(ss));
+    for(int s=1; s<Ls; s++){
+      spProj5m(tmp, chi(ss+s-1));
+      coalescedWrite(chi[ss+s], psi(ss+s) - puee[s-1]*tmp);
+    }
+    
+    // U_m^{-\dag}
+    for(int s=0; s<Ls-1; s++){
+      spProj5p(tmp, chi(ss+s));
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pueem[s]*tmp);
+    }
+
+    // L_m^{-\dag} D^{-dag}
+    for(int s=0; s<Ls-1; s++){
+      spProj5m(tmp, chi(ss+Ls-1));
+      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pleem[s]/pdee[Ls-1])*tmp);
+    }
+    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
+
+    // Apply L^{-dag}
+    for(int s=Ls-2; s>=0; s--){
+      spProj5p(tmp, chi(ss+s+1));
+      coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp);
+    }
+  });
+
+  this->MooeeInvTime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+  int Ls = this->Ls;
+
+  auto pm = this->pm;
+  auto plee = & this->lee [0];
+  auto pdee = & this->dee [0];
+  auto puee = & this->uee [0];
+  auto pleem= & this->leem[0];
+  auto pueem= & this->ueem[0];
+  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
+  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  int nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,Simd::Nsimd(),{
+
+    uint64_t ss = sss*Ls;
+
+    typedef decltype(coalescedRead(psi[0])) spinor;
+    spinor tmp1,tmp2,tmp2_spProj;
+
+    // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
+    coalescedWrite(chi[ss], psi(ss));
+    tmp2 = pMooeeInvDag_shift_lc[0]*psi(ss);
+    for(int s=1; s<Ls; s++){
+      spProj5m(tmp1, chi(ss+s-1));
+      coalescedWrite(chi[ss+s],psi(ss+s) - puee[s-1]*tmp1);
+      tmp2 = tmp2 + pMooeeInvDag_shift_lc[s]*psi(ss+s);
+    }
+
+    if(pm == 1){ spProj5p(tmp2_spProj, tmp2);}
+    else       { spProj5m(tmp2_spProj, tmp2);}
+
+    // U_m^{-\dag}
+    for(int s=0; s<Ls-1; s++){
+      spProj5p(tmp1, chi(ss+s));
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pueem[s]*tmp1);
+    }
+
+    // L_m^{-\dag} D^{-dag}
+    for(int s=0; s<Ls-1; s++){
+      spProj5m(tmp1, chi(ss+Ls-1));
+      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pleem[s]/pdee[Ls-1])*tmp1);
+    }
+    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
+    spProj5p(tmp1, chi(ss+Ls-1));
+    coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + pMooeeInvDag_shift_norm[Ls-1]*tmp2_spProj);
+
+    // Apply L^{-dag}
+    for(int s=Ls-2; s>=0; s--){
+      coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp1);
+      spProj5p(tmp1, chi(ss+s));
+      coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInvDag_shift_norm[s]*tmp2_spProj);
+    }
+  });
+
+  this->MooeeInvTime += usecond();
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h
@ -0,0 +1,407 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#pragma once
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl>
+MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
+					   GaugeField            &_Umu,
+					   GridCartesian         &FiveDimGrid,
+					   GridRedBlackCartesian &FiveDimRedBlackGrid,
+					   GridCartesian         &FourDimGrid,
+					   GridRedBlackCartesian &FourDimRedBlackGrid,
+					   RealD _mq1, RealD _mq2, RealD _mq3,
+					   RealD _shift, int _pm, RealD _M5,
+					   RealD _b, RealD _c, const ImplParams &p) :
+  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
+			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
+			    _shift, _pm, _M5, _b, _c, p)
+{
+  int Ls = this->Ls;
+
+  RealD eps = 1.0;
+  Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
+  assert(zdata->n == this->Ls);
+
+  std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
+    ",c=" << _c << ") with Ls=" << Ls << std::endl;
+  this->SetCoefficientsTanh(zdata, _b, _c);
+  std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
+    ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
+    ",pm=" << _pm << ")" << std::endl;
+
+  Approx::zolotarev_free(zdata);
+
+  if(_shift != 0.0){
+    SetCoefficientsPrecondShiftOps();
+  } else {
+    Mooee_shift.resize(Ls, 0.0);
+    MooeeInv_shift_lc.resize(Ls, 0.0);
+    MooeeInv_shift_norm.resize(Ls, 0.0);
+    MooeeInvDag_shift_lc.resize(Ls, 0.0);
+    MooeeInvDag_shift_norm.resize(Ls, 0.0);
+  }
+}
+
+/****************************************************************
+ * Additional EOFA operators only called outside the inverter.  
+ * Since speed is not essential, simple axpby-style
+ * implementations should be fine.
+ ***************************************************************/
+template<class Impl>
+void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
+{
+  int Ls = this->Ls;
+  RealD alpha = this->alpha;
+
+  Din = Zero();
+  if((sign == 1) && (dag == 0)) { // \Omega_{+}
+    for(int s=0; s<Ls; ++s){
+      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
+    }
+  } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
+    for(int s=0; s<Ls; ++s){
+      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
+    }
+  } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
+    for(int sp=0; sp<Ls; ++sp){
+      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
+    }
+  } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
+    for(int sp=0; sp<Ls; ++sp){
+      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
+    }
+  }
+}
+
+// This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
+// It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
+template<class Impl>
+void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
+{
+  int Ls    = this->Ls;
+  RealD b   = 0.5 * ( 1.0 + this->alpha );
+  RealD c   = 0.5 * ( 1.0 - this->alpha );
+  RealD mq1 = this->mq1;
+
+  for(int s=0; s<Ls; ++s){
+    if(s == 0) {
+      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
+      axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
+    } else if(s == (Ls-1)) {
+      axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
+      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
+    } else {
+      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
+      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
+    }
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+  RealD m = this->mq1;
+  RealD c = 0.5 * this->alpha;
+  RealD d = 0.5;
+
+  RealD DtInv_p(0.0), DtInv_m(0.0);
+  RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
+  FermionField tmp(this->FermionGrid());
+
+  for(int s=0; s<Ls; ++s){
+    for(int sp=0; sp<Ls; ++sp){
+
+      DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
+      DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
+      DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
+      DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
+
+      if(sp == 0){
+	axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
+	axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
+      } else {
+	axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
+	axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
+      }
+
+    }}
+}
+
+/*****************************************************************************************************/
+
+template<class Impl>
+RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
+{
+  FermionField Din(psi.Grid());
+
+  this->Meooe5D(psi, Din);
+  this->DW(Din, chi, DaggerNo);
+  axpby(chi, 1.0, 1.0, chi, psi);
+  this->M5D(psi, chi);
+  return(norm2(chi));
+}
+
+template<class Impl>
+RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
+{
+  FermionField Din(psi.Grid());
+
+  this->DW(psi, Din, DaggerYes);
+  this->MeooeDag5D(Din, chi);
+  this->M5Ddag(psi, chi);
+  axpby(chi, 1.0, 1.0, chi, psi);
+  return(norm2(chi));
+}
+
+/********************************************************************
+ * Performance critical fermion operators called inside the inverter
+ ********************************************************************/
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  Vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+
+  // no shift term
+  if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
+
+  // fused M + shift operation
+  else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  Vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+
+  // no shift term
+  if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
+
+  // fused M + shift operation
+  else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
+}
+
+// half checkerboard operations
+template<class Impl>
+void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  // coefficients of Mooee
+  Vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
+  for(int s=0; s<Ls; s++){
+    upper[s] = -this->cee[s];
+    lower[s] = -this->cee[s];
+  }
+  upper[Ls-1] *= -this->mq1;
+  lower[0]    *= -this->mq1;
+
+  // no shift term
+  if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
+
+  // fused M + shift operation
+  else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  // coefficients of MooeeDag
+  Vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      upper[s] = -this->cee[s+1];
+      lower[s] = this->mq1*this->cee[Ls-1];
+    } else if(s==(Ls-1)) {
+      upper[s] = this->mq1*this->cee[0];
+      lower[s] = -this->cee[s-1];
+    } else {
+      upper[s] = -this->cee[s+1];
+      lower[s] = -this->cee[s-1];
+    }
+  }
+
+  // no shift term
+  if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
+
+  // fused M + shift operation
+  else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
+}
+
+/****************************************************************************************/
+
+// Computes coefficients for applying Cayley preconditioned shift operators
+//  (Mooee + \Delta) --> Mooee_shift
+//  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
+//  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
+// For the latter two cases, the operation takes the form
+//  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
+//      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
+template<class Impl>
+void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
+{
+  int   Ls    = this->Ls;
+  int   pm    = this->pm;
+  RealD alpha = this->alpha;
+  RealD k     = this->k;
+  RealD mq1   = this->mq1;
+  RealD shift = this->shift;
+
+  // Initialize
+  Mooee_shift.resize(Ls);
+  MooeeInv_shift_lc.resize(Ls);
+  MooeeInv_shift_norm.resize(Ls);
+  MooeeInvDag_shift_lc.resize(Ls);
+  MooeeInvDag_shift_norm.resize(Ls);
+
+  // Construct Mooee_shift
+  int idx(0);
+  Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
+    ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
+  for(int s=0; s<Ls; ++s){
+    idx = (pm == 1) ? (s) : (Ls-1-s);
+    Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
+  }
+
+  // Tridiagonal solve for MooeeInvDag_shift_lc
+  {
+    Coeff_t m(0.0);
+    Vector<Coeff_t> d = Mooee_shift;
+    Vector<Coeff_t> u(Ls,0.0);
+    Vector<Coeff_t> y(Ls,0.0);
+    Vector<Coeff_t> q(Ls,0.0);
+    if(pm == 1){ u[0] = 1.0; }
+    else{ u[Ls-1] = 1.0; }
+
+    // Tridiagonal matrix algorithm + Sherman-Morrison formula
+    //
+    // We solve
+    //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
+    // where Mooee' is the tridiagonal part of Mooee_{+}, and
+    // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
+    // so that the outer-product u \otimes v gives the (0,Ls-1)
+    // entry of Mooee_{+}.
+    //
+    // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
+    // and then construct the solution to the original system
+    //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
+    if(pm == 1){
+      for(int s=1; s<Ls; ++s){
+	m = -this->cee[s] / this->bee[s-1];
+	d[s] -= m*d[s-1];
+	u[s] -= m*u[s-1];
+      }
+    }
+    y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
+    q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
+    for(int s=Ls-2; s>=0; --s){
+      if(pm == 1){
+	y[s] = d[s] / this->bee[s];
+	q[s] = u[s] / this->bee[s];
+      } else {
+	y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
+	q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
+      }
+    }
+
+    // Construct MooeeInvDag_shift_lc
+    for(int s=0; s<Ls; ++s){
+      if(pm == 1){
+	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
+	  (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
+      } else {
+	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
+	  (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
+      }
+    }
+
+    // Compute remaining coefficients
+    N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
+    for(int s=0; s<Ls; ++s){
+
+      // MooeeInv_shift_lc
+      if(pm == 1){ MooeeInv_shift_lc[s] = pow(this->bee[s],s)      * pow(this->cee[s],Ls-1-s); }
+      else       { MooeeInv_shift_lc[s] = pow(this->bee[s],Ls-1-s) * pow(this->cee[s],s); }
+
+      // MooeeInv_shift_norm
+      MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
+	( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N;
+
+      // MooeeInvDag_shift_norm
+      if(pm == 1){ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],s) * pow(this->cee[s],(Ls-1-s)) /
+     	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
+      else{ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],(Ls-1-s)) * pow(this->cee[s],s) /
+	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
+    }
+  }
+}
+
+// Recompute coefficients for a different value of shift constant
+template<class Impl>
+void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
+{
+  this->shift = new_shift;
+  if(new_shift != 0.0){
+    SetCoefficientsPrecondShiftOps();
+  } else {
+    int Ls = this->Ls;
+    Mooee_shift.resize(Ls,0.0);
+    MooeeInv_shift_lc.resize(Ls,0.0);
+    MooeeInv_shift_norm.resize(Ls,0.0);
+    MooeeInvDag_shift_lc.resize(Ls,0.0);
+    MooeeInvDag_shift_norm.resize(Ls,0.0);
+  }
+}
+
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@ -0,0 +1,450 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl>
+void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+  // this does both dag and undag but is trivial; make a common helper routing
+  int Ls = this->Ls;
+
+  this->DhopDir(psi,chi,dir,disp);
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
+    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
+  }
+  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
+
+}
+template<class Impl>
+void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
+{
+  int Ls = this->Ls;
+  if ( psi.Checkerboard() == Odd ) {
+    this->DhopEO(psi,chi,DaggerNo);
+  } else {
+    this->DhopOE(psi,chi,DaggerNo);
+  }
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
+    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
+  }
+  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
+}
+
+template<class Impl>
+void   PartialFractionFermion5D<Impl>::Mooee_internal(const FermionField &psi, FermionField &chi,int dag)
+{
+  // again dag and undag are trivially related
+  int sign = dag ? (-1) : 1;
+  int Ls = this->Ls;
+      
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+	
+    int s = 2*b;
+    RealD pp = p[nblock-1-b];
+    RealD qq = q[nblock-1-b];
+	
+    // Do each 2x2 block aligned at s and multiplies Dw site diagonal by G5 so Hw
+    ag5xpby_ssp(chi,-dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s  ,s+1); 
+    ag5xpby_ssp(chi, dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s+1,s);
+    axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
+  }
+      
+  {
+    RealD R=(1+mass)/(1-mass);
+    //R g5 psi[Ls-1] + p[0] H
+    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale*dw_diag/amax,psi,Ls-1,Ls-1);
+	
+    for(int b=0;b<nblock;b++){
+      int s = 2*b+1;
+      RealD pp = p[nblock-1-b];
+      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
+    }
+  }
+}
+
+template<class Impl>
+void   PartialFractionFermion5D<Impl>::MooeeInv_internal(const FermionField &psi, FermionField &chi,int dag)
+{
+  int sign = dag ? (-1) : 1;
+  int Ls = this->Ls;
+
+  FermionField tmp(psi.Grid());
+      
+  ///////////////////////////////////////////////////////////////////////////////////////
+  //Linv
+  ///////////////////////////////////////////////////////////////////////////////////////
+  int nblock=(Ls-1)/2;
+
+  axpy(chi,0.0,psi,psi); // Identity piece
+      
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    RealD pp = p[nblock-1-b];
+    RealD qq = q[nblock-1-b];
+    RealD coeff1=sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
+    RealD coeff2=sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
+    axpby_ssp  (chi,1.0,chi,coeff1,psi,Ls-1,s);
+    axpbg5y_ssp(chi,1.0,chi,coeff2,psi,Ls-1,s+1);
+  }
+      
+  ///////////////////////////////////////////////////////////////////////////////////////
+  //Dinv (note D isn't really diagonal -- just diagonal enough that we can still invert)
+  // Compute Seeinv (coeff of gamma5)
+  ///////////////////////////////////////////////////////////////////////////////////////
+  RealD R=(1+mass)/(1-mass);
+  RealD Seeinv = R + p[nblock]*dw_diag/amax;
+  for(int b=0;b<nblock;b++){
+    Seeinv += p[nblock-1-b]*dw_diag/amax / ( dw_diag*dw_diag/amax/amax + q[nblock-1-b]);
+  }    
+  Seeinv = 1.0/Seeinv;
+      
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    RealD pp = p[nblock-1-b];
+    RealD qq = q[nblock-1-b];
+    RealD coeff1=dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
+    RealD coeff2=amax*sqrt(qq) / ( dw_diag*dw_diag + amax*amax* qq);
+    ag5xpby_ssp  (tmp,-coeff1,chi,coeff2,chi,s,s+1);
+    ag5xpby_ssp  (tmp, coeff1,chi,coeff2,chi,s+1,s);
+  }
+  ag5xpby_ssp  (tmp, Seeinv,chi,0.0,chi,Ls-1,Ls-1);
+      
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // Uinv
+  ///////////////////////////////////////////////////////////////////////////////////////
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    RealD pp = p[nblock-1-b];
+    RealD qq = q[nblock-1-b];
+    RealD coeff1=-sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
+    RealD coeff2=-sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
+    axpby_ssp  (chi,1.0/scale,tmp,coeff1/scale,tmp,s,Ls-1);
+    axpbg5y_ssp(chi,1.0/scale,tmp,coeff2/scale,tmp,s+1,Ls-1);
+  }
+  axpby_ssp  (chi, 1.0/scale,tmp,0.0,tmp,Ls-1,Ls-1);
+}
+
+template<class Impl>
+void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, FermionField &chi,int dag)
+{
+  FermionField D(psi.Grid());
+  
+  int Ls = this->Ls;
+  int sign = dag ? (-1) : 1;
+
+  // For partial frac Hw case (b5=c5=1) chroma quirkily computes
+  //
+  // Conventions for partfrac appear to be a mess.
+  // Tony's Nara lectures have
+  //
+  // BlockDiag(  H/p_i  1             | 1       )    
+  //          (  1      p_i H / q_i^2 | 0       )  
+  //           ---------------------------------
+  //           ( -1      0                | R  +p0 H  )
+  //
+  //Chroma     ( -2H    2sqrt(q_i)    |   0         )
+  //           (2 sqrt(q_i)   2H      |  2 sqrt(p_i) )
+  //           ---------------------------------
+  //           ( 0     -2 sqrt(p_i)   |  2 R gamma_5 + p0 2H
+  //
+  // Edwards/Joo/Kennedy/Wenger
+  //
+  // Here, the "beta's" selected by chroma to scale the unphysical bulk constraint fields
+  // incorporate the approx scale factor. This is obtained by propagating the
+  // scale on "H" out to the off diagonal elements as follows:
+  //
+  // BlockDiag(  H/p_i  1             | 1       ) 
+  //          (  1      p_i H / q_i^2 | 0       )  
+  //           ---------------------------------
+  //          ( -1      0                | R  + p_0 H  )
+  //
+  // becomes:
+  // BlockDiag(  H/ sp_i  1               | 1             ) 
+  //          (  1      sp_i H / s^2q_i^2 | 0             )  
+  //           ---------------------------------
+  //           ( -1      0                | R + p_0/s H   )
+  //
+  //
+  // This is implemented in Chroma by
+  //           p0' = p0/approxMax
+  //           p_i' = p_i*approxMax
+  //           q_i' = q_i*approxMax*approxMax
+  //
+  // After the equivalence transform is applied the matrix becomes
+  // 
+  //Chroma     ( -2H    sqrt(q'_i)    |   0         )
+  //           (sqrt(q'_i)   2H       |   sqrt(p'_i) )
+  //           ---------------------------------
+  //           ( 0     -sqrt(p'_i)    |  2 R gamma_5 + p'0 2H
+  //
+  //     =     ( -2H    sqrt(q_i)amax    |   0              )
+  //           (sqrt(q_i)amax   2H       |   sqrt(p_i*amax) )
+  //           ---------------------------------
+  //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H
+  //
+
+  this->DW(psi,D,DaggerNo); 
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+	
+    int s = 2*b;
+    double pp = p[nblock-1-b];
+    double qq = q[nblock-1-b];
+	
+    // Do each 2x2 block aligned at s and
+    ag5xpby_ssp(chi,-1.0*scale,D,amax*sqrt(qq)*scale,psi, s  ,s+1); // Multiplies Dw by G5 so Hw
+    ag5xpby_ssp(chi, 1.0*scale,D,amax*sqrt(qq)*scale,psi, s+1,s);
+	
+    // Pick up last column
+    axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
+  }
+	
+  {
+    double R=(1+this->mass)/(1-this->mass);
+    //R g5 psi[Ls] + p[0] H
+    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
+	
+    for(int b=0;b<nblock;b++){
+      int s = 2*b+1;
+      double pp = p[nblock-1-b];
+      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
+    }
+  }
+
+}
+
+template<class Impl>
+RealD  PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
+{
+  M_internal(in,out,DaggerNo);
+  return norm2(out);
+}
+template<class Impl>
+RealD  PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
+{
+  M_internal(in,out,DaggerYes);
+  return norm2(out);
+}
+
+template<class Impl>
+void PartialFractionFermion5D<Impl>::Meooe       (const FermionField &in, FermionField &out)
+{
+  Meooe_internal(in,out,DaggerNo);
+}
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MeooeDag    (const FermionField &in, FermionField &out)
+{
+  Meooe_internal(in,out,DaggerYes);
+}
+template<class Impl>
+void PartialFractionFermion5D<Impl>::Mooee       (const FermionField &in, FermionField &out)
+{
+  Mooee_internal(in,out,DaggerNo);
+}
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MooeeDag    (const FermionField &in, FermionField &out)
+{
+  Mooee_internal(in,out,DaggerYes);
+}
+
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MooeeInv    (const FermionField &in, FermionField &out)
+{
+  MooeeInv_internal(in,out,DaggerNo);
+}
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MooeeInvDag (const FermionField &in, FermionField &out)
+{
+  MooeeInv_internal(in,out,DaggerYes);
+}
+
+
+// force terms; five routines; default to Dhop on diagonal
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+  }
+  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+  this->DhopDeriv(mat,D,V,DaggerNo); 
+};
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+  }
+  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+  this->DhopDerivOE(mat,D,V,DaggerNo); 
+};
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+  }
+  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+  this->DhopDerivEO(mat,D,V,DaggerNo); 
+};
+
+template<class Impl>
+void  PartialFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
+  SetCoefficientsZolotarev(1.0/scale,zdata);
+}
+template<class Impl>
+void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){
+
+  // check on degree matching
+  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+  int Ls = this->Ls;
+
+  assert(Ls == (2*zdata->da -1) );
+
+  // Part frac
+  //      RealD R;
+  R=(1+mass)/(1-mass);
+  dw_diag = (4.0-this->M5);
+
+  //      std::vector<RealD> p; 
+  //      std::vector<RealD> q;
+  p.resize(zdata->da);
+  q.resize(zdata->dd);
+	
+  for(int n=0;n<zdata->da;n++){
+    p[n] = zdata -> alpha[n];
+  }
+  for(int n=0;n<zdata->dd;n++){
+    q[n] = -zdata -> ap[n];
+  }
+      
+  scale= part_frac_chroma_convention ? 2.0 : 1.0; // Chroma conventions annoy me
+
+  amax=zolo_hi;
+}
+
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+    {
+      int Ls = this->Ls;
+      conformable(solution5d.Grid(),this->FermionGrid());
+      conformable(exported4d.Grid(),this->GaugeGrid());
+      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+    }
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+    {
+      int Ls = this->Ls;
+      conformable(imported5d.Grid(),this->FermionGrid());
+      conformable(input4d.Grid()   ,this->GaugeGrid());
+      FermionField tmp(this->FermionGrid());
+      tmp=Zero();
+      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
+      this->Dminus(tmp,imported5d);
+    }
+
+// Constructors
+template<class Impl>
+PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
+							 GridCartesian         &FiveDimGrid,
+							 GridRedBlackCartesian &FiveDimRedBlackGrid,
+							 GridCartesian         &FourDimGrid,
+							 GridRedBlackCartesian &FourDimRedBlackGrid,
+							 RealD _mass,RealD M5,
+							 const ImplParams &p) :
+  WilsonFermion5D<Impl>(_Umu,
+			FiveDimGrid, FiveDimRedBlackGrid,
+			FourDimGrid, FourDimRedBlackGrid,M5,p),
+  mass(_mass)
+
+{
+  int Ls = this->Ls;
+
+  assert((Ls&0x1)==1); // Odd Ls required
+  int nrational=Ls-1;
+
+
+  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);
+
+  // NB: chroma uses a cast to "float" for the zolotarev range(!?).
+  // this creates a real difference in the operator which I do not like but we can replicate here
+  // to demonstrate compatibility
+  //      RealD eps = (zolo_lo / zolo_hi);
+  //      zdata = bfm_zolotarev(eps,nrational,0);
+      
+  SetCoefficientsTanh(zdata,1.0);
+
+  Approx::zolotarev_free(zdata);
+
+}
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
@ -26,6 +26,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#pragma once
+
 #include <Grid/Grid.h>

 #ifdef AVX512
@ -586,11 +588,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VADD(UChi_00,UChi_10,UChi_00)				\
  VADD(UChi_01,UChi_11,UChi_01)				\
  VADD(UChi_02,UChi_12,UChi_02)	);			\
-  asm (									\
-       VSTORE(0,%0,pUChi_00)						\
-       VSTORE(1,%0,pUChi_01)						\
-       VSTORE(2,%0,pUChi_02)						\
-       : : "r" (out) : "memory" );
+  asm (							\
+  VSTORE(0,%0,pUChi_00)					\
+  VSTORE(1,%0,pUChi_01)					\
+  VSTORE(2,%0,pUChi_02)					\
+  : : "r" (out) : "memory" );

 // FIXME is sign right in the VSUB ?
 #define nREDUCEa(out)					\
@ -613,20 +615,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
      permute##dir(Chi_1,Chi_1);\
      permute##dir(Chi_2,Chi_2);

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-					 DoubledGaugeField &U, DoubledGaugeField &UUU,
-					 SiteSpinor *buf, int LLs, int sU, 
-					 const FermionField &in, FermionField &out,int dag) 
+					 DoubledGaugeFieldView &U,
+					 DoubledGaugeFieldView &UUU,
+					 SiteSpinor *buf, int LLs,
+					 int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  assert(0);
 };


-//#define CONDITIONAL_MOVE(l,o,out) if ( l ) { out = (uint64_t) &in._odata[o] ; } else { out =(uint64_t) &buf[o]; }
+//#define CONDITIONAL_MOVE(l,o,out) if ( l ) { out = (uint64_t) &in[o] ; } else { out =(uint64_t) &buf[o]; }

 #define CONDITIONAL_MOVE(l,o,out) { const SiteSpinor *ptr = l? in_p : buf; out = (uint64_t) &ptr[o]; }

@ -673,22 +675,23 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
  CONDITIONAL_MOVE(l3,o3,addr3);					\
  PF_CHI(addr3);							\
  									\
-  gauge0 =(uint64_t)&UU._odata[sU]( X );				\
-  gauge1 =(uint64_t)&UU._odata[sU]( Y );				\
-  gauge2 =(uint64_t)&UU._odata[sU]( Z );				\
-  gauge3 =(uint64_t)&UU._odata[sU]( T ); 
+  gauge0 =(uint64_t)&UU[sU]( X );				\
+  gauge1 =(uint64_t)&UU[sU]( Y );				\
+  gauge2 =(uint64_t)&UU[sU]( Z );				\
+  gauge3 =(uint64_t)&UU[sU]( T ); 
  
  // This is the single precision 5th direction vectorised kernel
 #include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-								    DoubledGaugeField &U, DoubledGaugeField &UUU,
-								    SiteSpinor *buf, int LLs, int sU, 
-								    const FermionField &in, FermionField &out,int dag) 
+								    DoubledGaugeFieldView &U,
+								    DoubledGaugeFieldView &UUU,
+								    SiteSpinor *buf, int LLs,
+								    int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
  uint64_t gauge0,gauge1,gauge2,gauge3;
  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in._odata[0];
+  const SiteSpinor *in_p; in_p = &in[0];

  int o0,o1,o2,o3; // offsets
  int l0,l1,l2,l3; // local 
@ -719,7 +722,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
    LOAD_CHI(addr0,addr1,addr2,addr3);
    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);

-    addr0 = (uint64_t) &out._odata[sF];
+    addr0 = (uint64_t) &out[sF];
    if ( dag ) {
      nREDUCE(addr0);
    } else { 
@ -734,14 +737,15 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl

 #include <Grid/simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-								    DoubledGaugeField &U, DoubledGaugeField &UUU,
-								    SiteSpinor *buf, int LLs, int sU, 
-								    const FermionField &in, FermionField &out,int dag) 
+								    DoubledGaugeFieldView &U,
+								    DoubledGaugeFieldView &UUU,
+								    SiteSpinor *buf, int LLs,
+								    int sU, const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
 #ifdef AVX512
  uint64_t gauge0,gauge1,gauge2,gauge3;
  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in._odata[0];
+  const SiteSpinor *in_p; in_p = &in[0];

  int o0,o1,o2,o3; // offsets
  int l0,l1,l2,l3; // local 
@ -771,7 +775,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
    LOAD_CHI(addr0,addr1,addr2,addr3);
    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);

-    addr0 = (uint64_t) &out._odata[sF];
+    addr0 = (uint64_t) &out[sF];
    if ( dag ) {
      nREDUCE(addr0);
    } else { 
@ -818,14 +822,15 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl

 #include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-							       DoubledGaugeField &U, DoubledGaugeField &UUU,
-							       SiteSpinor *buf, int LLs, int sU, 
-							       const FermionField &in, FermionField &out,int dag) 
+							       DoubledGaugeFieldView &U,
+							       DoubledGaugeFieldView &UUU,
+							       SiteSpinor *buf, int LLs,
+							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
  uint64_t gauge0,gauge1,gauge2,gauge3;
  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in._odata[0];
+  const SiteSpinor *in_p; in_p = &in[0];

  int o0,o1,o2,o3; // offsets
  int l0,l1,l2,l3; // local 
@ -872,7 +877,7 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
    PERMUTE23;
    MULT_ADD_XYZT(gauge2,gauge3);  

-    addr0 = (uint64_t) &out._odata[sF];
+    addr0 = (uint64_t) &out[sF];
    if ( dag ) { 
      nREDUCEa(addr0);
    } else { 
@ -886,14 +891,15 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,

 #include <Grid/simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-							       DoubledGaugeField &U, DoubledGaugeField &UUU,
-							       SiteSpinor *buf, int LLs, int sU, 
-							       const FermionField &in, FermionField &out,int dag) 
+							       DoubledGaugeFieldView &U,
+							       DoubledGaugeFieldView &UUU,
+							       SiteSpinor *buf, int LLs,
+							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
  uint64_t gauge0,gauge1,gauge2,gauge3;
  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in._odata[0];
+  const SiteSpinor *in_p; in_p = &in[0];

  int o0,o1,o2,o3; // offsets
  int l0,l1,l2,l3; // local 
@ -940,7 +946,7 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
    PERMUTE23;
    MULT_ADD_XYZT(gauge2,gauge3);  
    
-    addr0 = (uint64_t) &out._odata[sF];
+    addr0 = (uint64_t) &out[sF];
    if ( dag ) {
      nREDUCEa(addr0);
    } else { 
@ -952,17 +958,5 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
 #endif
 }

-#define KERNEL_INSTANTIATE(CLASS,FUNC,IMPL)			    \
-  template void CLASS<IMPL>::FUNC(StencilImpl &st, LebesgueOrder &lo,	\
-				  DoubledGaugeField &U,			\
-				  DoubledGaugeField &UUU,		\
-				  SiteSpinor *buf, int LLs,		\
-				  int sU, const FermionField &in, FermionField &out,int dag);
-
-KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
-KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
-KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplD);
-KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplF);
-
-}}
+NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
@ -28,6 +28,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #include <Grid/Grid.h>

+#pragma once
+
+NAMESPACE_BEGIN(Grid);

 #define LOAD_CHI(b)		\
  const SiteSpinor & ref (b[offset]);	\
@ -38,7 +41,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 // To splat or not to splat depends on the implementation
 #define MULT(A,UChi)				\
-  auto & ref(U._odata[sU](A));			\
+  auto & ref(U[sU](A));			\
   Impl::loadLinkElement(U_00,ref()(0,0));      \
   Impl::loadLinkElement(U_10,ref()(1,0));      \
   Impl::loadLinkElement(U_20,ref()(2,0));      \
@ -59,7 +62,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    UChi ## _2 += U_22*Chi_2;

 #define MULT_ADD(U,A,UChi)			\
-  auto & ref(U._odata[sU](A));			\
+  auto & ref(U[sU](A));			\
   Impl::loadLinkElement(U_00,ref()(0,0));      \
   Impl::loadLinkElement(U_10,ref()(1,0));      \
   Impl::loadLinkElement(U_20,ref()(2,0));      \
@ -92,7 +95,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  local  = SE->_is_local;		\
  perm   = SE->_permute;		\
  if ( local ) {						\
-    LOAD_CHI(in._odata);					\
+    LOAD_CHI(in);					\
    if ( perm) {						\
      PERMUTE_DIR(Perm);					\
    }								\
@ -120,14 +123,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  local  = SE->_is_local;				\
  perm   = SE->_permute;				\
  if ( local ) {					\
-    LOAD_CHI(in._odata);				\
+    LOAD_CHI(in);				\
    if ( perm) {					\
      PERMUTE_DIR(Perm);				\
    }							\
  } else if ( st.same_node[Dir] ) {			\
    LOAD_CHI(buf);					\
  }							\
-  if (SE->_is_local || st.same_node[Dir] ) {		\
+  if (local || st.same_node[Dir] ) {		\
    MULT_ADD(U,Dir,even);				\
  }

@ -135,22 +138,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  SE=st.GetEntry(ptype,Dir+skew,sF);			\
  offset = SE->_offset;					\
  local  = SE->_is_local;				\
-  perm   = SE->_permute;				\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
+  if ((!local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
    { LOAD_CHI(buf);	  }					\
    { MULT_ADD(U,Dir,even); }					\
  }								

-namespace Grid {
-namespace QCD {
-

 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
-					  DoubledGaugeField &U,DoubledGaugeField &UUU,
+					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
 					  SiteSpinor *buf, int LLs, int sU, 
-					  const FermionField &in, FermionField &out,int dag) 
+					  const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@ -213,16 +212,16 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    vstream(out._odata[sF],result);
+    vstream(out[sF],result);
  }
 }


 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeField &U, DoubledGaugeField &UUU,
+					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionField &in, FermionField &out,int dag) 
+					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@ -249,7 +248,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
  Simd U_22; 

  SiteSpinor result;
-  int offset,local,perm, ptype;
+  int offset, ptype, local, perm;

  StencilEntry *SE;
  int skew;
@ -257,8 +256,8 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
  for(int s=0;s<LLs;s++){
    int sF=s+LLs*sU;

-    even_0 = zero;    even_1 = zero;    even_2 = zero;
-     odd_0 = zero;     odd_1 = zero;     odd_2 = zero;
+    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
+     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();

    skew = 0;
    HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);  
@ -289,16 +288,16 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    vstream(out._odata[sF],result);
+    vstream(out[sF],result);
  }
 }


 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeField &U, DoubledGaugeField &UUU,
+					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionField &in, FermionField &out,int dag) 
+					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@ -325,7 +324,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
  Simd U_22; 

  SiteSpinor result;
-  int offset,local,perm, ptype;
+  int offset, ptype, local;

  StencilEntry *SE;
  int skew;
@ -333,8 +332,8 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
  for(int s=0;s<LLs;s++){
    int sF=s+LLs*sU;

-    even_0 = zero;    even_1 = zero;    even_2 = zero;
-     odd_0 = zero;     odd_1 = zero;     odd_2 = zero;
+    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
+     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
    int nmu=0;
    skew = 0;
    HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);  
@ -366,34 +365,29 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
 	result()()(1) = even_1 + odd_1;
 	result()()(2) = even_2 + odd_2;
      }
-      out._odata[sF] = out._odata[sF] + result;
+      out[sF] = out[sF] + result;
    }
  }
 }

-
 #define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
+						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionField &in, FermionField &out, int dag); \
+						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 									\
  template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
+						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionField &in, FermionField &out, int dag); \
+						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 									\
  template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
+						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionField &in, FermionField &out, int dag); \
+						     const FermionFieldView &in, FermionFieldView &out, int dag); \

-DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
+#undef LOAD_CHI
+
+NAMESPACE_END(Grid);


-}
-}
-
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@ -28,40 +28,38 @@ directory
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>

-namespace Grid {
-namespace QCD {
+#pragma once

-int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
-int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
+NAMESPACE_BEGIN(Grid);

 #define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
    if (SE->_permute) {						\
      chi_p = &chi;						\
-      permute(chi,  in._odata[SE->_offset], ptype);		\
+      permute(chi,  in[SE->_offset], ptype);		\
    } else {							\
-      chi_p = &in._odata[SE->_offset];				\
+      chi_p = &in[SE->_offset];				\
    }								\
  } else {							\
    chi_p = &buf[SE->_offset];					\
  }								\
-  multLink(Uchi, U._odata[sU], *chi_p, Dir);			
+  multLink(Uchi, U[sU], *chi_p, Dir);			

 #define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
    if (SE->_permute) {						\
      chi_p = &chi;						\
-      permute(chi,  in._odata[SE->_offset], ptype);		\
+      permute(chi,  in[SE->_offset], ptype);		\
    } else {							\
-      chi_p = &in._odata[SE->_offset];				\
+      chi_p = &in[SE->_offset];				\
    }								\
  } else if ( st.same_node[Dir] ) {				\
    chi_p = &buf[SE->_offset];					\
  }								\
  if (SE->_is_local || st.same_node[Dir] ) {			\
-    multLink(Uchi, U._odata[sU], *chi_p, Dir);			\
+    multLink(Uchi, U[sU], *chi_p, Dir);			\
  }

 #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
@ -69,7 +67,7 @@ int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
    chi_p = &buf[SE->_offset];					\
-    multLink(Uchi, U._odata[sU], *chi_p, Dir);			\
+    multLink(Uchi, U[sU], *chi_p, Dir);			\
  }

 template <class Impl>
@ -81,9 +79,9 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeField &U, DoubledGaugeField &UUU,
+					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionField &in, FermionField &out, int dag) {
+					     const FermionFieldView &in, FermionFieldView &out, int dag) {
  const SiteSpinor *chi_p;
  SiteSpinor chi;
  SiteSpinor Uchi;
@ -114,7 +112,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
    if ( dag ) { 
      Uchi = - Uchi;
    } 
-    vstream(out._odata[sF], Uchi);
+    vstream(out[sF], Uchi);
  }
 };

@ -123,9 +121,9 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
  ///////////////////////////////////////////////////
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
-						DoubledGaugeField &U, DoubledGaugeField &UUU,
+						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int LLs, int sU, 
-						const FermionField &in, FermionField &out,int dag) {
+						const FermionFieldView &in, FermionFieldView &out,int dag) {
  const SiteSpinor *chi_p;
  SiteSpinor chi;
  SiteSpinor Uchi;
@ -136,7 +134,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
  for(int s=0;s<LLs;s++){
    int sF=LLs*sU+s;
    skew = 0;
-    Uchi=zero;
+    Uchi=Zero();
    GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
@ -157,7 +155,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
    if ( dag ) {
      Uchi = - Uchi;
    }
-    vstream(out._odata[sF], Uchi);
+    vstream(out[sF], Uchi);
  }
 };

@ -167,11 +165,11 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
  ///////////////////////////////////////////////////
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
-						DoubledGaugeField &U, DoubledGaugeField &UUU,
+						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int LLs, int sU,
-						const FermionField &in, FermionField &out,int dag) {
+						const FermionFieldView &in, FermionFieldView &out,int dag) {
  const SiteSpinor *chi_p;
-  SiteSpinor chi;
+  //  SiteSpinor chi;
  SiteSpinor Uchi;
  StencilEntry *SE;
  int ptype;
@ -181,7 +179,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
  for(int s=0;s<LLs;s++){
    int sF=LLs*sU+s;
    skew = 0;
-    Uchi=zero;
+    Uchi=Zero();
    GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
@ -202,9 +200,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &

    if ( nmu ) { 
      if ( dag ) { 
-	out._odata[sF] = out._odata[sF] - Uchi;
+	out[sF] = out[sF] - Uchi;
      } else { 
-	out._odata[sF] = out._odata[sF] + Uchi;
+	out[sF] = out[sF] + Uchi;
      }
    }
  }
@ -215,9 +213,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
 ////////////////////////////////////////////////////////////////////////////////////

 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
+void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					 SiteSpinor *buf, int LLs, int sU,
-					 const FermionField &in, FermionField &out,
+					 const FermionFieldView &in, FermionFieldView &out,
 					 int interior,int exterior)
 {
  int dag=1;
@ -225,9 +223,9 @@ void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, Dou
 };

 template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
+void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 				      SiteSpinor *buf, int LLs, int sU,
-				      const FermionField &in, FermionField &out,
+				      const FermionFieldView &in, FermionFieldView &out,
 				      int interior,int exterior)
 {
  int dag=0;
@ -235,9 +233,9 @@ void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, Double
 };

 template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
+void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 				      SiteSpinor *buf, int LLs,
-				      int sU, const FermionField &in, FermionField &out,
+				      int sU, const FermionFieldView &in, FermionFieldView &out,
 				      int dag,int interior,int exterior) 
 {
  switch(Opt) {
@ -277,8 +275,8 @@ void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, Double
 };

 template <class Impl>
-void StaggeredKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,  DoubledGaugeField &UUU, SiteSpinor *buf, int sF,
-				      int sU, const FermionField &in, FermionField &out, int dir, int disp) 
+void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U,  DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
+					    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp) 
 {
  // Disp should be either +1,-1,+3,-3
  // What about "dag" ?
@ -287,8 +285,6 @@ void StaggeredKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,  Do
  assert(0);
 }

-FermOpStaggeredTemplateInstantiate(StaggeredKernels);
-FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
+NAMESPACE_END(Grid);

-}}

--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@ -26,23 +26,21 @@
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
-//#include <Grid/Eigen/Dense>
-#include <Grid/qcd/spin/Dirac.h>

-namespace Grid
-{
-namespace QCD
-{
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
+
+NAMESPACE_BEGIN(Grid);

 // *NOT* EO
 template <class Impl>
 RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
 {
-  FermionField temp(out._grid);
+  FermionField temp(out.Grid());

  // Wilson term
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  this->Dhop(in, out, DaggerNo);

  // Clover term
@ -55,10 +53,10 @@ RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
 template <class Impl>
 RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 {
-  FermionField temp(out._grid);
+  FermionField temp(out.Grid());

  // Wilson term
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  this->Dhop(in, out, DaggerYes);

  // Clover term
@ -72,7 +70,7 @@ template <class Impl>
 void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  WilsonFermion<Impl>::ImportGauge(_Umu);
-  GridBase *grid = _Umu._grid;
+  GridBase *grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);

  // Compute the field strength terms mu>nu
@ -93,27 +91,29 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  CloverTerm += fillCloverZT(Ez) * csw_t;
  CloverTerm += diag_mass;

-  int lvol = _Umu._grid->lSites();
+  int lvol = _Umu.Grid()->lSites();
  int DimRep = Impl::Dimension;

  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);

-  std::vector<int> lcoor;
-  typename SiteCloverType::scalar_object Qx = zero, Qxinv = zero;
+  Coordinate lcoor;
+  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();

  for (int site = 0; site < lvol; site++)
  {
    grid->LocalIndexToLocalCoor(site, lcoor);
    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
    peekLocalSite(Qx, CloverTerm, lcoor);
-    Qxinv = zero;
+    Qxinv = Zero();
    //if (csw!=0){
    for (int j = 0; j < Ns; j++)
      for (int k = 0; k < Ns; k++)
        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++)
-            EigenCloverOp(a + j * DimRep, b + k * DimRep) = Qx()(j, k)(a, b);
+          for (int b = 0; b < DimRep; b++){
+	    auto zz =  Qx()(j, k)(a, b);
+            EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
+	  }
    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;

    EigenInvCloverOp = EigenCloverOp.inverse();
@ -169,15 +169,15 @@ void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField
 template <class Impl>
 void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  CloverFieldType *Clover;
-  assert(in.checkerboard == Odd || in.checkerboard == Even);
+  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);

  if (dag)
  {
-    if (in._grid->_isCheckerBoarded)
+    if (in.Grid()->_isCheckerBoarded)
    {
-      if (in.checkerboard == Odd)
+      if (in.Checkerboard() == Odd)
      {
        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
      }
@ -195,10 +195,10 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
  }
  else
  {
-    if (in._grid->_isCheckerBoarded)
+    if (in.Grid()->_isCheckerBoarded)
    {

-      if (in.checkerboard == Odd)
+      if (in.Checkerboard() == Odd)
      {
        //  std::cout << "Calling clover term Odd" << std::endl;
        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
@ -209,7 +209,7 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
      }
      out = *Clover * in;
-      //  std::cout << GridLogMessage << "*Clover.checkerboard "  << (*Clover).checkerboard << std::endl;
+      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
    }
    else
    {
@ -235,9 +235,4 @@ void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U,
  assert(0); // not implemented yet
 }

-FermOpTemplateInstantiate(WilsonCloverFermion);
-AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
-TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
-//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
-}
-}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -36,13 +36,8 @@ Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>
 #include <Grid/perfmon/PerfCount.h>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);
  
-// S-direction is INNERMOST and takes no part in the parity.
-const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
-const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
-
  // 5d lattice for DWF.
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
@ -56,9 +51,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
-  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
-  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
-  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
+  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements,p),
+  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
+  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
  M5(_M5),
  Umu(_FourDimGrid),
  UmuEven(_FourDimRedBlackGrid),
@ -105,8 +100,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);

    for(int d=0;d<4;d++){
-      assert(FourDimGrid._simd_layout[d]=1);
-      assert(FourDimRedBlackGrid._simd_layout[d]=1);
+      assert(FourDimGrid._simd_layout[d]==1);
+      assert(FourDimRedBlackGrid._simd_layout[d]==1);
      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
    }

@ -141,7 +136,7 @@ void WilsonFermion5D<Impl>::Report(void)
  RealD NP     = _FourDimGrid->_Nprocessors;
  RealD NN     = _FourDimGrid->NodeCount();
  RealD volume = Ls;  
-  std::vector<int> latt = _FourDimGrid->GlobalDimensions();
+  Coordinate latt = _FourDimGrid->GlobalDimensions();
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];

  if ( DhopCalls > 0 ) {
@ -221,7 +216,7 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
-  GaugeField HUmu(_Umu._grid);
+  GaugeField HUmu(_Umu.Grid());
  HUmu = _Umu*(-0.5);
  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
  pickCheckerboard(Even,UmuEven,Umu);
@ -235,51 +230,43 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  //  assert( (disp==1)||(disp==-1) );
  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;

-  Compressor compressor(DaggerNo);
-  Stencil.HaloExchange(in,compressor);
-  
  int skip = (disp==1) ? 0 : 1;
-
  int dirdisp = dir+skip*4;
  int gamma   = dir+(1-skip)*4;

-  assert(dirdisp<=7);
-  assert(dirdisp>=0);
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in,compressor);
+  
+  uint64_t Nsite = Umu.Grid()->oSites();
+  Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);

-  parallel_for(int ss=0;ss<Umu._grid->oSites();ss++){
-    for(int s=0;s<Ls;s++){
-      int sU=ss;
-      int sF = s+Ls*sU; 
-      Kernels::DhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
-    }
-  }
 };

 template<class Impl>
 void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
-            DoubledGaugeField & U,
-            GaugeField &mat,
-            const FermionField &A,
-            const FermionField &B,
-            int dag)
+					  DoubledGaugeField & U,
+					  GaugeField &mat,
+					  const FermionField &A,
+					  const FermionField &B,
+					  int dag)
 {
  DerivCalls++;
  assert((dag==DaggerNo) ||(dag==DaggerYes));

-  conformable(st._grid,A._grid);
-  conformable(st._grid,B._grid);
+  conformable(st.Grid(),A.Grid());
+  conformable(st.Grid(),B.Grid());

  Compressor compressor(dag);
  
-  FermionField Btilde(B._grid);
-  FermionField Atilde(B._grid);
+  FermionField Btilde(B.Grid());
+  FermionField Atilde(B.Grid());

  DerivCommTime-=usecond();
  st.HaloExchange(B,compressor);
  DerivCommTime+=usecond();

  Atilde=A;
-  int LLs = B._grid->_rdimensions[0];
+  int LLs = B.Grid()->_rdimensions[0];


  DerivComputeTime-=usecond();
@ -295,21 +282,11 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
    ////////////////////////

    DerivDhopComputeTime -= usecond();
-    parallel_for (int sss = 0; sss < U._grid->oSites(); sss++) {
-      for (int s = 0; s < Ls; s++) {
-        int sU = sss;
-        int sF = s + Ls * sU;

-        assert(sF < B._grid->oSites());
-        assert(sU < U._grid->oSites());
+    int Usites = U.Grid()->oSites();

-        Kernels::DhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);
+    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);

-        ////////////////////////////
-        // spin trace outer product
-        ////////////////////////////
-      }
-    }
    ////////////////////////////
    // spin trace outer product
    ////////////////////////////
@ -325,12 +302,13 @@ void WilsonFermion5D<Impl>::DhopDeriv(GaugeField &mat,
                                      const FermionField &B,
                                      int dag)
 {
-  conformable(A._grid,FermionGrid());  
-  conformable(A._grid,B._grid);
+  conformable(A.Grid(),FermionGrid());  
+  conformable(A.Grid(),B.Grid());

-  //conformable(GaugeGrid(),mat._grid);// this is not general! leaving as a comment
+  //conformable(GaugeGrid(),mat.Grid());// this is not general! leaving as a comment

-  mat.checkerboard = A.checkerboard;
+  mat.Checkerboard() = A.Checkerboard();
+  //  mat.checkerboard = A.checkerboard;

  DerivInternal(Stencil,Umu,mat,A,B,dag);
 }
@ -341,12 +319,12 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
                                        const FermionField &B,
                                        int dag)
 {
-  conformable(A._grid,FermionRedBlackGrid());
-  conformable(A._grid,B._grid);
+  conformable(A.Grid(),FermionRedBlackGrid());
+  conformable(A.Grid(),B.Grid());

-  assert(B.checkerboard==Odd);
-  assert(A.checkerboard==Even);
-  mat.checkerboard = Even;
+  assert(B.Checkerboard()==Odd);
+  assert(A.Checkerboard()==Even);
+  mat.Checkerboard() = Even;

  DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
 }
@ -358,12 +336,12 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
                                        const FermionField &B,
                                        int dag)
 {
-  conformable(A._grid,FermionRedBlackGrid());
-  conformable(A._grid,B._grid);
+  conformable(A.Grid(),FermionRedBlackGrid());
+  conformable(A.Grid(),B.Grid());

-  assert(B.checkerboard==Even);
-  assert(A.checkerboard==Odd);
-  mat.checkerboard = Odd;
+  assert(B.Checkerboard()==Even);
+  assert(A.Checkerboard()==Odd);
+  mat.Checkerboard() = Odd;

  DerivInternal(StencilEven,UmuOdd,mat,A,B,dag);
 }
@ -374,11 +352,9 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         const FermionField &in, FermionField &out,int dag)
 {
  DhopTotalTime-=usecond();
-#ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else 
-#endif
    DhopInternalSerialComms(st,lo,U,in,out,dag);
  DhopTotalTime+=usecond();
 }
@ -389,131 +365,84 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 							DoubledGaugeField & U,
 							const FermionField &in, FermionField &out,int dag)
 {
-#ifdef GRID_OMP
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-
  Compressor compressor(dag);

-  int LLs = in._grid->_rdimensions[0];
-  int len =  U._grid->oSites();
+  int LLs = in.Grid()->_rdimensions[0];
+  int len =  U.Grid()->oSites();

+  /////////////////////////////
+  // Start comms  // Gather intranode and extra node differentiated??
+  /////////////////////////////
  DhopFaceTime-=usecond();
  st.HaloExchangeOptGather(in,compressor);
-  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();

-  double ctime=0;
-  double ptime=0;
+  DhopCommTime -=usecond();
+  std::vector<std::vector<CommsRequest_t> > requests;
+  st.CommunicateBegin(requests);

-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
-  { 
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      double start = usecond();
-      nthreads -= ncomms;
-      int ttid = tid - ncomms;
-      int n = U._grid->oSites();
-      int chunk = n / nthreads;
-      int rem = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-	myblock = ttid * chunk + ttid;
-	myn = chunk+1;
-      } else {
-	myblock = ttid*chunk + rem;
-	myn = chunk;
-      }
+  /////////////////////////////
+  // Overlap with comms
+  /////////////////////////////
+  DhopFaceTime-=usecond();
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+  DhopFaceTime+=usecond();
      
-      // do the compute
-      if (dag == DaggerYes) {
-	for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  int sU = ss;
-	  int sF = LLs * sU;
-	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
-	}
-      } else {
-	for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  int sU = ss;
-	  int sF = LLs * sU;
-	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
-	}
-      }
-	ptime = usecond() - start;
-    } else {
-      double start = usecond();
-      st.CommunicateThreaded();
-      ctime = usecond() - start;
-    }
+  /////////////////////////////
+  // do the compute interior
+  /////////////////////////////
+  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
+  DhopComputeTime-=usecond();
+  if (dag == DaggerYes) {
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
+  } else {
+    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
-  DhopCommTime += ctime;
-  DhopComputeTime+=ptime;
+  DhopComputeTime+=usecond();

-  // First to enter, last to leave timing
-  st.CollateThreads();
+  /////////////////////////////
+  // Complete comms
+  /////////////////////////////
+  st.CommunicateComplete(requests);
+  DhopCommTime   +=usecond();

+  /////////////////////////////
+  // do the compute exterior
+  /////////////////////////////
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();

  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
-    int sz=st.surface_list.size();
-    parallel_for (int ss = 0; ss < sz; ss++) {
-      int sU = st.surface_list[ss];
-      int sF = LLs * sU;
-      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
-    }
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  } else {
-    int sz=st.surface_list.size();
-    parallel_for (int ss = 0; ss < sz; ss++) {
-      int sU = st.surface_list[ss];
-      int sF = LLs * sU;
-      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
-    }
+    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
-#else 
-  assert(0);
-#endif
 }


 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
-					 DoubledGaugeField & U,
-					 const FermionField &in, FermionField &out,int dag)
+						    DoubledGaugeField & U,
+						    const FermionField &in, 
+						    FermionField &out,int dag)
 {
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  Compressor compressor(dag);

-  int LLs = in._grid->_rdimensions[0];
+  int LLs = in.Grid()->_rdimensions[0];
  
  DhopCommTime-=usecond();
  st.HaloExchangeOpt(in,compressor);
  DhopCommTime+=usecond();
  
  DhopComputeTime-=usecond();
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-
+  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
-    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
-      int sU = ss;
-      int sF = LLs * sU;
-      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
-    }
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  } else {
-    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
-      int sU = ss;
-      int sF = LLs * sU;
-      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
-    }
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
 }
@ -523,11 +452,11 @@ template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
-  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
-  conformable(in._grid,out._grid); // drops the cb check
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check

-  assert(in.checkerboard==Even);
-  out.checkerboard = Odd;
+  assert(in.Checkerboard()==Even);
+  out.Checkerboard() = Odd;

  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
 }
@ -535,11 +464,11 @@ template<class Impl>
 void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
-  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
-  conformable(in._grid,out._grid); // drops the cb check
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check

-  assert(in.checkerboard==Odd);
-  out.checkerboard = Even;
+  assert(in.Checkerboard()==Odd);
+  out.Checkerboard() = Even;

  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
 }
@ -547,17 +476,17 @@ template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
-  conformable(in._grid,FermionGrid()); // verifies full grid
-  conformable(in._grid,out._grid);
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());

-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();

  DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
 {
-  out.checkerboard=in.checkerboard;
+  out.Checkerboard()=in.Checkerboard();
  Dhop(in,out,dag); // -0.5 is included
  axpy(out,4.0-M5,in,out);
 }
@ -569,7 +498,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  GridBase *_grid = _FourDimGrid;
  GridBase *_5dgrid = _FiveDimGrid;

-  conformable(_5dgrid,out._grid);
+  conformable(_5dgrid,out.Grid());

  FermionField   PRsource(_5dgrid);
  FermionField   PLsource(_5dgrid);
@ -580,7 +509,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  FermionField   bufL_4d(_grid);
  FermionField   bufR_4d(_grid);

-  unsigned int Ls = in._grid->_rdimensions[0];
+  unsigned int Ls = in.Grid()->_rdimensions[0];
  
  typedef typename FermionField::vector_type vector_type;
  typedef typename FermionField::scalar_type ScalComplex;
@ -596,12 +525,12 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const

  Gamma g5(Gamma::Algebra::Gamma5);

-  std::vector<int> latt_size   = _grid->_fdimensions;
+  Coordinate latt_size   = _grid->_fdimensions;

-  LatComplex    sk(_grid);  sk = zero;
-  LatComplex    sk2(_grid); sk2= zero;
-  LatComplex    W(_grid); W= zero;
-  LatComplex    a(_grid); a= zero;
+  LatComplex    sk(_grid);  sk = Zero();
+  LatComplex    sk2(_grid); sk2= Zero();
+  LatComplex    W(_grid); W= Zero();
+  LatComplex    a(_grid); a= Zero();
  LatComplex    one  (_grid); one = ScalComplex(1.0,0.0);
  LatComplex 	cosha(_grid);
  LatComplex 	kmu(_grid);
@ -643,9 +572,9 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const

  // FIXME Need a Lattice acosh
  for(int idx=0;idx<_grid->lSites();idx++){
-    std::vector<int> lcoor(Nd);
+    Coordinate lcoor(Nd);
    Tcomplex cc;
-    RealD sgn;
+    //    RealD sgn;
    _grid->LocalIndexToLocalCoor(idx,lcoor);
    peekLocalSite(cc,cosha,lcoor);
    assert((double)real(cc)>=1.0);
@ -678,8 +607,8 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  //calculate GR, GL
  for(unsigned int ss=1;ss<=Ls;ss++)
  {
-    bufR_4d = zero;
-    bufL_4d = zero;
+    bufR_4d = Zero();
+    bufL_4d = Zero();
    for(unsigned int tt=1;tt<=Ls;tt++)
    {
      //possible sign if W<0
@ -688,7 +617,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const

      unsigned int f = (ss > tt) ? ss-tt : tt-ss; //f = abs(ss-tt)
      //GR
-      buf1_4d = zero;
+      buf1_4d = Zero();
      ExtractSlice(buf1_4d, PRsource, (tt-1), 0);
      //G(s,t)
      bufR_4d = bufR_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf1_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf1_4d;
@ -702,7 +631,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
      bufR_4d = bufR_4d + Amm * exp(-a*ss) * exp(-a*tt) * signW * buf1_4d ;

      //GL
-      buf2_4d = zero;
+      buf2_4d = Zero();
      ExtractSlice(buf2_4d, PLsource, (tt-1), 0);
      //G(s,t)
      bufL_4d = bufL_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf2_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf2_4d;
@ -722,13 +651,13 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
 //calculate propagator
  for(unsigned int ss=1;ss<=Ls;ss++)
  {
-    bufR_4d = zero;
-    bufL_4d = zero;
+    bufR_4d = Zero();
+    bufL_4d = Zero();

    //(i*gamma_mu*sin(p_mu) - W)*(GL*P- source)
-    buf1_4d = zero;
+    buf1_4d = Zero();
    ExtractSlice(buf1_4d, GL, (ss-1), 0);
-    buf2_4d = zero;
+    buf2_4d = Zero();
    for(int mu=0;mu<Nd;mu++) {
      LatticeCoordinate(kmu,mu);
      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
@ -738,9 +667,9 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
    bufL_4d = buf2_4d - W * buf1_4d;

    //(i*gamma_mu*sin(p_mu) - W)*(GR*P+ source)
-    buf1_4d = zero;
+    buf1_4d = Zero();
    ExtractSlice(buf1_4d, GR, (ss-1), 0);
-    buf2_4d = zero;
+    buf2_4d = Zero();
    for(int mu=0;mu<Nd;mu++) {
      LatticeCoordinate(kmu,mu);
      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
@ -781,7 +710,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
 {
  // what type LatticeComplex 
  GridBase *_grid = _FourDimGrid;
-  conformable(_grid,out._grid);
+  conformable(_grid,out.Grid());
  
  typedef typename FermionField::vector_type vector_type;
  typedef typename FermionField::scalar_type ScalComplex;
@ -795,17 +724,17 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
    Gamma::Algebra::GammaT
  };

-  std::vector<int> latt_size   = _grid->_fdimensions;
+  Coordinate latt_size   = _grid->_fdimensions;

  
-  FermionField   num  (_grid); num  = zero;
+  FermionField   num  (_grid); num  = Zero();

-  LatComplex    sk(_grid);  sk = zero;
-  LatComplex    sk2(_grid); sk2= zero;
-  LatComplex    W(_grid); W= zero;
-  LatComplex    a(_grid); a= zero;
+  LatComplex    sk(_grid);  sk = Zero();
+  LatComplex    sk2(_grid); sk2= Zero();
+  LatComplex    W(_grid); W= Zero();
+  LatComplex    a(_grid); a= Zero();
  LatComplex    one  (_grid); one = ScalComplex(1.0,0.0);
-  LatComplex denom(_grid); denom= zero;
+  LatComplex denom(_grid); denom= Zero();
  LatComplex cosha(_grid); 
  LatComplex kmu(_grid); 
  LatComplex Wea(_grid); 
@ -838,9 +767,9 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe

  // FIXME Need a Lattice acosh
  for(int idx=0;idx<_grid->lSites();idx++){
-    std::vector<int> lcoor(Nd);
+    Coordinate lcoor(Nd);
    Tcomplex cc;
-    RealD sgn;
+    //    RealD sgn;
    _grid->LocalIndexToLocalCoor(idx,lcoor);
    peekLocalSite(cc,cosha,lcoor);
    assert((double)real(cc)>=1.0);
@ -868,7 +797,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
    };

    GridBase *_grid = _FourDimGrid;
-    conformable(_grid,out._grid);
+    conformable(_grid,out.Grid());

    typedef typename FermionField::vector_type vector_type;
    typedef typename FermionField::scalar_type ScalComplex;
@ -876,18 +805,18 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
    typedef Lattice<iSinglet<vector_type> > LatComplex;


-    std::vector<int> latt_size   = _grid->_fdimensions;
+    Coordinate latt_size   = _grid->_fdimensions;

-    LatComplex    sk(_grid);  sk = zero;
-    LatComplex    sk2(_grid); sk2= zero;
+    LatComplex    sk(_grid);  sk = Zero();
+    LatComplex    sk2(_grid); sk2= Zero();

-    LatComplex    w_k(_grid); w_k= zero;
-    LatComplex    b_k(_grid); b_k= zero;
+    LatComplex    w_k(_grid); w_k= Zero();
+    LatComplex    b_k(_grid); b_k= Zero();

    LatComplex     one  (_grid); one = ScalComplex(1.0,0.0);

-    FermionField   num  (_grid); num  = zero;
-    LatComplex denom(_grid); denom= zero;
+    FermionField   num  (_grid); num  = Zero();
+    LatComplex denom(_grid); denom= Zero();
    LatComplex kmu(_grid); 
    ScalComplex ci(0.0,1.0);

@ -928,7 +857,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
 // Helper macro to reverse Simd vector. Fixme: slow, generic implementation.
 #define REVERSE_LS(qSite, qSiteRev, Nsimd) \
 { \
-    std::vector<typename SitePropagator::scalar_object> qSiteVec(Nsimd); \
+    ExtractBuffer<typename SitePropagator::scalar_object> qSiteVec(Nsimd);	\
    extract(qSite, qSiteVec); \
    for (int i = 0; i < Nsimd / 2; ++i) \
    { \
@ -946,31 +875,35 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
 template<class vobj> 
 Lattice<vobj> spProj5p(const Lattice<vobj> & in)
 {
-  GridBase *grid=in._grid;
+  GridBase *grid=in.Grid();
  Gamma G5(Gamma::Algebra::Gamma5);
  Lattice<vobj> ret(grid);
-  parallel_for(int ss=0;ss<grid->oSites();ss++){
-    ret._odata[ss] = in._odata[ss] + G5*in._odata[ss];
-  }
+  auto ret_v = ret.View();
+  auto in_v  =  in.View();
+  thread_for(ss,grid->oSites(),{
+    ret_v[ss] = in_v[ss] + G5*in_v[ss];
+  });
  return ret;
 }
 template<class vobj> 
 Lattice<vobj> spProj5m(const Lattice<vobj> & in)
 {
  Gamma G5(Gamma::Algebra::Gamma5);
-  GridBase *grid=in._grid;
+  GridBase *grid=in.Grid();
  Lattice<vobj> ret(grid);
-  parallel_for(int ss=0;ss<grid->oSites();ss++){
-    ret._odata[ss] = in._odata[ss] - G5*in._odata[ss];
-  }
+  auto ret_v = ret.View();
+  auto in_v  =  in.View();
+  thread_for(ss,grid->oSites(),{
+    ret_v[ss] = in_v[ss] - G5*in_v[ss];
+  });
  return ret;
 }

 template <class Impl>
 void WilsonFermion5D<Impl>::ContractJ5q(FermionField &q_in,ComplexField &J5q)
 {
-  conformable(GaugeGrid(), J5q._grid);
-  conformable(q_in._grid, FermionGrid());
+  conformable(GaugeGrid(), J5q.Grid());
+  conformable(q_in.Grid(), FermionGrid());

  // 4d field
  int Ls = this->Ls;
@ -990,8 +923,8 @@ void WilsonFermion5D<Impl>::ContractJ5q(FermionField &q_in,ComplexField &J5q)
 template <class Impl>
 void WilsonFermion5D<Impl>::ContractJ5q(PropagatorField &q_in,ComplexField &J5q)
 {
-  conformable(GaugeGrid(), J5q._grid);
-  conformable(q_in._grid, FermionGrid());
+  conformable(GaugeGrid(), J5q.Grid());
+  conformable(q_in.Grid(), FermionGrid());

  // 4d field
  int Ls = this->Ls;
@ -1015,20 +948,26 @@ void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                     Current curr_type,
                                                     unsigned int mu)
 {
-    conformable(q_in_1._grid, FermionGrid());
-    conformable(q_in_1._grid, q_in_2._grid);
-    conformable(_FourDimGrid, q_out._grid);
+    conformable(q_in_1.Grid(), FermionGrid());
+    conformable(q_in_1.Grid(), q_in_2.Grid());
+    conformable(_FourDimGrid, q_out.Grid());

    PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
-    unsigned int LLs = q_in_1._grid->_rdimensions[0];
-    q_out = zero;
+    unsigned int LLs = q_in_1.Grid()->_rdimensions[0];
+    q_out = Zero();

    // Forward, need q1(x + mu, s), q2(x, Ls - 1 - s). Backward, need q1(x, s), 
    // q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
    tmp1 = Cshift(q_in_1, mu + 1, 1);
    tmp2 = Cshift(q_in_2, mu + 1, 1);
-    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
-    {
+    auto q_in_1_v = q_in_1.View();
+    auto q_in_2_v = q_in_2.View();
+    auto tmp1_v   = tmp1.View();
+    auto tmp2_v   = tmp2.View();
+    auto q_out_v  = q_out.View();
+    auto Umu_v    = Umu.View();
+    thread_for(sU, Umu.Grid()->oSites(),{
+
        unsigned int sF1 = sU * LLs;
        unsigned int sF2 = (sU + 1) * LLs - 1;

@ -1042,26 +981,26 @@ void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
            // sites correctly.
            if (Impl::LsVectorised)
            {
-                REVERSE_LS(q_in_2._odata[sF2], qSite2, Ls / LLs);
-                REVERSE_LS(tmp2._odata[sF2], qmuSite2, Ls / LLs);
+                REVERSE_LS(q_in_2_v[sF2], qSite2, Ls / LLs);
+                REVERSE_LS(tmp2_v[sF2], qmuSite2, Ls / LLs);
            }
            else
            {
-                qSite2   = q_in_2._odata[sF2];
-                qmuSite2 = tmp2._odata[sF2];
+                qSite2   = q_in_2_v[sF2];
+                qmuSite2 = tmp2_v[sF2];
            }
-            Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sF1], 
+            Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sF1], 
                                                     qSite2, 
-                                                     q_out._odata[sU],
-                                                     Umu, sU, mu, axial_sign);
-            Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sF1],
+                                                     q_out_v[sU],
+                                                     Umu_v, sU, mu, axial_sign);
+            Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sF1],
                                                     qmuSite2,
-                                                     q_out._odata[sU],
-                                                     Umu, sU, mu, axial_sign);
+                                                     q_out_v[sU],
+                                                     Umu_v, sU, mu, axial_sign);
            sF1++;
            sF2--;
        }
-    }
+    });
 }


@ -1074,18 +1013,21 @@ void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                                unsigned int tmax,
 						ComplexField &lattice_cmplx)
 {
-    conformable(q_in._grid, FermionGrid());
-    conformable(q_in._grid, q_out._grid);
+    conformable(q_in.Grid(), FermionGrid());
+    conformable(q_in.Grid(), q_out.Grid());
    PropagatorField tmp(GaugeGrid()),tmp2(GaugeGrid());
    unsigned int tshift = (mu == Tp) ? 1 : 0;
-    unsigned int LLs = q_in._grid->_rdimensions[0];
+    unsigned int LLs = q_in.Grid()->_rdimensions[0];
    unsigned int LLt    = GridDefaultLatt()[Tp];

-    q_out = zero;
+    q_out = Zero();
    LatticeInteger coords(_FourDimGrid);
    LatticeCoordinate(coords, Tp);
-
-
+    
+    auto q_out_v = q_out.View();
+    auto tmp2_v  = tmp2.View();
+    auto coords_v= coords.View();
+    auto Umu_v   = Umu.View();
    for (unsigned int s = 0; s < LLs; ++s)
    {
        bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
@ -1098,59 +1040,51 @@ void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
        tmp = Cshift(tmp2, mu, 1);	 //q(x+mu,s)
        tmp2 = tmp*lattice_cmplx;	 //q(x+mu,s)*A(x)	

-    	parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
-    	{
+    	thread_for(sU, Umu.Grid()->oSites(),{
            // Compute the sequential conserved current insertion only if our simd
            // object contains a timeslice we need.
-            vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
-                	         (coords._odata[sU] <= tmax));
-            Integer timeSlices = Reduce(t_mask);
+            vPredicate t_mask;
+	    t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
+            Integer timeSlices = Reduce(t_mask());

            if (timeSlices > 0)
            {
 		unsigned int sF = sU * LLs + s;
-                Kernels::SeqConservedCurrentSiteFwd(tmp2._odata[sU], 
-                                              q_out._odata[sF], Umu, sU,
-                                              mu, t_mask, switch_sgn);
+                Kernels::SeqConservedCurrentSiteFwd(tmp2_v[sU], 
+						    q_out_v[sF], Umu_v, sU,
+						    mu, t_mask, switch_sgn);
            }

-        }
+        });

        //backward direction: Need q(x - mu, s)*A(x-mu)
        ExtractSlice(tmp2, q_in, s, 0);  //q(x,s)
        tmp = lattice_cmplx*tmp2;	 //q(x,s)*A(x)
        tmp2 = Cshift(tmp, mu, -1);	 //q(x-mu,s)*A(x-mu,s)

-    	parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    	thread_for(sU, Umu.Grid()->oSites(),
    	{
-            vInteger  t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
-                   	  	    (coords._odata[sU] <= (tmax + tshift)));
+	  vPredicate t_mask;
+	  t_mask()= ((coords_v[sU] >= (tmin + tshift)) && (coords_v[sU] <= (tmax + tshift)));

-	    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
-	    unsigned int t0 = 0;
-	    if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
-
-            Integer timeSlices = Reduce(t_mask);
-
-            if (timeSlices > 0)
-            {
-		unsigned int sF = sU * LLs + s; 
-        	Kernels::SeqConservedCurrentSiteBwd(tmp2._odata[sU], 
-                                             q_out._odata[sF], Umu, sU,
-                                             mu, t_mask, axial_sign);
-            }
-	}
+	  //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+	  unsigned int t0 = 0;
+	  if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
+	  
+	  Integer timeSlices = Reduce(t_mask());
+	  
+	  if (timeSlices > 0) {
+	    unsigned int sF = sU * LLs + s; 
+	    Kernels::SeqConservedCurrentSiteBwd(tmp2_v[sU], 
+						q_out_v[sF], Umu_v, sU,
+						mu, t_mask, axial_sign);
+	  }
+	});
    }
 }
-
-
-
-
-
-FermOpTemplateInstantiate(WilsonFermion5D);
-GparityFermOpTemplateInstantiate(WilsonFermion5D);
  
-}}
+NAMESPACE_END(Grid);
+



--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@ -1,4 +1,3 @@
-
 /*************************************************************************************

 Grid physics library, www.github.com/paboyle/Grid
@ -29,16 +28,11 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/WilsonFermion.h>

-namespace Grid {
-namespace QCD {
-
-const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
-const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
-int WilsonFermionStatic::HandOptDslash;
+NAMESPACE_BEGIN(Grid);

 /////////////////////////////////
 // Constructor and gauge import
@ -49,18 +43,19 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
                                   GridRedBlackCartesian &Hgrid, RealD _mass,
                                   const ImplParams &p,
                                   const WilsonAnisotropyCoefficients &anis)
-    : Kernels(p),
-      _grid(&Fgrid),
-      _cbgrid(&Hgrid),
-      Stencil(&Fgrid, npoint, Even, directions, displacements),
-      StencilEven(&Hgrid, npoint, Even, directions,displacements),  // source is Even
-      StencilOdd(&Hgrid, npoint, Odd, directions,displacements),  // source is Odd
-      mass(_mass),
-      Lebesgue(_grid),
-      LebesgueEvenOdd(_cbgrid),
-      Umu(&Fgrid),
-      UmuEven(&Hgrid),
-      UmuOdd(&Hgrid),
+  : 
+    Kernels(p),
+    _grid(&Fgrid),
+    _cbgrid(&Hgrid),
+    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
+    StencilEven(&Hgrid, npoint, Even, directions,displacements,p),  // source is Even
+    StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p),  // source is Odd
+    mass(_mass),
+    Lebesgue(_grid),
+    LebesgueEvenOdd(_cbgrid),
+    Umu(&Fgrid),
+    UmuEven(&Hgrid),
+    UmuOdd(&Hgrid),
      _tmp(&Hgrid),
      anisotropyCoeff(anis)
 {
@ -76,8 +71,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
 }

 template <class Impl>
-void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {
-  GaugeField HUmu(_Umu._grid);
+void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) 
+{
+  GaugeField HUmu(_Umu.Grid());

  //Here multiply the anisotropy coefficients
  if (anisotropyCoeff.isAnisotropic)
@ -107,21 +103,21 @@ void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {

 template <class Impl>
 RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
  return axpy_norm(out, diag_mass, in, out);
 }

 template <class Impl>
 RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerYes);
  return axpy_norm(out, diag_mass, in, out);
 }

 template <class Impl>
 void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
-  if (in.checkerboard == Odd) {
+  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerNo);
  } else {
    DhopOE(in, out, DaggerNo);
@ -130,7 +126,7 @@ void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {

 template <class Impl>
 void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
-  if (in.checkerboard == Odd) {
+  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerYes);
  } else {
    DhopOE(in, out, DaggerYes);
@ -139,26 +135,26 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
  
 template <class Impl>
 void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  typename FermionField::scalar_type scal(diag_mass);
  out = scal * in;
 }

 template <class Impl>
 void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  Mooee(in, out);
 }

 template<class Impl>
 void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  out = (1.0/(diag_mass))*in;
 }
  
 template<class Impl>
 void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();
  MooeeInv(in,out);
 }
 template<class Impl>
@ -169,7 +165,7 @@ void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const Fermi
  typedef Lattice<iSinglet<vector_type> > LatComplex;
  
  // what type LatticeComplex 
-  conformable(_grid,out._grid);
+  conformable(_grid,out.Grid());
  
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
@ -178,13 +174,13 @@ void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const Fermi
    Gamma::Algebra::GammaT
  };
  
-  std::vector<int> latt_size   = _grid->_fdimensions;
+  Coordinate latt_size   = _grid->_fdimensions;
  
-  FermionField   num  (_grid); num  = zero;
-  LatComplex    wilson(_grid); wilson= zero;
+  FermionField   num  (_grid); num  = Zero();
+  LatComplex    wilson(_grid); wilson= Zero();
  LatComplex     one  (_grid); one = ScalComplex(1.0,0.0);
  
-  LatComplex denom(_grid); denom= zero;
+  LatComplex denom(_grid); denom= Zero();
  LatComplex kmu(_grid); 
  ScalComplex ci(0.0,1.0);
  // momphase = n * 2pi / L
@ -229,9 +225,9 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,

  Compressor compressor(dag);

-  FermionField Btilde(B._grid);
-  FermionField Atilde(B._grid);
-  Atilde = A;//redundant
+  FermionField Btilde(B.Grid());
+  FermionField Atilde(B.Grid());
+  Atilde = A;

  st.HaloExchange(B, compressor);

@ -242,12 +238,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    int gamma = mu;
    if (!dag) gamma += Nd;

-    ////////////////////////
-    // Call the single hop
-    ////////////////////////
-    parallel_for (int sss = 0; sss < B._grid->oSites(); sss++) {
-      Kernels::DhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, gamma);
-    }
+    int Ls=1;
+    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);

    //////////////////////////////////////////////////
    // spin trace outer product
@ -258,70 +250,70 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,

 template <class Impl>
 void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-  conformable(U._grid, _grid);
-  conformable(U._grid, V._grid);
-  conformable(U._grid, mat._grid);
+  conformable(U.Grid(), _grid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());

-  mat.checkerboard = U.checkerboard;
+  mat.Checkerboard() = U.Checkerboard();

  DerivInternal(Stencil, Umu, mat, U, V, dag);
 }

 template <class Impl>
 void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-  conformable(U._grid, _cbgrid);
-  conformable(U._grid, V._grid);
-  //conformable(U._grid, mat._grid); not general, leaving as a comment (Guido)
+  conformable(U.Grid(), _cbgrid);
+  conformable(U.Grid(), V.Grid());
+  //conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido)
  // Motivation: look at the SchurDiff operator
  
-  assert(V.checkerboard == Even);
-  assert(U.checkerboard == Odd);
-  mat.checkerboard = Odd;
+  assert(V.Checkerboard() == Even);
+  assert(U.Checkerboard() == Odd);
+  mat.Checkerboard() = Odd;

  DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
 }

 template <class Impl>
 void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-  conformable(U._grid, _cbgrid);
-  conformable(U._grid, V._grid);
-  //conformable(U._grid, mat._grid);
+  conformable(U.Grid(), _cbgrid);
+  conformable(U.Grid(), V.Grid());
+  //conformable(U.Grid(), mat.Grid());

-  assert(V.checkerboard == Odd);
-  assert(U.checkerboard == Even);
-  mat.checkerboard = Even;
+  assert(V.Checkerboard() == Odd);
+  assert(U.Checkerboard() == Even);
+  mat.Checkerboard() = Even;

  DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
 }

 template <class Impl>
 void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
-  conformable(in._grid, _grid);  // verifies full grid
-  conformable(in._grid, out._grid);
+  conformable(in.Grid(), _grid);  // verifies full grid
+  conformable(in.Grid(), out.Grid());

-  out.checkerboard = in.checkerboard;
+  out.Checkerboard() = in.Checkerboard();

  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
 }

 template <class Impl>
 void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
-  conformable(in._grid, _cbgrid);    // verifies half grid
-  conformable(in._grid, out._grid);  // drops the cb check
+  conformable(in.Grid(), _cbgrid);    // verifies half grid
+  conformable(in.Grid(), out.Grid());  // drops the cb check

-  assert(in.checkerboard == Even);
-  out.checkerboard = Odd;
+  assert(in.Checkerboard() == Even);
+  out.Checkerboard() = Odd;

  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
 }

 template <class Impl>
 void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) {
-  conformable(in._grid, _cbgrid);    // verifies half grid
-  conformable(in._grid, out._grid);  // drops the cb check
+  conformable(in.Grid(), _cbgrid);    // verifies half grid
+  conformable(in.Grid(), out.Grid());  // drops the cb check

-  assert(in.checkerboard == Odd);
-  out.checkerboard = Even;
+  assert(in.Checkerboard() == Odd);
+  out.Checkerboard() = Even;

  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
 }
@ -332,7 +324,8 @@ void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int di
 }

 template <class Impl>
-void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
+void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
  int skip = (disp == 1) ? 0 : 1;
  int dirdisp = dir + skip * 4;
  int gamma = dir + (1 - skip) * 4;
@ -341,16 +334,16 @@ void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int
 };

 template <class Impl>
-void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) {
+void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
+{
  Compressor compressor(dag);

  Stencil.HaloExchange(in, compressor);
+  int Ls=1;
+  int Nsite=in.oSites();
+  Kernels::DhopDirKernel(Stencil, Umu, Stencil.CommBuf(), Ls, Nsite, in, out, dirdisp, gamma);
+};

-  parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
-    Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
-  }
-} 
-/*Change starts*/
 template <class Impl>
 void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       DoubledGaugeField &U,
@ -367,71 +360,51 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,

 template <class Impl>
 void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
-                                       DoubledGaugeField &U,
-                                       const FermionField &in,
-                                       FermionField &out, int dag) {
+						      DoubledGaugeField &U,
+						      const FermionField &in,
+						      FermionField &out, int dag) {
  assert((dag == DaggerNo) || (dag == DaggerYes));
-#ifdef GRID_OMP
-  Compressor compressor;
-  int len =  U._grid->oSites();
-  const int LLs =  1;
-
-  st.Prepare();
-  st.HaloGather(in,compressor);
-  st.CommsMergeSHM(compressor);
-#pragma omp parallel
-  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = len;
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
-      }
-      // do the compute
-     if (dag == DaggerYes) {
-
-        for (int sss = myblock; sss < myblock+myn; ++sss) {
-         Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
-       }
-     } else {
-        for (int sss = myblock; sss < myblock+myn; ++sss) {
-         Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
-       }
-    } //else
-
-    } else {
-      st.CommunicateThreaded();
-    }

  Compressor compressor(dag);
+  int len =  U.Grid()->oSites();

+  /////////////////////////////
+  // Start comms  // Gather intranode and extra node differentiated??
+  /////////////////////////////
+  std::vector<std::vector<CommsRequest_t> > requests;
+  st.Prepare();
+  st.HaloGather(in,compressor);
+  st.CommunicateBegin(requests);
+
+  /////////////////////////////
+  // Overlap with comms
+  /////////////////////////////
+  st.CommsMergeSHM(compressor);
+
+  /////////////////////////////
+  // do the compute interior
+  /////////////////////////////
+  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
-    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
-    }
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  } else {
-    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
-    }
-  }
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
+  } 

-  }  //pragma
-#else
-  assert(0);
-#endif
+  /////////////////////////////
+  // Complete comms
+  /////////////////////////////
+  st.CommunicateComplete(requests);
+  st.CommsMerge(compressor);
+
+  /////////////////////////////
+  // do the compute exterior
+  /////////////////////////////
+  if (dag == DaggerYes) {
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
+  } else {
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
+  }
 };


@ -444,14 +417,11 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
  Compressor compressor(dag);
  st.HaloExchange(in, compressor);

+  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
-    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
-    }
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  } else {
-    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
-    }
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  }
 };
 /*Change ends */
@ -468,28 +438,33 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                   Current curr_type,
                                                   unsigned int mu)
 {
-    Gamma g5(Gamma::Algebra::Gamma5);
-    conformable(_grid, q_in_1._grid);
-    conformable(_grid, q_in_2._grid);
-    conformable(_grid, q_out._grid);
-    PropagatorField tmp1(_grid), tmp2(_grid);
-    q_out = zero;
+  Gamma g5(Gamma::Algebra::Gamma5);
+  conformable(_grid, q_in_1.Grid());
+  conformable(_grid, q_in_2.Grid());
+  conformable(_grid, q_out.Grid());
+  PropagatorField tmp1(_grid), tmp2(_grid);
+  q_out = Zero();

-    // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
-    // Inefficient comms method but not performance critical.
-    tmp1 = Cshift(q_in_1, mu, 1);
-    tmp2 = Cshift(q_in_2, mu, 1);
-    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
-    {
-        Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sU],
-                                                 q_in_2._odata[sU],
-                                                 q_out._odata[sU],
-                                                 Umu, sU, mu);
-        Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sU],
-                                                 tmp2._odata[sU],
-                                                 q_out._odata[sU],
-                                                 Umu, sU, mu);
-    }
+  // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
+  // Inefficient comms method but not performance critical.
+  tmp1 = Cshift(q_in_1, mu, 1);
+  tmp2 = Cshift(q_in_2, mu, 1);
+  auto tmp1_v  =  tmp1.View();
+  auto tmp2_v  =  tmp2.View();
+  auto q_in_1_v=q_in_1.View();
+  auto q_in_2_v=q_in_2.View();
+  auto q_out_v = q_out.View();
+  auto Umu_v   =   Umu.View();
+  thread_for(sU, Umu.Grid()->oSites(),{
+      Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
+					       q_in_2_v[sU],
+					       q_out_v[sU],
+					       Umu_v, sU, mu);
+      Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
+					       tmp2_v[sU],
+					       q_out_v[sU],
+					       Umu_v, sU, mu);
+  });
 }


@ -502,61 +477,61 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
-    conformable(_grid, q_in._grid);
-    conformable(_grid, q_out._grid);
-    PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
-    unsigned int tshift = (mu == Tp) ? 1 : 0;
-    unsigned int LLt    = GridDefaultLatt()[Tp];
+  conformable(_grid, q_in.Grid());
+  conformable(_grid, q_out.Grid());

-    q_out = zero;
-    LatticeInteger coords(_grid);
-    LatticeCoordinate(coords, Tp);
+  //  Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
+  Complex i(0.0,1.0);
+  PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
+  unsigned int tshift = (mu == Tp) ? 1 : 0;
+  unsigned int LLt    = GridDefaultLatt()[Tp];

-    // Need q(x + mu) and q(x - mu).
-    tmp = Cshift(q_in, mu, 1);
-    tmpFwd = tmp*lattice_cmplx;
-    tmp = lattice_cmplx*q_in;
-    tmpBwd = Cshift(tmp, mu, -1);
+  q_out = Zero();
+  LatticeInteger coords(_grid);
+  LatticeCoordinate(coords, Tp);

-    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
-    {
-        // Compute the sequential conserved current insertion only if our simd
-        // object contains a timeslice we need.
-        vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
-                             (coords._odata[sU] <= tmax));
-        Integer timeSlices = Reduce(t_mask);
+  // Need q(x + mu) and q(x - mu).
+  tmp    = Cshift(q_in, mu, 1);
+  tmpFwd = tmp*lattice_cmplx;
+  tmp    = lattice_cmplx*q_in;
+  tmpBwd = Cshift(tmp, mu, -1);

-        if (timeSlices > 0)
-        {
-            Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sU], 
-                                                q_out._odata[sU], 
-                                                Umu, sU, mu, t_mask);
-        }
+  auto coords_v = coords.View();
+  auto tmpFwd_v = tmpFwd.View();
+  auto tmpBwd_v = tmpBwd.View();
+  auto Umu_v    = Umu.View();
+  auto q_out_v  = q_out.View();

-        // Repeat for backward direction.
-        t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
-                      (coords._odata[sU] <= (tmax + tshift)));
+  thread_for(sU, Umu.Grid()->oSites(), {

-	//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
-	unsigned int t0 = 0;
-	if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
+    // Compute the sequential conserved current insertion only if our simd
+    // object contains a timeslice we need.
+    vPredicate t_mask;
+    t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
+    Integer timeSlices = Reduce(t_mask());

-        timeSlices = Reduce(t_mask);
-
-        if (timeSlices > 0)
-        {
-            Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sU], 
-                                                q_out._odata[sU], 
-                                                Umu, sU, mu, t_mask);
-        }
+    if (timeSlices > 0) {
+      Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], 
+					  q_out_v[sU], 
+					  Umu_v, sU, mu, t_mask);
    }

+    // Repeat for backward direction.
+    t_mask()     = ((coords_v[sU] >= (tmin + tshift)) && 
+		    (coords_v[sU] <= (tmax + tshift)));
+    
+    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+    unsigned int t0 = 0;
+    if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
+    
+    timeSlices = Reduce(t_mask());

+    if (timeSlices > 0) {
+      Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], 
+					  q_out_v[sU], 
+					  Umu_v, sU, mu, t_mask);
+    }
+  });
 }

-FermOpTemplateInstantiate(WilsonFermion);
-AdjointFermOpTemplateInstantiate(WilsonFermion);
-TwoIndexFermOpTemplateInstantiate(WilsonFermion);
-GparityFermOpTemplateInstantiate(WilsonFermion);
-}
-}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h
@ -0,0 +1,716 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+
+
+    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#if defined(AVX512) 
+    ///////////////////////////////////////////////////////////
+    // If we are AVX512 specialise the single precision routine
+    ///////////////////////////////////////////////////////////
+#include <simd/Intel512wilson.h>
+#include <simd/Intel512single.h>
+
+/// Switch off the 5d vectorised code optimisations
+#undef DWFVEC5D
+
+static Vector<vComplexF> signsF;
+
+  template<typename vtype>    
+  int setupSigns(Vector<vtype>& signs ){
+    Vector<vtype> bother(2);
+    signs = bother;
+    vrsign(signs[0]);
+    visign(signs[1]);
+    return 1;
+  }
+
+  static int signInitF = setupSigns(signsF);
+
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
+#define COMPLEX_SIGNS(isigns) vComplexF *isigns = &signsF[0];  
+  
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+      
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#define MAYBEPERM(A,B) 
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+
+#ifdef DWFVEC5D
+
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+#undef  MULT_2SPIN
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#endif  // VEC 5D
+
+#undef COMPLEX_SIGNS
+#undef MAYBEPERM
+#undef MULT_2SPIN
+	
+
+
+///////////////////////////////////////////////////////////
+// If we are AVX512 specialise the double precision routine
+///////////////////////////////////////////////////////////
+
+#include <simd/Intel512double.h>
+    
+static Vector<vComplexD> signsD;
+static int signInitD = setupSigns(signsD);
+    
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
+#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];  
+
+
+#define INTERIOR_AND_EXTERIOR    
+#undef  INTERIOR
+#undef  EXTERIOR
+  
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+      
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+      
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#define MAYBEPERM(A,B) 
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+#ifdef DWFVEC5D
+
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+#undef  MULT_2SPIN
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#endif  // VEC 5D
+
+#undef COMPLEX_SIGNS
+#undef MAYBEPERM
+#undef MULT_2SPIN
+
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef UChi_00
+#undef UChi_01
+#undef UChi_02
+#undef UChi_10
+#undef UChi_11
+#undef UChi_12
+#undef UChi_20
+#undef UChi_21
+#undef UChi_22
+#undef UChi_30
+#undef UChi_31
+#undef UChi_32
+
+#undef Psi_00
+#undef Psi_01
+#undef Psi_02
+#undef Psi_10
+#undef Psi_11
+#undef Psi_12
+#undef Psi_20
+#undef Psi_21
+#undef Psi_22
+#undef Psi_30
+#undef Psi_31
+#undef Psi_32
+
+#undef Phi_00
+#undef Phi_01
+#undef Phi_02
+#undef Phi_10
+#undef Phi_11
+#undef Phi_12
+#undef Phi_20
+#undef Phi_21
+#undef Phi_22
+#undef Phi_30
+#undef Phi_31
+#undef Phi_32
+
+
+#endif //AVX512
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h
@ -130,16 +130,18 @@
  int local,perm, ptype;
  uint64_t base;
  uint64_t basep;
-  const uint64_t plocal =(uint64_t) & in._odata[0];
+  const uint64_t plocal =(uint64_t) & in[0];

  COMPLEX_SIGNS(isigns);
  MASK_REGS;
-  int nmax=U._grid->oSites();
+  int nmax=U.oSites();
  for(int site=0;site<Ns;site++) {
 #ifndef EXTERIOR
-    int sU =lo.Reorder(ssU);
+    //    int sU =lo.Reorder(ssU);
+    int sU =ssU;
    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
-    int sUn=lo.Reorder(ssn);
+    //    int sUn=lo.Reorder(ssn);
+    int sUn=ssn;
    LOCK_GAUGE(0);
 #else
    int sU =ssU;
@ -166,7 +168,7 @@
      if (nmu==0) break;
      //      if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
 #endif
-      base = (uint64_t) &out._odata[ss];
+      base = (uint64_t) &out[ss];
      basep= st.GetPFInfo(nent,plocal); nent++;
      RESULT(base,basep);
    }
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h.ab
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h.ab
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h.abc
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h.abc
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h
@ -0,0 +1,86 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+
+
+    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////////////////////////
+// Default to no assembler implementation
+// Will specialise to 
+///////////////////////////////////////////////////////////
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+				  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+					int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+					int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+{
+  assert(0);
+}
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmQPX.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmQPX.h
@ -28,7 +28,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-
+#pragma once

 #if defined(QPX) 

@ -52,18 +52,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /////////////////////////////////////////////////////////////////
 #undef KERNEL_DAG
 template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
      
 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, dag Kernel, single
 /////////////////////////////////////////////////////////////////
 #define KERNEL_DAG
 template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
 						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 				    
 #undef MAYBEPERM
 #undef MULT_2SPIN
@ -75,18 +75,18 @@ WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,Do
 /////////////////////////////////////////////////////////////////
 #undef KERNEL_DAG
 template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
 							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 				    
 /////////////////////////////////////////////////////////////////
 // Ls vectorised, dag Kernel, single
 /////////////////////////////////////////////////////////////////
 #define KERNEL_DAG
 template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef MAYBEPERM
 #undef MULT_2SPIN
 	
@ -104,9 +104,9 @@ WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrde
 /////////////////////////////////////////////////////////////////
 #undef KERNEL_DAG
 template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 /////////////////////////////////////////////////////////////////
      

@ -115,9 +115,9 @@ WilsonKernels<WilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,Doubl
 /////////////////////////////////////////////////////////////////
 #define KERNEL_DAG
 template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
 						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 /////////////////////////////////////////////////////////////////

 #undef MAYBEPERM
@ -129,9 +129,9 @@ WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,Do
 /////////////////////////////////////////////////////////////////
 #undef KERNEL_DAG
 template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
 							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 /////////////////////////////////////////////////////////////////
 				    
 /////////////////////////////////////////////////////////////////
@ -139,9 +139,9 @@ WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder &
 /////////////////////////////////////////////////////////////////
 #define KERNEL_DAG
 template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 /////////////////////////////////////////////////////////////////
 	
 #undef MAYBEPERM
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
+#pragma once
+
 #include <Grid/qcd/action/fermion/FermionCore.h>

 #define REGISTER
@ -45,7 +48,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  Chimu_32=ref(F)(3)(2)

 #define LOAD_CHIMU(DIR,F,PERM)						\
-  { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
+  { const SiteSpinor & ref (in[offset]); LOAD_CHIMU_BODY(F); }

 #define LOAD_CHI_BODY(F)				\
    Chi_00 = ref(F)(0)(0);\
@ -92,9 +95,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  g = F;								\
  direction = st._directions[DIR];				\
  distance = st._distances[DIR];				\
-  sl = st._grid->_simd_layout[direction];			\
+  sl = st._simd_layout[direction];			        \
  inplace_twist = 0;						\
-  if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\
+  if(SE->_around_the_world && st.parameters.twists[DIR % 4]){		\
    if(sl == 1){							\
      g = (F+1) % 2;							\
    }else{								\
@ -103,7 +106,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  }  

 #define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\
-  { const SiteSpinor &ref(in._odata[offset]);				\
+  { const SiteSpinor &ref(in[offset]);				\
    LOAD_CHI_SETUP(DIR,F);						\
    if(!inplace_twist){							\
      LOAD_CHIMU_BODY(g);						\
@ -201,10 +204,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>


 #define MULT_2SPIN(A,F)					\
-  {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
+  {auto & ref(U[sU](A)); MULT_2SPIN_BODY; }

 #define MULT_2SPIN_GPARITY(A,F)				\
-  {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
+  {auto & ref(U[sU](F)(A)); MULT_2SPIN_BODY; }


 #define PERMUTE_DIR(dir)			\
@ -468,8 +471,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
+  perm   = SE->_permute;				\
  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
    LOAD_CHI_IMPL(DIR,F,PERM);			\
    MULT_2SPIN_IMPL(DIR,F);			\
@ -479,7 +481,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define HAND_RESULT(ss,F)			\
  {						\
-    SiteSpinor & ref (out._odata[ss]);		\
+    SiteSpinor & ref (out[ss]);		\
    vstream(ref(F)(0)(0),result_00);		\
    vstream(ref(F)(0)(1),result_01);		\
    vstream(ref(F)(0)(2),result_02);		\
@ -496,7 +498,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define HAND_RESULT_EXT(ss,F)			\
  if (nmu){					\
-    SiteSpinor & ref (out._odata[ss]);		\
+    SiteSpinor & ref (out[ss]);		\
    ref(F)(0)(0)+=result_00;		\
    ref(F)(0)(1)+=result_01;		\
    ref(F)(0)(2)+=result_02;		\
@ -545,18 +547,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  Simd U_21;

 #define ZERO_RESULT				\
-  result_00=zero;				\
-  result_01=zero;				\
-  result_02=zero;				\
-  result_10=zero;				\
-  result_11=zero;				\
-  result_12=zero;				\
-  result_20=zero;				\
-  result_21=zero;				\
-  result_22=zero;				\
-  result_30=zero;				\
-  result_31=zero;				\
-  result_32=zero;			
+  result_00=Zero();				\
+  result_01=Zero();				\
+  result_02=Zero();				\
+  result_10=Zero();				\
+  result_11=Zero();				\
+  result_12=Zero();				\
+  result_20=Zero();				\
+  result_21=Zero();				\
+  result_22=Zero();				\
+  result_30=Zero();				\
+  result_31=Zero();				\
+  result_32=Zero();			

 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
@ -571,21 +573,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define Chimu_31 UChi_11
 #define Chimu_32 UChi_12

-namespace Grid {
-namespace QCD {
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
+NAMESPACE_BEGIN(Grid);

 #define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
@ -598,21 +586,6 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge
  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT(ss,F)

-  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-
 #define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
@ -624,22 +597,6 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub
  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT(ss,F)

-  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-
 #define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  ZERO_RESULT; \
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
@ -652,21 +609,6 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT(ss,F)

-  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-
 #define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
  ZERO_RESULT;							\
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
@ -678,23 +620,6 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
  HAND_RESULT(ss,F)
-  
-  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-  int nmu=0;

 #define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  ZERO_RESULT; \
@ -708,22 +633,6 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT_EXT(ss,F)

-  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  int nmu=0;
-
 #define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  ZERO_RESULT; \
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
@ -736,13 +645,10 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT_EXT(ss,F)

-  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
 #define HAND_SPECIALISE_GPARITY(IMPL)					\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
-				    int ss,int sU,const FermionField &in, FermionField &out) \
+  template<> void						\
+  WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
+				    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@ -756,9 +662,9 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-					    int ss,int sU,const FermionField &in, FermionField &out) \
+  template<> void						\
+  WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@ -772,9 +678,9 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
-						     int ss,int sU,const FermionField &in, FermionField &out) \
+  template<> void						\
+  WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
+				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@ -788,9 +694,9 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-							     int ss,int sU,const FermionField &in, FermionField &out) \
+  template<> void						\
+  WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@ -805,8 +711,8 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
  }									\
 									\
  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
-						     int ss,int sU,const FermionField &in, FermionField &out) \
+  WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
+				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@ -814,16 +720,16 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
 									\
    HAND_DECLARATIONS(ignore);						\
 									\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    int offset,perm, ptype, g, direction, distance, sl, inplace_twist; \
    StencilEntry *SE;							\
    int nmu=0;								\
    HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
    nmu = 0;								\
    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-							     int ss,int sU,const FermionField &in, FermionField &out) \
+  template<> void						\
+  WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@ -832,47 +738,11 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
    HAND_DECLARATIONS(ignore);						\
 									\
    StencilEntry *SE;							\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    int offset,perm, ptype, g, direction, distance, sl, inplace_twist; \
    int nmu=0;								\
    HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
    nmu = 0;								\
    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }

-
-HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
-
-
-
-
-
-
-
-
-
-
-  
-////////////// Wilson ; uses this implementation /////////////////////
-
-#define INSTANTIATE_THEM(A) \
-template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-					     int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						int ss,int sU,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionField &in, FermionField &out); 
-
-INSTANTIATE_THEM(GparityWilsonImplF);
-INSTANTIATE_THEM(GparityWilsonImplD);
-INSTANTIATE_THEM(GparityWilsonImplFH);
-INSTANTIATE_THEM(GparityWilsonImplDF);
-}}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
@ -26,12 +26,58 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
+#pragma once
+
 #include <Grid/qcd/action/fermion/FermionCore.h>

+
+#undef LOAD_CHIMU  
+#undef LOAD_CHI 
+#undef MULT_2SPIN
+#undef PERMUTE_DIR
+#undef XP_PROJ  
+#undef YP_PROJ  
+#undef ZP_PROJ  
+#undef TP_PROJ  
+#undef XM_PROJ  
+#undef YM_PROJ  
+#undef ZM_PROJ  
+#undef TM_PROJ  
+#undef XP_RECON 
+#undef XP_RECON_ACCUM 
+#undef XM_RECON 
+#undef XM_RECON_ACCUM 
+#undef YP_RECON_ACCUM 
+#undef YM_RECON_ACCUM 
+#undef ZP_RECON_ACCUM 
+#undef ZM_RECON_ACCUM 
+#undef TP_RECON_ACCUM 
+#undef TM_RECON_ACCUM 
+#undef ZERO_RESULT				 
+#undef Chimu_00
+#undef Chimu_01
+#undef Chimu_02
+#undef Chimu_10
+#undef Chimu_11
+#undef Chimu_12
+#undef Chimu_20
+#undef Chimu_21
+#undef Chimu_22
+#undef Chimu_30
+#undef Chimu_31
+#undef Chimu_32
+#undef HAND_STENCIL_LEG
+#undef HAND_STENCIL_LEG_INT
+#undef HAND_STENCIL_LEG_EXT
+#undef HAND_RESULT
+#undef HAND_RESULT_INT
+#undef HAND_RESULT_EXT
+
 #define REGISTER

 #define LOAD_CHIMU \
-  {const SiteSpinor & ref (in._odata[offset]);	\
+  {const SiteSpinor & ref (in[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
    Chimu_02=ref()(0)(2);\
@ -56,7 +102,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 // To splat or not to splat depends on the implementation
 #define MULT_2SPIN(A)\
-  {auto & ref(U._odata[sU](A));			\
+  {auto & ref(U[sU](A));			\
   Impl::loadLinkElement(U_00,ref()(0,0));	\
   Impl::loadLinkElement(U_10,ref()(1,0));	\
   Impl::loadLinkElement(U_20,ref()(2,0));	\
@ -355,7 +401,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define HAND_RESULT(ss)				\
  {						\
-    SiteSpinor & ref (out._odata[ss]);		\
+    SiteSpinor & ref (out[ss]);		\
    vstream(ref()(0)(0),result_00);		\
    vstream(ref()(0)(1),result_01);		\
    vstream(ref()(0)(2),result_02);		\
@ -372,7 +418,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define HAND_RESULT_EXT(ss)			\
  if (nmu){					\
-    SiteSpinor & ref (out._odata[ss]);		\
+    SiteSpinor & ref (out[ss]);		\
    ref()(0)(0)+=result_00;		\
    ref()(0)(1)+=result_01;		\
    ref()(0)(2)+=result_02;		\
@ -421,18 +467,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  Simd U_21;

 #define ZERO_RESULT				\
-  result_00=zero;				\
-  result_01=zero;				\
-  result_02=zero;				\
-  result_10=zero;				\
-  result_11=zero;				\
-  result_12=zero;				\
-  result_20=zero;				\
-  result_21=zero;				\
-  result_22=zero;				\
-  result_30=zero;				\
-  result_31=zero;				\
-  result_32=zero;			
+  result_00=Zero();				\
+  result_01=Zero();				\
+  result_02=Zero();				\
+  result_10=Zero();				\
+  result_11=Zero();				\
+  result_12=Zero();				\
+  result_20=Zero();				\
+  result_21=Zero();				\
+  result_22=Zero();				\
+  result_30=Zero();				\
+  result_31=Zero();				\
+  result_32=Zero();			

 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
@ -447,12 +493,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define Chimu_31 UChi_11
 #define Chimu_32 UChi_12

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
+WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
+				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
@ -475,8 +520,8 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge
 }

 template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
+void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@ -498,8 +543,8 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub
 }

 template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
+WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
@ -522,8 +567,8 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
 }

 template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
+void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@ -545,8 +590,8 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D
 }

 template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
+WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
@ -554,7 +599,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa

  HAND_DECLARATIONS(ignore);

-  int offset,local,perm, ptype;
+  int offset, ptype;
  StencilEntry *SE;
  int nmu=0;
  ZERO_RESULT;
@ -570,8 +615,8 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
 }

 template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
+void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@ -579,7 +624,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
  HAND_DECLARATIONS(ignore);

  StencilEntry *SE;
-  int offset,local,perm, ptype;
+  int offset, ptype;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
@ -595,37 +640,45 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D

 ////////////// Wilson ; uses this implementation /////////////////////

-#define INSTANTIATE_THEM(A) \
-template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-					     int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						int ss,int sU,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionField &in, FermionField &out); 
-
-INSTANTIATE_THEM(WilsonImplF);
-INSTANTIATE_THEM(WilsonImplD);
-INSTANTIATE_THEM(ZWilsonImplF);
-INSTANTIATE_THEM(ZWilsonImplD);
-INSTANTIATE_THEM(DomainWallVec5dImplF);
-INSTANTIATE_THEM(DomainWallVec5dImplD);
-INSTANTIATE_THEM(ZDomainWallVec5dImplF);
-INSTANTIATE_THEM(ZDomainWallVec5dImplD);
-INSTANTIATE_THEM(WilsonImplFH);
-INSTANTIATE_THEM(WilsonImplDF);
-INSTANTIATE_THEM(ZWilsonImplFH);
-INSTANTIATE_THEM(ZWilsonImplDF);
-INSTANTIATE_THEM(DomainWallVec5dImplFH);
-INSTANTIATE_THEM(DomainWallVec5dImplDF);
-INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
-INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
-INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
-INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
-
-}}
+NAMESPACE_END(Grid);
+#undef LOAD_CHIMU  
+#undef LOAD_CHI 
+#undef MULT_2SPIN
+#undef PERMUTE_DIR
+#undef XP_PROJ  
+#undef YP_PROJ  
+#undef ZP_PROJ  
+#undef TP_PROJ  
+#undef XM_PROJ  
+#undef YM_PROJ  
+#undef ZM_PROJ  
+#undef TM_PROJ  
+#undef XP_RECON 
+#undef XP_RECON_ACCUM 
+#undef XM_RECON 
+#undef XM_RECON_ACCUM 
+#undef YP_RECON_ACCUM 
+#undef YM_RECON_ACCUM 
+#undef ZP_RECON_ACCUM 
+#undef ZM_RECON_ACCUM 
+#undef TP_RECON_ACCUM 
+#undef TM_RECON_ACCUM 
+#undef ZERO_RESULT				 
+#undef Chimu_00
+#undef Chimu_01
+#undef Chimu_02
+#undef Chimu_10
+#undef Chimu_11
+#undef Chimu_12
+#undef Chimu_20
+#undef Chimu_21
+#undef Chimu_22
+#undef Chimu_30
+#undef Chimu_31
+#undef Chimu_32
+#undef HAND_STENCIL_LEG
+#undef HAND_STENCIL_LEG_INT
+#undef HAND_STENCIL_LEG_EXT
+#undef HAND_RESULT
+#undef HAND_RESULT_INT
+#undef HAND_RESULT_EXT
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -0,0 +1,551 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+
+////////////////////////////////////////////
+// Generic implementation; move to different file?
+////////////////////////////////////////////
+
+accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
+{
+#ifdef __CUDA_ARCH__
+  static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); 
+  uint4 * mem_pun  = (uint4 *)mem; // force 128 bit loads
+  uint4 * chip_pun = (uint4 *)&chip;
+  * chip_pun = * mem_pun;
+#else 
+  chip = *mem;
+#endif
+  return;
+}
+  
+#define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (SE->_is_local) {						\
+    int perm= SE->_permute;					\
+    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+    spProj(chi,tmp);						\
+  } else {							\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
+  }								\
+  synchronise();						\
+  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
+  Recon(result, Uchi);
+  
+#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon)		\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (SE->_is_local) {						\
+    int perm= SE->_permute;					\
+    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+    spProj(chi,tmp);						\
+  } else if ( st.same_node[Dir] ) {				\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
+  }								\
+  synchronise();						\
+  if (SE->_is_local || st.same_node[Dir] ) {			\
+    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
+    Recon(result, Uchi);					\
+  }								\
+  synchronise();						
+
+#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
+    auto chi = coalescedRead(buf[SE->_offset],lane);		\
+    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
+    Recon(result, Uchi);					\
+    nmu++;							\
+  }								\
+  synchronise();						
+
+#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
+  if (gamma == Dir) {						\
+    if (SE->_is_local ) {					\
+      int perm= SE->_permute;					\
+      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+      spProj(chi,tmp);						\
+    } else {							\
+      chi = coalescedRead(buf[SE->_offset],lane);		\
+    }								\
+    synchronise();						\
+    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
+    Recon(result, Uchi);					\
+    synchronise();						\
+  }
+
+  ////////////////////////////////////////////////////////////////////
+  // All legs kernels ; comms then compute
+  ////////////////////////////////////////////////////////////////////
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
+					     SiteHalfSpinor *buf, int sF,
+					     int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(buf[0]))   calcHalfSpinor;
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  calcHalfSpinor chi;
+  //  calcHalfSpinor *chi_p;
+  calcHalfSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=SIMTlane(Nsimd);
+  GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
+  GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG(Tp,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG(Xm,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG(Ym,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
+  coalescedWrite(out[sF],result,lane);
+};
+
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
+					  SiteHalfSpinor *buf, int sF,
+					  int sU, const FermionFieldView &in, FermionFieldView &out) 
+{
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcHalfSpinor chi;
+  //  calcHalfSpinor *chi_p;
+  calcHalfSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=SIMTlane(Nsimd);
+  GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
+  GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG(Tm,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG(Xp,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG(Yp,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
+  coalescedWrite(out[sF], result,lane);
+};
+  ////////////////////////////////////////////////////////////////////
+  // Interior kernels
+  ////////////////////////////////////////////////////////////////////
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
+						SiteHalfSpinor *buf, int sF,
+						int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcHalfSpinor chi;
+  //  calcHalfSpinor *chi_p;
+  calcHalfSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=SIMTlane(Nsimd);
+
+  result=Zero();
+  GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
+  GENERIC_STENCIL_LEG_INT(Yp,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG_INT(Zp,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG_INT(Tp,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG_INT(Xm,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG_INT(Ym,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
+  coalescedWrite(out[sF], result,lane);
+};
+
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
+							 SiteHalfSpinor *buf, int sF,
+							 int sU, const FermionFieldView &in, FermionFieldView &out) 
+{
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=SIMTlane(Nsimd);
+
+  calcHalfSpinor chi;
+  //  calcHalfSpinor *chi_p;
+  calcHalfSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  result=Zero();
+  GENERIC_STENCIL_LEG_INT(Xm,spProjXp,accumReconXp);
+  GENERIC_STENCIL_LEG_INT(Ym,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG_INT(Zm,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG_INT(Tm,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG_INT(Xp,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG_INT(Yp,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
+  coalescedWrite(out[sF], result,lane);
+};
+////////////////////////////////////////////////////////////////////
+// Exterior kernels
+////////////////////////////////////////////////////////////////////
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
+						SiteHalfSpinor *buf, int sF,
+						int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  //  calcHalfSpinor *chi_p;
+  calcHalfSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  int nmu=0;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=SIMTlane(Nsimd);
+  result=Zero();
+  GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
+  GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG_EXT(Zp,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG_EXT(Tp,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG_EXT(Xm,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm);
+  if ( nmu ) { 
+    auto out_t = coalescedRead(out[sF],lane);
+    out_t = out_t + result;
+    coalescedWrite(out[sF],out_t,lane);
+  }
+};
+
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
+					     SiteHalfSpinor *buf, int sF,
+					     int sU, const FermionFieldView &in, FermionFieldView &out) 
+{
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  //  calcHalfSpinor *chi_p;
+  calcHalfSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  int nmu=0;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=SIMTlane(Nsimd);
+  result=Zero();
+  GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
+  GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG_EXT(Zm,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG_EXT(Tm,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG_EXT(Xp,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm);
+  if ( nmu ) { 
+    auto out_t = coalescedRead(out[sF],lane);
+    out_t = out_t + result;
+    coalescedWrite(out[sF],out_t,lane);
+  }
+};
+
+template <class Impl>
+void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
+				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) 
+{
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcHalfSpinor chi;
+  calcSpinor result;
+  calcHalfSpinor Uchi;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=SIMTlane(Nsimd);
+
+  SE = st.GetEntry(ptype, dir, sF);
+  if (gamma == Xp) {						
+    if (SE->_is_local ) {					
+      int perm= SE->_permute;					
+      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	
+      spProjXp(chi,tmp);						
+    } else {							
+      chi = coalescedRead(buf[SE->_offset],lane);			
+    }								
+    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		
+    spReconXp(result, Uchi);					
+  }
+
+  GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
+  GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
+  GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
+  GENERIC_DHOPDIR_LEG(Xm,spProjXm,spReconXm);
+  GENERIC_DHOPDIR_LEG(Ym,spProjYm,spReconYm);
+  GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
+  GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
+  coalescedWrite(out[sF], result,lane);
+}
+
+template <class Impl>
+void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
+					 int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma) 
+{
+  assert(dirdisp<=7);
+  assert(dirdisp>=0);
+
+   auto U_v   = U.View();
+   auto in_v  = in.View();
+   auto out_v = out.View();
+   auto st_v  = st.View();
+   accelerator_for(ss,Nsite,Simd::Nsimd(),{
+    for(int s=0;s<Ls;s++){
+      int sU=ss;
+      int sF = s+Ls*sU; 
+      DhopDirK(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp,gamma);
+    }
+  });
+} 
+
+#define KERNEL_CALLNB(A) \
+  const uint64_t    NN = Nsite*Ls;					\
+  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
+  });
+
+#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); 
+
+#define ASM_CALL(A)							\
+  thread_for( ss, Nsite, {						\
+    int sU = ss;							\
+    int sF = ss*Ls;							\
+    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
+  });
+
+template <class Impl>
+void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+				     int Ls, int Nsite, const FermionField &in, FermionField &out,
+				     int interior,int exterior) 
+{
+    auto U_v   =   U.View();
+    auto in_v  =  in.View();
+    auto out_v = out.View();
+    auto st_v  =  st.View();
+
+   if( interior && exterior ) { 
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
+#ifndef GRID_NVCC
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite); printf(".");    return;}
+#endif
+   } else if( interior ) {
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
+#ifndef GRID_NVCC
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt);    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt); printf("-");    return;}
+#endif
+   } else if( exterior ) { 
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
+#ifndef GRID_NVCC
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt); printf("+");    return;}
+#endif
+   }
+   assert(0 && " Kernel optimisation case not covered ");
+  }
+  template <class Impl>
+  void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+					  int Ls, int Nsite, const FermionField &in, FermionField &out,
+					  int interior,int exterior) 
+  {
+    auto U_v   = U.View();
+    auto in_v  = in.View();
+    auto out_v = out.View();
+    auto st_v  = st.View();
+
+   if( interior && exterior ) { 
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;}
+#ifndef GRID_NVCC
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
+#endif
+   } else if( interior ) {
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
+#ifndef GRID_NVCC
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
+#endif
+   } else if( exterior ) { 
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
+#ifndef GRID_NVCC
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
+#endif
+   }
+   assert(0 && " Kernel optimisation case not covered ");
+  }
+
+/*******************************************************************************
+ * Conserved current utilities for Wilson fermions, for contracting propagators
+ * to make a conserved current sink or inserting the conserved current 
+ * sequentially. Common to both 4D and 5D.
+ ******************************************************************************/
+// N.B. Functions below assume a -1/2 factor within U.
+#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
+#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
+
+/*******************************************************************************
+ * Name: ContractConservedCurrentSiteFwd
+ * Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in_1 shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
+							  const SitePropagator &q_in_2,
+							  SitePropagator &q_out,
+							  DoubledGaugeFieldView &U,
+							  unsigned int sU,
+							  unsigned int mu,
+							  bool switch_sign)
+{
+  SitePropagator result, tmp;
+  Gamma g5(Gamma::Algebra::Gamma5);
+
+  Impl::multLink(tmp, U[sU], q_in_1, mu);
+
+  result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
+
+  if (switch_sign) {
+    q_out -= result;
+  } else {
+    q_out += result;
+  }
+}
+
+/*******************************************************************************
+ * Name: ContractConservedCurrentSiteBwd
+ * Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in_2 shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
+							  const SitePropagator &q_in_2,
+							  SitePropagator &q_out,
+							  DoubledGaugeFieldView &U,
+							  unsigned int sU,
+							  unsigned int mu,
+							  bool switch_sign)
+{
+  SitePropagator result, tmp;
+  Gamma g5(Gamma::Algebra::Gamma5);
+
+  Impl::multLink(tmp, U[sU], q_in_1, mu + Nd);
+
+  result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
+  if (switch_sign) {
+    q_out += result;
+  } else {
+    q_out -= result;
+  }
+}
+
+/*******************************************************************************
+ * Name: SeqConservedCurrentSiteFwd
+ * Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
+                                                     SitePropagator &q_out,
+                                                     DoubledGaugeFieldView &U,
+                                                     unsigned int sU,
+                                                     unsigned int mu,
+                                                     vPredicate t_mask,
+                                                     bool switch_sign)
+{
+  SitePropagator result;
+  
+  Impl::multLink(result, U[sU], q_in, mu);
+  result = WilsonCurrentFwd(result, mu);
+
+  // Zero any unwanted timeslice entries.
+  result = predicatedWhere(t_mask, result, 0.*result);
+  
+  if (switch_sign) {
+    q_out -= result;
+  } else {
+    q_out += result;
+  }
+}
+
+/*******************************************************************************
+ * Name: SeqConservedCurrentSiteFwd
+ * Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in shifted in -ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in, 
+                                                     SitePropagator &q_out,
+                                                     DoubledGaugeFieldView &U,
+                                                     unsigned int sU,
+                                                     unsigned int mu,
+                                                     vPredicate t_mask,
+                                                     bool switch_sign)
+{
+  SitePropagator result;
+  Impl::multLink(result, U[sU], q_in, mu + Nd);
+  result = WilsonCurrentBwd(result, mu);
+
+  // Zero any unwanted timeslice entries.
+  result = predicatedWhere(t_mask, result, 0.*result);
+  
+  if (switch_sign) {
+    q_out += result;
+  } else {
+    q_out -= result;
+  }
+}
+
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h
@ -0,0 +1,97 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
+
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+/*
+ * BF sequence
+ *
+ void bfmbase<Float>::MooeeInv(Fermion_t psi, 
+ Fermion_t chi, 
+ int dag, int cb)
+
+ double m    = this->mass;
+ double tm   = this->twistedmass;
+ double mtil = 4.0+this->mass;
+
+ double sq = mtil*mtil + tm*tm;
+
+ double a = mtil/sq;
+ double b = -tm /sq;
+ if(dag) b=-b;
+ axpibg5x(chi,psi,a,b);
+
+ void bfmbase<Float>::Mooee(Fermion_t psi, 
+ Fermion_t chi, 
+ int dag,int cb)
+ double a = 4.0+this->mass;
+ double b = this->twistedmass;
+ if(dag) b=-b;
+ axpibg5x(chi,psi,a,b);
+*/
+
+template<class Impl>
+void WilsonTMFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+  RealD a = 4.0+this->mass;
+  RealD b = this->mu;
+  out.Checkerboard() = in.Checkerboard();
+  axpibg5x(out,in,a,b);
+}
+template<class Impl>
+void WilsonTMFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+  RealD a = 4.0+this->mass;
+  RealD b = -this->mu;
+  out.Checkerboard() = in.Checkerboard();
+  axpibg5x(out,in,a,b);
+}
+template<class Impl>
+void WilsonTMFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+  RealD m    = this->mass;
+  RealD tm   = this->mu;
+  RealD mtil = 4.0+m;
+  RealD sq   = mtil*mtil+tm*tm;
+  RealD a    = mtil/sq;
+  RealD b    = -tm /sq;
+  axpibg5x(out,in,a,b);
+}
+template<class Impl>
+void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
+  RealD m    = this->mass;
+  RealD tm   = this->mu;
+  RealD mtil = 4.0+m;
+  RealD sq   = mtil*mtil+tm*tm;
+  RealD a    = mtil/sq;
+  RealD b    = tm /sq;
+  axpibg5x(out,in,a,b);
+}
+
+NAMESPACE_END(Grid);
--- a/Show More
+++ b/Show More