Hadrons: moving Hadrons to root directory, build system improvements

2025-10-19 07:24:44 +01:00 · 2018-08-28 15:00:40 +01:00
parent 5f206df775
commit fb7d021b9d
499 changed files with 429 additions and 846 deletions
--- a/Grid/qcd/action/Action.h
+++ b/Grid/qcd/action/Action.h
@@ -0,0 +1,50 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/Actions.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: neo <cossu@post.kek.jp>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_QCD_ACTION_H
+#define GRID_QCD_ACTION_H
+
+////////////////////////////////////////////
+// Abstract base interface
+////////////////////////////////////////////
+#include <Grid/qcd/action/ActionCore.h>
+////////////////////////////////////////////////////////////////////////
+// Fermion actions; prevent coupling fermion.cc files to other headers
+////////////////////////////////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/Fermion.h>
+////////////////////////////////////////
+// Pseudo fermion combinations for HMC
+////////////////////////////////////////
+#include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
+
+#endif
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@@ -0,0 +1,56 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/ActionBase.h
+
+Copyright (C) 2015-2016
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef ACTION_BASE_H
+#define ACTION_BASE_H
+
+namespace Grid {
+namespace QCD {
+
+template <class GaugeField >
+class Action 
+{
+
+ public:
+  bool is_smeared = false;
+  // Heatbath?
+  virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
+  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
+  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
+  virtual std::string action_name()    = 0;                             // return the action name
+  virtual std::string LogParameters()  = 0;                             // prints action parameters
+  virtual ~Action(){}
+};
+
+}
+}
+
+#endif // ACTION_BASE_H
--- a/Grid/qcd/action/ActionCore.h
+++ b/Grid/qcd/action/ActionCore.h
@@ -0,0 +1,61 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/ActionCore.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef QCD_ACTION_CORE
+#define QCD_ACTION_CORE
+
+#include <Grid/qcd/action/ActionBase.h>
+#include <Grid/qcd/action/ActionSet.h>
+#include <Grid/qcd/action/ActionParams.h>
+
+////////////////////////////////////////////
+// Gauge Actions
+////////////////////////////////////////////
+#include <Grid/qcd/action/gauge/Gauge.h>
+
+////////////////////////////////////////////
+// Fermion prereqs
+////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+////////////////////////////////////////////
+// Scalar Actions
+////////////////////////////////////////////
+#include <Grid/qcd/action/scalar/Scalar.h>
+
+////////////////////////////////////////////
+// Utility functions
+////////////////////////////////////////////
+#include <Grid/qcd/utils/Metric.h>
+#include <Grid/qcd/utils/CovariantLaplacian.h>
+
+
+
+
+#endif
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -0,0 +1,92 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/ActionParams.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef GRID_QCD_ACTION_PARAMS_H
+#define GRID_QCD_ACTION_PARAMS_H
+
+namespace Grid {
+namespace QCD {
+
+  // These can move into a params header and be given MacroMagic serialisation
+  struct GparityWilsonImplParams {
+    bool overlapCommsCompute;
+    std::vector<int> twists;
+    GparityWilsonImplParams() : twists(Nd, 0), overlapCommsCompute(false){};
+  };
+  
+  struct WilsonImplParams {
+    bool overlapCommsCompute;
+    std::vector<Complex> boundary_phases;
+    WilsonImplParams() : overlapCommsCompute(false) {
+      boundary_phases.resize(Nd, 1.0);
+    };
+    WilsonImplParams(const std::vector<Complex> phi)
+      : boundary_phases(phi), overlapCommsCompute(false) {}
+  };
+
+  struct StaggeredImplParams {
+    StaggeredImplParams()  {};
+  };
+  
+  struct OneFlavourRationalParams : Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(OneFlavourRationalParams, 
+				    RealD, lo, 
+				    RealD, hi, 
+				    int,   MaxIter, 
+				    RealD, tolerance, 
+				    int,   degree, 
+				    int,   precision);
+    
+    // MaxIter and tolerance, vectors??
+    
+    // constructor 
+    OneFlavourRationalParams(	RealD _lo      = 0.0, 
+				RealD _hi      = 1.0, 
+				int _maxit     = 1000,
+				RealD tol      = 1.0e-8, 
+                           	int _degree    = 10,
+				int _precision = 64)
+      : lo(_lo),
+	hi(_hi),
+	MaxIter(_maxit),
+	tolerance(tol),
+	degree(_degree),
+	precision(_precision){};
+  };
+  
+  
+}
+}
+
+
+
+
+#endif
--- a/Grid/qcd/action/ActionSet.h
+++ b/Grid/qcd/action/ActionSet.h
@@ -0,0 +1,116 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/ActionSet.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef ACTION_SET_H
+#define ACTION_SET_H
+
+namespace Grid {
+
+// Should drop this namespace here
+namespace QCD {
+
+//////////////////////////////////
+// Indexing of tuple types
+//////////////////////////////////
+
+template <class T, class Tuple>
+struct Index;
+
+template <class T, class... Types>
+struct Index<T, std::tuple<T, Types...>> {
+  static const std::size_t value = 0;
+};
+
+template <class T, class U, class... Types>
+struct Index<T, std::tuple<U, Types...>> {
+  static const std::size_t value = 1 + Index<T, std::tuple<Types...>>::value;
+};
+
+
+////////////////////////////////////////////
+// Action Level
+// Action collection 
+// in a integration level
+// (for multilevel integration schemes)
+////////////////////////////////////////////
+
+template <class Field, class Repr = NoHirep >
+struct ActionLevel {
+ public:
+  unsigned int multiplier;
+
+  // Fundamental repr actions separated because of the smearing
+  typedef Action<Field>* ActPtr;
+
+  // construct a tuple of vectors of the actions for the corresponding higher
+  // representation fields
+  typedef typename AccessTypes<Action, Repr>::VectorCollection action_collection;
+  typedef typename  AccessTypes<Action, Repr>::FieldTypeCollection action_hirep_types;
+
+  action_collection actions_hirep;
+  std::vector<ActPtr>& actions;
+
+  explicit ActionLevel(unsigned int mul = 1) : 
+  actions(std::get<0>(actions_hirep)), multiplier(mul) {
+    // initialize the hirep vectors to zero.
+    // apply(this->resize, actions_hirep, 0); //need a working resize
+    assert(mul >= 1);
+  }
+
+  template < class GenField >
+  void push_back(Action<GenField>* ptr) {
+    // insert only in the correct vector
+    std::get< Index < GenField, action_hirep_types>::value >(actions_hirep).push_back(ptr);
+  };
+
+  template <class ActPtr>
+  static void resize(ActPtr ap, unsigned int n) {
+    ap->resize(n);
+  }
+
+  // Loop on tuple for a callable function
+  template <std::size_t I = 1, typename Callable, typename ...Args>
+  inline typename std::enable_if<I == std::tuple_size<action_collection>::value, void>::type apply(Callable, Repr& R,Args&...) const {}
+
+  template <std::size_t I = 1, typename Callable, typename ...Args>
+  inline typename std::enable_if<I < std::tuple_size<action_collection>::value, void>::type apply(Callable fn, Repr& R, Args&... arguments) const {
+    fn(std::get<I>(actions_hirep), std::get<I>(R.rep), arguments...);
+    apply<I + 1>(fn, R, arguments...);
+  }  
+
+};
+
+// Define the ActionSet
+template <class GaugeField, class R>
+using ActionSet = std::vector<ActionLevel<GaugeField, R> >;
+
+} // QCD
+} // Grid
+
+#endif  // ACTION_SET_H
--- a/Grid/qcd/action/fermion/AbstractEOFAFermion.h
+++ b/Grid/qcd/action/fermion/AbstractEOFAFermion.h
@@ -0,0 +1,100 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/AbstractEOFAFermion.h
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef  GRID_QCD_ABSTRACT_EOFA_FERMION_H
+#define  GRID_QCD_ABSTRACT_EOFA_FERMION_H
+
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
+
+namespace Grid {
+namespace QCD {
+
+  // DJM: Abstract base class for EOFA fermion types.
+  // Defines layout of additional EOFA-specific parameters and operators.
+  // Use to construct EOFA pseudofermion actions that are agnostic to
+  // Shamir / Mobius / etc., and ensure that no one can construct EOFA
+  // pseudofermion action with non-EOFA fermion type.
+  template<class Impl>
+  class AbstractEOFAFermion : public CayleyFermion5D<Impl> {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+    public:
+      // Fermion operator: D(mq1) + shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm}
+      RealD mq1;
+      RealD mq2;
+      RealD mq3;
+      RealD shift;
+      int pm;
+
+      RealD alpha; // Mobius scale
+      RealD k;     // EOFA normalization constant
+
+      virtual void Instantiatable(void) = 0;
+
+      // EOFA-specific operations
+      // Force user to implement in derived classes
+      virtual void  Omega    (const FermionField& in, FermionField& out, int sign, int dag) = 0;
+      virtual void  Dtilde   (const FermionField& in, FermionField& out) = 0;
+      virtual void  DtildeInv(const FermionField& in, FermionField& out) = 0;
+
+      // Implement derivatives in base class:
+      // for EOFA both DWF and Mobius just need d(Dw)/dU
+      virtual void MDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
+        this->DhopDeriv(mat, U, V, dag);
+      };
+      virtual void MoeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
+        this->DhopDerivOE(mat, U, V, dag);
+      };
+      virtual void MeoDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
+        this->DhopDerivEO(mat, U, V, dag);
+      };
+
+      // Recompute 5D coefficients for different value of shift constant
+      // (needed for heatbath loop over poles)
+      virtual void RefreshShiftCoefficients(RealD new_shift) = 0;
+
+      // Constructors
+      AbstractEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
+        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
+        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int _pm,
+        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams())
+        : CayleyFermion5D<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid,
+          _mq1, _M5, p), mq1(_mq1), mq2(_mq2), mq3(_mq3), shift(_shift), pm(_pm)
+      {
+        int Ls = this->Ls;
+        this->alpha = _b + _c;
+        this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
+                    ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
+                    ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
+      };
+  };
+}}
+
+#endif
--- a/Grid/qcd/action/fermion/CayleyFermion5D.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.cc
@@ -0,0 +1,638 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
+
+namespace Grid {
+namespace QCD {
+
+ template<class Impl>
+ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
+					GridCartesian         &FiveDimGrid,
+					GridRedBlackCartesian &FiveDimRedBlackGrid,
+					GridCartesian         &FourDimGrid,
+					GridRedBlackCartesian &FourDimRedBlackGrid,
+					RealD _mass,RealD _M5,const ImplParams &p) :
+   WilsonFermion5D<Impl>(_Umu,
+		   FiveDimGrid,
+		   FiveDimRedBlackGrid,
+		   FourDimGrid,
+ 	 	   FourDimRedBlackGrid,_M5,p),
+   mass(_mass)
+ { 
+ }
+
+///////////////////////////////////////////////////////////////
+// Physical surface field utilities
+///////////////////////////////////////////////////////////////
+template<class Impl>  
+void CayleyFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+{
+  int Ls = this->Ls;
+  FermionField tmp(this->FermionGrid());
+  tmp = solution5d;
+  conformable(solution5d._grid,this->FermionGrid());
+  conformable(exported4d._grid,this->GaugeGrid());
+  axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0);
+  axpby_ssp_pplus (tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
+  ExtractSlice(exported4d, tmp, 0, 0);
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::ExportPhysicalFermionSource(const FermionField &solution5d,FermionField &exported4d)
+{
+  int Ls = this->Ls;
+  FermionField tmp(this->FermionGrid());
+  tmp = solution5d;
+  conformable(solution5d._grid,this->FermionGrid());
+  conformable(exported4d._grid,this->GaugeGrid());
+  axpby_ssp_pplus (tmp, 0., solution5d, 1., solution5d, 0, 0);
+  axpby_ssp_pminus(tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
+  ExtractSlice(exported4d, tmp, 0, 0);
+}
+template<class Impl>
+void CayleyFermion5D<Impl>::ImportUnphysicalFermion(const FermionField &input4d,FermionField &imported5d)
+{
+  int Ls = this->Ls;
+  FermionField tmp(this->FermionGrid());
+  conformable(imported5d._grid,this->FermionGrid());
+  conformable(input4d._grid   ,this->GaugeGrid());
+  tmp = zero;
+  InsertSlice(input4d, tmp, 0   , 0);
+  InsertSlice(input4d, tmp, Ls-1, 0);
+  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
+  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
+  imported5d=tmp;
+}
+
+template<class Impl>  
+void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+{
+  int Ls = this->Ls;
+  FermionField tmp(this->FermionGrid());
+  conformable(imported5d._grid,this->FermionGrid());
+  conformable(input4d._grid   ,this->GaugeGrid());
+  tmp = zero;
+  InsertSlice(input4d, tmp, 0   , 0);
+  InsertSlice(input4d, tmp, Ls-1, 0);
+  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
+  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
+  Dminus(tmp,imported5d);
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+
+  FermionField tmp_f(this->FermionGrid());
+  this->DW(psi,tmp_f,DaggerNo);
+
+  for(int s=0;s<Ls;s++){
+    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
+  }
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+
+  FermionField tmp_f(this->FermionGrid());
+  this->DW(psi,tmp_f,DaggerYes);
+
+  for(int s=0;s<Ls;s++){
+    axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
+  }
+}
+
+template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
+{
+  this->Report();
+  std::vector<int> latt = GridDefaultLatt();          
+  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP     = this->_FourDimGrid->_Nprocessors;
+  if ( M5Dcalls > 0 ) {
+    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
+
+    // Flops = 6.0*(Nc*Ns) *Ls*vol
+    RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
+    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  }
+
+  if ( MooeeInvCalls > 0 ) {
+
+    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
+
+    // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
+    RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
+    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  }
+
+}
+template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
+{
+  this->ZeroCounters();
+  M5Dflops=0;
+  M5Dcalls=0;
+  M5Dtime=0;
+  MooeeInvFlops=0;
+  MooeeInvCalls=0;
+  MooeeInvTime=0;
+}
+
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<Coeff_t> diag (Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
+  M5D(psi,chi,chi,lower,diag,upper);
+}
+template<class Impl>
+void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
+{
+  int Ls=this->Ls;
+  std::vector<Coeff_t> diag = bs;
+  std::vector<Coeff_t> upper= cs;
+  std::vector<Coeff_t> lower= cs; 
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,Din,lower,diag,upper);
+}
+// FIXME Redunant with the above routine; check this and eliminate
+template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<Coeff_t> diag = beo;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
+  for(int i=0;i<Ls;i++) {
+    upper[i]=-ceo[i];
+    lower[i]=-ceo[i];
+  }
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,chi,lower,diag,upper);
+}
+template<class Impl>
+void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
+  for(int i=0;i<Ls;i++) {
+    upper[i]=-cee[i];
+    lower[i]=-cee[i];
+  }
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,chi,lower,diag,upper);
+}
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
+
+  for (int s=0;s<Ls;s++){
+    // Assemble the 5d matrix
+    if ( s==0 ) {
+      upper[s] = -cee[s+1] ;
+      lower[s] = mass*cee[Ls-1];
+    } else if ( s==(Ls-1)) { 
+      upper[s] = mass*cee[0];
+      lower[s] = -cee[s-1];
+    } else {
+      upper[s]=-cee[s+1];
+      lower[s]=-cee[s-1];
+    }
+  }
+  // Conjugate the terms 
+  for (int s=0;s<Ls;s++){
+    diag[s] =conjugate(diag[s]);
+    upper[s]=conjugate(upper[s]);
+    lower[s]=conjugate(lower[s]);
+  }
+  M5Ddag(psi,psi,chi,lower,diag,upper);
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0);
+  std::vector<Coeff_t> lower(Ls,-1.0);
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5Ddag(psi,chi,chi,lower,diag,upper);
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
+{
+  int Ls=this->Ls;
+  std::vector<Coeff_t> diag =bs;
+  std::vector<Coeff_t> upper=cs;
+  std::vector<Coeff_t> lower=cs; 
+
+  for (int s=0;s<Ls;s++){
+    if ( s== 0 ) {
+      upper[s] = cs[s+1];
+      lower[s] =-mass*cs[Ls-1];
+    } else if ( s==(Ls-1) ) { 
+      upper[s] =-mass*cs[0];
+      lower[s] = cs[s-1];
+    } else { 
+      upper[s] = cs[s+1];
+      lower[s] = cs[s-1];
+    }
+    upper[s] = conjugate(upper[s]);
+    lower[s] = conjugate(lower[s]);
+    diag[s]  = conjugate(diag[s]);
+  }
+  M5Ddag(psi,psi,Din,lower,diag,upper);
+}
+
+template<class Impl>
+RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  
+  FermionField Din(psi._grid);
+  
+  // Assemble Din
+  Meooe5D(psi,Din);
+  
+  this->DW(Din,chi,DaggerNo);
+  // ((b D_W + D_w hop terms +1) on s-diag
+  axpby(chi,1.0,1.0,chi,psi); 
+  
+  M5D(psi,chi);
+  return(norm2(chi));
+}
+
+template<class Impl>
+RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
+{
+  // Under adjoint
+  //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
+  //D2- P+     D2+            P-D1-^dag D2+dag
+  
+  FermionField Din(psi._grid);
+  // Apply Dw
+  this->DW(psi,Din,DaggerYes); 
+  
+  MeooeDag5D(Din,chi);
+  
+  M5Ddag(psi,chi);
+  // ((b D_W + D_w hop terms +1) on s-diag
+  axpby (chi,1.0,1.0,chi,psi); 
+  return norm2(chi);
+}
+
+// half checkerboard operations
+template<class Impl>
+void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+
+  Meooe5D(psi,this->tmp()); 
+
+  if ( psi.checkerboard == Odd ) {
+    this->DhopEO(this->tmp(),chi,DaggerNo);
+  } else {
+    this->DhopOE(this->tmp(),chi,DaggerNo);
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
+{
+  // Apply 4d dslash
+  if ( psi.checkerboard == Odd ) {
+    this->DhopEO(psi,this->tmp(),DaggerYes);
+  } else {
+    this->DhopOE(psi,this->tmp(),DaggerYes);
+  }
+  MeooeDag5D(this->tmp(),chi); 
+}
+
+template<class Impl>
+void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+  Meo5D(psi,this->tmp());
+  // Apply 4d dslash fragment
+  this->DhopDir(this->tmp(),chi,dir,disp);
+}
+// force terms; five routines; default to Dhop on diagonal
+template<class Impl>
+void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
+  
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDeriv(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+    Meooe5D(U,Din);
+    this->DhopDeriv(mat,Din,V,dag);
+  }
+};
+template<class Impl>
+void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
+  
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDerivOE(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+    Meooe5D(U,Din);
+    this->DhopDerivOE(mat,Din,V,dag);
+  }
+};
+template<class Impl>
+void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
+  
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDerivEO(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+    Meooe5D(U,Din);
+    this->DhopDerivEO(mat,Din,V,dag);
+  }
+};
+  
+// Tanh
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
+{
+  std::vector<Coeff_t> gamma(this->Ls);
+  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
+  SetCoefficientsInternal(1.0,gamma,b,c);
+}
+//Zolo
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
+{
+  std::vector<Coeff_t> gamma(this->Ls);
+  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
+  SetCoefficientsInternal(zolo_hi,gamma,b,c);
+}
+//Zolo
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
+{
+  int Ls=this->Ls;
+
+  ///////////////////////////////////////////////////////////
+  // The Cayley coeffs (unprec)
+  ///////////////////////////////////////////////////////////
+  assert(gamma.size()==Ls);
+
+  omega.resize(Ls);
+  bs.resize(Ls);
+  cs.resize(Ls);
+  as.resize(Ls);
+  
+  // 
+  // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
+  //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
+  //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
+  //
+  //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
+  //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
+  //
+  // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
+  // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
+  //
+  // So 
+  //
+  // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
+  //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
+  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
+  //
+  // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
+  //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
+  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
+  // 
+    
+  double bpc = b+c;
+  double bmc = b-c;
+  for(int i=0; i < Ls; i++){
+    as[i] = 1.0;
+    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
+    assert(omega[i]!=Coeff_t(0.0));
+    bs[i] = 0.5*(bpc/omega[i] + bmc);
+    cs[i] = 0.5*(bpc/omega[i] - bmc);
+  }
+
+  ////////////////////////////////////////////////////////
+  // Constants for the preconditioned matrix Cayley form
+  ////////////////////////////////////////////////////////
+  bee.resize(Ls);
+  cee.resize(Ls);
+  beo.resize(Ls);
+  ceo.resize(Ls);
+  
+  for(int i=0;i<Ls;i++){
+    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
+    assert(bee[i]!=Coeff_t(0.0));
+    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
+    beo[i]=as[i]*bs[i];
+    ceo[i]=-as[i]*cs[i];
+  }
+  aee.resize(Ls);
+  aeo.resize(Ls);
+  for(int i=0;i<Ls;i++){
+    aee[i]=cee[i];
+    aeo[i]=ceo[i];
+  }
+  
+  //////////////////////////////////////////
+  // LDU decomposition of eeoo
+  //////////////////////////////////////////
+  dee.resize(Ls);
+  lee.resize(Ls);
+  leem.resize(Ls);
+  uee.resize(Ls);
+  ueem.resize(Ls);
+  
+  for(int i=0;i<Ls;i++){
+    
+    dee[i] = bee[i];
+    
+    if ( i < Ls-1 ) {
+
+      assert(bee[i]!=Coeff_t(0.0));
+      assert(bee[0]!=Coeff_t(0.0));
+      
+      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
+      
+      leem[i]=mass*cee[Ls-1]/bee[0];
+      for(int j=0;j<i;j++) {
+	assert(bee[j+1]!=Coeff_t(0.0));
+	leem[i]*= aee[j]/bee[j+1];
+      }
+      
+      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
+      
+      ueem[i]=mass;
+      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
+      ueem[i]*= aee[0]/bee[0];
+      
+    } else { 
+      lee[i] =0.0;
+      leem[i]=0.0;
+      uee[i] =0.0;
+      ueem[i]=0.0;
+    }
+  }
+	
+  { 
+    Coeff_t delta_d=mass*cee[Ls-1];
+    for(int j=0;j<Ls-1;j++) {
+      assert(bee[j] != Coeff_t(0.0));
+      delta_d *= cee[j]/bee[j];
+    }
+    dee[Ls-1] += delta_d;
+  }  
+
+  int inv=1;
+  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
+  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
+}
+
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
+						 Vector<iSinglet<Simd> > & Matp,
+						 Vector<iSinglet<Simd> > & Matm)
+{
+  int Ls=this->Ls;
+
+  GridBase *grid = this->FermionRedBlackGrid();
+  int LLs = grid->_rdimensions[0];
+
+  if ( LLs == Ls ) {
+    return; // Not vectorised in 5th direction
+  }
+
+  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
+  
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s) = bee[s];
+    Pminus(s,s)= bee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pminus(s,s+1) = -cee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pplus(s+1,s) = -cee[s+1];
+  }
+  Pplus (0,Ls-1) = mass*cee[0];
+  Pminus(Ls-1,0) = mass*cee[Ls-1];
+  
+  Eigen::MatrixXcd PplusMat ;
+  Eigen::MatrixXcd PminusMat;
+  
+  if ( inv ) {
+    PplusMat =Pplus.inverse();
+    PminusMat=Pminus.inverse();
+  } else { 
+    PplusMat =Pplus;
+    PminusMat=Pminus;
+  }
+  
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+  
+  typedef typename SiteHalfSpinor::scalar_type scalar_type;
+  const int Nsimd=Simd::Nsimd();
+  Matp.resize(Ls*LLs);
+  Matm.resize(Ls*LLs);
+
+  for(int s2=0;s2<Ls;s2++){
+  for(int s1=0;s1<LLs;s1++){
+    int istride = LLs;
+    int ostride = 1;
+    Simd Vp;
+    Simd Vm;
+    scalar_type *sp = (scalar_type *)&Vp;
+    scalar_type *sm = (scalar_type *)&Vm;
+    for(int l=0;l<Nsimd;l++){
+      if ( switcheroo<Coeff_t>::iscomplex() ) {
+	sp[l] = PplusMat (l*istride+s1*ostride,s2);
+	sm[l] = PminusMat(l*istride+s1*ostride,s2);
+      } else { 
+      // if real
+	scalar_type tmp;
+	tmp = PplusMat (l*istride+s1*ostride,s2);
+	sp[l] = scalar_type(tmp.real(),tmp.real());
+	tmp = PminusMat(l*istride+s1*ostride,s2);
+	sm[l] = scalar_type(tmp.real(),tmp.real());
+      }
+    }
+    Matp[LLs*s2+s1] = Vp;
+    Matm[LLs*s2+s1] = Vm;
+  }}
+}
+
+
+  FermOpTemplateInstantiate(CayleyFermion5D);
+  GparityFermOpTemplateInstantiate(CayleyFermion5D);
+
+}}
+
+
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -0,0 +1,209 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_CAYLEY_FERMION_H
+#define  GRID_QCD_CAYLEY_FERMION_H
+
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+     template<typename T> struct switcheroo   {
+       static inline int iscomplex()  { return 0; }
+
+       template<class vec>
+       static inline vec mult(vec a, vec b) {
+	 return real_mult(a,b);
+       }
+     };
+     template<> struct switcheroo<ComplexD> {
+       static inline int iscomplex()  { return 1; }
+
+       template<class vec>
+       static inline vec mult(vec a, vec b) {
+	 return a*b;
+       }
+     };
+     template<> struct switcheroo<ComplexF> {
+       static inline int iscomplex()  { return 1; }
+       template<class vec>
+       static inline vec mult(vec a, vec b) {
+	 return a*b;
+       }
+     };
+
+
+    template<class Impl>
+    class CayleyFermion5D : public WilsonFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+    public:
+
+      // override multiply
+      virtual RealD  M    (const FermionField &in, FermionField &out);
+      virtual RealD  Mdag (const FermionField &in, FermionField &out);
+
+      // half checkerboard operations
+      virtual void   Meooe       (const FermionField &in, FermionField &out);
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+      virtual void   Mooee       (const FermionField &in, FermionField &out);
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+      virtual void   Meo5D (const FermionField &psi, FermionField &chi);
+
+      virtual void   M5D   (const FermionField &psi, FermionField &chi);
+      virtual void   M5Ddag(const FermionField &psi, FermionField &chi);
+
+      ///////////////////////////////////////////////////////////////
+      // Physical surface field utilities
+      ///////////////////////////////////////////////////////////////
+      virtual void   Dminus(const FermionField &psi, FermionField &chi);
+      virtual void   DminusDag(const FermionField &psi, FermionField &chi);
+      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+      virtual void ExportPhysicalFermionSource(const FermionField &solution5d, FermionField &exported4d);
+      virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);
+      virtual void ImportUnphysicalFermion(const FermionField &solution5d, FermionField &exported4d);
+
+      /////////////////////////////////////////////////////
+      // Instantiate different versions depending on Impl
+      /////////////////////////////////////////////////////
+      void M5D(const FermionField &psi,
+	       const FermionField &phi,
+	       FermionField &chi,
+	       std::vector<Coeff_t> &lower,
+	       std::vector<Coeff_t> &diag,
+	       std::vector<Coeff_t> &upper);
+
+      void M5Ddag(const FermionField &psi,
+		  const FermionField &phi,
+		  FermionField &chi,
+		  std::vector<Coeff_t> &lower,
+		  std::vector<Coeff_t> &diag,
+		  std::vector<Coeff_t> &upper);
+
+      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
+      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd> > & Matp, Vector<iSinglet<Simd> > & Matm);
+
+      void MooeeInternalAsm(const FermionField &in, FermionField &out,
+			    int LLs, int site,
+			    Vector<iSinglet<Simd> > &Matp,
+			    Vector<iSinglet<Simd> > &Matm);
+      void MooeeInternalZAsm(const FermionField &in, FermionField &out,
+			    int LLs, int site,
+			    Vector<iSinglet<Simd> > &Matp,
+			    Vector<iSinglet<Simd> > &Matm);
+
+
+      virtual void   Instantiatable(void)=0;
+
+      // force terms; five routines; default to Dhop on diagonal
+      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+
+      // Efficient support for multigrid coarsening
+      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+
+      void   Meooe5D       (const FermionField &in, FermionField &out);
+      void   MeooeDag5D    (const FermionField &in, FermionField &out);
+
+      //    protected:
+      RealD mass;
+
+      // Cayley form Moebius (tanh and zolotarev)
+      std::vector<Coeff_t> omega;
+      std::vector<Coeff_t> bs;    // S dependent coeffs
+      std::vector<Coeff_t> cs;
+      std::vector<Coeff_t> as;
+      // For preconditioning Cayley form
+      std::vector<Coeff_t> bee;
+      std::vector<Coeff_t> cee;
+      std::vector<Coeff_t> aee;
+      std::vector<Coeff_t> beo;
+      std::vector<Coeff_t> ceo;
+      std::vector<Coeff_t> aeo;
+      // LDU factorisation of the eeoo matrix
+      std::vector<Coeff_t> lee;
+      std::vector<Coeff_t> leem;
+      std::vector<Coeff_t> uee;
+      std::vector<Coeff_t> ueem;
+      std::vector<Coeff_t> dee;
+
+      // Matrices of 5d ee inverse params
+      Vector<iSinglet<Simd> >  MatpInv;
+      Vector<iSinglet<Simd> >  MatmInv;
+      Vector<iSinglet<Simd> >  MatpInvDag;
+      Vector<iSinglet<Simd> >  MatmInvDag;
+
+      // Constructors
+      CayleyFermion5D(GaugeField &_Umu,
+		      GridCartesian         &FiveDimGrid,
+		      GridRedBlackCartesian &FiveDimRedBlackGrid,
+		      GridCartesian         &FourDimGrid,
+		      GridRedBlackCartesian &FourDimRedBlackGrid,
+		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
+
+
+
+     void CayleyReport(void);
+     void CayleyZeroCounters(void);
+
+     double M5Dflops;
+     double M5Dcalls;
+     double M5Dtime;
+
+     double MooeeInvFlops;
+     double MooeeInvCalls;
+     double MooeeInvTime;
+
+    protected:
+      virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
+      virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
+      virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
+    };
+
+  }
+}
+#define INSTANTIATE_DPERP(A)\
+template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
+					std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
+template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
+					   std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
+template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \
+template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);
+
+#undef  CAYLEY_DPERP_DENSE
+#define  CAYLEY_DPERP_CACHE
+#undef  CAYLEY_DPERP_LINALG
+#define CAYLEY_DPERP_VEC
+
+#endif
--- a/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -0,0 +1,249 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
+
+
+namespace Grid {
+namespace QCD {
+
+  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+
+  // Pminus fowards
+  // Pplus  backwards..
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
+{
+  int Ls =this->Ls;
+  GridBase *grid=psi._grid;
+  assert(phi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  M5Dcalls++;
+  M5Dtime-=usecond();
+
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    for(int s=0;s<Ls;s++){
+      auto tmp = psi._odata[0];
+      if ( s==0 ) {
+ 	                            spProj5m(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	                    spProj5p(tmp,psi._odata[ss+Ls-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else if ( s==(Ls-1)) {
+	                            spProj5m(tmp,psi._odata[ss+0]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+ 	                    spProj5p(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else { 
+	                            spProj5m(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	                    spProj5p(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      }
+    }
+  }
+  M5Dtime+=usecond();
+}
+
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
+{
+  int Ls =this->Ls;
+  GridBase *grid=psi._grid;
+  assert(phi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  M5Dcalls++;
+  M5Dtime-=usecond();
+
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    auto tmp = psi._odata[0];
+    for(int s=0;s<Ls;s++){
+      if ( s==0 ) {
+	spProj5p(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+Ls-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else if ( s==(Ls-1)) {
+	spProj5p(tmp,psi._odata[ss+0]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else { 
+	spProj5p(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      }
+    }
+  }
+  M5Dtime+=usecond();
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
+{
+  GridBase *grid=psi._grid;
+  int Ls=this->Ls;
+
+  chi.checkerboard=psi.checkerboard;
+
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    auto tmp = psi._odata[0];
+
+    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
+    // Apply (L^{\prime})^{-1}
+    chi[ss]=psi[ss]; // chi[0]=psi[0]
+    for(int s=1;s<Ls;s++){
+                            spProj5p(tmp,chi[ss+s-1]);  
+      chi[ss+s] = psi[ss+s]-lee[s-1]*tmp;
+    }
+    // L_m^{-1} 
+    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+                                   spProj5m(tmp,chi[ss+s]);    
+      chi[ss+Ls-1] = chi[ss+Ls-1] - leem[s]*tmp;
+    }
+    // U_m^{-1} D^{-1}
+    for (int s=0;s<Ls-1;s++){
+      // Chi[s] + 1/d chi[s] 
+                                                spProj5p(tmp,chi[ss+Ls-1]); 
+      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(ueem[s]/dee[Ls-1])*tmp;
+    }	
+    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
+      
+    // Apply U^{-1}
+    for (int s=Ls-2;s>=0;s--){
+                            spProj5m(tmp,chi[ss+s+1]);  
+      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
+    }
+  }
+
+  MooeeInvTime+=usecond();
+
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  GridBase *grid=psi._grid;
+  int Ls=this->Ls;
+
+  assert(psi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+
+  std::vector<Coeff_t> ueec(Ls);
+  std::vector<Coeff_t> deec(Ls);
+  std::vector<Coeff_t> leec(Ls);
+  std::vector<Coeff_t> ueemc(Ls);
+  std::vector<Coeff_t> leemc(Ls);
+  for(int s=0;s<ueec.size();s++){
+    ueec[s] = conjugate(uee[s]);
+    deec[s] = conjugate(dee[s]);
+    leec[s] = conjugate(lee[s]);
+    ueemc[s]= conjugate(ueem[s]);
+    leemc[s]= conjugate(leem[s]);
+  }
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+
+    auto tmp = psi._odata[0];
+
+    // Apply (U^{\prime})^{-dagger}
+    chi[ss]=psi[ss];
+    for (int s=1;s<Ls;s++){
+                            spProj5m(tmp,chi[ss+s-1]);
+      chi[ss+s] = psi[ss+s]-ueec[s-1]*tmp;
+    }
+    // U_m^{-\dagger} 
+    for (int s=0;s<Ls-1;s++){
+                                   spProj5p(tmp,chi[ss+s]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp;
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for (int s=0;s<Ls-1;s++){
+      spProj5m(tmp,chi[ss+Ls-1]);
+      chi[ss+s] = (1.0/deec[s])*chi[ss+s]-(leemc[s]/deec[Ls-1])*tmp;
+    }	
+    chi[ss+Ls-1]= (1.0/deec[Ls-1])*chi[ss+Ls-1];
+  
+    // Apply L^{-dagger}
+    for (int s=Ls-2;s>=0;s--){
+      spProj5p(tmp,chi[ss+s+1]);
+      chi[ss+s] = chi[ss+s] - leec[s]*tmp;
+    }
+  }
+
+  MooeeInvTime+=usecond();
+
+}
+
+#ifdef CAYLEY_DPERP_CACHE
+  INSTANTIATE_DPERP(WilsonImplF);
+  INSTANTIATE_DPERP(WilsonImplD);
+  INSTANTIATE_DPERP(GparityWilsonImplF);
+  INSTANTIATE_DPERP(GparityWilsonImplD);
+  INSTANTIATE_DPERP(ZWilsonImplF);
+  INSTANTIATE_DPERP(ZWilsonImplD);
+
+  INSTANTIATE_DPERP(WilsonImplFH);
+  INSTANTIATE_DPERP(WilsonImplDF);
+  INSTANTIATE_DPERP(GparityWilsonImplFH);
+  INSTANTIATE_DPERP(GparityWilsonImplDF);
+  INSTANTIATE_DPERP(ZWilsonImplFH);
+  INSTANTIATE_DPERP(ZWilsonImplDF);
+#endif
+
+}}
--- a/Grid/qcd/action/fermion/CayleyFermion5Ddense.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Ddense.cc
@@ -0,0 +1,156 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
+
+
+namespace Grid {
+namespace QCD {
+  /*
+   * Dense matrix versions of routines
+   */
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
+}
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
+{
+  int Ls=this->Ls;
+  int LLs = psi._grid->_rdimensions[0];
+  int vol = psi._grid->oSites()/LLs;
+  
+  chi.checkerboard=psi.checkerboard;
+  
+  assert(Ls==LLs);
+  
+  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+  
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s) = bee[s];
+    Pminus(s,s)= bee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pminus(s,s+1) = -cee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pplus(s+1,s) = -cee[s+1];
+  }
+  Pplus (0,Ls-1) = mass*cee[0];
+  Pminus(Ls-1,0) = mass*cee[Ls-1];
+  
+  Eigen::MatrixXd PplusMat ;
+  Eigen::MatrixXd PminusMat;
+  
+  if ( inv ) {
+    PplusMat =Pplus.inverse();
+    PminusMat=Pminus.inverse();
+  } else { 
+    PplusMat =Pplus;
+    PminusMat=Pminus;
+  }
+  
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+
+  // For the non-vectorised s-direction this is simple
+  
+  for(auto site=0;site<vol;site++){
+    
+    SiteSpinor     SiteChi;
+    SiteHalfSpinor SitePplus;
+    SiteHalfSpinor SitePminus;
+    
+    for(int s1=0;s1<Ls;s1++){
+      SiteChi =zero;
+      for(int s2=0;s2<Ls;s2++){
+	int lex2 = s2+Ls*site;
+	
+	if ( PplusMat(s1,s2) != 0.0 ) {
+	  spProj5p(SitePplus,psi[lex2]);
+	  accumRecon5p(SiteChi,PplusMat (s1,s2)*SitePplus);
+	}
+	
+	if ( PminusMat(s1,s2) != 0.0 ) {
+	  spProj5m(SitePminus,psi[lex2]);
+	  accumRecon5m(SiteChi,PminusMat(s1,s2)*SitePminus);
+	}
+      }
+      chi[s1+Ls*site] = SiteChi*0.5;
+    }
+  }
+}
+
+#ifdef CAYLEY_DPERP_DENSE
+INSTANTIATE_DPERP(GparityWilsonImplF);
+INSTANTIATE_DPERP(GparityWilsonImplD);
+INSTANTIATE_DPERP(WilsonImplF);
+INSTANTIATE_DPERP(WilsonImplD);
+INSTANTIATE_DPERP(ZWilsonImplF);
+INSTANTIATE_DPERP(ZWilsonImplD);
+
+template void CayleyFermion5D<GparityWilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<GparityWilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<WilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<WilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZWilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZWilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+
+INSTANTIATE_DPERP(GparityWilsonImplFH);
+INSTANTIATE_DPERP(GparityWilsonImplDF);
+INSTANTIATE_DPERP(WilsonImplFH);
+INSTANTIATE_DPERP(WilsonImplDF);
+INSTANTIATE_DPERP(ZWilsonImplFH);
+INSTANTIATE_DPERP(ZWilsonImplDF);
+
+template void CayleyFermion5D<GparityWilsonImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<GparityWilsonImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<WilsonImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<WilsonImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZWilsonImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZWilsonImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+#endif
+
+}}
--- a/Grid/qcd/action/fermion/CayleyFermion5Dssp.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Dssp.cc
@@ -0,0 +1,164 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
+
+
+namespace Grid {
+namespace QCD {
+
+  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+  // Pminus fowards
+  // Pplus  backwards
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
+{
+  Coeff_t one(1.0);
+  int Ls=this->Ls;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pplus (chi,one,chi,lower[s],psi,s,Ls-1);
+    } else if ( s==(Ls-1)) { 
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,0);
+      axpby_ssp_pplus (chi,one,chi,lower[s],psi,s,s-1);
+    } else {
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pplus(chi,one,chi,lower[s],psi,s,s-1);
+    }
+  }
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
+{
+  Coeff_t one(1.0);
+  int Ls=this->Ls;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pminus(chi,one,chi,lower[s],psi,s,Ls-1);
+    } else if ( s==(Ls-1)) { 
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,0);
+      axpby_ssp_pminus(chi,one,chi,lower[s],psi,s,s-1);
+    } else {
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pminus(chi,one,chi,lower[s],psi,s,s-1);
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
+{
+  Coeff_t one(1.0);
+  Coeff_t czero(0.0);
+  chi.checkerboard=psi.checkerboard;
+  int Ls=this->Ls;
+  // Apply (L^{\prime})^{-1}
+  axpby_ssp (chi,one,psi,     czero,psi,0,0);      // chi[0]=psi[0]
+  for (int s=1;s<Ls;s++){
+    axpby_ssp_pplus(chi,one,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+  }
+  // L_m^{-1} 
+  for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+    axpby_ssp_pminus(chi,one,chi,-leem[s],chi,Ls-1,s);
+  }
+  // U_m^{-1} D^{-1}
+  for (int s=0;s<Ls-1;s++){
+    // Chi[s] + 1/d chi[s] 
+    axpby_ssp_pplus(chi,one/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
+  }	
+  axpby_ssp(chi,one/dee[Ls-1],chi,czero,chi,Ls-1,Ls-1); // Modest avoidable 
+  
+  // Apply U^{-1}
+  for (int s=Ls-2;s>=0;s--){
+    axpby_ssp_pminus (chi,one,chi,-uee[s],chi,s,s+1);  // chi[Ls]
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  Coeff_t one(1.0);
+  Coeff_t czero(0.0);
+  chi.checkerboard=psi.checkerboard;
+  int Ls=this->Ls;
+  // Apply (U^{\prime})^{-dagger}
+  axpby_ssp (chi,one,psi,     czero,psi,0,0);      // chi[0]=psi[0]
+  for (int s=1;s<Ls;s++){
+    axpby_ssp_pminus(chi,one,psi,-conjugate(uee[s-1]),chi,s,s-1);
+  }
+  // U_m^{-\dagger} 
+  for (int s=0;s<Ls-1;s++){
+    axpby_ssp_pplus(chi,one,chi,-conjugate(ueem[s]),chi,Ls-1,s);
+  }
+  // L_m^{-\dagger} D^{-dagger}
+  for (int s=0;s<Ls-1;s++){
+    axpby_ssp_pminus(chi,one/conjugate(dee[s]),chi,-conjugate(leem[s]/dee[Ls-1]),chi,s,Ls-1);
+  }	
+  axpby_ssp(chi,one/conjugate(dee[Ls-1]),chi,czero,chi,Ls-1,Ls-1); // Modest avoidable 
+  
+  // Apply L^{-dagger}
+  for (int s=Ls-2;s>=0;s--){
+    axpby_ssp_pplus (chi,one,chi,-conjugate(lee[s]),chi,s,s+1);  // chi[Ls]
+  }
+}
+
+
+#ifdef CAYLEY_DPERP_LINALG
+  INSTANTIATE_DPERP(WilsonImplF);
+  INSTANTIATE_DPERP(WilsonImplD);
+  INSTANTIATE_DPERP(GparityWilsonImplF);
+  INSTANTIATE_DPERP(GparityWilsonImplD);
+  INSTANTIATE_DPERP(ZWilsonImplF);
+  INSTANTIATE_DPERP(ZWilsonImplD);
+
+  INSTANTIATE_DPERP(WilsonImplFH);
+  INSTANTIATE_DPERP(WilsonImplDF);
+  INSTANTIATE_DPERP(GparityWilsonImplFH);
+  INSTANTIATE_DPERP(GparityWilsonImplDF);
+  INSTANTIATE_DPERP(ZWilsonImplFH);
+  INSTANTIATE_DPERP(ZWilsonImplDF);
+#endif
+
+}
+}
--- a/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -0,0 +1,828 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
+
+
+namespace Grid {
+namespace QCD {  
+  /*
+   * Dense matrix versions of routines
+   */
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
+}
+  
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
+{
+  GridBase *grid=psi._grid;
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  const int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.checkerboard == psi.checkerboard);
+
+  chi.checkerboard=psi.checkerboard;
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+  for(int i=0;i<nsimd;i++){ //inner
+    int s  = o+i*LLs;
+    int ss = o*nsimd+i;
+    u_p[ss] = upper[s];
+    l_p[ss] = lower[s];
+    d_p[ss] = diag[s];
+  }}
+
+
+  M5Dcalls++;
+  M5Dtime-=usecond();
+
+  assert(Nc==3);
+
+  parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
+#if 0
+      alignas(64) SiteHalfSpinor hp;
+      alignas(64) SiteHalfSpinor hm;
+      alignas(64) SiteSpinor fp;
+      alignas(64) SiteSpinor fm;
+
+      for(int v=0;v<LLs;v++){
+
+	int vp=(v+1)%LLs;
+	int vm=(v+LLs-1)%LLs;
+
+	spProj5m(hp,psi[ss+vp]);
+	spProj5p(hm,psi[ss+vm]);
+
+	if ( vp<=v ) rotate(hp,hp,1);
+	if ( vm>=v ) rotate(hm,hm,nsimd-1);
+	
+	hp=0.5*hp;
+        hm=0.5*hm;
+
+	spRecon5m(fp,hp);
+	spRecon5p(fm,hm);
+
+	chi[ss+v] = d[v]*phi[ss+v];
+	chi[ss+v] = chi[ss+v]     +u[v]*fp;
+	chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+      }
+#else
+      for(int v=0;v<LLs;v++){
+
+	vprefetch(psi[ss+v+LLs]);
+
+	int vp= (v==LLs-1) ? 0     : v+1;
+	int vm= (v==0    ) ? LLs-1 : v-1;
+	
+	Simd hp_00 = psi[ss+vp]()(2)(0); 
+	Simd hp_01 = psi[ss+vp]()(2)(1); 
+	Simd hp_02 = psi[ss+vp]()(2)(2); 
+	Simd hp_10 = psi[ss+vp]()(3)(0); 
+	Simd hp_11 = psi[ss+vp]()(3)(1); 
+	Simd hp_12 = psi[ss+vp]()(3)(2); 
+	
+	Simd hm_00 = psi[ss+vm]()(0)(0); 
+	Simd hm_01 = psi[ss+vm]()(0)(1); 
+	Simd hm_02 = psi[ss+vm]()(0)(2); 
+	Simd hm_10 = psi[ss+vm]()(1)(0); 
+	Simd hm_11 = psi[ss+vm]()(1)(1); 
+	Simd hm_12 = psi[ss+vm]()(1)(2); 
+
+	if ( vp<=v ) {
+	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+	}
+	if ( vm>=v ) {
+	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+	}
+
+	// Can force these to real arithmetic and save 2x.
+	Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
+	Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
+	Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02); 
+	Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
+	Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
+	Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
+	Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
+	Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
+	Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);  
+	Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
+	Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
+	Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
+
+	vstream(chi[ss+v]()(0)(0),p_00);
+	vstream(chi[ss+v]()(0)(1),p_01);
+	vstream(chi[ss+v]()(0)(2),p_02);
+	vstream(chi[ss+v]()(1)(0),p_10);
+	vstream(chi[ss+v]()(1)(1),p_11);
+	vstream(chi[ss+v]()(1)(2),p_12);
+	vstream(chi[ss+v]()(2)(0),p_20);
+	vstream(chi[ss+v]()(2)(1),p_21);
+	vstream(chi[ss+v]()(2)(2),p_22);
+	vstream(chi[ss+v]()(3)(0),p_30);
+	vstream(chi[ss+v]()(3)(1),p_31);
+	vstream(chi[ss+v]()(3)(2),p_32);
+
+      }
+#endif
+  }
+  M5Dtime+=usecond();
+}
+
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
+{
+  GridBase *grid=psi._grid;
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.checkerboard == psi.checkerboard);
+
+  chi.checkerboard=psi.checkerboard;
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+  for(int i=0;i<nsimd;i++){ //inner
+    int s  = o+i*LLs;
+    int ss = o*nsimd+i;
+    u_p[ss] = upper[s];
+    l_p[ss] = lower[s];
+    d_p[ss] = diag[s];
+  }}
+
+  M5Dcalls++;
+  M5Dtime-=usecond();
+  parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
+#if 0
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5p(hp,psi[ss+vp]);
+      spProj5m(hm,psi[ss+vm]);
+
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+      
+      hp=hp*0.5;
+      hm=hm*0.5;
+      spRecon5p(fp,hp);
+      spRecon5m(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+#else
+      for(int v=0;v<LLs;v++){
+
+	vprefetch(psi[ss+v+LLs]);
+
+	int vp= (v==LLs-1) ? 0     : v+1;
+	int vm= (v==0    ) ? LLs-1 : v-1;
+	
+	Simd hp_00 = psi[ss+vp]()(0)(0); 
+	Simd hp_01 = psi[ss+vp]()(0)(1); 
+	Simd hp_02 = psi[ss+vp]()(0)(2); 
+	Simd hp_10 = psi[ss+vp]()(1)(0); 
+	Simd hp_11 = psi[ss+vp]()(1)(1); 
+	Simd hp_12 = psi[ss+vp]()(1)(2); 
+	
+	Simd hm_00 = psi[ss+vm]()(2)(0); 
+	Simd hm_01 = psi[ss+vm]()(2)(1); 
+	Simd hm_02 = psi[ss+vm]()(2)(2); 
+	Simd hm_10 = psi[ss+vm]()(3)(0); 
+	Simd hm_11 = psi[ss+vm]()(3)(1); 
+	Simd hm_12 = psi[ss+vm]()(3)(2); 
+
+	if ( vp<=v ) {
+	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+	}
+	if ( vm>=v ) {
+	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+	}
+
+	Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
+	Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
+	Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02); 
+	Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
+	Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
+	Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
+
+	Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
+	Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
+	Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);  
+	Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
+	Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
+	Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
+
+	vstream(chi[ss+v]()(0)(0),p_00);
+	vstream(chi[ss+v]()(0)(1),p_01);
+	vstream(chi[ss+v]()(0)(2),p_02);
+	vstream(chi[ss+v]()(1)(0),p_10);
+	vstream(chi[ss+v]()(1)(1),p_11);
+	vstream(chi[ss+v]()(1)(2),p_12);
+	vstream(chi[ss+v]()(2)(0),p_20);
+	vstream(chi[ss+v]()(2)(1),p_21);
+	vstream(chi[ss+v]()(2)(2),p_22);
+	vstream(chi[ss+v]()(3)(0),p_30);
+	vstream(chi[ss+v]()(3)(1),p_31);
+	vstream(chi[ss+v]()(3)(2),p_32);
+      }
+#endif
+  }
+  M5Dtime+=usecond();
+}
+
+
+#ifdef AVX512 
+#include <simd/Intel512common.h>
+#include <simd/Intel512avx.h>
+#include <simd/Intel512single.h>
+#endif 
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionField &chi,
+					     int LLs, int site,
+					     Vector<iSinglet<Simd> > &Matp,
+					     Vector<iSinglet<Simd> > &Matm)
+{
+#ifndef AVX512
+  {
+  SiteHalfSpinor BcastP;
+  SiteHalfSpinor BcastM;
+  SiteHalfSpinor SiteChiP;
+  SiteHalfSpinor SiteChiM;
+
+  // Ls*Ls * 2 * 12 * vol flops
+  for(int s1=0;s1<LLs;s1++){ 
+    for(int s2=0;s2<LLs;s2++){ 
+      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+
+        int s=s2+l*LLs;
+	int lex=s2+LLs*site;
+	
+	if ( s2==0 && l==0) {
+	  SiteChiP=zero;
+	  SiteChiM=zero;
+	}
+	
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
+	}}
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
+	}}
+
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
+	  SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
+	}}
+
+    }}
+    {
+      int lex = s1+LLs*site;
+      for(int sp=0;sp<2;sp++){
+      for(int co=0;co<Nc;co++){
+	vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
+	vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+      }}
+    }
+  }
+
+  }
+#else
+  {
+  // pointers
+    //  MASK_REGS;
+#define Chi_00 %%zmm1
+#define Chi_01 %%zmm2
+#define Chi_02 %%zmm3
+#define Chi_10 %%zmm4
+#define Chi_11 %%zmm5
+#define Chi_12 %%zmm6
+#define Chi_20 %%zmm7
+#define Chi_21 %%zmm8
+#define Chi_22 %%zmm9
+#define Chi_30 %%zmm10
+#define Chi_31 %%zmm11
+#define Chi_32 %%zmm12
+
+#define BCAST0   %%zmm13
+#define BCAST1   %%zmm14
+#define BCAST2   %%zmm15
+#define BCAST3   %%zmm16
+#define BCAST4   %%zmm17
+#define BCAST5   %%zmm18
+#define BCAST6   %%zmm19
+#define BCAST7   %%zmm20
+#define BCAST8   %%zmm21
+#define BCAST9   %%zmm22
+#define BCAST10  %%zmm23
+#define BCAST11  %%zmm24
+
+  int incr=LLs*LLs*sizeof(iSinglet<Simd>);
+  for(int s1=0;s1<LLs;s1++){ 
+    for(int s2=0;s2<LLs;s2++){ 
+      int lex=s2+LLs*site;
+      uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
+      uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
+      uint64_t a2 = (uint64_t)&psi[lex];
+      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+	if ( (s2+l)==0 ) {
+	  asm (
+  	           VPREFETCH1(0,%2)  	     VPREFETCH1(0,%1)
+  	           VPREFETCH1(12,%2)  	     VPREFETCH1(13,%2)
+  	           VPREFETCH1(14,%2)  	     VPREFETCH1(15,%2)         
+		   VBCASTCDUP(0,%2,BCAST0)   
+		   VBCASTCDUP(1,%2,BCAST1)   
+		   VBCASTCDUP(2,%2,BCAST2)   
+		   VBCASTCDUP(3,%2,BCAST3)   
+		   VBCASTCDUP(4,%2,BCAST4)     VMULMEM (0,%0,BCAST0,Chi_00)
+		   VBCASTCDUP(5,%2,BCAST5)     VMULMEM (0,%0,BCAST1,Chi_01)
+		   VBCASTCDUP(6,%2,BCAST6)     VMULMEM (0,%0,BCAST2,Chi_02)
+		   VBCASTCDUP(7,%2,BCAST7)     VMULMEM (0,%0,BCAST3,Chi_10)
+		   VBCASTCDUP(8,%2,BCAST8)     VMULMEM (0,%0,BCAST4,Chi_11)
+		   VBCASTCDUP(9,%2,BCAST9)     VMULMEM (0,%0,BCAST5,Chi_12)
+		   VBCASTCDUP(10,%2,BCAST10)   VMULMEM (0,%1,BCAST6,Chi_20)
+		   VBCASTCDUP(11,%2,BCAST11)   VMULMEM (0,%1,BCAST7,Chi_21)
+		   VMULMEM (0,%1,BCAST8,Chi_22)         
+		   VMULMEM (0,%1,BCAST9,Chi_30)
+		   VMULMEM (0,%1,BCAST10,Chi_31)       
+		   VMULMEM (0,%1,BCAST11,Chi_32)
+		   : : "r" (a0), "r" (a1), "r" (a2)  );
+	} else { 
+	  asm (
+		   VBCASTCDUP(0,%2,BCAST0)   VMADDMEM (0,%0,BCAST0,Chi_00)
+		   VBCASTCDUP(1,%2,BCAST1)   VMADDMEM (0,%0,BCAST1,Chi_01)
+		   VBCASTCDUP(2,%2,BCAST2)   VMADDMEM (0,%0,BCAST2,Chi_02)
+		   VBCASTCDUP(3,%2,BCAST3)   VMADDMEM (0,%0,BCAST3,Chi_10)
+		   VBCASTCDUP(4,%2,BCAST4)   VMADDMEM (0,%0,BCAST4,Chi_11)
+		   VBCASTCDUP(5,%2,BCAST5)   VMADDMEM (0,%0,BCAST5,Chi_12)
+		   VBCASTCDUP(6,%2,BCAST6)   VMADDMEM (0,%1,BCAST6,Chi_20)
+		   VBCASTCDUP(7,%2,BCAST7)   VMADDMEM (0,%1,BCAST7,Chi_21)
+		   VBCASTCDUP(8,%2,BCAST8)   VMADDMEM (0,%1,BCAST8,Chi_22)
+		   VBCASTCDUP(9,%2,BCAST9)   VMADDMEM (0,%1,BCAST9,Chi_30)
+		   VBCASTCDUP(10,%2,BCAST10)  VMADDMEM (0,%1,BCAST10,Chi_31)
+		   VBCASTCDUP(11,%2,BCAST11)  VMADDMEM (0,%1,BCAST11,Chi_32) 
+		   : : "r" (a0), "r" (a1), "r" (a2)  );
+	}
+	a0 = a0+incr;
+	a1 = a1+incr;
+	a2 = a2+sizeof(typename Simd::scalar_type);
+      }}
+    {
+      int lexa = s1+LLs*site;
+      asm (
+	       VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)		
+	       VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)		
+	       VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)		
+	       VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)		
+	       : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+    }
+  }
+  }
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef BCAST0
+#undef BCAST1
+#undef BCAST2
+#undef BCAST3
+#undef BCAST4
+#undef BCAST5
+#undef BCAST6
+#undef BCAST7
+#undef BCAST8
+#undef BCAST9
+#undef BCAST10
+#undef BCAST11
+#endif
+};
+
+  // Z-mobius version
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionField &chi,
+					     int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
+{
+#ifndef AVX512
+  {
+  SiteHalfSpinor BcastP;
+  SiteHalfSpinor BcastM;
+  SiteHalfSpinor SiteChiP;
+  SiteHalfSpinor SiteChiM;
+
+  // Ls*Ls * 2 * 12 * vol flops
+  for(int s1=0;s1<LLs;s1++){ 
+    for(int s2=0;s2<LLs;s2++){ 
+      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+
+        int s=s2+l*LLs;
+	int lex=s2+LLs*site;
+	
+	if ( s2==0 && l==0) {
+	  SiteChiP=zero;
+	  SiteChiM=zero;
+	}
+	
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
+	}}
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
+	}}
+
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co); 
+	  SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co); 
+	}}
+
+
+    }}
+    {
+      int lex = s1+LLs*site;
+      for(int sp=0;sp<2;sp++){
+      for(int co=0;co<Nc;co++){
+	vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
+	vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+      }}
+    }
+  }
+
+  }
+#else
+  {
+  // pointers
+  //  MASK_REGS;
+#define Chi_00 %zmm0
+#define Chi_01 %zmm1
+#define Chi_02 %zmm2
+#define Chi_10 %zmm3
+#define Chi_11 %zmm4
+#define Chi_12 %zmm5
+#define Chi_20 %zmm6
+#define Chi_21 %zmm7
+#define Chi_22 %zmm8
+#define Chi_30 %zmm9
+#define Chi_31 %zmm10
+#define Chi_32 %zmm11
+#define pChi_00 %%zmm0
+#define pChi_01 %%zmm1
+#define pChi_02 %%zmm2
+#define pChi_10 %%zmm3
+#define pChi_11 %%zmm4
+#define pChi_12 %%zmm5
+#define pChi_20 %%zmm6
+#define pChi_21 %%zmm7
+#define pChi_22 %%zmm8
+#define pChi_30 %%zmm9
+#define pChi_31 %%zmm10
+#define pChi_32 %%zmm11
+
+#define BCAST_00   %zmm12
+#define  SHUF_00   %zmm13
+#define BCAST_01   %zmm14
+#define  SHUF_01   %zmm15
+#define BCAST_02   %zmm16
+#define  SHUF_02   %zmm17
+#define BCAST_10   %zmm18
+#define  SHUF_10   %zmm19
+#define BCAST_11   %zmm20
+#define  SHUF_11   %zmm21
+#define BCAST_12   %zmm22
+#define  SHUF_12   %zmm23
+
+#define Mp  %zmm24
+#define Mps %zmm25
+#define Mm  %zmm26
+#define Mms %zmm27
+#define N 8
+  int incr=LLs*LLs*sizeof(iSinglet<Simd>);
+  for(int s1=0;s1<LLs;s1++){ 
+    for(int s2=0;s2<LLs;s2++){ 
+      int lex=s2+LLs*site;
+      uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
+      uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
+      uint64_t a2 = (uint64_t)&psi[lex];
+      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+	if ( (s2+l)==0 ) {
+	  LOAD64(%r8,a0);
+	  LOAD64(%r9,a1);
+	  LOAD64(%r10,a2);
+	  asm (
+	       VLOAD(0,%r8,Mp)// i r
+	       VLOAD(0,%r9,Mm)
+	       VSHUF(Mp,Mps)  // r i 
+	       VSHUF(Mm,Mms)
+	       VPREFETCH1(12,%r10)  	     VPREFETCH1(13,%r10)
+	       VPREFETCH1(14,%r10)  	     VPREFETCH1(15,%r10)         
+
+	       VMULIDUP(0*N,%r10,Mps,Chi_00)
+	       VMULIDUP(1*N,%r10,Mps,Chi_01)
+	       VMULIDUP(2*N,%r10,Mps,Chi_02)
+	       VMULIDUP(3*N,%r10,Mps,Chi_10)
+	       VMULIDUP(4*N,%r10,Mps,Chi_11)
+	       VMULIDUP(5*N,%r10,Mps,Chi_12)
+
+	       VMULIDUP(6*N ,%r10,Mms,Chi_20)
+	       VMULIDUP(7*N ,%r10,Mms,Chi_21)
+	       VMULIDUP(8*N ,%r10,Mms,Chi_22)
+	       VMULIDUP(9*N ,%r10,Mms,Chi_30)
+	       VMULIDUP(10*N,%r10,Mms,Chi_31)
+	       VMULIDUP(11*N,%r10,Mms,Chi_32)
+
+	       VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
+	       VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
+	       VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
+	       VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
+	       VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
+	       VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
+
+	       VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
+	       VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
+	       VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
+	       VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
+	       VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
+	       VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
+	       );
+	} else { 
+	  LOAD64(%r8,a0);
+	  LOAD64(%r9,a1);
+	  LOAD64(%r10,a2);
+	  asm (
+	       VLOAD(0,%r8,Mp)
+	       VSHUF(Mp,Mps)
+
+	       VLOAD(0,%r9,Mm)
+	       VSHUF(Mm,Mms)
+
+	       VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) //  Mri * Pii +- Cir
+	       VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
+	       VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
+	       VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
+	       VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
+	       VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
+
+	       VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
+	       VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
+	       VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
+	       VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
+	       VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
+	       VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
+
+	       VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) //  Cir = Mir * Prr +- ( Mri * Pii +- Cir) 
+	       VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) //  Ci = MiPr + Ci + MrPi ;    Cr = MrPr - ( MiPi - Cr)
+	       VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
+	       VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
+	       VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
+	       VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
+
+	       VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
+	       VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
+	       VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
+	       VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
+	       VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
+	       VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
+	       );
+	}
+	a0 = a0+incr;
+	a1 = a1+incr;
+	a2 = a2+sizeof(typename Simd::scalar_type);
+      }}
+    {
+      int lexa = s1+LLs*site;
+      /*
+      SiteSpinor tmp;
+      asm (
+	       VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
+	       VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
+	       VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
+	       VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
+	       : : "r" ((uint64_t)&tmp) : "memory" );
+      */
+
+      asm (
+	       VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
+	       VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
+	       VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
+	       VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
+	       : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+      //      if ( 1 || (site==0) ) { 
+      //	std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
+      //      }
+    }
+  }
+  }
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef BCAST0
+#undef BCAST1
+#undef BCAST2
+#undef BCAST3
+#undef BCAST4
+#undef BCAST5
+#undef BCAST6
+#undef BCAST7
+#undef BCAST8
+#undef BCAST9
+#undef BCAST10
+#undef BCAST11
+
+#endif
+};
+
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
+{
+  int Ls=this->Ls;
+  int LLs = psi._grid->_rdimensions[0];
+  int vol = psi._grid->oSites()/LLs;
+
+  chi.checkerboard=psi.checkerboard;
+  
+  Vector<iSinglet<Simd> >  Matp;
+  Vector<iSinglet<Simd> >  Matm;
+  Vector<iSinglet<Simd> >  *_Matp;
+  Vector<iSinglet<Simd> >  *_Matm;
+  
+  //  MooeeInternalCompute(dag,inv,Matp,Matm);
+  if ( inv && dag ) { 
+    _Matp = &MatpInvDag;
+    _Matm = &MatmInvDag;
+  }
+  if ( inv && (!dag) ) { 
+    _Matp = &MatpInv;
+    _Matm = &MatmInv;
+  } 
+  if ( !inv ) {
+    MooeeInternalCompute(dag,inv,Matp,Matm);
+    _Matp = &Matp;
+    _Matm = &Matm;
+  }
+  assert(_Matp->size()==Ls*LLs);
+
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+
+  if ( switcheroo<Coeff_t>::iscomplex() ) {
+    parallel_for(auto site=0;site<vol;site++){
+      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
+    }
+  } else { 
+    parallel_for(auto site=0;site<vol;site++){
+      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
+    }
+  }
+  MooeeInvTime+=usecond();
+}
+
+INSTANTIATE_DPERP(DomainWallVec5dImplD);
+INSTANTIATE_DPERP(DomainWallVec5dImplF);
+INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
+INSTANTIATE_DPERP(ZDomainWallVec5dImplF);
+
+INSTANTIATE_DPERP(DomainWallVec5dImplDF);
+INSTANTIATE_DPERP(DomainWallVec5dImplFH);
+INSTANTIATE_DPERP(ZDomainWallVec5dImplDF);
+INSTANTIATE_DPERP(ZDomainWallVec5dImplFH);
+
+template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+
+template void CayleyFermion5D<DomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<DomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+
+
+
+}}
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.cc
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.cc
@@ -0,0 +1,323 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
+
+namespace Grid {
+  namespace QCD {
+
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
+    {
+      SetCoefficientsZolotarev(1.0/scale,zdata);
+    }
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
+    {
+      // How to check Ls matches??
+      //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+      int Ls = this->Ls;
+      assert(zdata->db==Ls);// Beta has Ls coeffs
+
+      R=(1+this->mass)/(1-this->mass);
+
+      Beta.resize(Ls);
+      cc.resize(Ls);
+      cc_d.resize(Ls);
+      sqrt_cc.resize(Ls);
+      for(int i=0; i < Ls ; i++){
+	Beta[i] = zdata -> beta[i];
+	cc[i] = 1.0/Beta[i];
+	cc_d[i]=sqrt(cc[i]);
+      }
+    
+      cc_d[Ls-1]=1.0;
+      for(int i=0; i < Ls-1 ; i++){
+	sqrt_cc[i]= sqrt(cc[i]*cc[i+1]);
+      }    
+      sqrt_cc[Ls-2]=sqrt(cc[Ls-2]);
+
+
+      ZoloHiInv =1.0/zolo_hi;
+      dw_diag = (4.0-this->M5)*ZoloHiInv;
+    
+      See.resize(Ls);
+      Aee.resize(Ls);
+      int sign=1;
+      for(int s=0;s<Ls;s++){
+	Aee[s] = sign * Beta[s] * dw_diag;
+	sign   = - sign;
+      }
+      Aee[Ls-1] += R;
+    
+      See[0] = Aee[0];
+      for(int s=1;s<Ls;s++){
+	See[s] = Aee[s] - 1.0/See[s-1];
+      }
+      for(int s=0;s<Ls;s++){
+	std::cout<<GridLogMessage <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
+      }
+    }
+
+
+
+    template<class Impl>
+    RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
+    {
+      int Ls = this->Ls;
+
+      FermionField D(psi._grid);
+
+      this->DW(psi,D,DaggerNo); 
+
+      int sign=1;
+      for(int s=0;s<Ls;s++){
+	if ( s==0 ) {
+	  ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*ZoloHiInv,D,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
+	} else if ( s==(Ls-1) ){
+	  RealD R=(1.0+mass)/(1.0-mass);
+	  ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,D,sqrt_cc[s-1],psi,s,s-1);
+	  ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
+	} else {
+	  ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,D,sqrt_cc[s],psi,s,s+1);
+  	  axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
+	}
+	sign=-sign; 
+      }
+      return norm2(chi);
+    }
+    template<class Impl>
+    RealD  ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
+    {
+      // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
+      // The rest of matrix is symmetric.
+      // Can ignore "dag"
+      return M(psi,chi);
+    }
+    template<class Impl>
+    void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+      int Ls = this->Ls;
+
+      this->DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
+
+      int sign=1;
+      for(int s=0;s<Ls;s++){
+	if ( s==(Ls-1) ){
+	  ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
+	} else {
+	  ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
+	}
+	sign=-sign; 
+      }
+    }
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
+    {
+      int Ls = this->Ls;
+
+      // Apply 4d dslash
+      if ( psi.checkerboard == Odd ) {
+	this->DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
+      } else {
+	this->DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
+      }
+      
+      int sign=1;
+      for(int s=0;s<Ls;s++){
+	if ( s==(Ls-1) ){
+	  ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
+	} else {
+	  ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
+	}
+	sign=-sign; 
+      }
+    }
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
+    {
+      this->Meooe(psi,chi);
+    }
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
+    {
+      int Ls = this->Ls;
+
+      int sign=1;
+      for(int s=0;s<Ls;s++){
+	if ( s==0 ) {
+	  ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*dw_diag,psi,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
+	} else if ( s==(Ls-1) ){
+	  // Drop the CC here.
+	  double R=(1+mass)/(1-mass);
+	  ag5xpby_ssp(chi,Beta[s]*dw_diag,psi,sqrt_cc[s-1],psi,s,s-1);
+	  ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
+	} else {
+	  ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*dw_diag,psi,sqrt_cc[s],psi,s,s+1);
+	  axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
+	}
+	sign=-sign; 
+      }
+    }
+
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
+    {
+      this->Mooee(psi,chi);
+    }
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
+    {
+      int Ls = this->Ls;
+
+      // Apply Linv
+      axpby_ssp(chi,1.0/cc_d[0],psi,0.0,psi,0,0); 
+      for(int s=1;s<Ls;s++){
+	axpbg5y_ssp(chi,1.0/cc_d[s],psi,-1.0/See[s-1],chi,s,s-1);
+      }
+      // Apply Dinv
+      for(int s=0;s<Ls;s++){
+	ag5xpby_ssp(chi,1.0/See[s],chi,0.0,chi,s,s); //only appearance of See[0]
+      }
+      // Apply Uinv = (Linv)^T
+      axpby_ssp(chi,1.0/cc_d[Ls-1],chi,0.0,chi,Ls-1,Ls-1);
+      for(int s=Ls-2;s>=0;s--){
+	axpbg5y_ssp(chi,1.0/cc_d[s],chi,-1.0*cc_d[s+1]/See[s]/cc_d[s],chi,s,s+1);
+      }
+    }
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+    {
+      this->MooeeInv(psi,chi);
+    }
+
+  // force terms; five routines; default to Dhop on diagonal
+    template<class Impl>
+   void ContinuedFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int sign=1;
+    for(int s=0;s<Ls;s++){
+      if ( s==(Ls-1) ){
+	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+      } else {
+	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+      }
+      sign=-sign; 
+    }
+    this->DhopDeriv(mat,D,V,DaggerNo); 
+  };
+    template<class Impl>
+   void ContinuedFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int sign=1;
+    for(int s=0;s<Ls;s++){
+      if ( s==(Ls-1) ){
+	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+      } else {
+	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+      }
+      sign=-sign; 
+    }
+    this->DhopDerivOE(mat,D,V,DaggerNo); 
+  };
+  template<class Impl>
+  void ContinuedFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int sign=1;
+    for(int s=0;s<Ls;s++){
+      if ( s==(Ls-1) ){
+	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+      } else {
+	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+      }
+      sign=-sign; 
+    }
+    this->DhopDerivEO(mat,D,V,DaggerNo); 
+  };
+    
+    // Constructors
+    template<class Impl>
+    ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
+							   GaugeField &_Umu,
+							   GridCartesian         &FiveDimGrid,
+							   GridRedBlackCartesian &FiveDimRedBlackGrid,
+							   GridCartesian         &FourDimGrid,
+							   GridRedBlackCartesian &FourDimRedBlackGrid,
+							   RealD _mass,RealD M5,const ImplParams &p) :
+      WilsonFermion5D<Impl>(_Umu,
+			    FiveDimGrid, FiveDimRedBlackGrid,
+			    FourDimGrid, FourDimRedBlackGrid,M5,p),
+      mass(_mass)
+    {
+      int Ls = this->Ls;
+      assert((Ls&0x1)==1); // Odd Ls required
+    }
+
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+    {
+      int Ls = this->Ls;
+      conformable(solution5d._grid,this->FermionGrid());
+      conformable(exported4d._grid,this->GaugeGrid());
+      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+    }
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+    {
+      int Ls = this->Ls;
+      conformable(imported5d._grid,this->FermionGrid());
+      conformable(input4d._grid   ,this->GaugeGrid());
+      FermionField tmp(this->FermionGrid());
+      tmp=zero;
+      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
+      this->Dminus(tmp,imported5d);
+    }
+
+    FermOpTemplateInstantiate(ContinuedFractionFermion5D);
+
+  }
+}
+
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -0,0 +1,107 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_CONTINUED_FRACTION_H
+#define  GRID_QCD_CONTINUED_FRACTION_H
+
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class ContinuedFractionFermion5D : public WilsonFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+    public:
+
+      // override multiply
+      virtual RealD  M    (const FermionField &in, FermionField &out);
+      virtual RealD  Mdag (const FermionField &in, FermionField &out);
+
+      // half checkerboard operaions
+      virtual void   Meooe       (const FermionField &in, FermionField &out);
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+      virtual void   Mooee       (const FermionField &in, FermionField &out);
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+
+      // force terms; five routines; default to Dhop on diagonal
+      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+
+      //      virtual void   Instantiatable(void)=0;
+      virtual void   Instantiatable(void) =0;
+
+      // Efficient support for multigrid coarsening
+      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+
+      ///////////////////////////////////////////////////////////////
+      // Physical surface field utilities
+      ///////////////////////////////////////////////////////////////
+      //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
+      //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
+      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+
+      // Constructors
+      ContinuedFractionFermion5D(GaugeField &_Umu,
+				 GridCartesian         &FiveDimGrid,
+				 GridRedBlackCartesian &FiveDimRedBlackGrid,
+				 GridCartesian         &FourDimGrid,
+				 GridRedBlackCartesian &FourDimRedBlackGrid,
+				 RealD _mass,RealD M5,const ImplParams &p= ImplParams());
+
+    protected:
+
+      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
+      void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);;
+
+      // Cont frac
+      RealD dw_diag;
+      RealD mass;
+      RealD R;
+      RealD ZoloHiInv;
+      std::vector<double> Beta;
+      std::vector<double> cc;;
+      std::vector<double> cc_d;;
+      std::vector<double> sqrt_cc;
+      std::vector<double> See;
+      std::vector<double> Aee;
+
+    };
+
+
+  }
+}
+
+#endif
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc
@@ -0,0 +1,438 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+    template<class Impl>
+    DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
+      GaugeField            &_Umu,
+      GridCartesian         &FiveDimGrid,
+      GridRedBlackCartesian &FiveDimRedBlackGrid,
+      GridCartesian         &FourDimGrid,
+      GridRedBlackCartesian &FourDimRedBlackGrid,
+      RealD _mq1, RealD _mq2, RealD _mq3,
+      RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
+    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
+        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
+        _shift, _pm, _M5, 1.0, 0.0, p)
+    {
+        RealD eps = 1.0;
+        Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
+        assert(zdata->n == this->Ls);
+
+        std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
+        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
+
+        Approx::zolotarev_free(zdata);
+    }
+
+    /***************************************************************
+     * Additional EOFA operators only called outside the inverter.
+     * Since speed is not essential, simple axpby-style
+     * implementations should be fine.
+     ***************************************************************/
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
+    {
+        int Ls = this->Ls;
+
+        Din = zero;
+        if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
+        else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
+        else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
+        else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
+    }
+
+    // This is just the identity for DWF
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
+
+    // This is just the identity for DWF
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
+
+    /*****************************************************************************************************/
+
+    template<class Impl>
+    RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
+    {
+        int Ls = this->Ls;
+
+        FermionField Din(psi._grid);
+
+        this->Meooe5D(psi, Din);
+        this->DW(Din, chi, DaggerNo);
+        axpby(chi, 1.0, 1.0, chi, psi);
+        this->M5D(psi, chi);
+        return(norm2(chi));
+    }
+
+    template<class Impl>
+    RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
+    {
+        int Ls = this->Ls;
+
+        FermionField Din(psi._grid);
+
+        this->DW(psi, Din, DaggerYes);
+        this->MeooeDag5D(Din, chi);
+        this->M5Ddag(psi, chi);
+        axpby(chi, 1.0, 1.0, chi, psi);
+        return(norm2(chi));
+    }
+
+    /********************************************************************
+     * Performance critical fermion operators called inside the inverter
+     ********************************************************************/
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
+    {
+        int   Ls    = this->Ls;
+        int   pm    = this->pm;
+        RealD shift = this->shift;
+        RealD mq1   = this->mq1;
+        RealD mq2   = this->mq2;
+        RealD mq3   = this->mq3;
+
+        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
+        Coeff_t shiftp(0.0), shiftm(0.0);
+        if(shift != 0.0){
+          if(pm == 1){ shiftp = shift*(mq3-mq2); }
+          else{ shiftm = -shift*(mq3-mq2); }
+        }
+
+        std::vector<Coeff_t> diag(Ls,1.0);
+        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
+        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
+
+        #if(0)
+            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
+            for(int i=0; i<diag.size(); ++i){
+                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
+            }
+            for(int i=0; i<upper.size(); ++i){
+                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
+            }
+            for(int i=0; i<lower.size(); ++i){
+                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
+            }
+        #endif
+
+        this->M5D(psi, chi, chi, lower, diag, upper);
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
+    {
+        int   Ls    = this->Ls;
+        int   pm    = this->pm;
+        RealD shift = this->shift;
+        RealD mq1   = this->mq1;
+        RealD mq2   = this->mq2;
+        RealD mq3   = this->mq3;
+
+        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
+        Coeff_t shiftp(0.0), shiftm(0.0);
+        if(shift != 0.0){
+          if(pm == 1){ shiftp = shift*(mq3-mq2); }
+          else{ shiftm = -shift*(mq3-mq2); }
+        }
+
+        std::vector<Coeff_t> diag(Ls,1.0);
+        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
+        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
+
+        #if(0)
+            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
+            for(int i=0; i<diag.size(); ++i){
+                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
+            }
+            for(int i=0; i<upper.size(); ++i){
+                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
+            }
+            for(int i=0; i<lower.size(); ++i){
+                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
+            }
+        #endif
+
+        this->M5Ddag(psi, chi, chi, lower, diag, upper);
+    }
+
+    // half checkerboard operations
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
+    {
+        int Ls = this->Ls;
+
+        std::vector<Coeff_t> diag = this->bee;
+        std::vector<Coeff_t> upper(Ls);
+        std::vector<Coeff_t> lower(Ls);
+
+        for(int s=0; s<Ls; s++){
+          upper[s] = -this->cee[s];
+          lower[s] = -this->cee[s];
+        }
+        upper[Ls-1] = this->dm;
+        lower[0]    = this->dp;
+
+        this->M5D(psi, psi, chi, lower, diag, upper);
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
+    {
+        int Ls = this->Ls;
+
+        std::vector<Coeff_t> diag = this->bee;
+        std::vector<Coeff_t> upper(Ls);
+        std::vector<Coeff_t> lower(Ls);
+
+        for(int s=0; s<Ls; s++){
+          upper[s] = -this->cee[s];
+          lower[s] = -this->cee[s];
+        }
+        upper[Ls-1] = this->dp;
+        lower[0]    = this->dm;
+
+        this->M5Ddag(psi, psi, chi, lower, diag, upper);
+    }
+
+    /****************************************************************************************/
+
+    //Zolo
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
+    {
+        int   Ls    = this->Ls;
+        int   pm    = this->pm;
+        RealD mq1   = this->mq1;
+        RealD mq2   = this->mq2;
+        RealD mq3   = this->mq3;
+        RealD shift = this->shift;
+
+        ////////////////////////////////////////////////////////
+        // Constants for the preconditioned matrix Cayley form
+        ////////////////////////////////////////////////////////
+        this->bs.resize(Ls);
+        this->cs.resize(Ls);
+        this->aee.resize(Ls);
+        this->aeo.resize(Ls);
+        this->bee.resize(Ls);
+        this->beo.resize(Ls);
+        this->cee.resize(Ls);
+        this->ceo.resize(Ls);
+
+        for(int i=0; i<Ls; ++i){
+          this->bee[i] = 4.0 - this->M5 + 1.0;
+          this->cee[i] = 1.0;
+        }
+
+        for(int i=0; i<Ls; ++i){
+          this->aee[i] = this->cee[i];
+          this->bs[i] = this->beo[i] = 1.0;
+          this->cs[i] = this->ceo[i] = 0.0;
+        }
+
+        //////////////////////////////////////////
+        // EOFA shift terms
+        //////////////////////////////////////////
+        if(pm == 1){
+          this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
+          this->dm = mq1*this->cee[Ls-1];
+        } else if(this->pm == -1) {
+          this->dp = mq1*this->cee[0];
+          this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
+        } else {
+          this->dp = mq1*this->cee[0];
+          this->dm = mq1*this->cee[Ls-1];
+        }
+
+        //////////////////////////////////////////
+        // LDU decomposition of eeoo
+        //////////////////////////////////////////
+        this->dee.resize(Ls+1);
+        this->lee.resize(Ls);
+        this->leem.resize(Ls);
+        this->uee.resize(Ls);
+        this->ueem.resize(Ls);
+
+        for(int i=0; i<Ls; ++i){
+
+          if(i < Ls-1){
+
+            this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
+
+            this->leem[i] = this->dm/this->bee[i];
+            for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
+
+            this->dee[i] = this->bee[i];
+
+            this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
+
+            this->ueem[i] = this->dp / this->bee[0];
+            for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
+
+          } else {
+
+            this->lee[i]  = 0.0;
+            this->leem[i] = 0.0;
+            this->uee[i]  = 0.0;
+            this->ueem[i] = 0.0;
+
+          }
+        }
+
+        {
+          Coeff_t delta_d = 1.0 / this->bee[0];
+          for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
+          this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
+          this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
+        }
+
+        int inv = 1;
+        this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
+        this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
+    }
+
+    // Recompute Cayley-form coefficients for different shift
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
+    {
+        this->shift = new_shift;
+        Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
+        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
+        Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+    {
+        int Ls = this->Ls;
+
+        GridBase* grid = this->FermionRedBlackGrid();
+        int LLs = grid->_rdimensions[0];
+
+        if(LLs == Ls){ return; } // Not vectorised in 5th direction
+
+        Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+        Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
+
+        for(int s=0; s<Ls; s++){
+            Pplus(s,s)  = this->bee[s];
+            Pminus(s,s) = this->bee[s];
+        }
+
+        for(int s=0; s<Ls-1; s++){
+            Pminus(s,s+1) = -this->cee[s];
+        }
+
+        for(int s=0; s<Ls-1; s++){
+            Pplus(s+1,s) = -this->cee[s+1];
+        }
+
+        Pplus (0,Ls-1) = this->dp;
+        Pminus(Ls-1,0) = this->dm;
+
+        Eigen::MatrixXcd PplusMat ;
+        Eigen::MatrixXcd PminusMat;
+
+        #if(0)
+            std::cout << GridLogMessage << "Pplus:" << std::endl;
+            for(int s=0; s<Ls; ++s){
+                for(int ss=0; ss<Ls; ++ss){
+                    std::cout << Pplus(s,ss) << "\t";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << GridLogMessage << "Pminus:" << std::endl;
+            for(int s=0; s<Ls; ++s){
+                for(int ss=0; ss<Ls; ++ss){
+                    std::cout << Pminus(s,ss) << "\t";
+                }
+                std::cout << std::endl;
+            }
+        #endif
+
+        if(inv) {
+            PplusMat  = Pplus.inverse();
+            PminusMat = Pminus.inverse();
+        } else {
+            PplusMat  = Pplus;
+            PminusMat = Pminus;
+        }
+
+        if(dag){
+            PplusMat.adjointInPlace();
+            PminusMat.adjointInPlace();
+        }
+
+        typedef typename SiteHalfSpinor::scalar_type scalar_type;
+        const int Nsimd = Simd::Nsimd();
+        Matp.resize(Ls*LLs);
+        Matm.resize(Ls*LLs);
+
+        for(int s2=0; s2<Ls; s2++){
+        for(int s1=0; s1<LLs; s1++){
+            int istride = LLs;
+            int ostride = 1;
+            Simd Vp;
+            Simd Vm;
+            scalar_type *sp = (scalar_type*) &Vp;
+            scalar_type *sm = (scalar_type*) &Vm;
+            for(int l=0; l<Nsimd; l++){
+                if(switcheroo<Coeff_t>::iscomplex()) {
+                    sp[l] = PplusMat (l*istride+s1*ostride,s2);
+                    sm[l] = PminusMat(l*istride+s1*ostride,s2);
+                } else {
+                    // if real
+                    scalar_type tmp;
+                    tmp = PplusMat (l*istride+s1*ostride,s2);
+                    sp[l] = scalar_type(tmp.real(),tmp.real());
+                    tmp = PminusMat(l*istride+s1*ostride,s2);
+                    sm[l] = scalar_type(tmp.real(),tmp.real());
+                }
+            }
+            Matp[LLs*s2+s1] = Vp;
+            Matm[LLs*s2+s1] = Vm;
+        }}
+    }
+
+    FermOpTemplateInstantiate(DomainWallEOFAFermion);
+    GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
+
+}}
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h
@@ -0,0 +1,115 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.h
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
+#define  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
+
+#include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+  template<class Impl>
+  class DomainWallEOFAFermion : public AbstractEOFAFermion<Impl>
+  {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+    public:
+      // Modified (0,Ls-1) and (Ls-1,0) elements of Mooee
+      // for red-black preconditioned Shamir EOFA
+      Coeff_t dm;
+      Coeff_t dp;
+
+      virtual void Instantiatable(void) {};
+
+      // EOFA-specific operations
+      virtual void  Omega      (const FermionField& in, FermionField& out, int sign, int dag);
+      virtual void  Dtilde     (const FermionField& in, FermionField& out);
+      virtual void  DtildeInv  (const FermionField& in, FermionField& out);
+
+      // override multiply
+      virtual RealD M          (const FermionField& in, FermionField& out);
+      virtual RealD Mdag       (const FermionField& in, FermionField& out);
+
+      // half checkerboard operations
+      virtual void  Mooee      (const FermionField& in, FermionField& out);
+      virtual void  MooeeDag   (const FermionField& in, FermionField& out);
+      virtual void  MooeeInv   (const FermionField& in, FermionField& out);
+      virtual void  MooeeInvDag(const FermionField& in, FermionField& out);
+
+      virtual void   M5D       (const FermionField& psi, FermionField& chi);
+      virtual void   M5Ddag    (const FermionField& psi, FermionField& chi);
+
+      /////////////////////////////////////////////////////
+      // Instantiate different versions depending on Impl
+      /////////////////////////////////////////////////////
+      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
+        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+
+      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
+        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+
+      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
+
+      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+
+      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
+        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+
+      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
+        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+
+      virtual void RefreshShiftCoefficients(RealD new_shift);
+
+      // Constructors
+      DomainWallEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
+        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
+        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
+        RealD _M5, const ImplParams& p=ImplParams());
+
+    protected:
+      void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
+  };
+}}
+
+#define INSTANTIATE_DPERP_DWF_EOFA(A)\
+template void DomainWallEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
+  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
+template void DomainWallEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
+  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
+template void DomainWallEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
+template void DomainWallEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi);
+
+#undef  DOMAIN_WALL_EOFA_DPERP_DENSE
+#define DOMAIN_WALL_EOFA_DPERP_CACHE
+#undef  DOMAIN_WALL_EOFA_DPERP_LINALG
+#define DOMAIN_WALL_EOFA_DPERP_VEC
+
+#endif
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc
@@ -0,0 +1,248 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+
+    // Pminus fowards
+    // Pplus  backwards..
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
+        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+    {
+        int Ls = this->Ls;
+        GridBase* grid = psi._grid;
+
+        assert(phi.checkerboard == psi.checkerboard);
+        chi.checkerboard = psi.checkerboard;
+        // Flops = 6.0*(Nc*Ns) *Ls*vol
+        this->M5Dcalls++;
+        this->M5Dtime -= usecond();
+
+        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
+            for(int s=0; s<Ls; s++){
+                auto tmp = psi._odata[0];
+                if(s==0) {
+                    spProj5m(tmp, psi._odata[ss+s+1]);
+                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+                    spProj5p(tmp, psi._odata[ss+Ls-1]);
+                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+                } else if(s==(Ls-1)) {
+                    spProj5m(tmp, psi._odata[ss+0]);
+                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+                    spProj5p(tmp, psi._odata[ss+s-1]);
+                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+                } else {
+                    spProj5m(tmp, psi._odata[ss+s+1]);
+                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+                    spProj5p(tmp, psi._odata[ss+s-1]);
+                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+                }
+            }
+        }
+
+        this->M5Dtime += usecond();
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
+        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+    {
+        int Ls = this->Ls;
+        GridBase* grid = psi._grid;
+        assert(phi.checkerboard == psi.checkerboard);
+        chi.checkerboard=psi.checkerboard;
+
+        // Flops = 6.0*(Nc*Ns) *Ls*vol
+        this->M5Dcalls++;
+        this->M5Dtime -= usecond();
+
+        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
+            auto tmp = psi._odata[0];
+            for(int s=0; s<Ls; s++){
+                if(s==0) {
+                    spProj5p(tmp, psi._odata[ss+s+1]);
+                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+                    spProj5m(tmp, psi._odata[ss+Ls-1]);
+                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+                } else if(s==(Ls-1)) {
+                    spProj5p(tmp, psi._odata[ss+0]);
+                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+                    spProj5m(tmp, psi._odata[ss+s-1]);
+                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+                } else {
+                    spProj5p(tmp, psi._odata[ss+s+1]);
+                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+                    spProj5m(tmp, psi._odata[ss+s-1]);
+                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+                }
+            }
+        }
+
+        this->M5Dtime += usecond();
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+    {
+        GridBase* grid = psi._grid;
+        int Ls = this->Ls;
+
+        chi.checkerboard = psi.checkerboard;
+
+        this->MooeeInvCalls++;
+        this->MooeeInvTime -= usecond();
+
+        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
+
+            auto tmp1 = psi._odata[0];
+            auto tmp2 = psi._odata[0];
+
+            // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
+            // Apply (L^{\prime})^{-1}
+            chi[ss] = psi[ss]; // chi[0]=psi[0]
+            for(int s=1; s<Ls; s++){
+                spProj5p(tmp1, chi[ss+s-1]);
+                chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
+            }
+
+            // L_m^{-1}
+            for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+                spProj5m(tmp1, chi[ss+s]);
+                chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
+            }
+
+            // U_m^{-1} D^{-1}
+            for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+                spProj5p(tmp1, chi[ss+Ls-1]);
+                chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
+            }
+            spProj5m(tmp2, chi[ss+Ls-1]);
+            chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
+
+            // Apply U^{-1}
+            for(int s=Ls-2; s>=0; s--){
+                spProj5m(tmp1, chi[ss+s+1]);
+                chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
+            }
+        }
+
+        this->MooeeInvTime += usecond();
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+    {
+        GridBase* grid = psi._grid;
+        int Ls = this->Ls;
+
+        assert(psi.checkerboard == psi.checkerboard);
+        chi.checkerboard = psi.checkerboard;
+
+        std::vector<Coeff_t> ueec(Ls);
+        std::vector<Coeff_t> deec(Ls+1);
+        std::vector<Coeff_t> leec(Ls);
+        std::vector<Coeff_t> ueemc(Ls);
+        std::vector<Coeff_t> leemc(Ls);
+
+        for(int s=0; s<ueec.size(); s++){
+            ueec[s]  = conjugate(this->uee[s]);
+            deec[s]  = conjugate(this->dee[s]);
+            leec[s]  = conjugate(this->lee[s]);
+            ueemc[s] = conjugate(this->ueem[s]);
+            leemc[s] = conjugate(this->leem[s]);
+        }
+        deec[Ls] = conjugate(this->dee[Ls]);
+
+        this->MooeeInvCalls++;
+        this->MooeeInvTime -= usecond();
+
+        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
+
+            auto tmp1 = psi._odata[0];
+            auto tmp2 = psi._odata[0];
+
+            // Apply (U^{\prime})^{-dagger}
+            chi[ss] = psi[ss];
+            for(int s=1; s<Ls; s++){
+                spProj5m(tmp1, chi[ss+s-1]);
+                chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
+            }
+
+            // U_m^{-\dagger}
+            for(int s=0; s<Ls-1; s++){
+                spProj5p(tmp1, chi[ss+s]);
+                chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
+            }
+
+            // L_m^{-\dagger} D^{-dagger}
+            for(int s=0; s<Ls-1; s++){
+                spProj5m(tmp1, chi[ss+Ls-1]);
+                chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
+            }
+            spProj5p(tmp2, chi[ss+Ls-1]);
+            chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
+
+            // Apply L^{-dagger}
+            for(int s=Ls-2; s>=0; s--){
+                spProj5p(tmp1, chi[ss+s+1]);
+                chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
+            }
+        }
+
+        this->MooeeInvTime += usecond();
+    }
+
+    #ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
+
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
+
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
+
+    #endif
+
+}}
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermiondense.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermiondense.cc
@@ -0,0 +1,159 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+    /*
+    * Dense matrix versions of routines
+    */
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+    {
+        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+    {
+        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
+    {
+        int Ls = this->Ls;
+        int LLs = psi._grid->_rdimensions[0];
+        int vol = psi._grid->oSites()/LLs;
+
+        chi.checkerboard = psi.checkerboard;
+
+        assert(Ls==LLs);
+
+        Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+        Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+
+        for(int s=0;s<Ls;s++){
+            Pplus(s,s)  = this->bee[s];
+            Pminus(s,s) = this->bee[s];
+        }
+
+        for(int s=0; s<Ls-1; s++){
+            Pminus(s,s+1) = -this->cee[s];
+        }
+
+        for(int s=0; s<Ls-1; s++){
+            Pplus(s+1,s) = -this->cee[s+1];
+        }
+
+        Pplus (0,Ls-1) = this->dp;
+        Pminus(Ls-1,0) = this->dm;
+
+        Eigen::MatrixXd PplusMat ;
+        Eigen::MatrixXd PminusMat;
+
+        if(inv) {
+            PplusMat  = Pplus.inverse();
+            PminusMat = Pminus.inverse();
+        } else {
+            PplusMat  = Pplus;
+            PminusMat = Pminus;
+        }
+
+        if(dag){
+            PplusMat.adjointInPlace();
+            PminusMat.adjointInPlace();
+        }
+
+        // For the non-vectorised s-direction this is simple
+
+        for(auto site=0; site<vol; site++){
+
+            SiteSpinor     SiteChi;
+            SiteHalfSpinor SitePplus;
+            SiteHalfSpinor SitePminus;
+
+            for(int s1=0; s1<Ls; s1++){
+                SiteChi = zero;
+                for(int s2=0; s2<Ls; s2++){
+                    int lex2 = s2 + Ls*site;
+                    if(PplusMat(s1,s2) != 0.0){
+                        spProj5p(SitePplus,psi[lex2]);
+                        accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
+                    }
+                    if(PminusMat(s1,s2) != 0.0){
+                        spProj5m(SitePminus, psi[lex2]);
+                        accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
+                    }
+                }
+                chi[s1+Ls*site] = SiteChi*0.5;
+            }
+        }
+    }
+
+    #ifdef DOMAIN_WALL_EOFA_DPERP_DENSE
+
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
+
+        template void DomainWallEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
+
+        template void DomainWallEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+    #endif
+
+}}
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermionssp.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermionssp.cc
@@ -0,0 +1,168 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+    // Pminus fowards
+    // Pplus  backwards
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
+        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+    {
+        Coeff_t one(1.0);
+        int Ls = this->Ls;
+        for(int s=0; s<Ls; s++){
+            if(s==0) {
+              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
+            } else if (s==(Ls-1)) {
+              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
+              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
+            } else {
+              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+              axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
+            }
+        }
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
+        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+    {
+        Coeff_t one(1.0);
+        int Ls = this->Ls;
+        for(int s=0; s<Ls; s++){
+            if(s==0) {
+              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
+            } else if (s==(Ls-1)) {
+              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
+              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+            } else {
+              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+            }
+        }
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+    {
+        Coeff_t one(1.0);
+        Coeff_t czero(0.0);
+        chi.checkerboard = psi.checkerboard;
+        int Ls = this->Ls;
+
+        FermionField tmp(psi._grid);
+
+        // Apply (L^{\prime})^{-1}
+        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+        for(int s=1; s<Ls; s++){
+            axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+        }
+
+        // L_m^{-1}
+        for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+            axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
+        }
+
+        // U_m^{-1} D^{-1}
+        for(int s=0; s<Ls-1; s++){
+            axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls], chi, s, Ls-1);
+        }
+        axpby_ssp_pminus(tmp, czero, chi, one/this->dee[Ls-1], chi, Ls-1, Ls-1);
+        axpby_ssp_pplus(chi, one, tmp, one/this->dee[Ls], chi, Ls-1, Ls-1);
+
+        // Apply U^{-1}
+        for(int s=Ls-2; s>=0; s--){
+            axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
+        }
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+    {
+        Coeff_t one(1.0);
+        Coeff_t czero(0.0);
+        chi.checkerboard = psi.checkerboard;
+        int Ls = this->Ls;
+
+        FermionField tmp(psi._grid);
+
+        // Apply (U^{\prime})^{-dagger}
+        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+        for(int s=1; s<Ls; s++){
+            axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
+        }
+
+        // U_m^{-\dagger}
+        for(int s=0; s<Ls-1; s++){
+            axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
+        }
+
+        // L_m^{-\dagger} D^{-dagger}
+        for(int s=0; s<Ls-1; s++){
+            axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
+        }
+        axpby_ssp_pminus(tmp, czero, chi, one/conjugate(this->dee[Ls-1]), chi, Ls-1, Ls-1);
+        axpby_ssp_pplus(chi, one, tmp, one/conjugate(this->dee[Ls]), chi, Ls-1, Ls-1);
+
+        // Apply L^{-dagger}
+        for(int s=Ls-2; s>=0; s--){
+            axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
+        }
+    }
+
+    #ifdef DOMAIN_WALL_EOFA_DPERP_LINALG
+
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
+
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
+
+    #endif
+
+}}
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc
@@ -0,0 +1,605 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+    /*
+    * Dense matrix versions of routines
+    */
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+    {
+        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+    {
+        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
+        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+    {
+        GridBase* grid = psi._grid;
+        int Ls  = this->Ls;
+        int LLs = grid->_rdimensions[0];
+        const int nsimd = Simd::Nsimd();
+
+        Vector<iSinglet<Simd> > u(LLs);
+        Vector<iSinglet<Simd> > l(LLs);
+        Vector<iSinglet<Simd> > d(LLs);
+
+        assert(Ls/LLs == nsimd);
+        assert(phi.checkerboard == psi.checkerboard);
+
+        chi.checkerboard = psi.checkerboard;
+
+        // just directly address via type pun
+        typedef typename Simd::scalar_type scalar_type;
+        scalar_type* u_p = (scalar_type*) &u[0];
+        scalar_type* l_p = (scalar_type*) &l[0];
+        scalar_type* d_p = (scalar_type*) &d[0];
+
+        for(int o=0;o<LLs;o++){ // outer
+        for(int i=0;i<nsimd;i++){ //inner
+            int s  = o + i*LLs;
+            int ss = o*nsimd + i;
+            u_p[ss] = upper[s];
+            l_p[ss] = lower[s];
+            d_p[ss] = diag[s];
+        }}
+
+        this->M5Dcalls++;
+        this->M5Dtime -= usecond();
+
+        assert(Nc == 3);
+
+        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+
+            #if 0
+
+                alignas(64) SiteHalfSpinor hp;
+                alignas(64) SiteHalfSpinor hm;
+                alignas(64) SiteSpinor fp;
+                alignas(64) SiteSpinor fm;
+
+                for(int v=0; v<LLs; v++){
+
+                    int vp = (v+1)%LLs;
+                    int vm = (v+LLs-1)%LLs;
+
+                    spProj5m(hp, psi[ss+vp]);
+                    spProj5p(hm, psi[ss+vm]);
+
+                    if (vp <= v){ rotate(hp, hp, 1); }
+                    if (vm >= v){ rotate(hm, hm, nsimd-1); }
+
+                    hp = 0.5*hp;
+                    hm = 0.5*hm;
+
+                    spRecon5m(fp, hp);
+                    spRecon5p(fm, hm);
+
+                    chi[ss+v] = d[v]*phi[ss+v];
+                    chi[ss+v] = chi[ss+v] + u[v]*fp;
+                    chi[ss+v] = chi[ss+v] + l[v]*fm;
+
+                }
+
+            #else
+
+                for(int v=0; v<LLs; v++){
+
+                    vprefetch(psi[ss+v+LLs]);
+
+                    int vp = (v==LLs-1) ? 0     : v+1;
+                    int vm = (v==0)     ? LLs-1 : v-1;
+
+                    Simd hp_00 = psi[ss+vp]()(2)(0);
+                    Simd hp_01 = psi[ss+vp]()(2)(1);
+                    Simd hp_02 = psi[ss+vp]()(2)(2);
+                    Simd hp_10 = psi[ss+vp]()(3)(0);
+                    Simd hp_11 = psi[ss+vp]()(3)(1);
+                    Simd hp_12 = psi[ss+vp]()(3)(2);
+
+                    Simd hm_00 = psi[ss+vm]()(0)(0);
+                    Simd hm_01 = psi[ss+vm]()(0)(1);
+                    Simd hm_02 = psi[ss+vm]()(0)(2);
+                    Simd hm_10 = psi[ss+vm]()(1)(0);
+                    Simd hm_11 = psi[ss+vm]()(1)(1);
+                    Simd hm_12 = psi[ss+vm]()(1)(2);
+
+                    if(vp <= v){
+                        hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+                        hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+                        hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+                        hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+                        hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+                        hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+                    }
+
+                    if(vm >= v){
+                        hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+                        hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+                        hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+                        hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+                        hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+                        hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+                    }
+
+                    // Can force these to real arithmetic and save 2x.
+                    Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
+                    Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
+                    Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
+                    Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
+                    Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
+                    Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
+                    Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+                    Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+                    Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+                    Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+                    Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+                    Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+
+                    vstream(chi[ss+v]()(0)(0), p_00);
+                    vstream(chi[ss+v]()(0)(1), p_01);
+                    vstream(chi[ss+v]()(0)(2), p_02);
+                    vstream(chi[ss+v]()(1)(0), p_10);
+                    vstream(chi[ss+v]()(1)(1), p_11);
+                    vstream(chi[ss+v]()(1)(2), p_12);
+                    vstream(chi[ss+v]()(2)(0), p_20);
+                    vstream(chi[ss+v]()(2)(1), p_21);
+                    vstream(chi[ss+v]()(2)(2), p_22);
+                    vstream(chi[ss+v]()(3)(0), p_30);
+                    vstream(chi[ss+v]()(3)(1), p_31);
+                    vstream(chi[ss+v]()(3)(2), p_32);
+                }
+
+            #endif
+        }
+
+        this->M5Dtime += usecond();
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
+        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+    {
+        GridBase* grid = psi._grid;
+        int Ls  = this->Ls;
+        int LLs = grid->_rdimensions[0];
+        int nsimd = Simd::Nsimd();
+
+        Vector<iSinglet<Simd> > u(LLs);
+        Vector<iSinglet<Simd> > l(LLs);
+        Vector<iSinglet<Simd> > d(LLs);
+
+        assert(Ls/LLs == nsimd);
+        assert(phi.checkerboard == psi.checkerboard);
+
+        chi.checkerboard = psi.checkerboard;
+
+        // just directly address via type pun
+        typedef typename Simd::scalar_type scalar_type;
+        scalar_type* u_p = (scalar_type*) &u[0];
+        scalar_type* l_p = (scalar_type*) &l[0];
+        scalar_type* d_p = (scalar_type*) &d[0];
+
+        for(int o=0; o<LLs; o++){ // outer
+        for(int i=0; i<nsimd; i++){ //inner
+            int s  = o + i*LLs;
+            int ss = o*nsimd + i;
+            u_p[ss] = upper[s];
+            l_p[ss] = lower[s];
+            d_p[ss] = diag[s];
+        }}
+
+        this->M5Dcalls++;
+        this->M5Dtime -= usecond();
+
+        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+
+        #if 0
+
+            alignas(64) SiteHalfSpinor hp;
+            alignas(64) SiteHalfSpinor hm;
+            alignas(64) SiteSpinor fp;
+            alignas(64) SiteSpinor fm;
+
+            for(int v=0; v<LLs; v++){
+
+                int vp = (v+1)%LLs;
+                int vm = (v+LLs-1)%LLs;
+
+                spProj5p(hp, psi[ss+vp]);
+                spProj5m(hm, psi[ss+vm]);
+
+                if(vp <= v){ rotate(hp, hp, 1); }
+                if(vm >= v){ rotate(hm, hm, nsimd-1); }
+
+                hp = hp*0.5;
+                hm = hm*0.5;
+                spRecon5p(fp, hp);
+                spRecon5m(fm, hm);
+
+                chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+                chi[ss+v] = chi[ss+v]     +l[v]*fm;
+            }
+
+        #else
+
+            for(int v=0; v<LLs; v++){
+
+                vprefetch(psi[ss+v+LLs]);
+
+                int vp = (v == LLs-1) ? 0     : v+1;
+                int vm = (v == 0    ) ? LLs-1 : v-1;
+
+                Simd hp_00 = psi[ss+vp]()(0)(0);
+                Simd hp_01 = psi[ss+vp]()(0)(1);
+                Simd hp_02 = psi[ss+vp]()(0)(2);
+                Simd hp_10 = psi[ss+vp]()(1)(0);
+                Simd hp_11 = psi[ss+vp]()(1)(1);
+                Simd hp_12 = psi[ss+vp]()(1)(2);
+
+                Simd hm_00 = psi[ss+vm]()(2)(0);
+                Simd hm_01 = psi[ss+vm]()(2)(1);
+                Simd hm_02 = psi[ss+vm]()(2)(2);
+                Simd hm_10 = psi[ss+vm]()(3)(0);
+                Simd hm_11 = psi[ss+vm]()(3)(1);
+                Simd hm_12 = psi[ss+vm]()(3)(2);
+
+                if (vp <= v){
+                    hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+                    hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+                    hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+                    hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+                    hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+                    hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+                }
+
+                if(vm >= v){
+                    hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+                    hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+                    hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+                    hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+                    hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+                    hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+                }
+
+                Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+                Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+                Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+                Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+                Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+                Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+                Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
+                Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
+                Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
+                Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
+                Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
+                Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
+
+                vstream(chi[ss+v]()(0)(0), p_00);
+                vstream(chi[ss+v]()(0)(1), p_01);
+                vstream(chi[ss+v]()(0)(2), p_02);
+                vstream(chi[ss+v]()(1)(0), p_10);
+                vstream(chi[ss+v]()(1)(1), p_11);
+                vstream(chi[ss+v]()(1)(2), p_12);
+                vstream(chi[ss+v]()(2)(0), p_20);
+                vstream(chi[ss+v]()(2)(1), p_21);
+                vstream(chi[ss+v]()(2)(2), p_22);
+                vstream(chi[ss+v]()(3)(0), p_30);
+                vstream(chi[ss+v]()(3)(1), p_31);
+                vstream(chi[ss+v]()(3)(2), p_32);
+            }
+        #endif
+
+        }
+
+        this->M5Dtime += usecond();
+    }
+
+    #ifdef AVX512
+        #include<simd/Intel512common.h>
+        #include<simd/Intel512avx.h>
+        #include<simd/Intel512single.h>
+    #endif
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
+        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+    {
+        #ifndef AVX512
+        {
+            SiteHalfSpinor BcastP;
+            SiteHalfSpinor BcastM;
+            SiteHalfSpinor SiteChiP;
+            SiteHalfSpinor SiteChiM;
+
+            // Ls*Ls * 2 * 12 * vol flops
+            for(int s1=0; s1<LLs; s1++){
+
+                for(int s2=0; s2<LLs; s2++){
+                for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
+
+                    int s = s2 + l*LLs;
+                    int lex = s2 + LLs*site;
+
+                    if( s2==0 && l==0 ){
+                        SiteChiP=zero;
+                        SiteChiM=zero;
+                    }
+
+                    for(int sp=0; sp<2;  sp++){
+                    for(int co=0; co<Nc; co++){
+                        vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
+                    }}
+
+                    for(int sp=0; sp<2;  sp++){
+                    for(int co=0; co<Nc; co++){
+                        vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
+                    }}
+
+                    for(int sp=0; sp<2;  sp++){
+                    for(int co=0; co<Nc; co++){
+                        SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
+                        SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
+                    }}
+                }}
+
+                {
+                    int lex = s1 + LLs*site;
+                    for(int sp=0; sp<2;  sp++){
+                    for(int co=0; co<Nc; co++){
+                        vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
+                        vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+                    }}
+                }
+            }
+
+        }
+        #else
+        {
+            // pointers
+            //  MASK_REGS;
+            #define Chi_00 %%zmm1
+            #define Chi_01 %%zmm2
+            #define Chi_02 %%zmm3
+            #define Chi_10 %%zmm4
+            #define Chi_11 %%zmm5
+            #define Chi_12 %%zmm6
+            #define Chi_20 %%zmm7
+            #define Chi_21 %%zmm8
+            #define Chi_22 %%zmm9
+            #define Chi_30 %%zmm10
+            #define Chi_31 %%zmm11
+            #define Chi_32 %%zmm12
+
+            #define BCAST0  %%zmm13
+            #define BCAST1  %%zmm14
+            #define BCAST2  %%zmm15
+            #define BCAST3  %%zmm16
+            #define BCAST4  %%zmm17
+            #define BCAST5  %%zmm18
+            #define BCAST6  %%zmm19
+            #define BCAST7  %%zmm20
+            #define BCAST8  %%zmm21
+            #define BCAST9  %%zmm22
+            #define BCAST10 %%zmm23
+            #define BCAST11 %%zmm24
+
+            int incr = LLs*LLs*sizeof(iSinglet<Simd>);
+            for(int s1=0; s1<LLs; s1++){
+
+                for(int s2=0; s2<LLs; s2++){
+
+                    int lex = s2 + LLs*site;
+                    uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
+                    uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
+                    uint64_t a2 = (uint64_t) &psi[lex];
+
+                    for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
+                        if((s2+l)==0) {
+                            asm(
+                                    VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
+                                    VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
+                                    VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
+                                    VBCASTCDUP(0,%2,BCAST0)
+                                    VBCASTCDUP(1,%2,BCAST1)
+                                    VBCASTCDUP(2,%2,BCAST2)
+                                    VBCASTCDUP(3,%2,BCAST3)
+                                    VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
+                                    VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
+                                    VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
+                                    VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
+                                    VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
+                                    VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
+                                    VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
+                                    VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
+                                    VMULMEM(0,%1,BCAST8,Chi_22)
+                                    VMULMEM(0,%1,BCAST9,Chi_30)
+                                    VMULMEM(0,%1,BCAST10,Chi_31)
+                                    VMULMEM(0,%1,BCAST11,Chi_32)
+                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
+                        } else {
+                            asm(
+                                    VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
+                                    VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
+                                    VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
+                                    VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
+                                    VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
+                                    VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
+                                    VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
+                                    VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
+                                    VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
+                                    VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
+                                    VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
+                                    VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
+                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
+                        }
+                        a0 = a0 + incr;
+                        a1 = a1 + incr;
+                        a2 = a2 + sizeof(typename Simd::scalar_type);
+                    }
+                }
+
+                {
+                  int lexa = s1+LLs*site;
+                  asm (
+                     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
+                     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
+                     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
+                     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
+                     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+                }
+            }
+        }
+
+        #undef Chi_00
+        #undef Chi_01
+        #undef Chi_02
+        #undef Chi_10
+        #undef Chi_11
+        #undef Chi_12
+        #undef Chi_20
+        #undef Chi_21
+        #undef Chi_22
+        #undef Chi_30
+        #undef Chi_31
+        #undef Chi_32
+
+        #undef BCAST0
+        #undef BCAST1
+        #undef BCAST2
+        #undef BCAST3
+        #undef BCAST4
+        #undef BCAST5
+        #undef BCAST6
+        #undef BCAST7
+        #undef BCAST8
+        #undef BCAST9
+        #undef BCAST10
+        #undef BCAST11
+        #endif
+    };
+
+    // Z-mobius version
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
+        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+    {
+        std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
+        exit(-1);
+    };
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
+    {
+        int Ls  = this->Ls;
+        int LLs = psi._grid->_rdimensions[0];
+        int vol = psi._grid->oSites()/LLs;
+
+        chi.checkerboard = psi.checkerboard;
+
+        Vector<iSinglet<Simd> > Matp;
+        Vector<iSinglet<Simd> > Matm;
+        Vector<iSinglet<Simd> > *_Matp;
+        Vector<iSinglet<Simd> > *_Matm;
+
+        //  MooeeInternalCompute(dag,inv,Matp,Matm);
+        if(inv && dag){
+            _Matp = &this->MatpInvDag;
+            _Matm = &this->MatmInvDag;
+        }
+
+        if(inv && (!dag)){
+            _Matp = &this->MatpInv;
+            _Matm = &this->MatmInv;
+        }
+
+        if(!inv){
+            MooeeInternalCompute(dag, inv, Matp, Matm);
+            _Matp = &Matp;
+            _Matm = &Matm;
+        }
+
+        assert(_Matp->size() == Ls*LLs);
+
+        this->MooeeInvCalls++;
+        this->MooeeInvTime -= usecond();
+
+        if(switcheroo<Coeff_t>::iscomplex()){
+            parallel_for(auto site=0; site<vol; site++){
+                MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
+            }
+        } else {
+            parallel_for(auto site=0; site<vol; site++){
+                MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
+            }
+        }
+
+        this->MooeeInvTime += usecond();
+    }
+
+    #ifdef DOMAIN_WALL_EOFA_DPERP_VEC
+
+        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
+
+        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
+
+        template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+        template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+    #endif
+
+}}
--- a/Grid/qcd/action/fermion/DomainWallFermion.h
+++ b/Grid/qcd/action/fermion/DomainWallFermion.h
@@ -0,0 +1,134 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/DomainWallFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_DOMAIN_WALL_FERMION_H
+#define  GRID_QCD_DOMAIN_WALL_FERMION_H
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class DomainWallFermion : public CayleyFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+    public:
+
+      void FreePropagator(const FermionField &in,FermionField &out,RealD mass, std::vector<double> twist, bool fiveD) {
+	FermionField in_k(in._grid);
+	FermionField prop_k(in._grid);
+
+	FFT theFFT((GridCartesian *) in._grid);
+
+	//phase for boundary condition
+	ComplexField coor(in._grid);
+	ComplexField ph(in._grid);  ph = zero;
+	FermionField in_buf(in._grid); in_buf = zero;
+	Complex ci(0.0,1.0);
+	assert(twist.size() == Nd);//check that twist is Nd
+	int shift = 0;
+	if(fiveD) shift = 1;
+	for(unsigned int nu = 0; nu < Nd; nu++)
+	{
+	  // Shift coordinate lattice index by 1 to account for 5th dimension.
+          LatticeCoordinate(coor, nu + shift);
+	  ph = ph + twist[nu]*coor*((1./(in._grid->_fdimensions[nu+shift])));
+	}
+	in_buf = exp((Real)(2.0*M_PI)*ci*ph*(-1.0))*in;
+
+	if(fiveD){//FFT only on temporal and spatial dimensions
+          std::vector<int> mask(Nd+1,1); mask[0] = 0;
+	  theFFT.FFT_dim_mask(in_k,in_buf,mask,FFT::forward);
+          this->MomentumSpacePropagatorHt_5d(prop_k,in_k,mass,twist);
+          theFFT.FFT_dim_mask(out,prop_k,mask,FFT::backward);
+        }
+	else{
+	  theFFT.FFT_all_dim(in_k,in,FFT::forward);
+          this->MomentumSpacePropagatorHt(prop_k,in_k,mass,twist);
+	  theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+        }
+
+	//phase for boundary condition
+	out = out * exp((Real)(2.0*M_PI)*ci*ph);
+      };
+
+      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<double> twist) {
+        bool fiveD = true; //5d propagator by default
+        FreePropagator(in,out,mass,twist,fiveD);
+      };
+
+      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass, bool fiveD) {
+	std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+        FreePropagator(in,out,mass,twist,fiveD);
+      };
+
+      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+        bool fiveD = true; //5d propagator by default
+	std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+        FreePropagator(in,out,mass,twist,fiveD);
+      };
+
+      virtual void   Instantiatable(void) {};
+      // Constructors
+      DomainWallFermion(GaugeField &_Umu,
+			GridCartesian         &FiveDimGrid,
+			GridRedBlackCartesian &FiveDimRedBlackGrid,
+			GridCartesian         &FourDimGrid,
+			GridRedBlackCartesian &FourDimRedBlackGrid,
+			RealD _mass,RealD _M5,const ImplParams &p= ImplParams()) : 
+
+
+      CayleyFermion5D<Impl>(_Umu,
+			    FiveDimGrid,
+			    FiveDimRedBlackGrid,
+			    FourDimGrid,
+			    FourDimRedBlackGrid,_mass,_M5,p)
+
+      {
+	RealD eps = 1.0;
+
+	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
+	assert(zdata->n==this->Ls);
+	
+	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
+	// Call base setter
+	this->SetCoefficientsTanh(zdata,1.0,0.0);
+
+	Approx::zolotarev_free(zdata);
+      }
+
+    };
+
+  }
+}
+
+#endif
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -0,0 +1,323 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/Fermion_base_aggregate.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_FERMION_H
+#define  GRID_QCD_FERMION_H
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Explicit explicit template instantiation is still required in the .cc files
+//
+// - CayleyFermion5D.cc
+// - PartialFractionFermion5D.cc
+// - WilsonFermion5D.cc
+// - WilsonKernelsHand.cc
+// - ContinuedFractionFermion5D.cc
+// - WilsonFermion.cc
+// - WilsonKernels.cc
+// - DomainWallEOFAFermion.cc
+// - MobiusEOFAFermion.cc
+//
+// The explicit instantiation is only avoidable if we move this source to headers and end up with include/parse/recompile
+// for EVERY .cc file. This define centralises the list and restores global push of impl cases
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////
+// Fermion operators / actions
+////////////////////////////////////////////
+
+#include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>     // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
+
+#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
+#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
+
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
+#include <Grid/qcd/action/fermion/DomainWallFermion.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+#include <Grid/qcd/action/fermion/MobiusFermion.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+#include <Grid/qcd/action/fermion/ZMobiusFermion.h>
+#include <Grid/qcd/action/fermion/SchurDiagTwoKappa.h>
+#include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
+#include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
+#include <Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
+#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
+///////////////////////////////////////////////////////////////////////////////
+// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
+///////////////////////////////////////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/g5HermitianLinop.h>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// More maintainable to maintain the following typedef list centrally, as more "impl" targets
+// are added, (e.g. extension for gparity, half precision project in comms etc..)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+// Cayley 5d
+namespace Grid {
+  namespace QCD {
+
+typedef WilsonFermion<WilsonImplR> WilsonFermionR;
+typedef WilsonFermion<WilsonImplF> WilsonFermionF;
+typedef WilsonFermion<WilsonImplD> WilsonFermionD;
+
+typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
+typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
+typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
+
+typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
+typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
+typedef WilsonFermion<WilsonAdjImplD> WilsonAdjFermionD;
+
+typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermionR;
+typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
+typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
+
+typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
+typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
+typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
+
+// Twisted mass fermion
+typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
+typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
+typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
+
+// Clover fermions
+typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
+typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
+typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
+
+typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
+typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
+typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
+
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
+
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
+
+// Domain Wall fermions
+typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
+typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
+typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
+
+typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
+typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
+typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
+
+typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
+typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
+typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
+
+typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
+typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
+typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
+
+typedef MobiusFermion<WilsonImplR> MobiusFermionR;
+typedef MobiusFermion<WilsonImplF> MobiusFermionF;
+typedef MobiusFermion<WilsonImplD> MobiusFermionD;
+
+typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
+typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
+typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
+
+typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
+typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
+typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
+
+typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
+typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
+typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
+
+typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
+typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
+typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
+
+typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
+typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
+typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
+
+// Ls vectorised
+typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
+typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
+typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
+
+typedef DomainWallFermion<DomainWallVec5dImplRL> DomainWallFermionVec5dRL;
+typedef DomainWallFermion<DomainWallVec5dImplFH> DomainWallFermionVec5dFH;
+typedef DomainWallFermion<DomainWallVec5dImplDF> DomainWallFermionVec5dDF;
+
+typedef DomainWallEOFAFermion<DomainWallVec5dImplR> DomainWallEOFAFermionVec5dR;
+typedef DomainWallEOFAFermion<DomainWallVec5dImplF> DomainWallEOFAFermionVec5dF;
+typedef DomainWallEOFAFermion<DomainWallVec5dImplD> DomainWallEOFAFermionVec5dD;
+
+typedef DomainWallEOFAFermion<DomainWallVec5dImplRL> DomainWallEOFAFermionVec5dRL;
+typedef DomainWallEOFAFermion<DomainWallVec5dImplFH> DomainWallEOFAFermionVec5dFH;
+typedef DomainWallEOFAFermion<DomainWallVec5dImplDF> DomainWallEOFAFermionVec5dDF;
+
+typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
+typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
+typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
+
+typedef MobiusFermion<DomainWallVec5dImplRL> MobiusFermionVec5dRL;
+typedef MobiusFermion<DomainWallVec5dImplFH> MobiusFermionVec5dFH;
+typedef MobiusFermion<DomainWallVec5dImplDF> MobiusFermionVec5dDF;
+
+typedef MobiusEOFAFermion<DomainWallVec5dImplR> MobiusEOFAFermionVec5dR;
+typedef MobiusEOFAFermion<DomainWallVec5dImplF> MobiusEOFAFermionVec5dF;
+typedef MobiusEOFAFermion<DomainWallVec5dImplD> MobiusEOFAFermionVec5dD;
+
+typedef MobiusEOFAFermion<DomainWallVec5dImplRL> MobiusEOFAFermionVec5dRL;
+typedef MobiusEOFAFermion<DomainWallVec5dImplFH> MobiusEOFAFermionVec5dFH;
+typedef MobiusEOFAFermion<DomainWallVec5dImplDF> MobiusEOFAFermionVec5dDF;
+
+typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
+typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
+typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
+
+typedef ZMobiusFermion<ZDomainWallVec5dImplRL> ZMobiusFermionVec5dRL;
+typedef ZMobiusFermion<ZDomainWallVec5dImplFH> ZMobiusFermionVec5dFH;
+typedef ZMobiusFermion<ZDomainWallVec5dImplDF> ZMobiusFermionVec5dDF;
+
+typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
+typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
+typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
+
+typedef MobiusZolotarevFermion<WilsonImplR> MobiusZolotarevFermionR;
+typedef MobiusZolotarevFermion<WilsonImplF> MobiusZolotarevFermionF;
+typedef MobiusZolotarevFermion<WilsonImplD> MobiusZolotarevFermionD;
+typedef ShamirZolotarevFermion<WilsonImplR> ShamirZolotarevFermionR;
+typedef ShamirZolotarevFermion<WilsonImplF> ShamirZolotarevFermionF;
+typedef ShamirZolotarevFermion<WilsonImplD> ShamirZolotarevFermionD;
+
+typedef OverlapWilsonCayleyTanhFermion<WilsonImplR> OverlapWilsonCayleyTanhFermionR;
+typedef OverlapWilsonCayleyTanhFermion<WilsonImplF> OverlapWilsonCayleyTanhFermionF;
+typedef OverlapWilsonCayleyTanhFermion<WilsonImplD> OverlapWilsonCayleyTanhFermionD;
+typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplR> OverlapWilsonCayleyZolotarevFermionR;
+typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplF> OverlapWilsonCayleyZolotarevFermionF;
+typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD> OverlapWilsonCayleyZolotarevFermionD;
+
+// Continued fraction
+typedef OverlapWilsonContFracTanhFermion<WilsonImplR> OverlapWilsonContFracTanhFermionR;
+typedef OverlapWilsonContFracTanhFermion<WilsonImplF> OverlapWilsonContFracTanhFermionF;
+typedef OverlapWilsonContFracTanhFermion<WilsonImplD> OverlapWilsonContFracTanhFermionD;
+typedef OverlapWilsonContFracZolotarevFermion<WilsonImplR> OverlapWilsonContFracZolotarevFermionR;
+typedef OverlapWilsonContFracZolotarevFermion<WilsonImplF> OverlapWilsonContFracZolotarevFermionF;
+typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD> OverlapWilsonContFracZolotarevFermionD;
+
+// Partial fraction
+typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplR> OverlapWilsonPartialFractionTanhFermionR;
+typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplF> OverlapWilsonPartialFractionTanhFermionF;
+typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD> OverlapWilsonPartialFractionTanhFermionD;
+
+typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplR> OverlapWilsonPartialFractionZolotarevFermionR;
+typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplF> OverlapWilsonPartialFractionZolotarevFermionF;
+typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonPartialFractionZolotarevFermionD;
+
+// Gparity cases; partial list until tested
+typedef WilsonFermion<GparityWilsonImplR>     GparityWilsonFermionR;
+typedef WilsonFermion<GparityWilsonImplF>     GparityWilsonFermionF;
+typedef WilsonFermion<GparityWilsonImplD>     GparityWilsonFermionD;
+
+typedef WilsonFermion<GparityWilsonImplRL>     GparityWilsonFermionRL;
+typedef WilsonFermion<GparityWilsonImplFH>     GparityWilsonFermionFH;
+typedef WilsonFermion<GparityWilsonImplDF>     GparityWilsonFermionDF;
+
+typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
+typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
+typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
+
+typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
+typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
+typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
+
+typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
+typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
+typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
+
+typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
+typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
+typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
+
+typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
+typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
+typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
+
+typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
+typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
+typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
+
+typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
+typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
+typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
+
+typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
+typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
+typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
+
+typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
+typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
+typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
+
+typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
+typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
+typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
+
+typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
+typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
+typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
+
+typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
+typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
+typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
+
+typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
+typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
+typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
+
+
+  }}
+
+////////////////////
+// Scalar QED actions
+// TODO: this needs to move to another header after rename to Fermion.h
+////////////////////
+#include <Grid/qcd/action/scalar/Scalar.h>
+#include <Grid/qcd/action/gauge/Photon.h>
+
+#endif
--- a/Grid/qcd/action/fermion/FermionCore.h
+++ b/Grid/qcd/action/fermion/FermionCore.h
@@ -0,0 +1,93 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/Fermion_base_aggregate.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_FERMION_CORE_H
+#define  GRID_QCD_FERMION_CORE_H
+
+#include <Grid/GridCore.h>
+#include <Grid/GridQCDcore.h>
+#include <Grid/qcd/action/ActionCore.h>
+
+////////////////////////////////////////////
+// Fermion prereqs
+////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/FermionOperatorImpl.h>
+#include <Grid/qcd/action/fermion/FermionOperator.h>
+#include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/StaggeredKernels.h>        //used by all wilson type fermions
+
+#define FermOpStaggeredTemplateInstantiate(A) \
+  template class A<StaggeredImplF>; \
+  template class A<StaggeredImplD>; 
+
+#define FermOpStaggeredVec5dTemplateInstantiate(A) \
+  template class A<StaggeredVec5dImplF>; \
+  template class A<StaggeredVec5dImplD>; 
+
+#define FermOp4dVecTemplateInstantiate(A) \
+  template class A<WilsonImplF>;		\
+  template class A<WilsonImplD>;		\
+  template class A<ZWilsonImplF>;		\
+  template class A<ZWilsonImplD>;		\
+  template class A<GparityWilsonImplF>;		\
+  template class A<GparityWilsonImplD>;		\
+  template class A<WilsonImplFH>;		\
+  template class A<WilsonImplDF>;		\
+  template class A<ZWilsonImplFH>;		\
+  template class A<ZWilsonImplDF>;		\
+  template class A<GparityWilsonImplFH>;		\
+  template class A<GparityWilsonImplDF>;		
+
+
+#define AdjointFermOpTemplateInstantiate(A) \
+  template class A<WilsonAdjImplF>; \
+  template class A<WilsonAdjImplD>; 
+
+#define TwoIndexFermOpTemplateInstantiate(A) \
+  template class A<WilsonTwoIndexSymmetricImplF>; \
+  template class A<WilsonTwoIndexSymmetricImplD>; \
+  template class A<WilsonTwoIndexAntiSymmetricImplF>; \
+  template class A<WilsonTwoIndexAntiSymmetricImplD>;
+
+#define FermOp5dVecTemplateInstantiate(A) \
+  template class A<DomainWallVec5dImplF>;	\
+  template class A<DomainWallVec5dImplD>;	\
+  template class A<ZDomainWallVec5dImplF>;	\
+  template class A<ZDomainWallVec5dImplD>;	\
+  template class A<DomainWallVec5dImplFH>;	\
+  template class A<DomainWallVec5dImplDF>;	\
+  template class A<ZDomainWallVec5dImplFH>;	\
+  template class A<ZDomainWallVec5dImplDF>;	
+
+#define FermOpTemplateInstantiate(A) \
+ FermOp4dVecTemplateInstantiate(A) \
+ FermOp5dVecTemplateInstantiate(A) 
+
+#define GparityFermOpTemplateInstantiate(A) 
+
+#endif
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -0,0 +1,182 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/FermionOperator.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_FERMION_OPERATOR_H
+#define  GRID_QCD_FERMION_OPERATOR_H
+
+namespace Grid {
+
+  namespace QCD {
+
+    ////////////////////////////////////////////////////////////////
+    // Allow to select  between gauge representation rank bc's, flavours etc.
+    // and single/double precision.
+    ////////////////////////////////////////////////////////////////
+    
+    template<class Impl>
+    class FermionOperator : public CheckerBoardedSparseMatrixBase<typename Impl::FermionField>, public Impl
+    {
+    public:
+
+      INHERIT_IMPL_TYPES(Impl);
+
+      FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
+      virtual ~FermionOperator(void) = default;
+
+      virtual FermionField &tmp(void) = 0;
+
+      GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
+      GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
+
+      virtual GridBase *FermionGrid(void)         =0;
+      virtual GridBase *FermionRedBlackGrid(void) =0;
+      virtual GridBase *GaugeGrid(void)           =0;
+      virtual GridBase *GaugeRedBlackGrid(void)   =0;
+
+      // override multiply
+      virtual RealD  M    (const FermionField &in, FermionField &out)=0;
+      virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;
+
+      // Query the even even properties to make algorithmic decisions
+      virtual int    ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
+      virtual int    isTrivialEE(void) { return 0; };
+      virtual RealD  Mass(void) {return 0.0;};
+
+      // half checkerboard operaions
+      virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0;
+      virtual void   Mooee       (const FermionField &in, FermionField &out)=0;
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out)=0;
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out)=0;
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out)=0;
+
+      // non-hermitian hopping term; half cb or both
+      virtual void Dhop  (const FermionField &in, FermionField &out,int dag)=0;
+      virtual void DhopOE(const FermionField &in, FermionField &out,int dag)=0;
+      virtual void DhopEO(const FermionField &in, FermionField &out,int dag)=0;
+      virtual void DhopDir(const FermionField &in, FermionField &out,int dir,int disp)=0; // implemented by WilsonFermion and WilsonFermion5D
+
+      // force terms; five routines; default to Dhop on diagonal
+      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDeriv(mat,U,V,dag);};
+      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivOE(mat,U,V,dag);};
+      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivEO(mat,U,V,dag);};
+      virtual void MooDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=zero;}; // Clover can override these
+      virtual void MeeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=zero;};
+
+      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+      virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+      virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+
+
+      virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
+      virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+
+
+      virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
+
+      virtual void  FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<double> twist) {
+	FFT theFFT((GridCartesian *) in._grid);
+
+	FermionField in_k(in._grid);
+	FermionField prop_k(in._grid);
+
+	//phase for boundary condition
+	ComplexField coor(in._grid);
+	ComplexField ph(in._grid);  ph = zero;
+	FermionField in_buf(in._grid); in_buf = zero;
+	Complex ci(0.0,1.0);
+	assert(twist.size() == Nd);//check that twist is Nd
+	for(unsigned int nu = 0; nu < Nd; nu++)
+	{
+          LatticeCoordinate(coor, nu);
+	  ph = ph + twist[nu]*coor*((1./(in._grid->_fdimensions[nu])));
+	}
+	in_buf = exp((Real)(2.0*M_PI)*ci*ph*(-1.0))*in;
+
+	theFFT.FFT_all_dim(in_k,in_buf,FFT::forward);
+        this->MomentumSpacePropagator(prop_k,in_k,mass,twist);
+	theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+
+	//phase for boundary condition
+	out = out * exp((Real)(2.0*M_PI)*ci*ph);
+
+      };
+      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+		std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+	        FreePropagator(in,out,mass,twist);
+      };
+
+      ///////////////////////////////////////////////
+      // Updates gauge field during HMC
+      ///////////////////////////////////////////////
+      virtual void ImportGauge(const GaugeField & _U)=0;
+
+      //////////////////////////////////////////////////////////////////////
+      // Conserved currents, either contract at sink or insert sequentially.
+      //////////////////////////////////////////////////////////////////////
+      virtual void ContractConservedCurrent(PropagatorField &q_in_1,
+                                            PropagatorField &q_in_2,
+                                            PropagatorField &q_out,
+                                            Current curr_type,
+                                            unsigned int mu)=0;
+      virtual void SeqConservedCurrent(PropagatorField &q_in, 
+                                       PropagatorField &q_out,
+                                       Current curr_type,
+                                       unsigned int mu,
+                                       unsigned int tmin, 
+                                       unsigned int tmax,
+                                       ComplexField &lattice_cmplx)=0;
+      ///////////////////////////////////////////////
+      // Physical field import/export
+      ///////////////////////////////////////////////
+      virtual void Dminus(const FermionField &psi, FermionField &chi)    { chi=psi; }
+      virtual void DminusDag(const FermionField &psi, FermionField &chi) { chi=psi; }
+      virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)
+      {
+	imported = input;
+      };
+      virtual void ImportUnphysicalFermion(const FermionField &input,FermionField &imported)
+      {
+	imported=input;
+      };
+      virtual void ExportPhysicalFermionSolution(const FermionField &solution,FermionField &exported)
+      {
+	exported=solution;
+      };
+      virtual void ExportPhysicalFermionSource(const FermionField &solution,FermionField &exported)
+      {
+	exported=solution;
+      };
+    };
+
+  }
+}
+
+#endif
--- a/Grid/qcd/action/fermion/FermionOperatorImpl.h
+++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
@@ -0,0 +1,604 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid.h>
+
+namespace Grid {
+namespace QCD {
+
+const std::vector<int> 
+ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
+const std::vector<int> 
+ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
+
+/////////////////////////////////
+// Constructor and gauge import
+/////////////////////////////////
+
+
+template <class Impl>
+ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, 
+							 RealD _mass,
+							 RealD _c1, RealD _c2,RealD _u0,
+							 const ImplParams &p)
+    : Kernels(p),
+      _grid(&Fgrid),
+      _cbgrid(&Hgrid),
+      Stencil(&Fgrid, npoint, Even, directions, displacements),
+      StencilEven(&Hgrid, npoint, Even, directions, displacements),  // source is Even
+      StencilOdd(&Hgrid, npoint, Odd, directions, displacements),  // source is Odd
+      mass(_mass),
+      Lebesgue(_grid),
+      LebesgueEvenOdd(_cbgrid),
+      Umu(&Fgrid),
+      UmuEven(&Hgrid),
+      UmuOdd(&Hgrid),
+      UUUmu(&Fgrid),
+      UUUmuEven(&Hgrid),
+      UUUmuOdd(&Hgrid) ,
+      _tmp(&Hgrid)
+{
+  int vol4;
+  int LLs=1;
+  c1=_c1;
+  c2=_c2;
+  u0=_u0;
+  vol4= _grid->oSites();
+  Stencil.BuildSurfaceList(LLs,vol4);
+  vol4= _cbgrid->oSites();
+  StencilEven.BuildSurfaceList(LLs,vol4);
+  StencilOdd.BuildSurfaceList(LLs,vol4);
+}
+
+template <class Impl>
+ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
+							 GridRedBlackCartesian &Hgrid, RealD _mass,
+							 RealD _c1, RealD _c2,RealD _u0,
+							 const ImplParams &p)
+  : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p)
+{
+  ImportGauge(_Uthin,_Ufat);
+}
+
+  ////////////////////////////////////////////////////////////
+  // Momentum space propagator should be 
+  // https://arxiv.org/pdf/hep-lat/9712010.pdf
+  //
+  // mom space action.
+  //   gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
+  //
+  // must track through staggered flavour/spin reduction in literature to 
+  // turn to free propagator for the one component chi field, a la page 4/5
+  // of above link to implmement fourier based solver.
+  ////////////////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) 
+{
+  /////////////////////////////////////////////////////////////////
+  // Trivial import; phases and fattening and such like preapplied
+  /////////////////////////////////////////////////////////////////
+  GaugeLinkField U(GaugeGrid());
+
+  for (int mu = 0; mu < Nd; mu++) {
+
+    U = PeekIndex<LorentzIndex>(_Utriple, mu);
+    PokeIndex<LorentzIndex>(UUUmu, U, mu );
+
+    U = adj( Cshift(U, mu, -3));
+    PokeIndex<LorentzIndex>(UUUmu, -U, mu+4 );
+
+    U = PeekIndex<LorentzIndex>(_Ufat, mu);
+    PokeIndex<LorentzIndex>(Umu, U, mu);
+
+    U = adj( Cshift(U, mu, -1));
+    PokeIndex<LorentzIndex>(Umu, -U, mu+4);
+
+  }
+  CopyGaugeCheckerboards();
+}
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) 
+{
+
+  Umu   = _U;
+  UUUmu = _UUU;
+  CopyGaugeCheckerboards();
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
+{
+  pickCheckerboard(Even, UmuEven,  Umu);
+  pickCheckerboard(Odd,  UmuOdd ,  Umu);
+  pickCheckerboard(Even, UUUmuEven,UUUmu);
+  pickCheckerboard(Odd,  UUUmuOdd, UUUmu);
+}
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) 
+{
+  GaugeLinkField U(GaugeGrid());
+
+  ////////////////////////////////////////////////////////
+  // Double Store should take two fields for Naik and one hop separately.
+  ////////////////////////////////////////////////////////
+  Impl::DoubleStore(GaugeGrid(), UUUmu, Umu, _Uthin, _Ufat );
+
+  ////////////////////////////////////////////////////////
+  // Apply scale factors to get the right fermion Kinetic term
+  // Could pass coeffs into the double store to save work.
+  // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) 
+  ////////////////////////////////////////////////////////
+  for (int mu = 0; mu < Nd; mu++) {
+
+    U = PeekIndex<LorentzIndex>(Umu, mu);
+    PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
+    
+    U = PeekIndex<LorentzIndex>(Umu, mu+4);
+    PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
+
+    U = PeekIndex<LorentzIndex>(UUUmu, mu);
+    PokeIndex<LorentzIndex>(UUUmu, U*( 0.5*c2/u0/u0/u0), mu );
+    
+    U = PeekIndex<LorentzIndex>(UUUmu, mu+4);
+    PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
+  }
+
+  CopyGaugeCheckerboards();
+}
+
+/////////////////////////////
+// Implement the interface
+/////////////////////////////
+
+template <class Impl>
+RealD ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  Dhop(in, out, DaggerNo);
+  return axpy_norm(out, mass, in, out);
+}
+
+template <class Impl>
+RealD ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  Dhop(in, out, DaggerYes);
+  return axpy_norm(out, mass, in, out);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
+  if (in.checkerboard == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
+  }
+}
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+  if (in.checkerboard == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
+  }
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  typename FermionField::scalar_type scal(mass);
+  out = scal * in;
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  Mooee(in, out);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  out = (1.0 / (mass)) * in;
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
+                                      FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  MooeeInv(in, out);
+}
+
+///////////////////////////////////
+// Internal
+///////////////////////////////////
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, 
+						   GaugeField & mat,
+						   const FermionField &A, const FermionField &B, int dag) {
+  assert((dag == DaggerNo) || (dag == DaggerYes));
+
+  Compressor compressor;
+
+  FermionField Btilde(B._grid);
+  FermionField Atilde(B._grid);
+  Atilde = A;
+
+  st.HaloExchange(B, compressor);
+
+  for (int mu = 0; mu < Nd; mu++) {
+
+    ////////////////////////
+    // Call the single hop
+    ////////////////////////
+    PARALLEL_FOR_LOOP
+    for (int sss = 0; sss < B._grid->oSites(); sss++) {
+      Kernels::DhopDir(st, U, UUU, st.CommBuf(), sss, sss, B, Btilde, mu,1);
+    }
+
+    // Force in three link terms
+    //
+    //    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
+    //
+    // dU_ac(x)/dt = i p_ab U_bc(x)
+    //
+    // => dS_f/dt = dS_f/dU_ac(x) . dU_ac(x)/dt =  i p_ab U_bc(x) dS_f/dU_ac(x) 
+    //
+    // One link: form fragments S_f = A U B 
+    //
+    //         write Btilde = U(x) B(x+mu)
+    //
+    // mat+= TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+    // 
+    // Three link: form fragments S_f = A UUU B 
+    //
+    // mat+= outer ( A, UUUB) <-- Best take DhopDeriv with one linke or identity matrix
+    // mat+= outer ( AU, UUB) <-- and then use covariant cshift?
+    // mat+= outer ( AUU, UB) <-- Returned from call to DhopDir
+
+    assert(0);// need to figure out the force interface with a blasted three link term.
+    
+  }
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+
+  conformable(U._grid, _grid);
+  conformable(U._grid, V._grid);
+  conformable(U._grid, mat._grid);
+
+  mat.checkerboard = U.checkerboard;
+
+  DerivInternal(Stencil, Umu, UUUmu, mat, U, V, dag);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+
+  conformable(U._grid, _cbgrid);
+  conformable(U._grid, V._grid);
+  conformable(U._grid, mat._grid);
+
+  assert(V.checkerboard == Even);
+  assert(U.checkerboard == Odd);
+  mat.checkerboard = Odd;
+
+  DerivInternal(StencilEven, UmuOdd, UUUmuOdd, mat, U, V, dag);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+
+  conformable(U._grid, _cbgrid);
+  conformable(U._grid, V._grid);
+  conformable(U._grid, mat._grid);
+
+  assert(V.checkerboard == Odd);
+  assert(U.checkerboard == Even);
+  mat.checkerboard = Even;
+
+  DerivInternal(StencilOdd, UmuEven, UUUmuEven, mat, U, V, dag);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
+  DhopCalls+=2;
+  conformable(in._grid, _grid);  // verifies full grid
+  conformable(in._grid, out._grid);
+
+  out.checkerboard = in.checkerboard;
+
+  DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
+  DhopCalls+=1;
+  conformable(in._grid, _cbgrid);    // verifies half grid
+  conformable(in._grid, out._grid);  // drops the cb check
+
+  assert(in.checkerboard == Even);
+  out.checkerboard = Odd;
+
+  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
+  DhopCalls+=1;
+  conformable(in._grid, _cbgrid);    // verifies half grid
+  conformable(in._grid, out._grid);  // drops the cb check
+
+  assert(in.checkerboard == Odd);
+  out.checkerboard = Even;
+
+  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+  DhopDir(in, out, dir, disp);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
+
+  Compressor compressor;
+  Stencil.HaloExchange(in, compressor);
+
+  PARALLEL_FOR_LOOP
+  for (int sss = 0; sss < in._grid->oSites(); sss++) {
+    Kernels::DhopDir(Stencil, Umu, UUUmu, Stencil.CommBuf(), sss, sss, in, out, dir, disp);
+  }
+};
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+						  DoubledGaugeField &U,
+						  DoubledGaugeField &UUU,
+						  const FermionField &in,
+						  FermionField &out, int dag) 
+{
+#ifdef GRID_OMP
+  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
+    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
+  else
+#endif
+    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
+}
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+								 DoubledGaugeField &U,
+								 DoubledGaugeField &UUU,
+								 const FermionField &in,
+								 FermionField &out, int dag) 
+{
+#ifdef GRID_OMP
+  Compressor compressor; 
+  int len =  U._grid->oSites();
+  const int LLs =  1;
+
+  DhopTotalTime   -= usecond();
+
+  DhopFaceTime    -= usecond();
+  st.Prepare();
+  st.HaloGather(in,compressor);
+  st.CommsMergeSHM(compressor);
+  DhopFaceTime    += usecond();
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Ugly explicit thread mapping introduced for OPA reasons.
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  DhopComputeTime    -= usecond();
+#pragma omp parallel 
+  {
+    int tid = omp_get_thread_num();
+    int nthreads = omp_get_num_threads();
+    int ncomms = CartesianCommunicator::nCommThreads;
+    if (ncomms == -1) ncomms = 1;
+    assert(nthreads > ncomms);
+
+    if (tid >= ncomms) {
+      nthreads -= ncomms;
+      int ttid  = tid - ncomms;
+      int n     = len;
+      int chunk = n / nthreads;
+      int rem   = n % nthreads;
+      int myblock, myn;
+      if (ttid < rem) {
+        myblock = ttid * chunk + ttid;
+        myn = chunk+1;
+      } else {
+        myblock = ttid*chunk + rem;
+        myn = chunk;
+      }
+
+      // do the compute
+      if (dag == DaggerYes) {
+        for (int ss = myblock; ss < myblock+myn; ++ss) {
+          int sU = ss;
+	  // Interior = 1; Exterior = 0; must implement for staggered
+          Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0); 
+        }
+      } else {
+        for (int ss = myblock; ss < myblock+myn; ++ss) {
+	  // Interior = 1; Exterior = 0;
+          int sU = ss;
+          Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
+        }
+      }
+    } else {
+      st.CommunicateThreaded();
+    }
+  }
+  DhopComputeTime    += usecond();
+
+  // First to enter, last to leave timing
+  DhopFaceTime    -= usecond();
+  st.CommsMerge(compressor);
+  DhopFaceTime    -= usecond();
+
+  DhopComputeTime2    -= usecond();
+  if (dag == DaggerYes) {
+    int sz=st.surface_list.size();
+    parallel_for (int ss = 0; ss < sz; ss++) {
+      int sU = st.surface_list[ss];
+      Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
+    }
+  } else {
+    int sz=st.surface_list.size();
+    parallel_for (int ss = 0; ss < sz; ss++) {
+      int sU = st.surface_list[ss];
+      Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
+    }
+  }
+  DhopComputeTime2    += usecond();
+#else
+  assert(0);
+#endif
+}
+
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
+							     DoubledGaugeField &U,
+							     DoubledGaugeField &UUU,
+							     const FermionField &in,
+							     FermionField &out, int dag) 
+{
+  assert((dag == DaggerNo) || (dag == DaggerYes));
+
+  DhopTotalTime   -= usecond();
+
+  DhopCommTime    -= usecond();
+  Compressor compressor;
+  st.HaloExchange(in, compressor);
+  DhopCommTime    += usecond();
+
+  DhopComputeTime -= usecond();
+  if (dag == DaggerYes) {
+    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
+      Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
+    }
+  } else {
+    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
+      Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
+    }
+  }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
+};
+
+  ////////////////////////////////////////////////////////////////
+  // Reporting
+  ////////////////////////////////////////////////////////////////
+template<class Impl>
+void ImprovedStaggeredFermion<Impl>::Report(void) 
+{
+  std::vector<int> latt = GridDefaultLatt();          
+  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP = _grid->_Nprocessors;
+  RealD NN = _grid->NodeCount();
+
+  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls   : " 
+	    << DhopCalls   << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime   /Calls       : " 
+	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime    /Calls       : " 
+	    << DhopCommTime    / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls        : " 
+	    << DhopComputeTime / DhopCalls << " us" << std::endl;
+
+  // Average the compute time
+  _grid->GlobalSum(DhopComputeTime);
+  DhopComputeTime/=NP;
+
+  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
+  
+  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
+}
+template<class Impl>
+void ImprovedStaggeredFermion<Impl>::ZeroCounters(void) 
+{
+  DhopCalls       = 0;
+  DhopTotalTime   = 0;
+  DhopCommTime    = 0;
+  DhopComputeTime = 0;
+  DhopFaceTime    = 0;
+
+  Stencil.ZeroCounters();
+  StencilEven.ZeroCounters();
+  StencilOdd.ZeroCounters();
+}
+
+
+//////////////////////////////////////////////////////// 
+// Conserved current - not yet implemented.
+////////////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                        PropagatorField &q_in_2,
+                                                        PropagatorField &q_out,
+                                                        Current curr_type,
+                                                        unsigned int mu)
+{
+    assert(0);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
+                                              PropagatorField &q_out,
+                                              Current curr_type,
+                                              unsigned int mu,
+                                              unsigned int tmin, 
+                                              unsigned int tmax,
+					      ComplexField &lattice_cmplx)
+{
+    assert(0);
+
+}
+
+
+FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
+
+  //AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
+  //TwoIndexFermOpTemplateInstantiate(ImprovedStaggeredFermion);
+
+}}
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -0,0 +1,205 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_QCD_IMPR_STAG_FERMION_H
+#define GRID_QCD_IMPR_STAG_FERMION_H
+
+namespace Grid {
+
+namespace QCD {
+
+class ImprovedStaggeredFermionStatic {
+ public:
+  static const std::vector<int> directions;
+  static const std::vector<int> displacements;
+  static const int npoint = 16;
+};
+
+template <class Impl>
+class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedStaggeredFermionStatic {
+ public:
+  INHERIT_IMPL_TYPES(Impl);
+  typedef StaggeredKernels<Impl> Kernels;
+
+  FermionField _tmp;
+  FermionField &tmp(void) { return _tmp; }
+
+  ////////////////////////////////////////
+  // Performance monitoring
+  ////////////////////////////////////////
+  void Report(void);
+  void ZeroCounters(void);
+  double DhopTotalTime;
+  double DhopCalls;
+  double DhopCommTime;
+  double DhopComputeTime;
+  double DhopComputeTime2;
+  double DhopFaceTime;
+
+  ///////////////////////////////////////////////////////////////
+  // Implement the abstract base
+  ///////////////////////////////////////////////////////////////
+  GridBase *GaugeGrid(void) { return _grid; }
+  GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
+  GridBase *FermionGrid(void) { return _grid; }
+  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
+
+  //////////////////////////////////////////////////////////////////
+  // override multiply; cut number routines if pass dagger argument
+  // and also make interface more uniformly consistent
+  //////////////////////////////////////////////////////////////////
+  RealD M(const FermionField &in, FermionField &out);
+  RealD Mdag(const FermionField &in, FermionField &out);
+
+  /////////////////////////////////////////////////////////
+  // half checkerboard operations
+  /////////////////////////////////////////////////////////
+  void Meooe(const FermionField &in, FermionField &out);
+  void MeooeDag(const FermionField &in, FermionField &out);
+  void Mooee(const FermionField &in, FermionField &out);
+  void MooeeDag(const FermionField &in, FermionField &out);
+  void MooeeInv(const FermionField &in, FermionField &out);
+  void MooeeInvDag(const FermionField &in, FermionField &out);
+
+  ////////////////////////
+  // Derivative interface
+  ////////////////////////
+  // Interface calls an internal routine
+  void DhopDeriv  (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+  void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+  void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+
+  ///////////////////////////////////////////////////////////////
+  // non-hermitian hopping term; half cb or both
+  ///////////////////////////////////////////////////////////////
+  void Dhop  (const FermionField &in, FermionField &out, int dag);
+  void DhopOE(const FermionField &in, FermionField &out, int dag);
+  void DhopEO(const FermionField &in, FermionField &out, int dag);
+
+  ///////////////////////////////////////////////////////////////
+  // Multigrid assistance; force term uses too
+  ///////////////////////////////////////////////////////////////
+  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
+
+  ///////////////////////////////////////////////////////////////
+  // Extra methods added by derived
+  ///////////////////////////////////////////////////////////////
+  void DerivInternal(StencilImpl &st, 
+		     DoubledGaugeField &U,DoubledGaugeField &UUU,
+		     GaugeField &mat, 
+		     const FermionField &A, const FermionField &B, int dag);
+
+  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
+                    const FermionField &in, FermionField &out, int dag);
+  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
+                    const FermionField &in, FermionField &out, int dag);
+  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
+                    const FermionField &in, FermionField &out, int dag);
+
+  //////////////////////////////////////////////////////////////////////////
+  // Grid own interface Constructor
+  //////////////////////////////////////////////////////////////////////////
+  ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
+			   GridRedBlackCartesian &Hgrid, RealD _mass,
+			   RealD _c1, RealD _c2,RealD _u0,
+			   const ImplParams &p = ImplParams());
+
+  //////////////////////////////////////////////////////////////////////////
+  // MILC constructor no gauge fields
+  //////////////////////////////////////////////////////////////////////////
+  ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
+			   RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
+			   const ImplParams &p = ImplParams());
+
+  // DoubleStore impl dependent
+  void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
+  void ImportGauge      (const GaugeField &_Uthin  ,const GaugeField &_Ufat);
+  void ImportGaugeSimple(const GaugeField &_UUU    ,const GaugeField &_U);
+  void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
+  DoubledGaugeField &GetU(void)   { return Umu ; } ;
+  DoubledGaugeField &GetUUU(void) { return UUUmu; };
+  void CopyGaugeCheckerboards(void);
+
+  ///////////////////////////////////////////////////////////////
+  // Data members require to support the functionality
+  ///////////////////////////////////////////////////////////////
+
+  //    protected:
+ public:
+  // any other parameters of action ???
+  virtual int   isTrivialEE(void) { return 1; };
+  virtual RealD Mass(void) { return mass; }
+  RealD mass;
+  RealD u0;
+  RealD c1;
+  RealD c2;
+
+  GridBase *_grid;
+  GridBase *_cbgrid;
+
+  // Defines the stencils for even and odd
+  StencilImpl Stencil;
+  StencilImpl StencilEven;
+  StencilImpl StencilOdd;
+
+  // Copy of the gauge field , with even and odd subsets
+  DoubledGaugeField Umu;
+  DoubledGaugeField UmuEven;
+  DoubledGaugeField UmuOdd;
+
+  DoubledGaugeField UUUmu;
+  DoubledGaugeField UUUmuEven;
+  DoubledGaugeField UUUmuOdd;
+
+  LebesgueOrder Lebesgue;
+  LebesgueOrder LebesgueEvenOdd;
+  
+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+                                PropagatorField &q_in_2,
+                                PropagatorField &q_out,
+                                Current curr_type,
+                                unsigned int mu);
+  void SeqConservedCurrent(PropagatorField &q_in, 
+                           PropagatorField &q_out,
+                           Current curr_type, 
+                           unsigned int mu,
+                           unsigned int tmin, 
+                           unsigned int tmax,
+			   ComplexField &lattice_cmplx);
+};
+
+typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
+typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
+
+}
+}
+#endif
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
@@ -0,0 +1,654 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
+#include <Grid/perfmon/PerfCount.h>
+
+namespace Grid {
+namespace QCD {
+  
+// S-direction is INNERMOST and takes no part in the parity.
+const std::vector<int> 
+ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
+const std::vector<int> 
+ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
+
+  // 5d lattice for DWF.
+template<class Impl>
+ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian         &FiveDimGrid,
+							     GridRedBlackCartesian &FiveDimRedBlackGrid,
+							     GridCartesian         &FourDimGrid,
+							     GridRedBlackCartesian &FourDimRedBlackGrid,
+							     RealD _mass,
+							     RealD _c1,RealD _c2, RealD _u0,
+							     const ImplParams &p) :
+  Kernels(p),
+  _FiveDimGrid        (&FiveDimGrid),
+  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
+  _FourDimGrid        (&FourDimGrid),
+  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
+  Stencil    (&FiveDimGrid,npoint,Even,directions,displacements),
+  StencilEven(&FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
+  StencilOdd (&FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
+  mass(_mass),
+  c1(_c1),
+  c2(_c2),
+  u0(_u0),
+  Umu(&FourDimGrid),
+  UmuEven(&FourDimRedBlackGrid),
+  UmuOdd (&FourDimRedBlackGrid),
+  UUUmu(&FourDimGrid),
+  UUUmuEven(&FourDimRedBlackGrid),
+  UUUmuOdd(&FourDimRedBlackGrid),
+  Lebesgue(&FourDimGrid),
+  LebesgueEvenOdd(&FourDimRedBlackGrid),
+  _tmp(&FiveDimRedBlackGrid)
+{
+
+  // some assertions
+  assert(FiveDimGrid._ndimension==5);
+  assert(FourDimGrid._ndimension==4);
+  assert(FourDimRedBlackGrid._ndimension==4);
+  assert(FiveDimRedBlackGrid._ndimension==5);
+  assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
+
+  // extent of fifth dim and not spread out
+  Ls=FiveDimGrid._fdimensions[0];
+  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+  assert(FiveDimGrid._processors[0]         ==1);
+  assert(FiveDimRedBlackGrid._processors[0] ==1);
+
+  // Other dimensions must match the decomposition of the four-D fields 
+  for(int d=0;d<4;d++){
+    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
+
+    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
+
+    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+    assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
+    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
+  }
+
+  if (Impl::LsVectorised) { 
+
+    int nsimd = Simd::Nsimd();
+    
+    // Dimension zero of the five-d is the Ls direction
+    assert(FiveDimGrid._simd_layout[0]        ==nsimd);
+    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
+
+    for(int d=0;d<4;d++){
+      assert(FourDimGrid._simd_layout[d]=1);
+      assert(FourDimRedBlackGrid._simd_layout[d]=1);
+      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
+    }
+
+  } else {
+    
+    // Dimension zero of the five-d is the Ls direction
+    assert(FiveDimRedBlackGrid._simd_layout[0]==1);
+    assert(FiveDimGrid._simd_layout[0]        ==1);
+
+  }
+  int LLs = FiveDimGrid._rdimensions[0];
+  int vol4= FourDimGrid.oSites();
+  Stencil.BuildSurfaceList(LLs,vol4);
+
+  vol4=FourDimRedBlackGrid.oSites();
+  StencilEven.BuildSurfaceList(LLs,vol4);
+  StencilOdd.BuildSurfaceList(LLs,vol4);
+}
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::CopyGaugeCheckerboards(void)
+{
+  pickCheckerboard(Even, UmuEven,  Umu);
+  pickCheckerboard(Odd,  UmuOdd ,  Umu);
+  pickCheckerboard(Even, UUUmuEven,UUUmu);
+  pickCheckerboard(Odd,  UUUmuOdd, UUUmu);
+}
+template<class Impl>
+ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
+							     GridCartesian         &FiveDimGrid,
+							     GridRedBlackCartesian &FiveDimRedBlackGrid,
+							     GridCartesian         &FourDimGrid,
+							     GridRedBlackCartesian &FourDimRedBlackGrid,
+							     RealD _mass,
+							     RealD _c1,RealD _c2, RealD _u0,
+							     const ImplParams &p) :
+  ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid,
+			     FourDimGrid,FourDimRedBlackGrid,
+			     _mass,_c1,_c2,_u0,p)
+{
+  ImportGauge(_Uthin,_Ufat);
+}
+
+///////////////////////////////////////////////////
+// For MILC use; pass three link U's and 1 link U
+///////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) 
+{
+  /////////////////////////////////////////////////////////////////
+  // Trivial import; phases and fattening and such like preapplied
+  /////////////////////////////////////////////////////////////////
+  for (int mu = 0; mu < Nd; mu++) {
+
+    auto U = PeekIndex<LorentzIndex>(_Utriple, mu);
+    Impl::InsertGaugeField(UUUmu,U,mu);
+
+    U = adj( Cshift(U, mu, -3));
+    Impl::InsertGaugeField(UUUmu,-U,mu+4);
+
+    U = PeekIndex<LorentzIndex>(_Ufat, mu);
+    Impl::InsertGaugeField(Umu,U,mu);
+
+    U = adj( Cshift(U, mu, -1));
+    Impl::InsertGaugeField(Umu,-U,mu+4);
+
+  }
+  CopyGaugeCheckerboards();
+}
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) 
+{
+  /////////////////////////////////////////////////////////////////
+  // Trivial import; phases and fattening and such like preapplied
+  /////////////////////////////////////////////////////////////////
+  Umu   = _U;
+  UUUmu = _UUU;
+  CopyGaugeCheckerboards();
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
+{
+  ////////////////////////////////////////////////////////
+  // Double Store should take two fields for Naik and one hop separately.
+  ////////////////////////////////////////////////////////
+  Impl::DoubleStore(GaugeGrid(), UUUmu, Umu, _Uthin, _Ufat );
+
+  ////////////////////////////////////////////////////////
+  // Apply scale factors to get the right fermion Kinetic term
+  // Could pass coeffs into the double store to save work.
+  // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) 
+  ////////////////////////////////////////////////////////
+  for (int mu = 0; mu < Nd; mu++) {
+
+    auto U = PeekIndex<LorentzIndex>(Umu, mu);
+    PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
+    
+    U = PeekIndex<LorentzIndex>(Umu, mu+4);
+    PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
+
+    U = PeekIndex<LorentzIndex>(UUUmu, mu);
+    PokeIndex<LorentzIndex>(UUUmu, U*( 0.5*c2/u0/u0/u0), mu );
+    
+    U = PeekIndex<LorentzIndex>(UUUmu, mu+4);
+    PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
+  }
+
+  CopyGaugeCheckerboards();
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
+{
+  int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
+                    // we drop off the innermost fifth dimension
+
+  Compressor compressor;
+  Stencil.HaloExchange(in,compressor);
+
+  parallel_for(int ss=0;ss<Umu._grid->oSites();ss++){
+    for(int s=0;s<Ls;s++){
+      int sU=ss;
+      int sF = s+Ls*sU; 
+      Kernels::DhopDir(Stencil, Umu, UUUmu, Stencil.CommBuf(), sF, sU, in, out, dir, disp);
+    }
+  }
+};
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DerivInternal(StencilImpl & st,
+            DoubledGaugeField & U,
+            DoubledGaugeField & UUU,
+            GaugeField &mat,
+            const FermionField &A,
+            const FermionField &B,
+            int dag)
+{
+  // No force terms in multi-rhs solver staggered
+  assert(0);
+}
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopDeriv(GaugeField &mat,
+				      const FermionField &A,
+				      const FermionField &B,
+				      int dag)
+{
+  assert(0);
+}
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
+					const FermionField &A,
+					const FermionField &B,
+					int dag)
+{
+  assert(0);
+}
+
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
+					const FermionField &A,
+					const FermionField &B,
+					int dag)
+{
+  assert(0);
+}
+
+/*CHANGE */
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
+						    DoubledGaugeField & U,DoubledGaugeField & UUU,
+						    const FermionField &in, FermionField &out,int dag)
+{
+#ifdef GRID_OMP
+  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
+    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
+  else
+#endif
+    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
+}
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
+								   DoubledGaugeField & U,DoubledGaugeField & UUU,
+								   const FermionField &in, FermionField &out,int dag)
+{
+#ifdef GRID_OMP
+  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
+
+  Compressor compressor; 
+
+  int LLs = in._grid->_rdimensions[0];
+  int len =  U._grid->oSites();
+
+  DhopFaceTime-=usecond();
+  st.Prepare();
+  st.HaloGather(in,compressor);
+  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+  DhopFaceTime+=usecond();
+
+  double ctime=0;
+  double ptime=0;
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Ugly explicit thread mapping introduced for OPA reasons.
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
+  {
+    int tid = omp_get_thread_num();
+    int nthreads = omp_get_num_threads();
+    int ncomms = CartesianCommunicator::nCommThreads;
+    if (ncomms == -1) ncomms = 1;
+    assert(nthreads > ncomms);
+    if (tid >= ncomms) {
+      double start = usecond();
+      nthreads -= ncomms;
+      int ttid  = tid - ncomms;
+      int n     = U._grid->oSites(); // 4d vol
+      int chunk = n / nthreads;
+      int rem   = n % nthreads;
+      int myblock, myn;
+      if (ttid < rem) {
+        myblock = ttid * chunk + ttid;
+        myn = chunk+1;
+      } else {
+        myblock = ttid*chunk + rem;
+        myn = chunk;
+      }
+
+      // do the compute
+      if (dag == DaggerYes) {
+        for (int ss = myblock; ss < myblock+myn; ++ss) {
+          int sU = ss;
+	  // Interior = 1; Exterior = 0; must implement for staggered
+          Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<---------
+        }
+      } else {
+        for (int ss = myblock; ss < myblock+myn; ++ss) {
+	  // Interior = 1; Exterior = 0;
+          int sU = ss;
+          Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------
+        }
+      }
+        ptime = usecond() - start;
+    } else {
+      double start = usecond();
+      st.CommunicateThreaded();
+      ctime = usecond() - start;
+    }
+  }
+  DhopCommTime += ctime;
+  DhopComputeTime+=ptime;
+
+  // First to enter, last to leave timing
+  st.CollateThreads();
+
+  DhopFaceTime-=usecond();
+  st.CommsMerge(compressor);
+  DhopFaceTime+=usecond();
+
+  DhopComputeTime2-=usecond();
+  if (dag == DaggerYes) {
+    int sz=st.surface_list.size();
+    parallel_for (int ss = 0; ss < sz; ss++) {
+      int sU = st.surface_list[ss];
+      Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<----------
+    }
+  } else {
+    int sz=st.surface_list.size();
+    parallel_for (int ss = 0; ss < sz; ss++) {
+      int sU = st.surface_list[ss];
+      Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<----------
+    }
+  }
+  DhopComputeTime2+=usecond();
+#else
+  assert(0);
+#endif
+
+}
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
+						    DoubledGaugeField & U,DoubledGaugeField & UUU,
+						    const FermionField &in, FermionField &out,int dag)
+{
+  Compressor compressor;
+  int LLs = in._grid->_rdimensions[0];
+
+
+
+ //double t1=usecond();
+  DhopTotalTime -= usecond();
+  DhopCommTime -= usecond();
+  st.HaloExchange(in,compressor);
+  DhopCommTime += usecond();
+  
+  DhopComputeTime -= usecond();
+  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  if (dag == DaggerYes) {
+    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
+      int sU=ss;
+      Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
+    }
+  } else {
+    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
+      int sU=ss;
+      Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
+    }
+  }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
+ //double t2=usecond();
+ //std::cout << __FILE__ << " " << __func__  << " Total Time " << DhopTotalTime << std::endl;
+ //std::cout << __FILE__ << " " << __func__  << " Total Time Org " << t2-t1 << std::endl;
+ //std::cout << __FILE__ << " " << __func__  << " Comml Time " << DhopCommTime << std::endl;
+ //std::cout << __FILE__ << " " << __func__  << " Compute Time " << DhopComputeTime << std::endl;
+
+}
+/*CHANGE END*/
+
+/* ORG
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
+						    DoubledGaugeField & U,DoubledGaugeField & UUU,
+						    const FermionField &in, FermionField &out,int dag)
+{
+  Compressor compressor;
+  int LLs = in._grid->_rdimensions[0];
+
+
+
+  DhopTotalTime -= usecond();
+  DhopCommTime -= usecond();
+  st.HaloExchange(in,compressor);
+  DhopCommTime += usecond();
+  
+  DhopComputeTime -= usecond();
+  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  if (dag == DaggerYes) {
+    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
+      int sU=ss;
+      Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
+    }
+  } else {
+    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
+      int sU=ss;
+	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
+    }
+  }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
+}
+*/
+
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
+{
+  DhopCalls+=1;
+  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
+  conformable(in._grid,out._grid); // drops the cb check
+
+  assert(in.checkerboard==Even);
+  out.checkerboard = Odd;
+
+  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
+{
+  DhopCalls+=1;
+  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
+  conformable(in._grid,out._grid); // drops the cb check
+
+  assert(in.checkerboard==Odd);
+  out.checkerboard = Even;
+
+  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
+{
+  DhopCalls+=2;
+  conformable(in._grid,FermionGrid()); // verifies full grid
+  conformable(in._grid,out._grid);
+
+  out.checkerboard = in.checkerboard;
+
+  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
+}
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Report(void) 
+{
+  std::vector<int> latt = GridDefaultLatt();          
+  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP = _FourDimGrid->_Nprocessors;
+  RealD NN = _FourDimGrid->NodeCount();
+
+  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
+	    << DhopCalls   << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
+	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
+	    << DhopCommTime    / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
+	    << DhopComputeTime / DhopCalls << " us" << std::endl;
+
+  // Average the compute time
+  _FourDimGrid->GlobalSum(DhopComputeTime);
+  DhopComputeTime/=NP;
+
+  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
+  
+  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
+{
+  DhopCalls       = 0;
+  DhopTotalTime    = 0;
+  DhopCommTime    = 0;
+  DhopComputeTime = 0;
+  DhopFaceTime    = 0;
+
+
+  Stencil.ZeroCounters();
+  StencilEven.ZeroCounters();
+  StencilOdd.ZeroCounters();
+}
+
+/////////////////////////////////////////////////////////////////////////
+// Implement the general interface. Here we use SAME mass on all slices
+/////////////////////////////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+  DhopDir(in, out, dir, disp);
+}
+template <class Impl>
+RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  Dhop(in, out, DaggerNo);
+  return axpy_norm(out, mass, in, out);
+}
+
+template <class Impl>
+RealD ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  Dhop(in, out, DaggerYes);
+  return axpy_norm(out, mass, in, out);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) {
+  if (in.checkerboard == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
+  }
+}
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+  if (in.checkerboard == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
+  }
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  typename FermionField::scalar_type scal(mass);
+  out = scal * in;
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  Mooee(in, out);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  out = (1.0 / (mass)) * in;
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
+                                      FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  MooeeInv(in, out);
+}
+
+//////////////////////////////////////////////////////// 
+// Conserved current - not yet implemented.
+////////////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                         PropagatorField &q_in_2,
+                                                         PropagatorField &q_out,
+                                                         Current curr_type,
+                                                         unsigned int mu)
+{
+    assert(0);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
+                                              PropagatorField &q_out,
+                                              Current curr_type,
+                                              unsigned int mu,
+                                              unsigned int tmin, 
+                                              unsigned int tmax,
+					      ComplexField &lattice_cmplx)
+{
+    assert(0);
+
+}
+
+FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
+FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
+  
+}}
+
+
+
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -0,0 +1,234 @@
+
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: AzusaYamaguchi <ayamaguc@staffmail.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_IMPROVED_STAGGERED_FERMION_5D_H
+#define  GRID_QCD_IMPROVED_STAGGERED_FERMION_5D_H
+
+namespace Grid {
+namespace QCD {
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // This is the 4d red black case appropriate to support
+  ////////////////////////////////////////////////////////////////////////////////
+
+    class ImprovedStaggeredFermion5DStatic { 
+    public:
+      // S-direction is INNERMOST and takes no part in the parity.
+      static const std::vector<int> directions;
+      static const std::vector<int> displacements;
+      const int npoint = 16;
+    };
+
+    template<class Impl>
+    class ImprovedStaggeredFermion5D :  public StaggeredKernels<Impl>, public ImprovedStaggeredFermion5DStatic 
+    {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+      typedef StaggeredKernels<Impl> Kernels;
+
+      FermionField _tmp;
+      FermionField &tmp(void) { return _tmp; }
+
+      ////////////////////////////////////////
+      // Performance monitoring
+      ////////////////////////////////////////
+      void Report(void);
+      void ZeroCounters(void);
+      double DhopTotalTime;
+      double DhopCalls;
+      double DhopCommTime;
+      double DhopComputeTime;
+      double DhopComputeTime2;
+      double DhopFaceTime;
+
+      ///////////////////////////////////////////////////////////////
+      // Implement the abstract base
+      ///////////////////////////////////////////////////////////////
+      GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
+      GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
+      GridBase *FermionGrid(void)            { return _FiveDimGrid;}
+      GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
+
+      // full checkerboard operations; leave unimplemented as abstract for now
+      RealD  M    (const FermionField &in, FermionField &out);
+      RealD  Mdag (const FermionField &in, FermionField &out);
+
+      // half checkerboard operations
+      void   Meooe       (const FermionField &in, FermionField &out);
+      void   Mooee       (const FermionField &in, FermionField &out);
+      void   MooeeInv    (const FermionField &in, FermionField &out);
+
+      void   MeooeDag    (const FermionField &in, FermionField &out);
+      void   MooeeDag    (const FermionField &in, FermionField &out);
+      void   MooeeInvDag (const FermionField &in, FermionField &out);
+
+      void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+      void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+
+      // These can be overridden by fancy 5d chiral action
+      void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+
+      // Implement hopping term non-hermitian hopping term; half cb or both
+      void Dhop  (const FermionField &in, FermionField &out,int dag);
+      void DhopOE(const FermionField &in, FermionField &out,int dag);
+      void DhopEO(const FermionField &in, FermionField &out,int dag);
+
+    
+    ///////////////////////////////////////////////////////////////
+    // New methods added 
+    ///////////////////////////////////////////////////////////////
+    void DerivInternal(StencilImpl & st,
+		       DoubledGaugeField & U,
+		       DoubledGaugeField & UUU,
+		       GaugeField &mat,
+		       const FermionField &A,
+		       const FermionField &B,
+		       int dag);
+    
+    void DhopInternal(StencilImpl & st,
+		      LebesgueOrder &lo,
+		      DoubledGaugeField &U,
+		      DoubledGaugeField &UUU,
+		      const FermionField &in, 
+		      FermionField &out,
+		      int dag);
+    
+    void DhopInternalOverlappedComms(StencilImpl & st,
+		      LebesgueOrder &lo,
+		      DoubledGaugeField &U,
+		      DoubledGaugeField &UUU,
+		      const FermionField &in, 
+		      FermionField &out,
+		      int dag);
+
+    void DhopInternalSerialComms(StencilImpl & st,
+		      LebesgueOrder &lo,
+		      DoubledGaugeField &U,
+		      DoubledGaugeField &UUU,
+		      const FermionField &in, 
+		      FermionField &out,
+		      int dag);
+    
+    
+    // Constructors
+    ////////////////////////////////////////////////////////////////////////////////////////////////
+    // Grid internal interface -- Thin link and fat link, with coefficients
+    ////////////////////////////////////////////////////////////////////////////////////////////////
+    ImprovedStaggeredFermion5D(GaugeField &_Uthin,
+			       GaugeField &_Ufat,
+			       GridCartesian         &FiveDimGrid,
+			       GridRedBlackCartesian &FiveDimRedBlackGrid,
+			       GridCartesian         &FourDimGrid,
+			       GridRedBlackCartesian &FourDimRedBlackGrid,
+			       double _mass,
+			       RealD _c1, RealD _c2,RealD _u0,
+			       const ImplParams &p= ImplParams());
+    ////////////////////////////////////////////////////////////////////////////////////////////////
+    // MILC constructor ; triple links, no rescale factors; must be externally pre multiplied
+    ////////////////////////////////////////////////////////////////////////////////////////////////
+    ImprovedStaggeredFermion5D(GridCartesian         &FiveDimGrid,
+			       GridRedBlackCartesian &FiveDimRedBlackGrid,
+			       GridCartesian         &FourDimGrid,
+			       GridRedBlackCartesian &FourDimRedBlackGrid,
+			       double _mass,
+			       RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
+			       const ImplParams &p= ImplParams());
+
+    // DoubleStore gauge field in operator
+    void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
+    void ImportGauge      (const GaugeField &_Uthin  ,const GaugeField &_Ufat);
+    void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
+    void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
+    // Give a reference; can be used to do an assignment or copy back out after import
+    // if Carleton wants to cache them and not use the ImportSimple
+    DoubledGaugeField &GetU(void)   { return Umu ; } ;
+    DoubledGaugeField &GetUUU(void) { return UUUmu; };
+    void CopyGaugeCheckerboards(void);
+    
+    ///////////////////////////////////////////////////////////////
+    // Data members require to support the functionality
+    ///////////////////////////////////////////////////////////////
+  public:
+
+    virtual int   isTrivialEE(void) { return 1; };
+    virtual RealD Mass(void) { return mass; }
+    
+    GridBase *_FourDimGrid;
+    GridBase *_FourDimRedBlackGrid;
+    GridBase *_FiveDimGrid;
+    GridBase *_FiveDimRedBlackGrid;
+    
+    RealD mass;
+    RealD c1;
+    RealD c2;
+    RealD u0;
+    int Ls;
+    
+    //Defines the stencils for even and odd
+    StencilImpl Stencil; 
+    StencilImpl StencilEven; 
+    StencilImpl StencilOdd; 
+    
+    // Copy of the gauge field , with even and odd subsets
+    DoubledGaugeField Umu;
+    DoubledGaugeField UmuEven;
+    DoubledGaugeField UmuOdd;
+
+    DoubledGaugeField UUUmu;
+    DoubledGaugeField UUUmuEven;
+    DoubledGaugeField UUUmuOdd;
+    
+    LebesgueOrder Lebesgue;
+    LebesgueOrder LebesgueEvenOdd;
+    
+    // Comms buffer
+    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+    
+    ///////////////////////////////////////////////////////////////
+    // Conserved current utilities
+    ///////////////////////////////////////////////////////////////
+    void ContractConservedCurrent(PropagatorField &q_in_1,
+                                  PropagatorField &q_in_2,
+                                  PropagatorField &q_out,
+                                  Current curr_type,
+                                  unsigned int mu);
+    void SeqConservedCurrent(PropagatorField &q_in, 
+                             PropagatorField &q_out,
+                             Current curr_type, 
+                             unsigned int mu,
+                             unsigned int tmin, 
+                             unsigned int tmax,
+                 	     ComplexField &lattice_cmplx);
+  };
+
+}}
+
+#endif
--- a/Grid/qcd/action/fermion/MobiusEOFAFermion.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.cc
@@ -0,0 +1,502 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+  template<class Impl>
+    MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
+      GaugeField            &_Umu,
+      GridCartesian         &FiveDimGrid,
+      GridRedBlackCartesian &FiveDimRedBlackGrid,
+      GridCartesian         &FourDimGrid,
+      GridRedBlackCartesian &FourDimRedBlackGrid,
+      RealD _mq1, RealD _mq2, RealD _mq3,
+      RealD _shift, int _pm, RealD _M5,
+      RealD _b, RealD _c, const ImplParams &p) :
+    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
+        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
+        _shift, _pm, _M5, _b, _c, p)
+    {
+      int Ls = this->Ls;
+
+      RealD eps = 1.0;
+      Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
+      assert(zdata->n == this->Ls);
+
+      std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
+        ",c=" << _c << ") with Ls=" << Ls << std::endl;
+      this->SetCoefficientsTanh(zdata, _b, _c);
+      std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
+        ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
+        ",pm=" << _pm << ")" << std::endl;
+
+      Approx::zolotarev_free(zdata);
+
+      if(_shift != 0.0){
+        SetCoefficientsPrecondShiftOps();
+      } else {
+        Mooee_shift.resize(Ls, 0.0);
+        MooeeInv_shift_lc.resize(Ls, 0.0);
+        MooeeInv_shift_norm.resize(Ls, 0.0);
+        MooeeInvDag_shift_lc.resize(Ls, 0.0);
+        MooeeInvDag_shift_norm.resize(Ls, 0.0);
+      }
+    }
+
+    /****************************************************************
+     * Additional EOFA operators only called outside the inverter.  
+     * Since speed is not essential, simple axpby-style
+     * implementations should be fine.
+     ***************************************************************/
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
+    {
+      int Ls = this->Ls;
+      RealD alpha = this->alpha;
+
+      Din = zero;
+      if((sign == 1) && (dag == 0)) { // \Omega_{+}
+        for(int s=0; s<Ls; ++s){
+          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
+        }
+      } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
+        for(int s=0; s<Ls; ++s){
+          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
+        }
+      } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
+        for(int sp=0; sp<Ls; ++sp){
+          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
+        }
+      } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
+        for(int sp=0; sp<Ls; ++sp){
+          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
+        }
+      }
+    }
+
+    // This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
+    // It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
+    {
+      int Ls    = this->Ls;
+      RealD b   = 0.5 * ( 1.0 + this->alpha );
+      RealD c   = 0.5 * ( 1.0 - this->alpha );
+      RealD mq1 = this->mq1;
+
+      for(int s=0; s<Ls; ++s){
+        if(s == 0) {
+          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
+          axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
+        } else if(s == (Ls-1)) {
+          axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
+          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
+        } else {
+          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
+          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
+        }
+      }
+    }
+
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+      RealD m = this->mq1;
+      RealD c = 0.5 * this->alpha;
+      RealD d = 0.5;
+
+      RealD DtInv_p(0.0), DtInv_m(0.0);
+      RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
+      FermionField tmp(this->FermionGrid());
+
+      for(int s=0; s<Ls; ++s){
+      for(int sp=0; sp<Ls; ++sp){
+
+        DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
+        DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
+        DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
+        DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
+
+        if(sp == 0){
+          axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
+          axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
+        } else {
+          axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
+          axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
+        }
+
+      }}
+    }
+
+    /*****************************************************************************************************/
+
+    template<class Impl>
+    RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+
+      FermionField Din(psi._grid);
+
+      this->Meooe5D(psi, Din);
+      this->DW(Din, chi, DaggerNo);
+      axpby(chi, 1.0, 1.0, chi, psi);
+      this->M5D(psi, chi);
+      return(norm2(chi));
+    }
+
+    template<class Impl>
+    RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+
+      FermionField Din(psi._grid);
+
+      this->DW(psi, Din, DaggerYes);
+      this->MeooeDag5D(Din, chi);
+      this->M5Ddag(psi, chi);
+      axpby(chi, 1.0, 1.0, chi, psi);
+      return(norm2(chi));
+    }
+
+    /********************************************************************
+     * Performance critical fermion operators called inside the inverter
+     ********************************************************************/
+
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+
+      std::vector<Coeff_t> diag(Ls,1.0);
+      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+
+      // no shift term
+      if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
+
+      // fused M + shift operation
+      else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
+    }
+
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+
+      std::vector<Coeff_t> diag(Ls,1.0);
+      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+
+      // no shift term
+      if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
+
+      // fused M + shift operation
+      else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
+    }
+
+    // half checkerboard operations
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+
+      // coefficients of Mooee
+      std::vector<Coeff_t> diag = this->bee;
+      std::vector<Coeff_t> upper(Ls);
+      std::vector<Coeff_t> lower(Ls);
+      for(int s=0; s<Ls; s++){
+        upper[s] = -this->cee[s];
+        lower[s] = -this->cee[s];
+      }
+      upper[Ls-1] *= -this->mq1;
+      lower[0]    *= -this->mq1;
+
+      // no shift term
+      if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
+
+      // fused M + shift operation
+      else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
+    }
+
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+
+      // coefficients of MooeeDag
+      std::vector<Coeff_t> diag = this->bee;
+      std::vector<Coeff_t> upper(Ls);
+      std::vector<Coeff_t> lower(Ls);
+      for(int s=0; s<Ls; s++){
+        if(s==0) {
+          upper[s] = -this->cee[s+1];
+          lower[s] = this->mq1*this->cee[Ls-1];
+        } else if(s==(Ls-1)) {
+          upper[s] = this->mq1*this->cee[0];
+          lower[s] = -this->cee[s-1];
+        } else {
+          upper[s] = -this->cee[s+1];
+          lower[s] = -this->cee[s-1];
+        }
+      }
+
+      // no shift term
+      if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
+
+      // fused M + shift operation
+      else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
+    }
+
+    /****************************************************************************************/
+
+    // Computes coefficients for applying Cayley preconditioned shift operators
+    //  (Mooee + \Delta) --> Mooee_shift
+    //  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
+    //  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
+    // For the latter two cases, the operation takes the form
+    //  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
+    //      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
+    {
+      int   Ls    = this->Ls;
+      int   pm    = this->pm;
+      RealD alpha = this->alpha;
+      RealD k     = this->k;
+      RealD mq1   = this->mq1;
+      RealD shift = this->shift;
+
+      // Initialize
+      Mooee_shift.resize(Ls);
+      MooeeInv_shift_lc.resize(Ls);
+      MooeeInv_shift_norm.resize(Ls);
+      MooeeInvDag_shift_lc.resize(Ls);
+      MooeeInvDag_shift_norm.resize(Ls);
+
+      // Construct Mooee_shift
+      int idx(0);
+      Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
+                  ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
+      for(int s=0; s<Ls; ++s){
+        idx = (pm == 1) ? (s) : (Ls-1-s);
+        Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
+      }
+
+      // Tridiagonal solve for MooeeInvDag_shift_lc
+      {
+        Coeff_t m(0.0);
+        std::vector<Coeff_t> d = Mooee_shift;
+        std::vector<Coeff_t> u(Ls,0.0);
+        std::vector<Coeff_t> y(Ls,0.0);
+        std::vector<Coeff_t> q(Ls,0.0);
+        if(pm == 1){ u[0] = 1.0; }
+        else{ u[Ls-1] = 1.0; }
+
+        // Tridiagonal matrix algorithm + Sherman-Morrison formula
+        //
+        // We solve
+        //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
+        // where Mooee' is the tridiagonal part of Mooee_{+}, and
+        // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
+        // so that the outer-product u \otimes v gives the (0,Ls-1)
+        // entry of Mooee_{+}.
+        //
+        // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
+        // and then construct the solution to the original system
+        //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
+        if(pm == 1){
+          for(int s=1; s<Ls; ++s){
+            m = -this->cee[s] / this->bee[s-1];
+            d[s] -= m*d[s-1];
+            u[s] -= m*u[s-1];
+          }
+        }
+        y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
+        q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
+        for(int s=Ls-2; s>=0; --s){
+          if(pm == 1){
+            y[s] = d[s] / this->bee[s];
+            q[s] = u[s] / this->bee[s];
+          } else {
+            y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
+            q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
+          }
+        }
+
+        // Construct MooeeInvDag_shift_lc
+        for(int s=0; s<Ls; ++s){
+          if(pm == 1){
+            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
+              (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
+          } else {
+            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
+              (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
+          }
+        }
+
+        // Compute remaining coefficients
+        N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
+        for(int s=0; s<Ls; ++s){
+
+          // MooeeInv_shift_lc
+          if(pm == 1){ MooeeInv_shift_lc[s] = std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s); }
+          else{ MooeeInv_shift_lc[s] = std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s); }
+
+          // MooeeInv_shift_norm
+          MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
+            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N;
+
+          // MooeeInvDag_shift_norm
+          if(pm == 1){ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s) /
+            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
+          else{ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s) /
+            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
+        }
+      }
+    }
+
+    // Recompute coefficients for a different value of shift constant
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
+    {
+      this->shift = new_shift;
+      if(new_shift != 0.0){
+        SetCoefficientsPrecondShiftOps();
+      } else {
+        int Ls = this->Ls;
+        Mooee_shift.resize(Ls,0.0);
+        MooeeInv_shift_lc.resize(Ls,0.0);
+        MooeeInv_shift_norm.resize(Ls,0.0);
+        MooeeInvDag_shift_lc.resize(Ls,0.0);
+        MooeeInvDag_shift_norm.resize(Ls,0.0);
+      }
+    }
+
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
+      Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+    {
+      int Ls = this->Ls;
+
+      GridBase* grid = this->FermionRedBlackGrid();
+      int LLs = grid->_rdimensions[0];
+
+      if(LLs == Ls){ return; } // Not vectorised in 5th direction
+
+      Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+      Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
+
+      for(int s=0; s<Ls; s++){
+        Pplus(s,s)  = this->bee[s];
+        Pminus(s,s) = this->bee[s];
+      }
+
+      for(int s=0; s<Ls-1; s++){
+        Pminus(s,s+1) = -this->cee[s];
+        Pplus(s+1,s) = -this->cee[s+1];
+      }
+
+      Pplus (0,Ls-1) = this->mq1*this->cee[0];
+      Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
+
+      if(this->shift != 0.0){
+        RealD c = 0.5 * this->alpha;
+        RealD d = 0.5;
+        RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
+        if(this->pm == 1) {
+          for(int s=0; s<Ls; ++s){
+            Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
+          }
+        } else {
+          for(int s=0; s<Ls; ++s){
+            Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
+          }
+        }
+      }
+
+      Eigen::MatrixXcd PplusMat ;
+      Eigen::MatrixXcd PminusMat;
+
+      if(inv) {
+        PplusMat  = Pplus.inverse();
+        PminusMat = Pminus.inverse();
+      } else {
+        PplusMat  = Pplus;
+        PminusMat = Pminus;
+      }
+
+      if(dag){
+        PplusMat.adjointInPlace();
+        PminusMat.adjointInPlace();
+      }
+
+      typedef typename SiteHalfSpinor::scalar_type scalar_type;
+      const int Nsimd = Simd::Nsimd();
+      Matp.resize(Ls*LLs);
+      Matm.resize(Ls*LLs);
+
+      for(int s2=0; s2<Ls; s2++){
+      for(int s1=0; s1<LLs; s1++){
+        int istride = LLs;
+        int ostride = 1;
+        Simd Vp;
+        Simd Vm;
+        scalar_type *sp = (scalar_type*) &Vp;
+        scalar_type *sm = (scalar_type*) &Vm;
+        for(int l=0; l<Nsimd; l++){
+          if(switcheroo<Coeff_t>::iscomplex()) {
+            sp[l] = PplusMat (l*istride+s1*ostride,s2);
+            sm[l] = PminusMat(l*istride+s1*ostride,s2);
+          } else {
+            // if real
+            scalar_type tmp;
+            tmp = PplusMat (l*istride+s1*ostride,s2);
+            sp[l] = scalar_type(tmp.real(),tmp.real());
+            tmp = PminusMat(l*istride+s1*ostride,s2);
+            sm[l] = scalar_type(tmp.real(),tmp.real());
+          }
+        }
+        Matp[LLs*s2+s1] = Vp;
+        Matm[LLs*s2+s1] = Vm;
+      }}
+  }
+
+  FermOpTemplateInstantiate(MobiusEOFAFermion);
+  GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
+
+}}
--- a/Grid/qcd/action/fermion/MobiusEOFAFermion.h
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.h
@@ -0,0 +1,133 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.h
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef  GRID_QCD_MOBIUS_EOFA_FERMION_H
+#define  GRID_QCD_MOBIUS_EOFA_FERMION_H
+
+#include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+  template<class Impl>
+  class MobiusEOFAFermion : public AbstractEOFAFermion<Impl>
+  {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+    public:
+      // Shift operator coefficients for red-black preconditioned Mobius EOFA
+      std::vector<Coeff_t> Mooee_shift;
+      std::vector<Coeff_t> MooeeInv_shift_lc;
+      std::vector<Coeff_t> MooeeInv_shift_norm;
+      std::vector<Coeff_t> MooeeInvDag_shift_lc;
+      std::vector<Coeff_t> MooeeInvDag_shift_norm;
+
+      virtual void Instantiatable(void) {};
+
+      // EOFA-specific operations
+      virtual void  Omega            (const FermionField& in, FermionField& out, int sign, int dag);
+      virtual void  Dtilde           (const FermionField& in, FermionField& out);
+      virtual void  DtildeInv        (const FermionField& in, FermionField& out);
+
+      // override multiply
+      virtual RealD M                (const FermionField& in, FermionField& out);
+      virtual RealD Mdag             (const FermionField& in, FermionField& out);
+
+      // half checkerboard operations
+      virtual void  Mooee            (const FermionField& in, FermionField& out);
+      virtual void  MooeeDag         (const FermionField& in, FermionField& out);
+      virtual void  MooeeInv         (const FermionField& in, FermionField& out);
+      virtual void  MooeeInv_shift   (const FermionField& in, FermionField& out);
+      virtual void  MooeeInvDag      (const FermionField& in, FermionField& out);
+      virtual void  MooeeInvDag_shift(const FermionField& in, FermionField& out);
+
+      virtual void   M5D             (const FermionField& psi, FermionField& chi);
+      virtual void   M5Ddag          (const FermionField& psi, FermionField& chi);
+
+      /////////////////////////////////////////////////////
+      // Instantiate different versions depending on Impl
+      /////////////////////////////////////////////////////
+      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
+        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+
+      void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
+        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+        std::vector<Coeff_t>& shift_coeffs);
+
+      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
+        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+
+      void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
+        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+        std::vector<Coeff_t>& shift_coeffs);
+
+      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
+
+      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+
+      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
+        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+
+      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
+        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+
+      virtual void RefreshShiftCoefficients(RealD new_shift);
+
+      // Constructors
+      MobiusEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
+        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
+        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
+        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams());
+
+    protected:
+      void SetCoefficientsPrecondShiftOps(void);
+  };
+}}
+
+#define INSTANTIATE_DPERP_MOBIUS_EOFA(A)\
+template void MobiusEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
+  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
+template void MobiusEOFAFermion<A>::M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
+  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
+template void MobiusEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
+  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
+template void MobiusEOFAFermion<A>::M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
+  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
+template void MobiusEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
+template void MobiusEOFAFermion<A>::MooeeInv_shift(const FermionField& psi, FermionField& chi); \
+template void MobiusEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi); \
+template void MobiusEOFAFermion<A>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi);
+
+#undef  MOBIUS_EOFA_DPERP_DENSE
+#define MOBIUS_EOFA_DPERP_CACHE
+#undef  MOBIUS_EOFA_DPERP_LINALG
+#define MOBIUS_EOFA_DPERP_VEC
+
+#endif
--- a/Grid/qcd/action/fermion/MobiusEOFAFermioncache.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermioncache.cc
@@ -0,0 +1,429 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi, const FermionField &phi, FermionField &chi,
+    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
+  {
+    int Ls = this->Ls;
+    GridBase *grid = psi._grid;
+
+    assert(phi.checkerboard == psi.checkerboard);
+    chi.checkerboard = psi.checkerboard;
+
+    // Flops = 6.0*(Nc*Ns) *Ls*vol
+    this->M5Dcalls++;
+    this->M5Dtime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+      for(int s=0; s<Ls; s++){
+        auto tmp = psi._odata[0];
+        if(s==0){
+          spProj5m(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5p(tmp, psi._odata[ss+Ls-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else if(s==(Ls-1)) {
+          spProj5m(tmp, psi._odata[ss+0]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5p(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else {
+          spProj5m(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5p(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        }
+      }
+    }
+
+    this->M5Dtime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
+    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
+    std::vector<Coeff_t> &shift_coeffs)
+  {
+    int Ls = this->Ls;
+    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
+    GridBase *grid = psi._grid;
+
+    assert(phi.checkerboard == psi.checkerboard);
+    chi.checkerboard = psi.checkerboard;
+
+    // Flops = 6.0*(Nc*Ns) *Ls*vol
+    this->M5Dcalls++;
+    this->M5Dtime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+      for(int s=0; s<Ls; s++){
+        auto tmp = psi._odata[0];
+        if(s==0){
+          spProj5m(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5p(tmp, psi._odata[ss+Ls-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else if(s==(Ls-1)) {
+          spProj5m(tmp, psi._odata[ss+0]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5p(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else {
+          spProj5m(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5p(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        }
+        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+shift_s]); }
+        else{ spProj5m(tmp, psi._odata[ss+shift_s]); }
+        chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
+      }
+    }
+
+    this->M5Dtime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi,
+    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
+  {
+    int Ls = this->Ls;
+    GridBase *grid = psi._grid;
+
+    assert(phi.checkerboard == psi.checkerboard);
+    chi.checkerboard = psi.checkerboard;
+
+    // Flops = 6.0*(Nc*Ns) *Ls*vol
+    this->M5Dcalls++;
+    this->M5Dtime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+      auto tmp = psi._odata[0];
+      for(int s=0; s<Ls; s++){
+        if(s==0) {
+          spProj5p(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5m(tmp, psi._odata[ss+Ls-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else if(s==(Ls-1)) {
+          spProj5p(tmp, psi._odata[ss+0]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5m(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else {
+          spProj5p(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5m(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        }
+      }
+    }
+
+    this->M5Dtime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
+    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
+    std::vector<Coeff_t> &shift_coeffs)
+  {
+    int Ls = this->Ls;
+    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
+    GridBase *grid = psi._grid;
+
+    assert(phi.checkerboard == psi.checkerboard);
+    chi.checkerboard = psi.checkerboard;
+
+    // Flops = 6.0*(Nc*Ns) *Ls*vol
+    this->M5Dcalls++;
+    this->M5Dtime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+      chi[ss+Ls-1] = zero;
+      auto tmp = psi._odata[0];
+      for(int s=0; s<Ls; s++){
+        if(s==0) {
+          spProj5p(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5m(tmp, psi._odata[ss+Ls-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else if(s==(Ls-1)) {
+          spProj5p(tmp, psi._odata[ss+0]);
+          chi[ss+s] = chi[ss+s] + diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5m(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else {
+          spProj5p(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5m(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        }
+        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+s]); }
+        else{ spProj5m(tmp, psi._odata[ss+s]); }
+        chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
+      }
+    }
+
+    this->M5Dtime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+  {
+    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
+
+    GridBase *grid = psi._grid;
+    int Ls = this->Ls;
+
+    chi.checkerboard = psi.checkerboard;
+
+    this->MooeeInvCalls++;
+    this->MooeeInvTime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+
+      auto tmp = psi._odata[0];
+
+      // Apply (L^{\prime})^{-1}
+      chi[ss] = psi[ss]; // chi[0]=psi[0]
+      for(int s=1; s<Ls; s++){
+        spProj5p(tmp, chi[ss+s-1]);
+        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp;
+      }
+
+      // L_m^{-1}
+      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+        spProj5m(tmp, chi[ss+s]);
+        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp;
+      }
+
+      // U_m^{-1} D^{-1}
+      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+        spProj5p(tmp, chi[ss+Ls-1]);
+        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp;
+      }
+      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
+
+      // Apply U^{-1}
+      for(int s=Ls-2; s>=0; s--){
+        spProj5m(tmp, chi[ss+s+1]);
+        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
+      }
+    }
+
+    this->MooeeInvTime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi, FermionField &chi)
+  {
+    GridBase *grid = psi._grid;
+    int Ls = this->Ls;
+
+    chi.checkerboard = psi.checkerboard;
+
+    this->MooeeInvCalls++;
+    this->MooeeInvTime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+
+      auto tmp1        = psi._odata[0];
+      auto tmp2        = psi._odata[0];
+      auto tmp2_spProj = psi._odata[0];
+
+      // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
+      chi[ss] = psi[ss]; // chi[0]=psi[0]
+      tmp2 = MooeeInv_shift_lc[0]*psi[ss];
+      for(int s=1; s<Ls; s++){
+        spProj5p(tmp1, chi[ss+s-1]);
+        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
+        tmp2 = tmp2 + MooeeInv_shift_lc[s]*psi[ss+s];
+      }
+      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
+      else{ spProj5m(tmp2_spProj, tmp2); }
+
+      // L_m^{-1}
+      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+        spProj5m(tmp1, chi[ss+s]);
+        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
+      }
+
+      // U_m^{-1} D^{-1}
+      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+        spProj5p(tmp1, chi[ss+Ls-1]);
+        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp1;
+      }
+      // chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
+      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
+      spProj5m(tmp1, chi[ss+Ls-1]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
+
+      // Apply U^{-1} and add shift term
+      for(int s=Ls-2; s>=0; s--){
+        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
+        spProj5m(tmp1, chi[ss+s]);
+        chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
+      }
+    }
+
+    this->MooeeInvTime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi, FermionField &chi)
+  {
+    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
+
+    GridBase *grid = psi._grid;
+    int Ls = this->Ls;
+
+    chi.checkerboard = psi.checkerboard;
+
+    this->MooeeInvCalls++;
+    this->MooeeInvTime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+
+      auto tmp = psi._odata[0];
+
+      // Apply (U^{\prime})^{-dag}
+      chi[ss] = psi[ss];
+      for(int s=1; s<Ls; s++){
+        spProj5m(tmp, chi[ss+s-1]);
+        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp;
+      }
+
+      // U_m^{-\dag}
+      for(int s=0; s<Ls-1; s++){
+        spProj5p(tmp, chi[ss+s]);
+        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp;
+      }
+
+      // L_m^{-\dag} D^{-dag}
+      for(int s=0; s<Ls-1; s++){
+        spProj5m(tmp, chi[ss+Ls-1]);
+        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp;
+      }
+      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
+
+      // Apply L^{-dag}
+      for(int s=Ls-2; s>=0; s--){
+        spProj5p(tmp, chi[ss+s+1]);
+        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
+      }
+    }
+
+    this->MooeeInvTime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi, FermionField &chi)
+  {
+    GridBase *grid = psi._grid;
+    int Ls = this->Ls;
+
+    chi.checkerboard = psi.checkerboard;
+
+    this->MooeeInvCalls++;
+    this->MooeeInvTime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+
+      auto tmp1        = psi._odata[0];
+      auto tmp2        = psi._odata[0];
+      auto tmp2_spProj = psi._odata[0];
+
+      // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
+      chi[ss] = psi[ss];
+      tmp2 = MooeeInvDag_shift_lc[0]*psi[ss];
+      for(int s=1; s<Ls; s++){
+        spProj5m(tmp1, chi[ss+s-1]);
+        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp1;
+        tmp2 = tmp2 + MooeeInvDag_shift_lc[s]*psi[ss+s];
+      }
+      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
+      else{ spProj5m(tmp2_spProj, tmp2); }
+
+      // U_m^{-\dag}
+      for(int s=0; s<Ls-1; s++){
+        spProj5p(tmp1, chi[ss+s]);
+        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp1;
+      }
+
+      // L_m^{-\dag} D^{-dag}
+      for(int s=0; s<Ls-1; s++){
+        spProj5m(tmp1, chi[ss+Ls-1]);
+        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp1;
+      }
+      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
+      spProj5p(tmp1, chi[ss+Ls-1]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInvDag_shift_norm[Ls-1]*tmp2_spProj;
+
+      // Apply L^{-dag}
+      for(int s=Ls-2; s>=0; s--){
+        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp1;
+        spProj5p(tmp1, chi[ss+s]);
+        chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
+      }
+    }
+
+    this->MooeeInvTime += usecond();
+  }
+
+  #ifdef MOBIUS_EOFA_DPERP_CACHE
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
+
+  #endif
+
+}}
--- a/Grid/qcd/action/fermion/MobiusEOFAFermiondense.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermiondense.cc
@@ -0,0 +1,184 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+  /*
+  * Dense matrix versions of routines
+  */
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
+  {
+    int Ls = this->Ls;
+    int LLs = psi._grid->_rdimensions[0];
+    int vol = psi._grid->oSites()/LLs;
+
+    int pm      = this->pm;
+    RealD shift = this->shift;
+    RealD alpha = this->alpha;
+    RealD k     = this->k;
+    RealD mq1   = this->mq1;
+
+    chi.checkerboard = psi.checkerboard;
+
+    assert(Ls==LLs);
+
+    Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+    Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+
+    for(int s=0;s<Ls;s++){
+        Pplus(s,s)  = this->bee[s];
+        Pminus(s,s) = this->bee[s];
+    }
+
+    for(int s=0; s<Ls-1; s++){
+        Pminus(s,s+1) = -this->cee[s];
+    }
+
+    for(int s=0; s<Ls-1; s++){
+        Pplus(s+1,s) = -this->cee[s+1];
+    }
+    Pplus (0,Ls-1) = mq1*this->cee[0];
+    Pminus(Ls-1,0) = mq1*this->cee[Ls-1];
+
+    if(shift != 0.0){
+      Coeff_t N = 2.0 * ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
+      for(int s=0; s<Ls; ++s){
+        if(pm == 1){ Pplus(s,Ls-1) += shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
+        else{ Pminus(Ls-1-s,Ls-1) -= shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
+      }
+    }
+
+    Eigen::MatrixXd PplusMat ;
+    Eigen::MatrixXd PminusMat;
+
+    if(inv){
+      PplusMat  = Pplus.inverse();
+      PminusMat = Pminus.inverse();
+    } else {
+      PplusMat  = Pplus;
+      PminusMat = Pminus;
+    }
+
+    if(dag){
+      PplusMat.adjointInPlace();
+      PminusMat.adjointInPlace();
+    }
+
+    // For the non-vectorised s-direction this is simple
+
+    for(auto site=0; site<vol; site++){
+
+        SiteSpinor     SiteChi;
+        SiteHalfSpinor SitePplus;
+        SiteHalfSpinor SitePminus;
+
+        for(int s1=0; s1<Ls; s1++){
+            SiteChi = zero;
+            for(int s2=0; s2<Ls; s2++){
+                int lex2 = s2 + Ls*site;
+                if(PplusMat(s1,s2) != 0.0){
+                    spProj5p(SitePplus,psi[lex2]);
+                    accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
+                }
+                if(PminusMat(s1,s2) != 0.0){
+                    spProj5m(SitePminus, psi[lex2]);
+                    accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
+                }
+            }
+            chi[s1+Ls*site] = SiteChi*0.5;
+        }
+    }
+  }
+
+  #ifdef MOBIUS_EOFA_DPERP_DENSE
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
+
+    template void MobiusEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
+
+    template void MobiusEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+  #endif
+
+}}
--- a/Grid/qcd/action/fermion/MobiusEOFAFermionssp.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermionssp.cc
@@ -0,0 +1,290 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+  // Pminus fowards
+  // Pplus  backwards
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+  {
+    Coeff_t one(1.0);
+    int Ls = this->Ls;
+    for(int s=0; s<Ls; s++){
+      if(s==0) {
+        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
+      } else if (s==(Ls-1)) {
+        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
+        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
+      } else {
+        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
+      }
+    }
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+    std::vector<Coeff_t>& shift_coeffs)
+  {
+    Coeff_t one(1.0);
+    int Ls = this->Ls;
+    for(int s=0; s<Ls; s++){
+      if(s==0) {
+        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
+      } else if (s==(Ls-1)) {
+        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
+        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
+      } else {
+        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
+      }
+      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
+      else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
+    }
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+  {
+    Coeff_t one(1.0);
+    int Ls = this->Ls;
+    for(int s=0; s<Ls; s++){
+      if(s==0) {
+        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
+      } else if (s==(Ls-1)) {
+        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
+        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+      } else {
+        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+      }
+    }
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+    std::vector<Coeff_t>& shift_coeffs)
+  {
+    Coeff_t one(1.0);
+    int Ls = this->Ls;
+    for(int s=0; s<Ls; s++){
+      if(s==0) {
+        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
+      } else if (s==(Ls-1)) {
+        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
+        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+      } else {
+        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+      }
+      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
+      else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
+    }
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+  {
+    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
+
+    Coeff_t one(1.0);
+    Coeff_t czero(0.0);
+    chi.checkerboard = psi.checkerboard;
+    int Ls = this->Ls;
+
+    // Apply (L^{\prime})^{-1}
+    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+    for(int s=1; s<Ls; s++){
+      axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+    }
+
+    // L_m^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+      axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
+    }
+
+    // U_m^{-1} D^{-1}
+    for(int s=0; s<Ls-1; s++){
+      axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
+    }
+    axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
+
+    // Apply U^{-1}
+    for(int s=Ls-2; s>=0; s--){
+      axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
+    }
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
+  {
+    Coeff_t one(1.0);
+    Coeff_t czero(0.0);
+    chi.checkerboard = psi.checkerboard;
+    int Ls = this->Ls;
+
+    FermionField tmp(psi._grid);
+
+    // Apply (L^{\prime})^{-1}
+    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+    axpby_ssp(tmp, czero, tmp, this->MooeeInv_shift_lc[0], psi, 0, 0);
+    for(int s=1; s<Ls; s++){
+      axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+      axpby_ssp(tmp, one, tmp, this->MooeeInv_shift_lc[s], psi, 0, s);
+    }
+
+    // L_m^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+      axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
+    }
+
+    // U_m^{-1} D^{-1}
+    for(int s=0; s<Ls-1; s++){
+      axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
+    }
+    axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
+
+    // Apply U^{-1} and add shift term
+    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
+    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
+    for(int s=Ls-2; s>=0; s--){
+      axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
+      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
+      else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
+    }
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+  {
+    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
+
+    Coeff_t one(1.0);
+    Coeff_t czero(0.0);
+    chi.checkerboard = psi.checkerboard;
+    int Ls = this->Ls;
+
+    // Apply (U^{\prime})^{-dagger}
+    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+    for(int s=1; s<Ls; s++){
+      axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
+    }
+
+    // U_m^{-\dagger}
+    for(int s=0; s<Ls-1; s++){
+      axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for(int s=0; s<Ls-1; s++){
+      axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
+    }
+    axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
+
+    // Apply L^{-dagger}
+    for(int s=Ls-2; s>=0; s--){
+      axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
+    }
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
+  {
+    Coeff_t one(1.0);
+    Coeff_t czero(0.0);
+    chi.checkerboard = psi.checkerboard;
+    int Ls = this->Ls;
+
+    FermionField tmp(psi._grid);
+
+    // Apply (U^{\prime})^{-dagger} and accumulate (MooeeInvDag_shift_lc)_{j} \psi_{j} in tmp[0]
+    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+    axpby_ssp(tmp, czero, tmp, this->MooeeInvDag_shift_lc[0], psi, 0, 0);
+    for(int s=1; s<Ls; s++){
+      axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
+      axpby_ssp(tmp, one, tmp, this->MooeeInvDag_shift_lc[s], psi, 0, s);
+    }
+
+    // U_m^{-\dagger}
+    for(int s=0; s<Ls-1; s++){
+      axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for(int s=0; s<Ls-1; s++){
+      axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
+    }
+    axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
+
+    // Apply L^{-dagger} and add shift
+    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
+    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
+    for(int s=Ls-2; s>=0; s--){
+      axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
+      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
+      else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
+    }
+  }
+
+  #ifdef MOBIUS_EOFA_DPERP_LINALG
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
+
+  #endif
+
+}}
--- a/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc
@@ -0,0 +1,983 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+  /*
+  * Dense matrix versions of routines
+  */
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+  {
+    GridBase* grid  = psi._grid;
+    int Ls          = this->Ls;
+    int LLs         = grid->_rdimensions[0];
+    const int nsimd = Simd::Nsimd();
+
+    Vector<iSinglet<Simd>> u(LLs);
+    Vector<iSinglet<Simd>> l(LLs);
+    Vector<iSinglet<Simd>> d(LLs);
+
+    assert(Ls/LLs == nsimd);
+    assert(phi.checkerboard == psi.checkerboard);
+
+    chi.checkerboard = psi.checkerboard;
+
+    // just directly address via type pun
+    typedef typename Simd::scalar_type scalar_type;
+    scalar_type* u_p = (scalar_type*) &u[0];
+    scalar_type* l_p = (scalar_type*) &l[0];
+    scalar_type* d_p = (scalar_type*) &d[0];
+
+    for(int o=0; o<LLs; o++){ // outer
+    for(int i=0; i<nsimd; i++){ //inner
+      int s   = o + i*LLs;
+      int ss  = o*nsimd + i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+    }}
+
+    this->M5Dcalls++;
+    this->M5Dtime -= usecond();
+
+    assert(Nc == 3);
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+
+      #if 0
+
+        alignas(64) SiteHalfSpinor hp;
+        alignas(64) SiteHalfSpinor hm;
+        alignas(64) SiteSpinor fp;
+        alignas(64) SiteSpinor fm;
+
+        for(int v=0; v<LLs; v++){
+
+          int vp = (v+1)%LLs;
+          int vm = (v+LLs-1)%LLs;
+
+          spProj5m(hp, psi[ss+vp]);
+          spProj5p(hm, psi[ss+vm]);
+
+          if (vp <= v){ rotate(hp, hp, 1); }
+          if (vm >= v){ rotate(hm, hm, nsimd-1); }
+
+          hp = 0.5*hp;
+          hm = 0.5*hm;
+
+          spRecon5m(fp, hp);
+          spRecon5p(fm, hm);
+
+          chi[ss+v] = d[v]*phi[ss+v];
+          chi[ss+v] = chi[ss+v] + u[v]*fp;
+          chi[ss+v] = chi[ss+v] + l[v]*fm;
+
+        }
+
+      #else
+
+        for(int v=0; v<LLs; v++){
+
+          vprefetch(psi[ss+v+LLs]);
+
+          int vp = (v == LLs-1) ? 0     : v+1;
+          int vm = (v == 0)     ? LLs-1 : v-1;
+
+          Simd hp_00 = psi[ss+vp]()(2)(0);
+          Simd hp_01 = psi[ss+vp]()(2)(1);
+          Simd hp_02 = psi[ss+vp]()(2)(2);
+          Simd hp_10 = psi[ss+vp]()(3)(0);
+          Simd hp_11 = psi[ss+vp]()(3)(1);
+          Simd hp_12 = psi[ss+vp]()(3)(2);
+
+          Simd hm_00 = psi[ss+vm]()(0)(0);
+          Simd hm_01 = psi[ss+vm]()(0)(1);
+          Simd hm_02 = psi[ss+vm]()(0)(2);
+          Simd hm_10 = psi[ss+vm]()(1)(0);
+          Simd hm_11 = psi[ss+vm]()(1)(1);
+          Simd hm_12 = psi[ss+vm]()(1)(2);
+
+          if(vp <= v){
+            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+          }
+
+          if(vm >= v){
+            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+          }
+
+          // Can force these to real arithmetic and save 2x.
+          Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
+          Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
+          Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
+          Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
+          Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
+          Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
+          Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+          Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+          Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+          Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+          Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+          Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+
+          vstream(chi[ss+v]()(0)(0), p_00);
+          vstream(chi[ss+v]()(0)(1), p_01);
+          vstream(chi[ss+v]()(0)(2), p_02);
+          vstream(chi[ss+v]()(1)(0), p_10);
+          vstream(chi[ss+v]()(1)(1), p_11);
+          vstream(chi[ss+v]()(1)(2), p_12);
+          vstream(chi[ss+v]()(2)(0), p_20);
+          vstream(chi[ss+v]()(2)(1), p_21);
+          vstream(chi[ss+v]()(2)(2), p_22);
+          vstream(chi[ss+v]()(3)(0), p_30);
+          vstream(chi[ss+v]()(3)(1), p_31);
+          vstream(chi[ss+v]()(3)(2), p_32);
+        }
+
+      #endif
+    }
+
+    this->M5Dtime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+    std::vector<Coeff_t>& shift_coeffs)
+  {
+    #if 0
+
+      this->M5D(psi, phi, chi, lower, diag, upper);
+
+      // FIXME: possible gain from vectorizing shift operation as well?
+      Coeff_t one(1.0);
+      int Ls = this->Ls;
+      for(int s=0; s<Ls; s++){
+        if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
+        else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
+      }
+
+    #else
+
+      GridBase* grid  = psi._grid;
+      int Ls          = this->Ls;
+      int LLs         = grid->_rdimensions[0];
+      const int nsimd = Simd::Nsimd();
+
+      Vector<iSinglet<Simd>> u(LLs);
+      Vector<iSinglet<Simd>> l(LLs);
+      Vector<iSinglet<Simd>> d(LLs);
+      Vector<iSinglet<Simd>> s(LLs);
+
+      assert(Ls/LLs == nsimd);
+      assert(phi.checkerboard == psi.checkerboard);
+
+      chi.checkerboard = psi.checkerboard;
+
+      // just directly address via type pun
+      typedef typename Simd::scalar_type scalar_type;
+      scalar_type* u_p = (scalar_type*) &u[0];
+      scalar_type* l_p = (scalar_type*) &l[0];
+      scalar_type* d_p = (scalar_type*) &d[0];
+      scalar_type* s_p = (scalar_type*) &s[0];
+
+      for(int o=0; o<LLs; o++){ // outer
+      for(int i=0; i<nsimd; i++){ //inner
+        int s   = o + i*LLs;
+        int ss  = o*nsimd + i;
+        u_p[ss] = upper[s];
+        l_p[ss] = lower[s];
+        d_p[ss] = diag[s];
+        s_p[ss] = shift_coeffs[s];
+      }}
+
+      this->M5Dcalls++;
+      this->M5Dtime -= usecond();
+
+      assert(Nc == 3);
+
+      parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+
+        int vs     = (this->pm == 1) ? LLs-1 : 0;
+        Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
+        Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
+        Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
+        Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
+        Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
+        Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
+
+        for(int v=0; v<LLs; v++){
+
+          vprefetch(psi[ss+v+LLs]);
+
+          int vp = (v == LLs-1) ? 0     : v+1;
+          int vm = (v == 0)     ? LLs-1 : v-1;
+
+          Simd hp_00 = psi[ss+vp]()(2)(0);
+          Simd hp_01 = psi[ss+vp]()(2)(1);
+          Simd hp_02 = psi[ss+vp]()(2)(2);
+          Simd hp_10 = psi[ss+vp]()(3)(0);
+          Simd hp_11 = psi[ss+vp]()(3)(1);
+          Simd hp_12 = psi[ss+vp]()(3)(2);
+
+          Simd hm_00 = psi[ss+vm]()(0)(0);
+          Simd hm_01 = psi[ss+vm]()(0)(1);
+          Simd hm_02 = psi[ss+vm]()(0)(2);
+          Simd hm_10 = psi[ss+vm]()(1)(0);
+          Simd hm_11 = psi[ss+vm]()(1)(1);
+          Simd hm_12 = psi[ss+vm]()(1)(2);
+
+          if(vp <= v){
+            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+          }
+
+          if(this->pm == 1 && vs <= v){
+            hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
+            hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
+            hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
+            hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
+            hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
+            hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
+          }
+
+          if(vm >= v){
+            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+          }
+
+          if(this->pm == -1 && vs >= v){
+            hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
+            hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
+            hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
+            hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
+            hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
+            hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
+          }
+
+          // Can force these to real arithmetic and save 2x.
+          Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
+          Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
+          Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
+          Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
+          Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
+          Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
+          Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+          Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+          Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+          Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+          Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+          Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+
+          vstream(chi[ss+v]()(0)(0), p_00);
+          vstream(chi[ss+v]()(0)(1), p_01);
+          vstream(chi[ss+v]()(0)(2), p_02);
+          vstream(chi[ss+v]()(1)(0), p_10);
+          vstream(chi[ss+v]()(1)(1), p_11);
+          vstream(chi[ss+v]()(1)(2), p_12);
+          vstream(chi[ss+v]()(2)(0), p_20);
+          vstream(chi[ss+v]()(2)(1), p_21);
+          vstream(chi[ss+v]()(2)(2), p_22);
+          vstream(chi[ss+v]()(3)(0), p_30);
+          vstream(chi[ss+v]()(3)(1), p_31);
+          vstream(chi[ss+v]()(3)(2), p_32);
+        }
+      }
+
+      this->M5Dtime += usecond();
+
+    #endif
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+  {
+    GridBase* grid = psi._grid;
+    int Ls  = this->Ls;
+    int LLs = grid->_rdimensions[0];
+    int nsimd = Simd::Nsimd();
+
+    Vector<iSinglet<Simd>> u(LLs);
+    Vector<iSinglet<Simd>> l(LLs);
+    Vector<iSinglet<Simd>> d(LLs);
+
+    assert(Ls/LLs == nsimd);
+    assert(phi.checkerboard == psi.checkerboard);
+
+    chi.checkerboard = psi.checkerboard;
+
+    // just directly address via type pun
+    typedef typename Simd::scalar_type scalar_type;
+    scalar_type* u_p = (scalar_type*) &u[0];
+    scalar_type* l_p = (scalar_type*) &l[0];
+    scalar_type* d_p = (scalar_type*) &d[0];
+
+    for(int o=0; o<LLs; o++){ // outer
+    for(int i=0; i<nsimd; i++){ //inner
+      int s  = o + i*LLs;
+      int ss = o*nsimd + i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+    }}
+
+    this->M5Dcalls++;
+    this->M5Dtime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+
+      #if 0
+
+        alignas(64) SiteHalfSpinor hp;
+        alignas(64) SiteHalfSpinor hm;
+        alignas(64) SiteSpinor fp;
+        alignas(64) SiteSpinor fm;
+
+        for(int v=0; v<LLs; v++){
+
+          int vp = (v+1)%LLs;
+          int vm = (v+LLs-1)%LLs;
+
+          spProj5p(hp, psi[ss+vp]);
+          spProj5m(hm, psi[ss+vm]);
+
+          if(vp <= v){ rotate(hp, hp, 1); }
+          if(vm >= v){ rotate(hm, hm, nsimd-1); }
+
+          hp = hp*0.5;
+          hm = hm*0.5;
+          spRecon5p(fp, hp);
+          spRecon5m(fm, hm);
+
+          chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+          chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+        }
+
+      #else
+
+        for(int v=0; v<LLs; v++){
+
+          vprefetch(psi[ss+v+LLs]);
+
+          int vp = (v == LLs-1) ? 0     : v+1;
+          int vm = (v == 0    ) ? LLs-1 : v-1;
+
+          Simd hp_00 = psi[ss+vp]()(0)(0);
+          Simd hp_01 = psi[ss+vp]()(0)(1);
+          Simd hp_02 = psi[ss+vp]()(0)(2);
+          Simd hp_10 = psi[ss+vp]()(1)(0);
+          Simd hp_11 = psi[ss+vp]()(1)(1);
+          Simd hp_12 = psi[ss+vp]()(1)(2);
+
+          Simd hm_00 = psi[ss+vm]()(2)(0);
+          Simd hm_01 = psi[ss+vm]()(2)(1);
+          Simd hm_02 = psi[ss+vm]()(2)(2);
+          Simd hm_10 = psi[ss+vm]()(3)(0);
+          Simd hm_11 = psi[ss+vm]()(3)(1);
+          Simd hm_12 = psi[ss+vm]()(3)(2);
+
+          if (vp <= v){
+            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+          }
+
+          if(vm >= v){
+            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+          }
+
+          Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+          Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+          Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+          Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+          Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+          Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+          Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
+          Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
+          Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
+          Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
+          Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
+          Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
+
+          vstream(chi[ss+v]()(0)(0), p_00);
+          vstream(chi[ss+v]()(0)(1), p_01);
+          vstream(chi[ss+v]()(0)(2), p_02);
+          vstream(chi[ss+v]()(1)(0), p_10);
+          vstream(chi[ss+v]()(1)(1), p_11);
+          vstream(chi[ss+v]()(1)(2), p_12);
+          vstream(chi[ss+v]()(2)(0), p_20);
+          vstream(chi[ss+v]()(2)(1), p_21);
+          vstream(chi[ss+v]()(2)(2), p_22);
+          vstream(chi[ss+v]()(3)(0), p_30);
+          vstream(chi[ss+v]()(3)(1), p_31);
+          vstream(chi[ss+v]()(3)(2), p_32);
+
+        }
+
+      #endif
+
+    }
+
+    this->M5Dtime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+    std::vector<Coeff_t>& shift_coeffs)
+  {
+    #if 0
+
+      this->M5Ddag(psi, phi, chi, lower, diag, upper);
+
+      // FIXME: possible gain from vectorizing shift operation as well?
+      Coeff_t one(1.0);
+      int Ls = this->Ls;
+      for(int s=0; s<Ls; s++){
+        if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
+        else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
+      }
+
+    #else
+
+      GridBase* grid = psi._grid;
+      int Ls  = this->Ls;
+      int LLs = grid->_rdimensions[0];
+      int nsimd = Simd::Nsimd();
+
+      Vector<iSinglet<Simd>> u(LLs);
+      Vector<iSinglet<Simd>> l(LLs);
+      Vector<iSinglet<Simd>> d(LLs);
+      Vector<iSinglet<Simd>> s(LLs);
+
+      assert(Ls/LLs == nsimd);
+      assert(phi.checkerboard == psi.checkerboard);
+
+      chi.checkerboard = psi.checkerboard;
+
+      // just directly address via type pun
+      typedef typename Simd::scalar_type scalar_type;
+      scalar_type* u_p = (scalar_type*) &u[0];
+      scalar_type* l_p = (scalar_type*) &l[0];
+      scalar_type* d_p = (scalar_type*) &d[0];
+      scalar_type* s_p = (scalar_type*) &s[0];
+
+      for(int o=0; o<LLs; o++){ // outer
+      for(int i=0; i<nsimd; i++){ //inner
+        int s  = o + i*LLs;
+        int ss = o*nsimd + i;
+        u_p[ss] = upper[s];
+        l_p[ss] = lower[s];
+        d_p[ss] = diag[s];
+        s_p[ss] = shift_coeffs[s];
+      }}
+
+      this->M5Dcalls++;
+      this->M5Dtime -= usecond();
+
+      parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+
+        int vs     = (this->pm == 1) ? LLs-1 : 0;
+        Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
+        Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
+        Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
+        Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
+        Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
+        Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
+
+        for(int v=0; v<LLs; v++){
+
+          vprefetch(psi[ss+v+LLs]);
+
+          int vp = (v == LLs-1) ? 0     : v+1;
+          int vm = (v == 0    ) ? LLs-1 : v-1;
+
+          Simd hp_00 = psi[ss+vp]()(0)(0);
+          Simd hp_01 = psi[ss+vp]()(0)(1);
+          Simd hp_02 = psi[ss+vp]()(0)(2);
+          Simd hp_10 = psi[ss+vp]()(1)(0);
+          Simd hp_11 = psi[ss+vp]()(1)(1);
+          Simd hp_12 = psi[ss+vp]()(1)(2);
+
+          Simd hm_00 = psi[ss+vm]()(2)(0);
+          Simd hm_01 = psi[ss+vm]()(2)(1);
+          Simd hm_02 = psi[ss+vm]()(2)(2);
+          Simd hm_10 = psi[ss+vm]()(3)(0);
+          Simd hm_11 = psi[ss+vm]()(3)(1);
+          Simd hm_12 = psi[ss+vm]()(3)(2);
+
+          if (vp <= v){
+            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+          }
+
+          if(this->pm == 1 && vs <= v){
+            hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
+            hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
+            hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
+            hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
+            hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
+            hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
+          }
+
+          if(vm >= v){
+            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+          }
+
+          if(this->pm == -1 && vs >= v){
+            hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
+            hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
+            hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
+            hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
+            hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
+            hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
+          }
+
+          Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+          Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+          Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+          Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+          Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+          Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+          Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
+          Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
+          Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
+          Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
+          Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
+          Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
+
+          vstream(chi[ss+v]()(0)(0), p_00);
+          vstream(chi[ss+v]()(0)(1), p_01);
+          vstream(chi[ss+v]()(0)(2), p_02);
+          vstream(chi[ss+v]()(1)(0), p_10);
+          vstream(chi[ss+v]()(1)(1), p_11);
+          vstream(chi[ss+v]()(1)(2), p_12);
+          vstream(chi[ss+v]()(2)(0), p_20);
+          vstream(chi[ss+v]()(2)(1), p_21);
+          vstream(chi[ss+v]()(2)(2), p_22);
+          vstream(chi[ss+v]()(3)(0), p_30);
+          vstream(chi[ss+v]()(3)(1), p_31);
+          vstream(chi[ss+v]()(3)(2), p_32);
+
+        }
+
+      }
+
+      this->M5Dtime += usecond();
+
+    #endif
+  }
+
+  #ifdef AVX512
+    #include<simd/Intel512common.h>
+    #include<simd/Intel512avx.h>
+    #include<simd/Intel512single.h>
+  #endif
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
+    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+  {
+    #ifndef AVX512
+      {
+        SiteHalfSpinor BcastP;
+        SiteHalfSpinor BcastM;
+        SiteHalfSpinor SiteChiP;
+        SiteHalfSpinor SiteChiM;
+
+        // Ls*Ls * 2 * 12 * vol flops
+        for(int s1=0; s1<LLs; s1++){
+
+          for(int s2=0; s2<LLs; s2++){
+          for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
+
+            int s = s2 + l*LLs;
+            int lex = s2 + LLs*site;
+
+            if( s2==0 && l==0 ){
+              SiteChiP=zero;
+              SiteChiM=zero;
+            }
+
+            for(int sp=0; sp<2;  sp++){
+            for(int co=0; co<Nc; co++){
+              vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
+            }}
+
+            for(int sp=0; sp<2;  sp++){
+            for(int co=0; co<Nc; co++){
+              vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
+            }}
+
+            for(int sp=0; sp<2;  sp++){
+            for(int co=0; co<Nc; co++){
+              SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
+              SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
+            }}
+          }}
+
+          {
+            int lex = s1 + LLs*site;
+            for(int sp=0; sp<2;  sp++){
+            for(int co=0; co<Nc; co++){
+              vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
+              vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+            }}
+          }
+        }
+      }
+    #else
+      {
+        // pointers
+        //  MASK_REGS;
+        #define Chi_00 %%zmm1
+        #define Chi_01 %%zmm2
+        #define Chi_02 %%zmm3
+        #define Chi_10 %%zmm4
+        #define Chi_11 %%zmm5
+        #define Chi_12 %%zmm6
+        #define Chi_20 %%zmm7
+        #define Chi_21 %%zmm8
+        #define Chi_22 %%zmm9
+        #define Chi_30 %%zmm10
+        #define Chi_31 %%zmm11
+        #define Chi_32 %%zmm12
+
+        #define BCAST0  %%zmm13
+        #define BCAST1  %%zmm14
+        #define BCAST2  %%zmm15
+        #define BCAST3  %%zmm16
+        #define BCAST4  %%zmm17
+        #define BCAST5  %%zmm18
+        #define BCAST6  %%zmm19
+        #define BCAST7  %%zmm20
+        #define BCAST8  %%zmm21
+        #define BCAST9  %%zmm22
+        #define BCAST10 %%zmm23
+        #define BCAST11 %%zmm24
+
+        int incr = LLs*LLs*sizeof(iSinglet<Simd>);
+
+        for(int s1=0; s1<LLs; s1++){
+
+          for(int s2=0; s2<LLs; s2++){
+
+            int lex = s2 + LLs*site;
+            uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
+            uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
+            uint64_t a2 = (uint64_t) &psi[lex];
+
+            for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
+
+              if((s2+l)==0) {
+                asm(
+                      VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
+                      VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
+                      VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
+                      VBCASTCDUP(0,%2,BCAST0)
+                      VBCASTCDUP(1,%2,BCAST1)
+                      VBCASTCDUP(2,%2,BCAST2)
+                      VBCASTCDUP(3,%2,BCAST3)
+                      VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
+                      VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
+                      VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
+                      VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
+                      VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
+                      VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
+                      VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
+                      VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
+                      VMULMEM(0,%1,BCAST8,Chi_22)
+                      VMULMEM(0,%1,BCAST9,Chi_30)
+                      VMULMEM(0,%1,BCAST10,Chi_31)
+                      VMULMEM(0,%1,BCAST11,Chi_32)
+                      : : "r" (a0), "r" (a1), "r" (a2)                            );
+              } else {
+                asm(
+                      VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
+                      VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
+                      VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
+                      VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
+                      VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
+                      VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
+                      VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
+                      VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
+                      VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
+                      VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
+                      VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
+                      VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
+                      : : "r" (a0), "r" (a1), "r" (a2)                            );
+              }
+
+              a0 = a0 + incr;
+              a1 = a1 + incr;
+              a2 = a2 + sizeof(typename Simd::scalar_type);
+            }
+          }
+
+          {
+            int lexa = s1+LLs*site;
+            asm (
+               VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
+               VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
+               VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
+               VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
+               : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+          }
+        }
+      }
+
+      #undef Chi_00
+      #undef Chi_01
+      #undef Chi_02
+      #undef Chi_10
+      #undef Chi_11
+      #undef Chi_12
+      #undef Chi_20
+      #undef Chi_21
+      #undef Chi_22
+      #undef Chi_30
+      #undef Chi_31
+      #undef Chi_32
+
+      #undef BCAST0
+      #undef BCAST1
+      #undef BCAST2
+      #undef BCAST3
+      #undef BCAST4
+      #undef BCAST5
+      #undef BCAST6
+      #undef BCAST7
+      #undef BCAST8
+      #undef BCAST9
+      #undef BCAST10
+      #undef BCAST11
+
+    #endif
+  };
+
+  // Z-mobius version
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
+    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+  {
+    std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
+    exit(-1);
+  };
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
+  {
+    int Ls  = this->Ls;
+    int LLs = psi._grid->_rdimensions[0];
+    int vol = psi._grid->oSites()/LLs;
+
+    chi.checkerboard = psi.checkerboard;
+
+    Vector<iSinglet<Simd>>   Matp;
+    Vector<iSinglet<Simd>>   Matm;
+    Vector<iSinglet<Simd>>* _Matp;
+    Vector<iSinglet<Simd>>* _Matm;
+
+    //  MooeeInternalCompute(dag,inv,Matp,Matm);
+    if(inv && dag){
+      _Matp = &this->MatpInvDag;
+      _Matm = &this->MatmInvDag;
+    }
+
+    if(inv && (!dag)){
+      _Matp = &this->MatpInv;
+      _Matm = &this->MatmInv;
+    }
+
+    if(!inv){
+      MooeeInternalCompute(dag, inv, Matp, Matm);
+      _Matp = &Matp;
+      _Matm = &Matm;
+    }
+
+    assert(_Matp->size() == Ls*LLs);
+
+    this->MooeeInvCalls++;
+    this->MooeeInvTime -= usecond();
+
+    if(switcheroo<Coeff_t>::iscomplex()){
+      parallel_for(auto site=0; site<vol; site++){
+        MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
+      }
+    } else {
+      parallel_for(auto site=0; site<vol; site++){
+        MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
+      }
+    }
+
+    this->MooeeInvTime += usecond();
+  }
+
+  #ifdef MOBIUS_EOFA_DPERP_VEC
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
+
+    template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+    template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+  #endif
+
+}}
--- a/Grid/qcd/action/fermion/MobiusFermion.h
+++ b/Grid/qcd/action/fermion/MobiusFermion.h
@@ -0,0 +1,80 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/MobiusFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_MOBIUS_FERMION_H
+#define  GRID_QCD_MOBIUS_FERMION_H
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class MobiusFermion : public CayleyFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+    public:
+
+      virtual void   Instantiatable(void) {};
+      // Constructors
+      MobiusFermion(GaugeField &_Umu,
+		    GridCartesian         &FiveDimGrid,
+		    GridRedBlackCartesian &FiveDimRedBlackGrid,
+		    GridCartesian         &FourDimGrid,
+		    GridRedBlackCartesian &FourDimRedBlackGrid,
+		    RealD _mass,RealD _M5,
+		    RealD b, RealD c,const ImplParams &p= ImplParams()) : 
+      
+      CayleyFermion5D<Impl>(_Umu,
+			    FiveDimGrid,
+			    FiveDimRedBlackGrid,
+			    FourDimGrid,
+			    FourDimRedBlackGrid,_mass,_M5,p)
+
+      {
+	RealD eps = 1.0;
+
+	std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
+	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
+	assert(zdata->n==this->Ls);
+	
+	// Call base setter
+	this->SetCoefficientsTanh(zdata,b,c);
+
+	Approx::zolotarev_free(zdata);
+ 
+      }
+
+    };
+
+  }
+}
+
+#endif
--- a/Grid/qcd/action/fermion/MobiusZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/MobiusZolotarevFermion.h
@@ -0,0 +1,81 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/MobiusZolotarevFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H
+#define  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class MobiusZolotarevFermion : public CayleyFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+    public:
+
+      virtual void   Instantiatable(void) {};
+      // Constructors
+       MobiusZolotarevFermion(GaugeField &_Umu,
+			      GridCartesian         &FiveDimGrid,
+			      GridRedBlackCartesian &FiveDimRedBlackGrid,
+			      GridCartesian         &FourDimGrid,
+			      GridRedBlackCartesian &FourDimRedBlackGrid,
+			      RealD _mass,RealD _M5,
+			      RealD b, RealD c,
+			      RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
+      
+      CayleyFermion5D<Impl>(_Umu,
+			    FiveDimGrid,
+			    FiveDimRedBlackGrid,
+			    FourDimGrid,
+			    FourDimRedBlackGrid,_mass,_M5,p)
+
+      {
+	RealD eps = lo/hi;
+
+	Approx::zolotarev_data *zdata = Approx::zolotarev(eps,this->Ls,0);
+	assert(zdata->n==this->Ls);
+
+	std::cout<<GridLogMessage << "MobiusZolotarevFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Zolotarev range ["<<lo<<","<<hi<<"]"<<std::endl;
+	
+	// Call base setter
+	this->SetCoefficientsZolotarev(hi,zdata,b,c);
+ 
+	Approx::zolotarev_free(zdata);
+      }
+
+    };
+
+  }
+}
+
+#endif
--- a/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
@@ -0,0 +1,69 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef OVERLAP_WILSON_CAYLEY_TANH_FERMION_H
+#define OVERLAP_WILSON_CAYLEY_TANH_FERMION_H
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class OverlapWilsonCayleyTanhFermion : public MobiusFermion<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+    public:
+
+     void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+       this->MomentumSpacePropagatorHw(out,in,_m,twist);
+     };
+
+     // Constructors
+    OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
+				   GridCartesian         &FiveDimGrid,
+				   GridRedBlackCartesian &FiveDimRedBlackGrid,
+				   GridCartesian         &FourDimGrid,
+				   GridRedBlackCartesian &FourDimRedBlackGrid,
+				   RealD _mass,RealD _M5,
+				   RealD scale,const ImplParams &p= ImplParams()) :
+      
+      // b+c=scale, b-c = 0 <=> b =c = scale/2
+      MobiusFermion<Impl>(_Umu,
+			  FiveDimGrid,
+			  FiveDimRedBlackGrid,
+			  FourDimGrid,
+			  FourDimRedBlackGrid,_mass,_M5,0.5*scale,0.5*scale,p)
+	{
+	}
+    };
+  }
+}
+#endif
--- a/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
@@ -0,0 +1,68 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H
+#define  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class OverlapWilsonCayleyZolotarevFermion : public MobiusZolotarevFermion<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+    public:
+
+      // Constructors
+
+    OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
+					GridCartesian         &FiveDimGrid,
+					GridRedBlackCartesian &FiveDimRedBlackGrid,
+					GridCartesian         &FourDimGrid,
+					GridRedBlackCartesian &FourDimRedBlackGrid,
+					RealD _mass,RealD _M5,
+					RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
+      // b+c=1.0, b-c = 0 <=> b =c = 1/2
+      MobiusZolotarevFermion<Impl>(_Umu,
+				   FiveDimGrid,
+				   FiveDimRedBlackGrid,
+				   FourDimGrid,
+				   FourDimRedBlackGrid,_mass,_M5,0.5,0.5,lo,hi,p)
+
+      {}
+
+    };
+
+  }
+}
+
+#endif
--- a/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@@ -0,0 +1,71 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H
+#define OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class OverlapWilsonContFracTanhFermion : public ContinuedFractionFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+    public:
+
+      virtual void   Instantiatable(void){};
+      // Constructors
+    OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
+				     GridCartesian         &FiveDimGrid,
+				     GridRedBlackCartesian &FiveDimRedBlackGrid,
+				     GridCartesian         &FourDimGrid,
+				     GridRedBlackCartesian &FourDimRedBlackGrid,
+				     RealD _mass,RealD _M5,
+				     RealD scale,const ImplParams &p= ImplParams()) :
+      
+      // b+c=scale, b-c = 0 <=> b =c = scale/2
+      ContinuedFractionFermion5D<Impl>(_Umu,
+				       FiveDimGrid,
+				       FiveDimRedBlackGrid,
+				       FourDimGrid,
+				       FourDimRedBlackGrid,_mass,_M5,p)
+	{
+	  assert((this->Ls&0x1)==1); // Odd Ls required
+	  int nrational=this->Ls-1;// Even rational order
+	  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
+	  this->SetCoefficientsTanh(zdata,scale);
+	  Approx::zolotarev_free(zdata);
+	}
+    };
+  }
+}
+#endif
--- a/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@@ -0,0 +1,74 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H
+#define OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class OverlapWilsonContFracZolotarevFermion : public ContinuedFractionFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+
+      virtual void   Instantiatable(void){};
+      // Constructors
+    OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
+					  GridCartesian         &FiveDimGrid,
+					  GridRedBlackCartesian &FiveDimRedBlackGrid,
+					  GridCartesian         &FourDimGrid,
+					  GridRedBlackCartesian &FourDimRedBlackGrid,
+					  RealD _mass,RealD _M5,
+					  RealD lo,RealD hi,const ImplParams &p= ImplParams()):
+      
+      // b+c=scale, b-c = 0 <=> b =c = scale/2
+      ContinuedFractionFermion5D<Impl>(_Umu,
+				       FiveDimGrid,
+				       FiveDimRedBlackGrid,
+				       FourDimGrid,
+				       FourDimRedBlackGrid,_mass,_M5,p)
+	{
+	  assert((this->Ls&0x1)==1); // Odd Ls required
+
+	  int nrational=this->Ls;// Odd rational order
+	  RealD eps = lo/hi;
+
+	  Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
+	  this->SetCoefficientsZolotarev(hi,zdata);
+	  Approx::zolotarev_free(zdata);
+
+	}
+    };
+  }
+}
+#endif
--- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@@ -0,0 +1,71 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H
+#define OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class OverlapWilsonPartialFractionTanhFermion : public PartialFractionFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+    public:
+
+      virtual void   Instantiatable(void){};
+      // Constructors
+    OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
+					    GridCartesian         &FiveDimGrid,
+					    GridRedBlackCartesian &FiveDimRedBlackGrid,
+					    GridCartesian         &FourDimGrid,
+					    GridRedBlackCartesian &FourDimRedBlackGrid,
+					    RealD _mass,RealD _M5,
+					    RealD scale,const ImplParams &p= ImplParams()) :
+      
+      // b+c=scale, b-c = 0 <=> b =c = scale/2
+      PartialFractionFermion5D<Impl>(_Umu,
+				     FiveDimGrid,
+				     FiveDimRedBlackGrid,
+				     FourDimGrid,
+				     FourDimRedBlackGrid,_mass,_M5,p)
+	{
+	  assert((this->Ls&0x1)==1); // Odd Ls required
+	  int nrational=this->Ls-1;// Even rational order
+	  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
+	  this->SetCoefficientsTanh(zdata,scale);
+	  Approx::zolotarev_free(zdata);
+	}
+    };
+  }
+}
+#endif
--- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@@ -0,0 +1,74 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H
+#define OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class OverlapWilsonPartialFractionZolotarevFermion : public PartialFractionFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+
+      virtual void   Instantiatable(void){};
+      // Constructors
+    OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
+						 GridCartesian         &FiveDimGrid,
+						 GridRedBlackCartesian &FiveDimRedBlackGrid,
+						 GridCartesian         &FourDimGrid,
+						 GridRedBlackCartesian &FourDimRedBlackGrid,
+						 RealD _mass,RealD _M5,
+						 RealD lo,RealD hi,const ImplParams &p= ImplParams()):
+      
+      // b+c=scale, b-c = 0 <=> b =c = scale/2
+      PartialFractionFermion5D<Impl>(_Umu,
+				     FiveDimGrid,
+				     FiveDimRedBlackGrid,
+				     FourDimGrid,
+				     FourDimRedBlackGrid,_mass,_M5,p)
+	{
+	  assert((this->Ls&0x1)==1); // Odd Ls required
+
+	  int nrational=this->Ls;// Odd rational order
+	  RealD eps = lo/hi;
+
+	  Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
+	  this->SetCoefficientsZolotarev(hi,zdata);
+	  Approx::zolotarev_free(zdata);
+
+	}
+    };
+  }
+}
+#endif
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.cc
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.cc
@@ -0,0 +1,459 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
+
+namespace Grid {
+  namespace QCD {
+
+
+    template<class Impl>
+    void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+      // this does both dag and undag but is trivial; make a common helper routing
+
+      int sign = 1;
+      int Ls = this->Ls;
+
+      this->DhopDir(psi,chi,dir,disp);
+
+      int nblock=(Ls-1)/2;
+      for(int b=0;b<nblock;b++){
+	int s = 2*b;
+	ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
+	ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
+      }
+      ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
+
+    }
+    template<class Impl>
+    void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
+    {
+      int Ls = this->Ls;
+      int sign = dag ? (-1) : 1;
+
+      if ( psi.checkerboard == Odd ) {
+	this->DhopEO(psi,chi,DaggerNo);
+      } else {
+	this->DhopOE(psi,chi,DaggerNo);
+      }
+
+      int nblock=(Ls-1)/2;
+      for(int b=0;b<nblock;b++){
+	int s = 2*b;
+	ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
+	ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
+      }
+      ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
+    }
+
+    template<class Impl>
+    void   PartialFractionFermion5D<Impl>::Mooee_internal(const FermionField &psi, FermionField &chi,int dag)
+    {
+      // again dag and undag are trivially related
+      int sign = dag ? (-1) : 1;
+      int Ls = this->Ls;
+      
+      int nblock=(Ls-1)/2;
+      for(int b=0;b<nblock;b++){
+	
+	int s = 2*b;
+	RealD pp = p[nblock-1-b];
+	RealD qq = q[nblock-1-b];
+	
+	// Do each 2x2 block aligned at s and multiplies Dw site diagonal by G5 so Hw
+	ag5xpby_ssp(chi,-dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s  ,s+1); 
+	ag5xpby_ssp(chi, dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s+1,s);
+	axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
+      }
+      
+      {
+	RealD R=(1+mass)/(1-mass);
+	//R g5 psi[Ls-1] + p[0] H
+	ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale*dw_diag/amax,psi,Ls-1,Ls-1);
+	
+	for(int b=0;b<nblock;b++){
+	  int s = 2*b+1;
+	  RealD pp = p[nblock-1-b];
+	  axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
+	}
+      }
+    }
+
+    template<class Impl>
+    void   PartialFractionFermion5D<Impl>::MooeeInv_internal(const FermionField &psi, FermionField &chi,int dag)
+    {
+      int sign = dag ? (-1) : 1;
+      int Ls = this->Ls;
+
+      FermionField tmp(psi._grid);
+      
+      ///////////////////////////////////////////////////////////////////////////////////////
+      //Linv
+      ///////////////////////////////////////////////////////////////////////////////////////
+      int nblock=(Ls-1)/2;
+
+      axpy(chi,0.0,psi,psi); // Identity piece
+      
+      for(int b=0;b<nblock;b++){
+	int s = 2*b;
+	RealD pp = p[nblock-1-b];
+	RealD qq = q[nblock-1-b];
+	RealD coeff1=sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
+	RealD coeff2=sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
+	axpby_ssp  (chi,1.0,chi,coeff1,psi,Ls-1,s);
+	axpbg5y_ssp(chi,1.0,chi,coeff2,psi,Ls-1,s+1);
+      }
+      
+      ///////////////////////////////////////////////////////////////////////////////////////
+      //Dinv (note D isn't really diagonal -- just diagonal enough that we can still invert)
+      // Compute Seeinv (coeff of gamma5)
+      ///////////////////////////////////////////////////////////////////////////////////////
+      RealD R=(1+mass)/(1-mass);
+      RealD Seeinv = R + p[nblock]*dw_diag/amax;
+      for(int b=0;b<nblock;b++){
+	Seeinv += p[nblock-1-b]*dw_diag/amax / ( dw_diag*dw_diag/amax/amax + q[nblock-1-b]);
+      }    
+      Seeinv = 1.0/Seeinv;
+      
+      for(int b=0;b<nblock;b++){
+	int s = 2*b;
+	RealD pp = p[nblock-1-b];
+	RealD qq = q[nblock-1-b];
+	RealD coeff1=dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
+	RealD coeff2=amax*sqrt(qq) / ( dw_diag*dw_diag + amax*amax* qq);
+	ag5xpby_ssp  (tmp,-coeff1,chi,coeff2,chi,s,s+1);
+	ag5xpby_ssp  (tmp, coeff1,chi,coeff2,chi,s+1,s);
+      }
+      ag5xpby_ssp  (tmp, Seeinv,chi,0.0,chi,Ls-1,Ls-1);
+      
+      ///////////////////////////////////////////////////////////////////////////////////////
+      // Uinv
+      ///////////////////////////////////////////////////////////////////////////////////////
+      for(int b=0;b<nblock;b++){
+	int s = 2*b;
+	RealD pp = p[nblock-1-b];
+	RealD qq = q[nblock-1-b];
+	RealD coeff1=-sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
+	RealD coeff2=-sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
+	axpby_ssp  (chi,1.0/scale,tmp,coeff1/scale,tmp,s,Ls-1);
+	axpbg5y_ssp(chi,1.0/scale,tmp,coeff2/scale,tmp,s+1,Ls-1);
+      }
+      axpby_ssp  (chi, 1.0/scale,tmp,0.0,tmp,Ls-1,Ls-1);
+    }
+
+    template<class Impl>
+    void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, FermionField &chi,int dag)
+    {
+      FermionField D(psi._grid);
+  
+      int Ls = this->Ls;
+      int sign = dag ? (-1) : 1;
+
+      // For partial frac Hw case (b5=c5=1) chroma quirkily computes
+      //
+      // Conventions for partfrac appear to be a mess.
+      // Tony's Nara lectures have
+      //
+      // BlockDiag(  H/p_i  1             | 1       )    
+      //          (  1      p_i H / q_i^2 | 0       )  
+      //           ---------------------------------
+      //           ( -1      0                | R  +p0 H  )
+      //
+      //Chroma     ( -2H    2sqrt(q_i)    |   0         )
+      //           (2 sqrt(q_i)   2H      |  2 sqrt(p_i) )
+      //           ---------------------------------
+      //           ( 0     -2 sqrt(p_i)   |  2 R gamma_5 + p0 2H
+      //
+      // Edwards/Joo/Kennedy/Wenger
+      //
+      // Here, the "beta's" selected by chroma to scale the unphysical bulk constraint fields
+      // incorporate the approx scale factor. This is obtained by propagating the
+      // scale on "H" out to the off diagonal elements as follows:
+      //
+      // BlockDiag(  H/p_i  1             | 1       ) 
+      //          (  1      p_i H / q_i^2 | 0       )  
+      //           ---------------------------------
+      //          ( -1      0                | R  + p_0 H  )
+      //
+      // becomes:
+      // BlockDiag(  H/ sp_i  1               | 1             ) 
+      //          (  1      sp_i H / s^2q_i^2 | 0             )  
+      //           ---------------------------------
+      //           ( -1      0                | R + p_0/s H   )
+      //
+      //
+      // This is implemented in Chroma by
+      //           p0' = p0/approxMax
+      //           p_i' = p_i*approxMax
+      //           q_i' = q_i*approxMax*approxMax
+      //
+      // After the equivalence transform is applied the matrix becomes
+      // 
+      //Chroma     ( -2H    sqrt(q'_i)    |   0         )
+      //           (sqrt(q'_i)   2H       |   sqrt(p'_i) )
+      //           ---------------------------------
+      //           ( 0     -sqrt(p'_i)    |  2 R gamma_5 + p'0 2H
+      //
+      //     =     ( -2H    sqrt(q_i)amax    |   0              )
+      //           (sqrt(q_i)amax   2H       |   sqrt(p_i*amax) )
+      //           ---------------------------------
+      //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H
+      //
+
+      this->DW(psi,D,DaggerNo); 
+
+      int nblock=(Ls-1)/2;
+      for(int b=0;b<nblock;b++){
+	
+	int s = 2*b;
+	double pp = p[nblock-1-b];
+	double qq = q[nblock-1-b];
+	
+	// Do each 2x2 block aligned at s and
+	ag5xpby_ssp(chi,-1.0*scale,D,amax*sqrt(qq)*scale,psi, s  ,s+1); // Multiplies Dw by G5 so Hw
+	ag5xpby_ssp(chi, 1.0*scale,D,amax*sqrt(qq)*scale,psi, s+1,s);
+	
+	// Pick up last column
+	axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
+      }
+	
+      {
+	double R=(1+this->mass)/(1-this->mass);
+	//R g5 psi[Ls] + p[0] H
+	ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
+	
+	for(int b=0;b<nblock;b++){
+	  int s = 2*b+1;
+	  double pp = p[nblock-1-b];
+	  axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
+	}
+      }
+
+    }
+
+    template<class Impl>
+    RealD  PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
+    {
+      M_internal(in,out,DaggerNo);
+      return norm2(out);
+    }
+    template<class Impl>
+    RealD  PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
+    {
+      M_internal(in,out,DaggerYes);
+      return norm2(out);
+    }
+
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::Meooe       (const FermionField &in, FermionField &out)
+    {
+      Meooe_internal(in,out,DaggerNo);
+    }
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::MeooeDag    (const FermionField &in, FermionField &out)
+    {
+      Meooe_internal(in,out,DaggerYes);
+    }
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::Mooee       (const FermionField &in, FermionField &out)
+    {
+      Mooee_internal(in,out,DaggerNo);
+    }
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::MooeeDag    (const FermionField &in, FermionField &out)
+    {
+      Mooee_internal(in,out,DaggerYes);
+    }
+
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::MooeeInv    (const FermionField &in, FermionField &out)
+    {
+      MooeeInv_internal(in,out,DaggerNo);
+    }
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::MooeeInvDag (const FermionField &in, FermionField &out)
+    {
+      MooeeInv_internal(in,out,DaggerYes);
+    }
+
+
+  // force terms; five routines; default to Dhop on diagonal
+    template<class Impl>
+   void PartialFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int nblock=(Ls-1)/2;
+    for(int b=0;b<nblock;b++){
+      int s = 2*b;
+      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+    }
+    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+    this->DhopDeriv(mat,D,V,DaggerNo); 
+  };
+    template<class Impl>
+   void PartialFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int nblock=(Ls-1)/2;
+    for(int b=0;b<nblock;b++){
+      int s = 2*b;
+      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+    }
+    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+    this->DhopDerivOE(mat,D,V,DaggerNo); 
+  };
+    template<class Impl>
+   void PartialFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int nblock=(Ls-1)/2;
+    for(int b=0;b<nblock;b++){
+      int s = 2*b;
+      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+    }
+    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+    this->DhopDerivEO(mat,D,V,DaggerNo); 
+  };
+
+    template<class Impl>
+    void  PartialFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
+      SetCoefficientsZolotarev(1.0/scale,zdata);
+    }
+    template<class Impl>
+    void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){
+
+      // check on degree matching
+      //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+      int Ls = this->Ls;
+
+      assert(Ls == (2*zdata->da -1) );
+
+      // Part frac
+      //      RealD R;
+      R=(1+mass)/(1-mass);
+      dw_diag = (4.0-this->M5);
+
+      //      std::vector<RealD> p; 
+      //      std::vector<RealD> q;
+      p.resize(zdata->da);
+      q.resize(zdata->dd);
+	
+      for(int n=0;n<zdata->da;n++){
+	p[n] = zdata -> alpha[n];
+      }
+      for(int n=0;n<zdata->dd;n++){
+	q[n] = -zdata -> ap[n];
+      }
+      
+      scale= part_frac_chroma_convention ? 2.0 : 1.0; // Chroma conventions annoy me
+
+      amax=zolo_hi;
+    }
+
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+    {
+      int Ls = this->Ls;
+      conformable(solution5d._grid,this->FermionGrid());
+      conformable(exported4d._grid,this->GaugeGrid());
+      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+    }
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+    {
+      int Ls = this->Ls;
+      conformable(imported5d._grid,this->FermionGrid());
+      conformable(input4d._grid   ,this->GaugeGrid());
+      FermionField tmp(this->FermionGrid());
+      tmp=zero;
+      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
+      this->Dminus(tmp,imported5d);
+    }
+
+      // Constructors
+    template<class Impl>
+    PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
+							     GridCartesian         &FiveDimGrid,
+							     GridRedBlackCartesian &FiveDimRedBlackGrid,
+							     GridCartesian         &FourDimGrid,
+							     GridRedBlackCartesian &FourDimRedBlackGrid,
+							     RealD _mass,RealD M5,
+							     const ImplParams &p) :
+      WilsonFermion5D<Impl>(_Umu,
+			    FiveDimGrid, FiveDimRedBlackGrid,
+			    FourDimGrid, FourDimRedBlackGrid,M5,p),
+      mass(_mass)
+
+    {
+      int Ls = this->Ls;
+
+      assert((Ls&0x1)==1); // Odd Ls required
+      int nrational=Ls-1;
+
+
+      Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);
+
+      // NB: chroma uses a cast to "float" for the zolotarev range(!?).
+      // this creates a real difference in the operator which I do not like but we can replicate here
+      // to demonstrate compatibility
+      //      RealD eps = (zolo_lo / zolo_hi);
+      //      zdata = bfm_zolotarev(eps,nrational,0);
+      
+      SetCoefficientsTanh(zdata,1.0);
+
+      Approx::zolotarev_free(zdata);
+
+    }
+ 
+    FermOpTemplateInstantiate(PartialFractionFermion5D);
+
+ }
+}
+
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@@ -0,0 +1,107 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_PARTIAL_FRACTION_H
+#define  GRID_QCD_PARTIAL_FRACTION_H
+
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class PartialFractionFermion5D : public WilsonFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+
+      const int part_frac_chroma_convention=1;
+
+      void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
+      void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
+      void   MooeeInv_internal(const FermionField &in, FermionField &out,int dag);
+      void   M_internal(const FermionField &in, FermionField &out,int dag);
+
+      // override multiply
+      virtual RealD  M    (const FermionField &in, FermionField &out);
+      virtual RealD  Mdag (const FermionField &in, FermionField &out);
+
+      // half checkerboard operaions
+      virtual void   Meooe       (const FermionField &in, FermionField &out);
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+      virtual void   Mooee       (const FermionField &in, FermionField &out);
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+
+      // force terms; five routines; default to Dhop on diagonal
+      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+
+      virtual void   Instantiatable(void) =0; // ensure no make-eee
+
+      // Efficient support for multigrid coarsening
+      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+
+      ///////////////////////////////////////////////////////////////
+      // Physical surface field utilities
+      ///////////////////////////////////////////////////////////////
+      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+
+      // Constructors
+      PartialFractionFermion5D(GaugeField &_Umu,
+			       GridCartesian         &FiveDimGrid,
+			       GridRedBlackCartesian &FiveDimRedBlackGrid,
+			       GridCartesian         &FourDimGrid,
+			       GridRedBlackCartesian &FourDimRedBlackGrid,
+			       RealD _mass,RealD M5,const ImplParams &p= ImplParams());
+
+    protected:
+
+      virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
+      virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);
+
+      // Part frac
+      RealD mass;
+      RealD dw_diag;
+      RealD R;
+      RealD amax;
+      RealD scale;
+      std::vector<double> p; 
+      std::vector<double> q;
+
+    };
+
+
+  }
+}
+
+#endif
--- a/Grid/qcd/action/fermion/ScaledShamirFermion.h
+++ b/Grid/qcd/action/fermion/ScaledShamirFermion.h
@@ -0,0 +1,69 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/ScaledShamirFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_SCALED_SHAMIR_FERMION_H
+#define  GRID_QCD_SCALED_SHAMIR_FERMION_H
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class ScaledShamirFermion : public MobiusFermion<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+
+      // Constructors
+    ScaledShamirFermion(GaugeField &_Umu,
+			GridCartesian         &FiveDimGrid,
+			GridRedBlackCartesian &FiveDimRedBlackGrid,
+			GridCartesian         &FourDimGrid,
+			GridRedBlackCartesian &FourDimRedBlackGrid,
+			RealD _mass,RealD _M5,
+//			RealD scale):
+			RealD scale,const ImplParams &p= ImplParams()) :
+      
+      // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
+      MobiusFermion<Impl>(_Umu,
+		    FiveDimGrid,
+		    FiveDimRedBlackGrid,
+		    FourDimGrid,
+	FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0),p)
+//		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
+      {
+      }
+
+    };
+
+  }
+}
+
+#endif
--- a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h
+++ b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h
@@ -0,0 +1,102 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: SchurDiagTwoKappa.h
+
+    Copyright (C) 2017
+
+Author: Christoph Lehner
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  _SCHUR_DIAG_TWO_KAPPA_H
+#define  _SCHUR_DIAG_TWO_KAPPA_H
+
+namespace Grid {
+
+  // This is specific to (Z)mobius fermions
+  template<class Matrix, class Field>
+    class KappaSimilarityTransform {
+  public:
+    INHERIT_IMPL_TYPES(Matrix);
+    std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
+
+    KappaSimilarityTransform (Matrix &zmob) {
+      for (int i=0;i<(int)zmob.bs.size();i++) {
+	Coeff_t k = 1.0 / ( 2.0 * (zmob.bs[i] *(4 - zmob.M5) + 1.0) );
+	kappa.push_back( k );
+	kappaDag.push_back( conj(k) );
+	kappaInv.push_back( 1.0 / k );
+	kappaInvDag.push_back( 1.0 / conj(k) );
+      }
+    }
+
+  template<typename vobj>
+    void sscale(const Lattice<vobj>& in, Lattice<vobj>& out, Coeff_t* s) {
+    GridBase *grid=out._grid;
+    out.checkerboard = in.checkerboard;
+    assert(grid->_simd_layout[0] == 1); // should be fine for ZMobius for now
+    int Ls = grid->_rdimensions[0];
+    parallel_for(int ss=0;ss<grid->oSites();ss++){
+      vobj tmp = s[ss % Ls]*in._odata[ss];
+      vstream(out._odata[ss],tmp);
+    }
+  }
+
+  RealD sscale_norm(const Field& in, Field& out, Coeff_t* s) {
+    sscale(in,out,s);
+    return norm2(out);
+  }
+
+  virtual RealD M       (const Field& in, Field& out) { return sscale_norm(in,out,&kappa[0]);   }
+  virtual RealD MDag    (const Field& in, Field& out) { return sscale_norm(in,out,&kappaDag[0]);}
+  virtual RealD MInv    (const Field& in, Field& out) { return sscale_norm(in,out,&kappaInv[0]);}
+  virtual RealD MInvDag (const Field& in, Field& out) { return sscale_norm(in,out,&kappaInvDag[0]);}
+
+  };
+
+  template<class Matrix,class Field>
+    class SchurDiagTwoKappaOperator :  public SchurOperatorBase<Field> {
+  public:
+    KappaSimilarityTransform<Matrix, Field> _S;
+    SchurDiagTwoOperator<Matrix, Field> _Mat;
+
+    SchurDiagTwoKappaOperator (Matrix &Mat): _S(Mat), _Mat(Mat) {};
+
+    virtual  RealD Mpc      (const Field &in, Field &out) {
+      Field tmp(in._grid);
+
+      _S.MInv(in,out);
+      _Mat.Mpc(out,tmp);
+      return _S.M(tmp,out);
+
+    }
+    virtual  RealD MpcDag   (const Field &in, Field &out){
+      Field tmp(in._grid);
+
+      _S.MDag(in,out);
+      _Mat.MpcDag(out,tmp);
+      return _S.MInvDag(tmp,out);
+    }
+  };
+
+}
+
+#endif
--- a/Grid/qcd/action/fermion/ShamirZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/ShamirZolotarevFermion.h
@@ -0,0 +1,69 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/ShamirZolotarevFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H
+#define  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class ShamirZolotarevFermion : public MobiusZolotarevFermion<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+
+      // Constructors
+
+
+    ShamirZolotarevFermion(GaugeField &_Umu,
+			   GridCartesian         &FiveDimGrid,
+			   GridRedBlackCartesian &FiveDimRedBlackGrid,
+			   GridCartesian         &FourDimGrid,
+			   GridRedBlackCartesian &FourDimRedBlackGrid,
+			   RealD _mass,RealD _M5,
+			   RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
+      
+      // b+c = 1; b-c = 1 => b=1, c=0
+      MobiusZolotarevFermion<Impl>(_Umu,
+				   FiveDimGrid,
+				   FiveDimRedBlackGrid,
+				   FourDimGrid,
+				   FourDimRedBlackGrid,_mass,_M5,1.0,0.0,lo,hi,p)
+      
+      {}
+
+    };
+
+  }
+}
+
+#endif
--- a/Grid/qcd/action/fermion/StaggeredKernels.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernels.cc
@@ -0,0 +1,294 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+namespace QCD {
+
+int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
+int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
+
+#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)		\
+  SE = st.GetEntry(ptype, Dir+skew, sF);			\
+  if (SE->_is_local ) {						\
+    if (SE->_permute) {						\
+      chi_p = &chi;						\
+      permute(chi,  in._odata[SE->_offset], ptype);		\
+    } else {							\
+      chi_p = &in._odata[SE->_offset];				\
+    }								\
+  } else {							\
+    chi_p = &buf[SE->_offset];					\
+  }								\
+  multLink(Uchi, U._odata[sU], *chi_p, Dir);			
+
+#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)		\
+  SE = st.GetEntry(ptype, Dir+skew, sF);			\
+  if (SE->_is_local ) {						\
+    if (SE->_permute) {						\
+      chi_p = &chi;						\
+      permute(chi,  in._odata[SE->_offset], ptype);		\
+    } else {							\
+      chi_p = &in._odata[SE->_offset];				\
+    }								\
+  } else if ( st.same_node[Dir] ) {				\
+    chi_p = &buf[SE->_offset];					\
+  }								\
+  if (SE->_is_local || st.same_node[Dir] ) {			\
+    multLink(Uchi, U._odata[sU], *chi_p, Dir);			\
+  }
+
+#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
+  SE = st.GetEntry(ptype, Dir+skew, sF);			\
+  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
+    nmu++;							\
+    chi_p = &buf[SE->_offset];					\
+    multLink(Uchi, U._odata[sU], *chi_p, Dir);			\
+  }
+
+template <class Impl>
+StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
+
+////////////////////////////////////////////////////////////////////////////////////
+// Generic implementation; move to different file?
+// Int, Ext, Int+Ext cases for comms overlap
+////////////////////////////////////////////////////////////////////////////////////
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
+					     DoubledGaugeField &U, DoubledGaugeField &UUU,
+					     SiteSpinor *buf, int LLs, int sU, 
+					     const FermionField &in, FermionField &out, int dag) {
+  const SiteSpinor *chi_p;
+  SiteSpinor chi;
+  SiteSpinor Uchi;
+  StencilEntry *SE;
+  int ptype;
+  int skew;
+
+  for(int s=0;s<LLs;s++){
+    int sF=LLs*sU+s;
+    skew = 0;
+    GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
+    GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
+    skew=8;
+    GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
+    if ( dag ) { 
+      Uchi = - Uchi;
+    } 
+    vstream(out._odata[sF], Uchi);
+  }
+};
+
+  ///////////////////////////////////////////////////
+  // Only contributions from interior of our node
+  ///////////////////////////////////////////////////
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
+						DoubledGaugeField &U, DoubledGaugeField &UUU,
+						SiteSpinor *buf, int LLs, int sU, 
+						const FermionField &in, FermionField &out,int dag) {
+  const SiteSpinor *chi_p;
+  SiteSpinor chi;
+  SiteSpinor Uchi;
+  StencilEntry *SE;
+  int ptype;
+  int skew ;
+
+  for(int s=0;s<LLs;s++){
+    int sF=LLs*sU+s;
+    skew = 0;
+    Uchi=zero;
+    GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
+    skew=8;
+    GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
+    if ( dag ) {
+      Uchi = - Uchi;
+    }
+    vstream(out._odata[sF], Uchi);
+  }
+};
+
+
+  ///////////////////////////////////////////////////
+  // Only contributions from exterior of our node
+  ///////////////////////////////////////////////////
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
+						DoubledGaugeField &U, DoubledGaugeField &UUU,
+						SiteSpinor *buf, int LLs, int sU,
+						const FermionField &in, FermionField &out,int dag) {
+  const SiteSpinor *chi_p;
+  SiteSpinor chi;
+  SiteSpinor Uchi;
+  StencilEntry *SE;
+  int ptype;
+  int nmu=0;
+  int skew ;
+
+  for(int s=0;s<LLs;s++){
+    int sF=LLs*sU+s;
+    skew = 0;
+    Uchi=zero;
+    GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
+    skew=8;
+    GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
+
+    if ( nmu ) { 
+      if ( dag ) { 
+	out._odata[sF] = out._odata[sF] - Uchi;
+      } else { 
+	out._odata[sF] = out._odata[sF] + Uchi;
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////
+// Driving / wrapping routine to select right kernel
+////////////////////////////////////////////////////////////////////////////////////
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
+					 SiteSpinor *buf, int LLs, int sU,
+					 const FermionField &in, FermionField &out,
+					 int interior,int exterior)
+{
+  int dag=1;
+  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
+};
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
+				      SiteSpinor *buf, int LLs, int sU,
+				      const FermionField &in, FermionField &out,
+				      int interior,int exterior)
+{
+  int dag=0;
+  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
+};
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
+				      SiteSpinor *buf, int LLs,
+				      int sU, const FermionField &in, FermionField &out,
+				      int dag,int interior,int exterior) 
+{
+  switch(Opt) {
+#ifdef AVX512
+  case OptInlineAsm:
+    if ( interior && exterior ) {
+      DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
+    } else { 
+      std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
+      assert(0);
+    }
+    break;
+#endif
+  case OptHandUnroll:
+    if ( interior && exterior ) {
+      DhopSiteHand   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
+    } else if ( interior ) {
+      DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
+    } else if ( exterior ) {
+      DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
+    }
+    break;
+  case OptGeneric:
+    if ( interior && exterior ) {
+      DhopSiteGeneric   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
+    } else if ( interior ) {
+      DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
+    } else if ( exterior ) {
+      DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
+    }
+    break;
+  default:
+    std::cout<<"Oops Opt = "<<Opt<<std::endl;
+    assert(0);
+    break;
+  }
+};
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,  DoubledGaugeField &UUU, SiteSpinor *buf, int sF,
+				      int sU, const FermionField &in, FermionField &out, int dir, int disp) 
+{
+  // Disp should be either +1,-1,+3,-3
+  // What about "dag" ?
+  // Because we work out pU . dS/dU 
+  // U
+  assert(0);
+}
+
+FermOpStaggeredTemplateInstantiate(StaggeredKernels);
+FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
+
+}}
+
--- a/Grid/qcd/action/fermion/StaggeredKernels.h
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@@ -0,0 +1,122 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/StaggeredKernels.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_QCD_STAGGERED_KERNELS_H
+#define GRID_QCD_STAGGERED_KERNELS_H
+
+namespace Grid {
+namespace QCD {
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Helper routines that implement Staggered stencil for a single site.
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class StaggeredKernelsStatic { 
+ public:
+  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
+  enum { CommsAndCompute, CommsThenCompute };
+  static int Opt;
+  static int Comms;
+};
+ 
+template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic { 
+ public:
+   
+  INHERIT_IMPL_TYPES(Impl);
+  typedef FermionOperator<Impl> Base;
+   
+public:
+    
+   void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
+		      int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
+
+   ///////////////////////////////////////////////////////////////////////////////////////
+   // Generic Nc kernels
+   ///////////////////////////////////////////////////////////////////////////////////////
+   void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
+			DoubledGaugeField &U, DoubledGaugeField &UUU, 
+			SiteSpinor * buf, int LLs, int sU, 
+			const FermionField &in, FermionField &out,int dag);
+   void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
+			   DoubledGaugeField &U, DoubledGaugeField &UUU, 
+			   SiteSpinor * buf, int LLs, int sU, 
+			   const FermionField &in, FermionField &out,int dag);
+   void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
+			   DoubledGaugeField &U, DoubledGaugeField &UUU,
+			   SiteSpinor * buf, int LLs, int sU, 
+			   const FermionField &in, FermionField &out,int dag);
+
+   ///////////////////////////////////////////////////////////////////////////////////////
+   // Nc=3 specific kernels
+   ///////////////////////////////////////////////////////////////////////////////////////
+   void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
+		     DoubledGaugeField &U,DoubledGaugeField &UUU, 
+		     SiteSpinor * buf, int LLs, int sU, 
+		     const FermionField &in, FermionField &out,int dag);
+   void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
+			DoubledGaugeField &U,DoubledGaugeField &UUU, 
+			SiteSpinor * buf, int LLs, int sU, 
+			const FermionField &in, FermionField &out,int dag);
+   void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
+			DoubledGaugeField &U,DoubledGaugeField &UUU, 
+			SiteSpinor * buf, int LLs, int sU, 
+			const FermionField &in, FermionField &out,int dag);
+
+   ///////////////////////////////////////////////////////////////////////////////////////
+   // Asm Nc=3 specific kernels
+   ///////////////////////////////////////////////////////////////////////////////////////
+   void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+		    DoubledGaugeField &U,DoubledGaugeField &UUU, 
+		    SiteSpinor * buf, int LLs, int sU, 
+		    const FermionField &in, FermionField &out,int dag);
+   ///////////////////////////////////////////////////////////////////////////////////////////////////
+   // Generic interface; fan out to right routine
+   ///////////////////////////////////////////////////////////////////////////////////////////////////
+   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
+		 DoubledGaugeField &U, DoubledGaugeField &UUU, 
+		 SiteSpinor * buf, int LLs, int sU,
+		 const FermionField &in, FermionField &out, int interior=1,int exterior=1);
+
+   void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, 
+		    DoubledGaugeField &U, DoubledGaugeField &UUU, 
+		    SiteSpinor * buf, int LLs, int sU,
+		    const FermionField &in, FermionField &out, int interior=1,int exterior=1);
+
+   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
+		 DoubledGaugeField &U, DoubledGaugeField &UUU, 
+		 SiteSpinor * buf, int LLs, int sU,
+		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
+  
+public:
+
+  StaggeredKernels(const ImplParams &p = ImplParams());
+
+};
+    
+}}
+
+#endif
--- a/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
@@ -0,0 +1,968 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+
+#ifdef AVX512
+#include <simd/Intel512common.h>
+#include <simd/Intel512avx.h>
+#endif
+
+// Interleave operations from two directions
+// This looks just like a 2 spin multiply and reuse same sequence from the Wilson
+// Kernel. But the spin index becomes a mu index instead.
+#define Chi_00 %zmm0
+#define Chi_01 %zmm1
+#define Chi_02 %zmm2
+#define Chi_10 %zmm3
+#define Chi_11 %zmm4
+#define Chi_12 %zmm5
+#define Chi_20 %zmm6
+#define Chi_21 %zmm7
+#define Chi_22 %zmm8
+#define Chi_30 %zmm9
+#define Chi_31 %zmm10
+#define Chi_32 %zmm11
+
+#define UChi_00 %zmm12
+#define UChi_01 %zmm13
+#define UChi_02 %zmm14
+#define UChi_10 %zmm15
+#define UChi_11 %zmm16
+#define UChi_12 %zmm17
+#define UChi_20 %zmm18
+#define UChi_21 %zmm19
+#define UChi_22 %zmm20
+#define UChi_30 %zmm21
+#define UChi_31 %zmm22
+#define UChi_32 %zmm23
+
+#define pChi_00 %%zmm0
+#define pChi_01 %%zmm1
+#define pChi_02 %%zmm2
+#define pChi_10 %%zmm3
+#define pChi_11 %%zmm4
+#define pChi_12 %%zmm5
+#define pChi_20 %%zmm6
+#define pChi_21 %%zmm7
+#define pChi_22 %%zmm8
+#define pChi_30 %%zmm9
+#define pChi_31 %%zmm10
+#define pChi_32 %%zmm11
+
+#define pUChi_00 %%zmm12
+#define pUChi_01 %%zmm13
+#define pUChi_02 %%zmm14
+#define pUChi_10 %%zmm15
+#define pUChi_11 %%zmm16
+#define pUChi_12 %%zmm17
+#define pUChi_20 %%zmm18
+#define pUChi_21 %%zmm19
+#define pUChi_22 %%zmm20
+#define pUChi_30 %%zmm21
+#define pUChi_31 %%zmm22
+#define pUChi_32 %%zmm23
+
+#define T0 %zmm24
+#define T1 %zmm25
+#define T2 %zmm26
+#define T3 %zmm27
+
+#define Z00 %zmm26
+#define Z10 %zmm27
+#define Z0 Z00
+#define Z1 %zmm28
+#define Z2 %zmm29
+
+#define Z3 %zmm30
+#define Z4 %zmm31
+#define Z5 Chi_31
+#define Z6 Chi_32
+
+#define MULT_ADD_LS(g0,g1,g2,g3)					\
+  asm ( "movq %0, %%r8 \n\t"					\
+	"movq %1, %%r9 \n\t"						\
+        "movq %2, %%r10 \n\t"						\
+        "movq %3, %%r11 \n\t" :  : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
+  asm (									\
+  VSHUF(Chi_00,T0)      VSHUF(Chi_10,T1)				\
+  VSHUF(Chi_20,T2)      VSHUF(Chi_30,T3)				\
+  VMADDSUBIDUP(0,%r8,T0,UChi_00) VMADDSUBIDUP(0,%r9,T1,UChi_10)		\
+  VMADDSUBIDUP(3,%r8,T0,UChi_01) VMADDSUBIDUP(3,%r9,T1,UChi_11)		\
+  VMADDSUBIDUP(6,%r8,T0,UChi_02) VMADDSUBIDUP(6,%r9,T1,UChi_12)		\
+  VMADDSUBIDUP(0,%r10,T2,UChi_20) VMADDSUBIDUP(0,%r11,T3,UChi_30)		\
+  VMADDSUBIDUP(3,%r10,T2,UChi_21) VMADDSUBIDUP(3,%r11,T3,UChi_31)		\
+  VMADDSUBIDUP(6,%r10,T2,UChi_22) VMADDSUBIDUP(6,%r11,T3,UChi_32)		\
+  VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
+  VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
+  VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
+  VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
+  VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
+  VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
+  VSHUF(Chi_01,T0)	  VSHUF(Chi_11,T1)				\
+  VSHUF(Chi_21,T2)	  VSHUF(Chi_31,T3)				\
+  VMADDSUBIDUP(1,%r8,T0,UChi_00)     VMADDSUBIDUP(1,%r9,T1,UChi_10)	\
+  VMADDSUBIDUP(4,%r8,T0,UChi_01)     VMADDSUBIDUP(4,%r9,T1,UChi_11)	\
+  VMADDSUBIDUP(7,%r8,T0,UChi_02)     VMADDSUBIDUP(7,%r9,T1,UChi_12)	\
+  VMADDSUBIDUP(1,%r10,T2,UChi_20)     VMADDSUBIDUP(1,%r11,T3,UChi_30)	\
+  VMADDSUBIDUP(4,%r10,T2,UChi_21)     VMADDSUBIDUP(4,%r11,T3,UChi_31)	\
+  VMADDSUBIDUP(7,%r10,T2,UChi_22)     VMADDSUBIDUP(7,%r11,T3,UChi_32)	\
+  VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
+  VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
+  VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
+  VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
+  VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
+  VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
+  VSHUF(Chi_02,T0)    VSHUF(Chi_12,T1)					\
+  VSHUF(Chi_22,T2)    VSHUF(Chi_32,T3)					\
+  VMADDSUBIDUP(2,%r8,T0,UChi_00)     VMADDSUBIDUP(2,%r9,T1,UChi_10)     \
+  VMADDSUBIDUP(5,%r8,T0,UChi_01)     VMADDSUBIDUP(5,%r9,T1,UChi_11)     \
+  VMADDSUBIDUP(8,%r8,T0,UChi_02)     VMADDSUBIDUP(8,%r9,T1,UChi_12)     \
+  VMADDSUBIDUP(2,%r10,T2,UChi_20)     VMADDSUBIDUP(2,%r11,T3,UChi_30)     \
+  VMADDSUBIDUP(5,%r10,T2,UChi_21)     VMADDSUBIDUP(5,%r11,T3,UChi_31)     \
+  VMADDSUBIDUP(8,%r10,T2,UChi_22)     VMADDSUBIDUP(8,%r11,T3,UChi_32)     \
+  VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
+  VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
+  VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
+  VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
+  VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
+  VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
+
+#define MULT_LS(g0,g1,g2,g3)					\
+  asm ( "movq %0, %%r8 \n\t"					\
+	"movq %1, %%r9 \n\t"						\
+        "movq %2, %%r10 \n\t"						\
+        "movq %3, %%r11 \n\t" :  : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
+  asm (									\
+  VSHUF(Chi_00,T0)      VSHUF(Chi_10,T1)				\
+  VSHUF(Chi_20,T2)      VSHUF(Chi_30,T3)				\
+  VMULIDUP(0,%r8,T0,UChi_00) VMULIDUP(0,%r9,T1,UChi_10)		\
+  VMULIDUP(3,%r8,T0,UChi_01) VMULIDUP(3,%r9,T1,UChi_11)		\
+  VMULIDUP(6,%r8,T0,UChi_02) VMULIDUP(6,%r9,T1,UChi_12)		\
+  VMULIDUP(0,%r10,T2,UChi_20) VMULIDUP(0,%r11,T3,UChi_30)		\
+  VMULIDUP(3,%r10,T2,UChi_21) VMULIDUP(3,%r11,T3,UChi_31)		\
+  VMULIDUP(6,%r10,T2,UChi_22) VMULIDUP(6,%r11,T3,UChi_32)		\
+  VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
+  VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
+  VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
+  VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
+  VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
+  VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
+  VSHUF(Chi_01,T0)	  VSHUF(Chi_11,T1)				\
+  VSHUF(Chi_21,T2)	  VSHUF(Chi_31,T3)				\
+  VMADDSUBIDUP(1,%r8,T0,UChi_00)     VMADDSUBIDUP(1,%r9,T1,UChi_10)	\
+  VMADDSUBIDUP(4,%r8,T0,UChi_01)     VMADDSUBIDUP(4,%r9,T1,UChi_11)	\
+  VMADDSUBIDUP(7,%r8,T0,UChi_02)     VMADDSUBIDUP(7,%r9,T1,UChi_12)	\
+  VMADDSUBIDUP(1,%r10,T2,UChi_20)     VMADDSUBIDUP(1,%r11,T3,UChi_30)	\
+  VMADDSUBIDUP(4,%r10,T2,UChi_21)     VMADDSUBIDUP(4,%r11,T3,UChi_31)	\
+  VMADDSUBIDUP(7,%r10,T2,UChi_22)     VMADDSUBIDUP(7,%r11,T3,UChi_32)	\
+  VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
+  VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
+  VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
+  VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
+  VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
+  VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
+  VSHUF(Chi_02,T0)    VSHUF(Chi_12,T1)					\
+  VSHUF(Chi_22,T2)    VSHUF(Chi_32,T3)					\
+  VMADDSUBIDUP(2,%r8,T0,UChi_00)     VMADDSUBIDUP(2,%r9,T1,UChi_10)     \
+  VMADDSUBIDUP(5,%r8,T0,UChi_01)     VMADDSUBIDUP(5,%r9,T1,UChi_11)     \
+  VMADDSUBIDUP(8,%r8,T0,UChi_02)     VMADDSUBIDUP(8,%r9,T1,UChi_12)     \
+  VMADDSUBIDUP(2,%r10,T2,UChi_20)     VMADDSUBIDUP(2,%r11,T3,UChi_30)     \
+  VMADDSUBIDUP(5,%r10,T2,UChi_21)     VMADDSUBIDUP(5,%r11,T3,UChi_31)     \
+  VMADDSUBIDUP(8,%r10,T2,UChi_22)     VMADDSUBIDUP(8,%r11,T3,UChi_32)     \
+  VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
+  VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
+  VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
+  VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
+  VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
+  VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
+
+#define MULT_ADD_XYZTa(g0,g1)					\
+  asm ( "movq %0, %%r8 \n\t"					\
+	"movq %1, %%r9 \n\t"	 :  : "r"(g0), "r"(g1) : "%r8","%r9");\
+	   __asm__ (						\
+	   VSHUF(Chi_00,T0)				\
+	   VSHUF(Chi_10,T1)						\
+	   VMOVIDUP(0,%r8,Z0 )						\
+           VMOVIDUP(3,%r8,Z1 )						\
+           VMOVIDUP(6,%r8,Z2 )						\
+           VMADDSUB(Z0,T0,UChi_00)					\
+	   VMADDSUB(Z1,T0,UChi_01)					\
+	   VMADDSUB(Z2,T0,UChi_02)					\
+									\
+	   VMOVIDUP(0,%r9,Z0 )						\
+           VMOVIDUP(3,%r9,Z1 )						\
+           VMOVIDUP(6,%r9,Z2 )						\
+           VMADDSUB(Z0,T1,UChi_10)					\
+           VMADDSUB(Z1,T1,UChi_11)            \
+           VMADDSUB(Z2,T1,UChi_12)            \
+	   							\
+								\
+	   VMOVRDUP(0,%r8,Z3 )					\
+	   VMOVRDUP(3,%r8,Z4 )					\
+	   VMOVRDUP(6,%r8,Z5 )					\
+           VMADDSUB(Z3,Chi_00,UChi_00)/*rr * ir = ri rr*/	\
+           VMADDSUB(Z4,Chi_00,UChi_01)				\
+           VMADDSUB(Z5,Chi_00,UChi_02)				\
+								\
+	   VMOVRDUP(0,%r9,Z3 )					\
+	   VMOVRDUP(3,%r9,Z4 )					\
+	   VMOVRDUP(6,%r9,Z5 )					\
+           VMADDSUB(Z3,Chi_10,UChi_10)				\
+           VMADDSUB(Z4,Chi_10,UChi_11)\
+           VMADDSUB(Z5,Chi_10,UChi_12)				\
+	   							\
+								\
+	   VMOVIDUP(1,%r8,Z0 )					\
+	   VMOVIDUP(4,%r8,Z1 )					\
+	   VMOVIDUP(7,%r8,Z2 )					\
+	   VSHUF(Chi_01,T0)					\
+           VMADDSUB(Z0,T0,UChi_00)				\
+           VMADDSUB(Z1,T0,UChi_01)				\
+           VMADDSUB(Z2,T0,UChi_02)				\
+								\
+	   VMOVIDUP(1,%r9,Z0 )					\
+	   VMOVIDUP(4,%r9,Z1 )					\
+	   VMOVIDUP(7,%r9,Z2 )					\
+	   VSHUF(Chi_11,T1)					\
+           VMADDSUB(Z0,T1,UChi_10)				\
+           VMADDSUB(Z1,T1,UChi_11)				\
+           VMADDSUB(Z2,T1,UChi_12)				\
+								\
+	   VMOVRDUP(1,%r8,Z3 )					\
+	   VMOVRDUP(4,%r8,Z4 )					\
+	   VMOVRDUP(7,%r8,Z5 )					\
+           VMADDSUB(Z3,Chi_01,UChi_00)				\
+           VMADDSUB(Z4,Chi_01,UChi_01)				\
+           VMADDSUB(Z5,Chi_01,UChi_02)				\
+								\
+	   VMOVRDUP(1,%r9,Z3 )					\
+	   VMOVRDUP(4,%r9,Z4 )					\
+	   VMOVRDUP(7,%r9,Z5 )					\
+           VMADDSUB(Z3,Chi_11,UChi_10)				\
+           VMADDSUB(Z4,Chi_11,UChi_11)				\
+           VMADDSUB(Z5,Chi_11,UChi_12)				\
+	   							\
+	   VSHUF(Chi_02,T0)					\
+	   VSHUF(Chi_12,T1)					\
+	   VMOVIDUP(2,%r8,Z0 )					\
+	   VMOVIDUP(5,%r8,Z1 )					\
+	   VMOVIDUP(8,%r8,Z2 )					\
+           VMADDSUB(Z0,T0,UChi_00)				\
+           VMADDSUB(Z1,T0,UChi_01)			      \
+           VMADDSUB(Z2,T0,UChi_02)			      \
+	   VMOVIDUP(2,%r9,Z0 )					\
+	   VMOVIDUP(5,%r9,Z1 )					\
+	   VMOVIDUP(8,%r9,Z2 )					\
+           VMADDSUB(Z0,T1,UChi_10)			      \
+           VMADDSUB(Z1,T1,UChi_11)			      \
+           VMADDSUB(Z2,T1,UChi_12)			      \
+	   /*55*/					      \
+	   VMOVRDUP(2,%r8,Z3 )		  \
+	   VMOVRDUP(5,%r8,Z4 )					\
+	   VMOVRDUP(8,%r8,Z5 )				      \
+           VMADDSUB(Z3,Chi_02,UChi_00)			      \
+           VMADDSUB(Z4,Chi_02,UChi_01)			      \
+           VMADDSUB(Z5,Chi_02,UChi_02)			      \
+	   VMOVRDUP(2,%r9,Z3 )		  \
+	   VMOVRDUP(5,%r9,Z4 )					\
+	   VMOVRDUP(8,%r9,Z5 )				      \
+           VMADDSUB(Z3,Chi_12,UChi_10)			      \
+           VMADDSUB(Z4,Chi_12,UChi_11)			      \
+           VMADDSUB(Z5,Chi_12,UChi_12)			      \
+	   /*61 insns*/							);
+
+#define MULT_ADD_XYZT(g0,g1)					\
+  asm ( "movq %0, %%r8 \n\t"					\
+	"movq %1, %%r9 \n\t"	 :  : "r"(g0), "r"(g1) : "%r8","%r9");\
+  __asm__ (							  \
+  VSHUFMEM(0,%r8,Z00)		   VSHUFMEM(0,%r9,Z10)			\
+  VRDUP(Chi_00,T0)           VIDUP(Chi_00,Chi_00)	          \
+   VRDUP(Chi_10,T1)           VIDUP(Chi_10,Chi_10)		  \
+   VMUL(Z00,Chi_00,Z1)        VMUL(Z10,Chi_10,Z2)		  \
+   VSHUFMEM(3,%r8,Z00)	      VSHUFMEM(3,%r9,Z10)		  \
+   VMUL(Z00,Chi_00,Z3)        VMUL(Z10,Chi_10,Z4)		  \
+   VSHUFMEM(6,%r8,Z00)	      VSHUFMEM(6,%r9,Z10)		  \
+   VMUL(Z00,Chi_00,Z5)        VMUL(Z10,Chi_10,Z6)		  \
+   VMADDMEM(0,%r8,T0,UChi_00)  VMADDMEM(0,%r9,T1,UChi_10)		  \
+   VMADDMEM(3,%r8,T0,UChi_01)  VMADDMEM(3,%r9,T1,UChi_11)		  \
+   VMADDMEM(6,%r8,T0,UChi_02)  VMADDMEM(6,%r9,T1,UChi_12)		  \
+   VSHUFMEM(1,%r8,Z00)	      VSHUFMEM(1,%r9,Z10)		  \
+   VRDUP(Chi_01,T0)           VIDUP(Chi_01,Chi_01)		  \
+   VRDUP(Chi_11,T1)           VIDUP(Chi_11,Chi_11)		  \
+   VMADD(Z00,Chi_01,Z1)       VMADD(Z10,Chi_11,Z2)		  \
+   VSHUFMEM(4,%r8,Z00)	      VSHUFMEM(4,%r9,Z10)		  \
+   VMADD(Z00,Chi_01,Z3)       VMADD(Z10,Chi_11,Z4)		  \
+   VSHUFMEM(7,%r8,Z00)	      VSHUFMEM(7,%r9,Z10)		  \
+   VMADD(Z00,Chi_01,Z5)       VMADD(Z10,Chi_11,Z6)		  \
+   VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10)	  \
+   VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11)	  \
+   VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12)	  \
+   VSHUFMEM(2,%r8,Z00)	      VSHUFMEM(2,%r9,Z10)			\
+   VRDUP(Chi_02,T0)           VIDUP(Chi_02,Chi_02)			\
+   VRDUP(Chi_12,T1)           VIDUP(Chi_12,Chi_12)			\
+   VMADD(Z00,Chi_02,Z1)       VMADD(Z10,Chi_12,Z2)		  \
+   VSHUFMEM(5,%r8,Z00)	      VSHUFMEM(5,%r9,Z10)		  \
+   VMADD(Z00,Chi_02,Z3)       VMADD(Z10,Chi_12,Z4)		  \
+   VSHUFMEM(8,%r8,Z00)	      VSHUFMEM(8,%r9,Z10)		  \
+   VMADD(Z00,Chi_02,Z5)       VMADD(Z10,Chi_12,Z6)		  \
+   VMADDSUBMEM(2,%r8,T0,Z1)   VMADDSUBMEM(2,%r9,T1,Z2)		  \
+   VMADDSUBMEM(5,%r8,T0,Z3)   VMADDSUBMEM(5,%r9,T1,Z4)	          \
+   VMADDSUBMEM(8,%r8,T0,Z5)   VMADDSUBMEM(8,%r9,T1,Z6)	       \
+   VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
+   VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
+   VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) );
+
+#define MULT_XYZT(g0,g1)					\
+    asm ( "movq %0, %%r8 \n\t"						\
+	"movq %1, %%r9 \n\t" :  : "r"(g0), "r"(g1) : "%r8","%r9" ); \
+	   __asm__ (						\
+	   VSHUF(Chi_00,T0)				\
+	   VSHUF(Chi_10,T1)						\
+	   VMOVIDUP(0,%r8,Z0 )						\
+           VMOVIDUP(3,%r8,Z1 )						\
+           VMOVIDUP(6,%r8,Z2 )						\
+	   /*6*/							\
+           VMUL(Z0,T0,UChi_00)            \
+           VMUL(Z1,T0,UChi_01)            \
+           VMUL(Z2,T0,UChi_02)            \
+	   VMOVIDUP(0,%r9,Z0 )						\
+           VMOVIDUP(3,%r9,Z1 )						\
+           VMOVIDUP(6,%r9,Z2 )						\
+           VMUL(Z0,T1,UChi_10)            \
+           VMUL(Z1,T1,UChi_11)            \
+           VMUL(Z2,T1,UChi_12)            \
+	   VMOVRDUP(0,%r8,Z3 )					\
+	   VMOVRDUP(3,%r8,Z4 )					\
+	   VMOVRDUP(6,%r8,Z5 )					\
+	   /*18*/						\
+           VMADDSUB(Z3,Chi_00,UChi_00)				\
+           VMADDSUB(Z4,Chi_00,UChi_01)\
+           VMADDSUB(Z5,Chi_00,UChi_02) \
+	   VMOVRDUP(0,%r9,Z3 )					\
+	   VMOVRDUP(3,%r9,Z4 )					\
+	   VMOVRDUP(6,%r9,Z5 )					\
+           VMADDSUB(Z3,Chi_10,UChi_10)				\
+           VMADDSUB(Z4,Chi_10,UChi_11)\
+           VMADDSUB(Z5,Chi_10,UChi_12)				\
+	   VMOVIDUP(1,%r8,Z0 )					\
+	   VMOVIDUP(4,%r8,Z1 )					\
+	   VMOVIDUP(7,%r8,Z2 )					\
+	   /*28*/						\
+	   VSHUF(Chi_01,T0)					\
+           VMADDSUB(Z0,T0,UChi_00)      \
+           VMADDSUB(Z1,T0,UChi_01)       \
+           VMADDSUB(Z2,T0,UChi_02)        \
+	   VMOVIDUP(1,%r9,Z0 )					\
+	   VMOVIDUP(4,%r9,Z1 )					\
+	   VMOVIDUP(7,%r9,Z2 )					\
+	   VSHUF(Chi_11,T1)					\
+           VMADDSUB(Z0,T1,UChi_10)				\
+           VMADDSUB(Z1,T1,UChi_11)				\
+           VMADDSUB(Z2,T1,UChi_12)        \
+	   VMOVRDUP(1,%r8,Z3 )					\
+	   VMOVRDUP(4,%r8,Z4 )					\
+	   VMOVRDUP(7,%r8,Z5 )					\
+           /*38*/						\
+           VMADDSUB(Z3,Chi_01,UChi_00)    \
+           VMADDSUB(Z4,Chi_01,UChi_01)    \
+           VMADDSUB(Z5,Chi_01,UChi_02)    \
+	   VMOVRDUP(1,%r9,Z3 )					\
+	   VMOVRDUP(4,%r9,Z4 )					\
+	   VMOVRDUP(7,%r9,Z5 )					\
+           VMADDSUB(Z3,Chi_11,UChi_10)				\
+           VMADDSUB(Z4,Chi_11,UChi_11)    \
+           VMADDSUB(Z5,Chi_11,UChi_12)				\
+	   /*48*/						\
+	   VSHUF(Chi_02,T0)					\
+	   VSHUF(Chi_12,T1)					\
+	   VMOVIDUP(2,%r8,Z0 )					\
+	   VMOVIDUP(5,%r8,Z1 )					\
+	   VMOVIDUP(8,%r8,Z2 )					\
+           VMADDSUB(Z0,T0,UChi_00)				\
+           VMADDSUB(Z1,T0,UChi_01)			      \
+           VMADDSUB(Z2,T0,UChi_02)			      \
+	   VMOVIDUP(2,%r9,Z0 )					\
+	   VMOVIDUP(5,%r9,Z1 )					\
+	   VMOVIDUP(8,%r9,Z2 )					\
+           VMADDSUB(Z0,T1,UChi_10)			      \
+           VMADDSUB(Z1,T1,UChi_11)			      \
+           VMADDSUB(Z2,T1,UChi_12)			      \
+	   /*55*/					      \
+	   VMOVRDUP(2,%r8,Z3 )		  \
+	   VMOVRDUP(5,%r8,Z4 )					\
+	   VMOVRDUP(8,%r8,Z5 )				      \
+           VMADDSUB(Z3,Chi_02,UChi_00)			      \
+           VMADDSUB(Z4,Chi_02,UChi_01)			      \
+           VMADDSUB(Z5,Chi_02,UChi_02)			      \
+	   VMOVRDUP(2,%r9,Z3 )		  \
+	   VMOVRDUP(5,%r9,Z4 )					\
+	   VMOVRDUP(8,%r9,Z5 )				      \
+           VMADDSUB(Z3,Chi_12,UChi_10)			      \
+           VMADDSUB(Z4,Chi_12,UChi_11)			      \
+           VMADDSUB(Z5,Chi_12,UChi_12)			      \
+	   /*61 insns*/							);
+
+#define MULT_XYZTa(g0,g1)					\
+  asm ( "movq %0, %%r8 \n\t"					\
+	"movq %1, %%r9 \n\t" :  : "r"(g0), "r"(g1) : "%r8","%r9" ); \
+  __asm__ (							  \
+   VSHUFMEM(0,%r8,Z00)		   VSHUFMEM(0,%r9,Z10)	  \
+   VRDUP(Chi_00,T0)           VIDUP(Chi_00,Chi_00)	          \
+   VRDUP(Chi_10,T1)           VIDUP(Chi_10,Chi_10)		  \
+   VMUL(Z00,Chi_00,Z1)        VMUL(Z10,Chi_10,Z2)		  \
+   VSHUFMEM(3,%r8,Z00)	      VSHUFMEM(3,%r9,Z10)		  \
+   VMUL(Z00,Chi_00,Z3)        VMUL(Z10,Chi_10,Z4)		  \
+   VSHUFMEM(6,%r8,Z00)	      VSHUFMEM(6,%r9,Z10)		  \
+   VMUL(Z00,Chi_00,Z5)        VMUL(Z10,Chi_10,Z6)		  \
+   VMULMEM(0,%r8,T0,UChi_00)  VMULMEM(0,%r9,T1,UChi_10)		  \
+   VMULMEM(3,%r8,T0,UChi_01)  VMULMEM(3,%r9,T1,UChi_11)		  \
+   VMULMEM(6,%r8,T0,UChi_02)  VMULMEM(6,%r9,T1,UChi_12)		  \
+   VSHUFMEM(1,%r8,Z00)	      VSHUFMEM(1,%r9,Z10)		  \
+   VRDUP(Chi_01,T0)           VIDUP(Chi_01,Chi_01)		  \
+   VRDUP(Chi_11,T1)           VIDUP(Chi_11,Chi_11)		  \
+   VMADD(Z00,Chi_01,Z1)       VMADD(Z10,Chi_11,Z2)		  \
+   VSHUFMEM(4,%r8,Z00)	      VSHUFMEM(4,%r9,Z10)		  \
+   VMADD(Z00,Chi_01,Z3)       VMADD(Z10,Chi_11,Z4)		  \
+   VSHUFMEM(7,%r8,Z00)	      VSHUFMEM(7,%r9,Z10)		  \
+   VMADD(Z00,Chi_01,Z5)       VMADD(Z10,Chi_11,Z6)		  \
+   VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10)	  \
+   VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11)	  \
+   VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12)	  \
+   VSHUFMEM(2,%r8,Z00)	      VSHUFMEM(2,%r9,Z10)			\
+   VRDUP(Chi_02,T0)           VIDUP(Chi_02,Chi_02)			\
+   VRDUP(Chi_12,T1)           VIDUP(Chi_12,Chi_12)			\
+   VMADD(Z00,Chi_02,Z1)       VMADD(Z10,Chi_12,Z2)		  \
+   VSHUFMEM(5,%r8,Z00)	      VSHUFMEM(5,%r9,Z10)		  \
+   VMADD(Z00,Chi_02,Z3)       VMADD(Z10,Chi_12,Z4)		  \
+   VSHUFMEM(8,%r8,Z00)	      VSHUFMEM(8,%r9,Z10)		  \
+   VMADD(Z00,Chi_02,Z5)       VMADD(Z10,Chi_12,Z6)		  \
+   VMADDSUBMEM(2,%r8,T0,Z1)   VMADDSUBMEM(2,%r9,T1,Z2)		  \
+   VMADDSUBMEM(5,%r8,T0,Z3)   VMADDSUBMEM(5,%r9,T1,Z4)	          \
+   VMADDSUBMEM(8,%r8,T0,Z5)   VMADDSUBMEM(8,%r9,T1,Z6)	       \
+   VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
+   VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
+   VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) );
+
+
+#define LOAD_CHI(a0,a1,a2,a3)						\
+  asm (									\
+       "movq %0, %%r8 \n\t"						\
+       VLOAD(0,%%r8,pChi_00)						\
+       VLOAD(1,%%r8,pChi_01)						\
+       VLOAD(2,%%r8,pChi_02)						\
+       : : "r" (a0) : "%r8" );						\
+  asm (									\
+       "movq %0, %%r8 \n\t"						\
+       VLOAD(0,%%r8,pChi_10)						\
+       VLOAD(1,%%r8,pChi_11)						\
+       VLOAD(2,%%r8,pChi_12)						\
+       : : "r" (a1) : "%r8" );						\
+  asm (									\
+       "movq %0, %%r8 \n\t"						\
+       VLOAD(0,%%r8,pChi_20)						\
+       VLOAD(1,%%r8,pChi_21)						\
+       VLOAD(2,%%r8,pChi_22)						\
+       : : "r" (a2) : "%r8" );						\
+  asm (									\
+       "movq %0, %%r8 \n\t"						\
+       VLOAD(0,%%r8,pChi_30)						\
+       VLOAD(1,%%r8,pChi_31)						\
+       VLOAD(2,%%r8,pChi_32)						\
+       : : "r" (a3) : "%r8" );						
+
+#define LOAD_CHIa(a0,a1)						\
+  asm (									\
+       "movq %0, %%r8 \n\t"						\
+       VLOAD(0,%%r8,pChi_00)						\
+       VLOAD(1,%%r8,pChi_01)						\
+       VLOAD(2,%%r8,pChi_02)						\
+       : : "r" (a0) : "%r8" );						\
+  asm (									\
+       "movq %0, %%r8 \n\t"						\
+       VLOAD(0,%%r8,pChi_10)						\
+       VLOAD(1,%%r8,pChi_11)						\
+       VLOAD(2,%%r8,pChi_12)						\
+       : : "r" (a1) : "%r8" );						
+
+#define PF_CHI(a0)							
+#define PF_CHIa(a0)							\
+  asm (									\
+       "movq %0, %%r8 \n\t"						\
+       VPREFETCH1(0,%%r8)						\
+       VPREFETCH1(1,%%r8)						\
+       VPREFETCH1(2,%%r8)						\
+       : : "r" (a0) : "%r8" );						\
+
+#define PF_GAUGE_XYZT(a0)							
+#define PF_GAUGE_XYZTa(a0)						\
+  asm (									\
+       "movq %0, %%r8 \n\t"						\
+       VPREFETCH1(0,%%r8)						\
+       VPREFETCH1(1,%%r8)						\
+       VPREFETCH1(2,%%r8)						\
+       VPREFETCH1(3,%%r8)						\
+       VPREFETCH1(4,%%r8)						\
+       VPREFETCH1(5,%%r8)						\
+       VPREFETCH1(6,%%r8)						\
+       VPREFETCH1(7,%%r8)						\
+       VPREFETCH1(8,%%r8)						\
+       : : "r" (a0) : "%r8" );						\
+
+#define PF_GAUGE_LS(a0)							
+#define PF_GAUGE_LSa(a0)							\
+  asm (									\
+       "movq %0, %%r8 \n\t"						\
+       VPREFETCH1(0,%%r8)						\
+       VPREFETCH1(1,%%r8)						\
+       : : "r" (a0) : "%r8" );						\
+  
+
+#define REDUCE(out)					\
+  asm (							\
+  VADD(UChi_00,UChi_10,UChi_00)				\
+  VADD(UChi_01,UChi_11,UChi_01)				\
+  VADD(UChi_02,UChi_12,UChi_02)				\
+  VADD(UChi_30,UChi_20,UChi_30)				\
+  VADD(UChi_31,UChi_21,UChi_31)				\
+  VADD(UChi_32,UChi_22,UChi_32)				\
+  VADD(UChi_00,UChi_30,UChi_00)				\
+  VADD(UChi_01,UChi_31,UChi_01)				\
+  VADD(UChi_02,UChi_32,UChi_02)				);	\
+  asm (								\
+       VSTORE(0,%0,pUChi_00)					\
+       VSTORE(1,%0,pUChi_01)					\
+       VSTORE(2,%0,pUChi_02)					\
+       : : "r" (out) : "memory" );
+
+#define nREDUCE(out)							\
+  asm (									\
+       VADD(UChi_00,UChi_10,UChi_00)					\
+       VADD(UChi_01,UChi_11,UChi_01)					\
+       VADD(UChi_02,UChi_12,UChi_02)					\
+       VADD(UChi_30,UChi_20,UChi_30)					\
+       VADD(UChi_31,UChi_21,UChi_31)					\
+       VADD(UChi_32,UChi_22,UChi_32)					\
+       VADD(UChi_00,UChi_30,UChi_00)					\
+       VADD(UChi_01,UChi_31,UChi_01)					\
+       VADD(UChi_02,UChi_32,UChi_02)				);	\
+  asm (VZERO(Chi_00)							\
+       VSUB(UChi_00,Chi_00,UChi_00)					\
+       VSUB(UChi_01,Chi_00,UChi_01)					\
+       VSUB(UChi_02,Chi_00,UChi_02)				);	\
+  asm (								\
+       VSTORE(0,%0,pUChi_00)					\
+       VSTORE(1,%0,pUChi_01)					\
+       VSTORE(2,%0,pUChi_02)					\
+       : : "r" (out) : "memory" );
+
+#define REDUCEa(out)					\
+  asm (							\
+  VADD(UChi_00,UChi_10,UChi_00)				\
+  VADD(UChi_01,UChi_11,UChi_01)				\
+  VADD(UChi_02,UChi_12,UChi_02)	);			\
+  asm (									\
+       VSTORE(0,%0,pUChi_00)						\
+       VSTORE(1,%0,pUChi_01)						\
+       VSTORE(2,%0,pUChi_02)						\
+       : : "r" (out) : "memory" );
+
+// FIXME is sign right in the VSUB ?
+#define nREDUCEa(out)					\
+  asm (							\
+  VADD(UChi_00,UChi_10,UChi_00)				\
+  VADD(UChi_01,UChi_11,UChi_01)				\
+  VADD(UChi_02,UChi_12,UChi_02)	);			\
+  asm (VZERO(Chi_00)							\
+       VSUB(UChi_00,Chi_00,UChi_00)					\
+       VSUB(UChi_01,Chi_00,UChi_01)					\
+       VSUB(UChi_02,Chi_00,UChi_02)				);	\
+  asm (									\
+       VSTORE(0,%0,pUChi_00)				\
+       VSTORE(1,%0,pUChi_01)				\
+       VSTORE(2,%0,pUChi_02)				\
+       : : "r" (out) : "memory" );
+
+#define PERMUTE_DIR(dir)			\
+      permute##dir(Chi_0,Chi_0);\
+      permute##dir(Chi_1,Chi_1);\
+      permute##dir(Chi_2,Chi_2);
+
+namespace Grid {
+namespace QCD {
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+					 DoubledGaugeField &U, DoubledGaugeField &UUU,
+					 SiteSpinor *buf, int LLs, int sU, 
+					 const FermionField &in, FermionField &out,int dag) 
+{
+  assert(0);
+};
+
+
+//#define CONDITIONAL_MOVE(l,o,out) if ( l ) { out = (uint64_t) &in._odata[o] ; } else { out =(uint64_t) &buf[o]; }
+
+#define CONDITIONAL_MOVE(l,o,out) { const SiteSpinor *ptr = l? in_p : buf; out = (uint64_t) &ptr[o]; }
+
+#define PREPARE_XYZT(X,Y,Z,T,skew,UU)			\
+  PREPARE(X,Y,Z,T,skew,UU);				\
+  PF_GAUGE_XYZT(gauge0);					\
+  PF_GAUGE_XYZT(gauge1);					\
+  PF_GAUGE_XYZT(gauge2);					\
+  PF_GAUGE_XYZT(gauge3);					
+
+#define PREPARE_LS(X,Y,Z,T,skew,UU)			\
+  PREPARE(X,Y,Z,T,skew,UU);				\
+  PF_GAUGE_LS(gauge0);					\
+  PF_GAUGE_LS(gauge1);					\
+  PF_GAUGE_LS(gauge2);					\
+  PF_GAUGE_LS(gauge3);					
+
+#define PREPARE(X,Y,Z,T,skew,UU)					\
+  SE0=st.GetEntry(ptype,X+skew,sF);					\
+  o0 = SE0->_offset;							\
+  l0 = SE0->_is_local;							\
+  p0 = SE0->_permute;							\
+  CONDITIONAL_MOVE(l0,o0,addr0);					\
+  PF_CHI(addr0);							\
+  									\
+  SE1=st.GetEntry(ptype,Y+skew,sF);					\
+  o1 = SE1->_offset;							\
+  l1 = SE1->_is_local;							\
+  p1 = SE1->_permute;							\
+  CONDITIONAL_MOVE(l1,o1,addr1);					\
+  PF_CHI(addr1);							\
+  									\
+  SE2=st.GetEntry(ptype,Z+skew,sF);					\
+  o2 = SE2->_offset;							\
+  l2 = SE2->_is_local;							\
+  p2 = SE2->_permute;							\
+  CONDITIONAL_MOVE(l2,o2,addr2);					\
+  PF_CHI(addr2);							\
+  									\
+  SE3=st.GetEntry(ptype,T+skew,sF);					\
+  o3 = SE3->_offset;							\
+  l3 = SE3->_is_local;							\
+  p3 = SE3->_permute;							\
+  CONDITIONAL_MOVE(l3,o3,addr3);					\
+  PF_CHI(addr3);							\
+  									\
+  gauge0 =(uint64_t)&UU._odata[sU]( X );				\
+  gauge1 =(uint64_t)&UU._odata[sU]( Y );				\
+  gauge2 =(uint64_t)&UU._odata[sU]( Z );				\
+  gauge3 =(uint64_t)&UU._odata[sU]( T ); 
+  
+  // This is the single precision 5th direction vectorised kernel
+#include <simd/Intel512single.h>
+template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+								    DoubledGaugeField &U, DoubledGaugeField &UUU,
+								    SiteSpinor *buf, int LLs, int sU, 
+								    const FermionField &in, FermionField &out,int dag) 
+{
+#ifdef AVX512
+  uint64_t gauge0,gauge1,gauge2,gauge3;
+  uint64_t addr0,addr1,addr2,addr3;
+  const SiteSpinor *in_p; in_p = &in._odata[0];
+
+  int o0,o1,o2,o3; // offsets
+  int l0,l1,l2,l3; // local 
+  int p0,p1,p2,p3; // perm
+  int ptype;
+  StencilEntry *SE0;
+  StencilEntry *SE1;
+  StencilEntry *SE2;
+  StencilEntry *SE3;
+
+   for(int s=0;s<LLs;s++){
+
+    int sF=s+LLs*sU;
+    // Xp, Yp, Zp, Tp
+    PREPARE(Xp,Yp,Zp,Tp,0,U);
+    LOAD_CHI(addr0,addr1,addr2,addr3);
+    MULT_LS(gauge0,gauge1,gauge2,gauge3);  
+
+    PREPARE(Xm,Ym,Zm,Tm,0,U);
+    LOAD_CHI(addr0,addr1,addr2,addr3);
+    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);  
+
+    PREPARE(Xp,Yp,Zp,Tp,8,UUU);
+    LOAD_CHI(addr0,addr1,addr2,addr3);
+    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
+
+    PREPARE(Xm,Ym,Zm,Tm,8,UUU);
+    LOAD_CHI(addr0,addr1,addr2,addr3);
+    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
+
+    addr0 = (uint64_t) &out._odata[sF];
+    if ( dag ) {
+      nREDUCE(addr0);
+    } else { 
+      REDUCE(addr0);
+    }
+   }
+#else 
+    assert(0);
+#endif
+   
+}
+
+#include <simd/Intel512double.h>
+template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+								    DoubledGaugeField &U, DoubledGaugeField &UUU,
+								    SiteSpinor *buf, int LLs, int sU, 
+								    const FermionField &in, FermionField &out,int dag) 
+{
+#ifdef AVX512
+  uint64_t gauge0,gauge1,gauge2,gauge3;
+  uint64_t addr0,addr1,addr2,addr3;
+  const SiteSpinor *in_p; in_p = &in._odata[0];
+
+  int o0,o1,o2,o3; // offsets
+  int l0,l1,l2,l3; // local 
+  int p0,p1,p2,p3; // perm
+  int ptype;
+  StencilEntry *SE0;
+  StencilEntry *SE1;
+  StencilEntry *SE2;
+  StencilEntry *SE3;
+
+  for(int s=0;s<LLs;s++){
+    int sF=s+LLs*sU;
+    // Xp, Yp, Zp, Tp
+    PREPARE(Xp,Yp,Zp,Tp,0,U);
+    LOAD_CHI(addr0,addr1,addr2,addr3);
+    MULT_LS(gauge0,gauge1,gauge2,gauge3);  
+
+    PREPARE(Xm,Ym,Zm,Tm,0,U);
+    LOAD_CHI(addr0,addr1,addr2,addr3);
+    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);  
+
+    PREPARE(Xp,Yp,Zp,Tp,8,UUU);
+    LOAD_CHI(addr0,addr1,addr2,addr3);
+    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
+
+    PREPARE(Xm,Ym,Zm,Tm,8,UUU);
+    LOAD_CHI(addr0,addr1,addr2,addr3);
+    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
+
+    addr0 = (uint64_t) &out._odata[sF];
+    if ( dag ) {
+      nREDUCE(addr0);
+    } else { 
+      REDUCE(addr0);
+    }
+  }
+#else 
+  assert(0);
+#endif
+}
+   
+   
+
+
+#define PERMUTE_DIR3 __asm__ (	\
+  VPERM3(Chi_00,Chi_00)	\
+  VPERM3(Chi_01,Chi_01)	\
+  VPERM3(Chi_02,Chi_02)	);
+
+#define PERMUTE_DIR2 __asm__ (	\
+  VPERM2(Chi_10,Chi_10)	\
+  VPERM2(Chi_11,Chi_11)	\
+  VPERM2(Chi_12,Chi_12) );
+
+#define PERMUTE_DIR1 __asm__ (	\
+  VPERM1(Chi_00,Chi_00)	\
+  VPERM1(Chi_01,Chi_01)	\
+  VPERM1(Chi_02,Chi_02)	);
+
+#define PERMUTE_DIR0 __asm__ (			\
+  VPERM0(Chi_10,Chi_10)	\
+  VPERM0(Chi_11,Chi_11)	\
+  VPERM0(Chi_12,Chi_12) );
+
+#define PERMUTE01 \
+  if ( p0 ) { PERMUTE_DIR3; }\
+  if ( p1 ) { PERMUTE_DIR2; }
+
+#define PERMUTE23 \
+  if ( p2 ) { PERMUTE_DIR1; }\
+  if ( p3 ) { PERMUTE_DIR0; }
+
+  // This is the single precision 5th direction vectorised kernel
+
+#include <simd/Intel512single.h>
+template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+							       DoubledGaugeField &U, DoubledGaugeField &UUU,
+							       SiteSpinor *buf, int LLs, int sU, 
+							       const FermionField &in, FermionField &out,int dag) 
+{
+#ifdef AVX512
+  uint64_t gauge0,gauge1,gauge2,gauge3;
+  uint64_t addr0,addr1,addr2,addr3;
+  const SiteSpinor *in_p; in_p = &in._odata[0];
+
+  int o0,o1,o2,o3; // offsets
+  int l0,l1,l2,l3; // local 
+  int p0,p1,p2,p3; // perm
+  int ptype;
+  StencilEntry *SE0;
+  StencilEntry *SE1;
+  StencilEntry *SE2;
+  StencilEntry *SE3;
+
+  for(int s=0;s<LLs;s++){
+    
+    int sF=s+LLs*sU;
+    // Xp, Yp, Zp, Tp
+    PREPARE(Xp,Yp,Zp,Tp,0,U);
+    LOAD_CHIa(addr0,addr1);
+    PERMUTE01;
+    MULT_XYZT(gauge0,gauge1);
+    LOAD_CHIa(addr2,addr3);
+    PERMUTE23;
+    MULT_ADD_XYZT(gauge2,gauge3);  
+
+    PREPARE(Xm,Ym,Zm,Tm,0,U);
+    LOAD_CHIa(addr0,addr1);
+    PERMUTE01;
+    MULT_ADD_XYZT(gauge0,gauge1);
+    LOAD_CHIa(addr2,addr3);
+    PERMUTE23;
+    MULT_ADD_XYZT(gauge2,gauge3);  
+
+    PREPARE(Xp,Yp,Zp,Tp,8,UUU);
+    LOAD_CHIa(addr0,addr1);
+    PERMUTE01;
+    MULT_ADD_XYZT(gauge0,gauge1);
+    LOAD_CHIa(addr2,addr3);
+    PERMUTE23;
+    MULT_ADD_XYZT(gauge2,gauge3);  
+    
+    PREPARE(Xm,Ym,Zm,Tm,8,UUU);
+    LOAD_CHIa(addr0,addr1);
+    PERMUTE01;
+    MULT_ADD_XYZT(gauge0,gauge1);
+    LOAD_CHIa(addr2,addr3);
+    PERMUTE23;
+    MULT_ADD_XYZT(gauge2,gauge3);  
+
+    addr0 = (uint64_t) &out._odata[sF];
+    if ( dag ) { 
+      nREDUCEa(addr0);
+    } else { 
+      REDUCEa(addr0);
+    }
+  }
+#else 
+  assert(0);
+#endif
+}
+
+#include <simd/Intel512double.h>
+template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+							       DoubledGaugeField &U, DoubledGaugeField &UUU,
+							       SiteSpinor *buf, int LLs, int sU, 
+							       const FermionField &in, FermionField &out,int dag) 
+{
+#ifdef AVX512
+  uint64_t gauge0,gauge1,gauge2,gauge3;
+  uint64_t addr0,addr1,addr2,addr3;
+  const SiteSpinor *in_p; in_p = &in._odata[0];
+
+  int o0,o1,o2,o3; // offsets
+  int l0,l1,l2,l3; // local 
+  int p0,p1,p2,p3; // perm
+  int ptype;
+  StencilEntry *SE0;
+  StencilEntry *SE1;
+  StencilEntry *SE2;
+  StencilEntry *SE3;
+
+  for(int s=0;s<LLs;s++){
+    
+    int sF=s+LLs*sU;
+    // Xp, Yp, Zp, Tp
+    PREPARE(Xp,Yp,Zp,Tp,0,U);
+    LOAD_CHIa(addr0,addr1);
+    PERMUTE01;
+    MULT_XYZT(gauge0,gauge1);
+    LOAD_CHIa(addr2,addr3);
+    PERMUTE23;
+    MULT_ADD_XYZT(gauge2,gauge3);  
+    
+    PREPARE(Xm,Ym,Zm,Tm,0,U);
+    LOAD_CHIa(addr0,addr1);
+    PERMUTE01;
+    MULT_ADD_XYZT(gauge0,gauge1);
+    LOAD_CHIa(addr2,addr3);
+    PERMUTE23;
+    MULT_ADD_XYZT(gauge2,gauge3);  
+    
+    PREPARE(Xp,Yp,Zp,Tp,8,UUU);
+    LOAD_CHIa(addr0,addr1);
+    PERMUTE01;
+    MULT_ADD_XYZT(gauge0,gauge1);
+    LOAD_CHIa(addr2,addr3);
+    PERMUTE23;
+    MULT_ADD_XYZT(gauge2,gauge3);  
+    
+    PREPARE(Xm,Ym,Zm,Tm,8,UUU);
+    LOAD_CHIa(addr0,addr1);
+    PERMUTE01;
+    MULT_ADD_XYZT(gauge0,gauge1);
+    LOAD_CHIa(addr2,addr3);
+    PERMUTE23;
+    MULT_ADD_XYZT(gauge2,gauge3);  
+    
+    addr0 = (uint64_t) &out._odata[sF];
+    if ( dag ) {
+      nREDUCEa(addr0);
+    } else { 
+      REDUCEa(addr0);
+    }
+  }
+#else 
+  assert(0);
+#endif
+}
+
+#define KERNEL_INSTANTIATE(CLASS,FUNC,IMPL)			    \
+  template void CLASS<IMPL>::FUNC(StencilImpl &st, LebesgueOrder &lo,	\
+				  DoubledGaugeField &U,			\
+				  DoubledGaugeField &UUU,		\
+				  SiteSpinor *buf, int LLs,		\
+				  int sU, const FermionField &in, FermionField &out,int dag);
+
+KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
+KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
+KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplD);
+KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplF);
+
+}}
+
--- a/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
@@ -0,0 +1,399 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+
+
+#define LOAD_CHI(b)		\
+  const SiteSpinor & ref (b[offset]);	\
+    Chi_0=ref()()(0);\
+    Chi_1=ref()()(1);\
+    Chi_2=ref()()(2);
+
+
+// To splat or not to splat depends on the implementation
+#define MULT(A,UChi)				\
+  auto & ref(U._odata[sU](A));			\
+   Impl::loadLinkElement(U_00,ref()(0,0));      \
+   Impl::loadLinkElement(U_10,ref()(1,0));      \
+   Impl::loadLinkElement(U_20,ref()(2,0));      \
+   Impl::loadLinkElement(U_01,ref()(0,1));      \
+   Impl::loadLinkElement(U_11,ref()(1,1));      \
+   Impl::loadLinkElement(U_21,ref()(2,1));      \
+   Impl::loadLinkElement(U_02,ref()(0,2));     \
+   Impl::loadLinkElement(U_12,ref()(1,2));     \
+   Impl::loadLinkElement(U_22,ref()(2,2));     \
+    UChi ## _0  = U_00*Chi_0;	       \
+    UChi ## _1  = U_10*Chi_0;\
+    UChi ## _2  = U_20*Chi_0;\
+    UChi ## _0 += U_01*Chi_1;\
+    UChi ## _1 += U_11*Chi_1;\
+    UChi ## _2 += U_21*Chi_1;\
+    UChi ## _0 += U_02*Chi_2;\
+    UChi ## _1 += U_12*Chi_2;\
+    UChi ## _2 += U_22*Chi_2;
+
+#define MULT_ADD(U,A,UChi)			\
+  auto & ref(U._odata[sU](A));			\
+   Impl::loadLinkElement(U_00,ref()(0,0));      \
+   Impl::loadLinkElement(U_10,ref()(1,0));      \
+   Impl::loadLinkElement(U_20,ref()(2,0));      \
+   Impl::loadLinkElement(U_01,ref()(0,1));      \
+   Impl::loadLinkElement(U_11,ref()(1,1));      \
+   Impl::loadLinkElement(U_21,ref()(2,1));      \
+   Impl::loadLinkElement(U_02,ref()(0,2));     \
+   Impl::loadLinkElement(U_12,ref()(1,2));     \
+   Impl::loadLinkElement(U_22,ref()(2,2));     \
+    UChi ## _0 += U_00*Chi_0;	       \
+    UChi ## _1 += U_10*Chi_0;\
+    UChi ## _2 += U_20*Chi_0;\
+    UChi ## _0 += U_01*Chi_1;\
+    UChi ## _1 += U_11*Chi_1;\
+    UChi ## _2 += U_21*Chi_1;\
+    UChi ## _0 += U_02*Chi_2;\
+    UChi ## _1 += U_12*Chi_2;\
+    UChi ## _2 += U_22*Chi_2;
+
+
+#define PERMUTE_DIR(dir)			\
+  permute##dir(Chi_0,Chi_0);			\
+  permute##dir(Chi_1,Chi_1);			\
+  permute##dir(Chi_2,Chi_2);
+
+
+#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew)	\
+  SE=st.GetEntry(ptype,Dir+skew,sF);	\
+  offset = SE->_offset;			\
+  local  = SE->_is_local;		\
+  perm   = SE->_permute;		\
+  if ( local ) {						\
+    LOAD_CHI(in._odata);					\
+    if ( perm) {						\
+      PERMUTE_DIR(Perm);					\
+    }								\
+  } else {							\
+    LOAD_CHI(buf);						\
+  }								
+
+#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even)		\
+  HAND_STENCIL_LEG_BASE(Dir,Perm,skew)				\
+  {								\
+    MULT(Dir,even);						\
+  }
+
+#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even)			\
+  HAND_STENCIL_LEG_BASE(Dir,Perm,skew)				\
+  {								\
+    MULT_ADD(U,Dir,even);					\
+  }
+
+
+
+#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even)	\
+  SE=st.GetEntry(ptype,Dir+skew,sF);			\
+  offset = SE->_offset;					\
+  local  = SE->_is_local;				\
+  perm   = SE->_permute;				\
+  if ( local ) {					\
+    LOAD_CHI(in._odata);				\
+    if ( perm) {					\
+      PERMUTE_DIR(Perm);				\
+    }							\
+  } else if ( st.same_node[Dir] ) {			\
+    LOAD_CHI(buf);					\
+  }							\
+  if (SE->_is_local || st.same_node[Dir] ) {		\
+    MULT_ADD(U,Dir,even);				\
+  }
+
+#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even)	\
+  SE=st.GetEntry(ptype,Dir+skew,sF);			\
+  offset = SE->_offset;					\
+  local  = SE->_is_local;				\
+  perm   = SE->_permute;				\
+  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
+    nmu++;							\
+    { LOAD_CHI(buf);	  }					\
+    { MULT_ADD(U,Dir,even); }					\
+  }								
+
+namespace Grid {
+namespace QCD {
+
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
+					  DoubledGaugeField &U,DoubledGaugeField &UUU,
+					  SiteSpinor *buf, int LLs, int sU, 
+					  const FermionField &in, FermionField &out,int dag) 
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  Simd even_0; // 12 regs on knc
+  Simd even_1;
+  Simd even_2;
+  Simd odd_0; // 12 regs on knc
+  Simd odd_1;
+  Simd odd_2;
+
+  Simd Chi_0;    // two spinor; 6 regs
+  Simd Chi_1;
+  Simd Chi_2;
+  
+  Simd U_00;  // two rows of U matrix
+  Simd U_10;
+  Simd U_20;  
+  Simd U_01;
+  Simd U_11;
+  Simd U_21;  // 2 reg left.
+  Simd U_02;
+  Simd U_12;
+  Simd U_22; 
+
+  SiteSpinor result;
+  int offset,local,perm, ptype;
+
+  StencilEntry *SE;
+  int skew;
+
+  for(int s=0;s<LLs;s++){
+    int sF=s+LLs*sU;
+
+    skew = 0;
+    HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);  
+    HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);   
+    HAND_STENCIL_LEG      (U,Zp,1,skew,even);  
+    HAND_STENCIL_LEG      (U,Tp,0,skew,odd);  
+    HAND_STENCIL_LEG      (U,Xm,3,skew,even);  
+    HAND_STENCIL_LEG      (U,Ym,2,skew,odd);   
+    HAND_STENCIL_LEG      (U,Zm,1,skew,even);  
+    HAND_STENCIL_LEG      (U,Tm,0,skew,odd);  
+    skew = 8;
+    HAND_STENCIL_LEG(UUU,Xp,3,skew,even);  
+    HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);   
+    HAND_STENCIL_LEG(UUU,Zp,1,skew,even);  
+    HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);  
+    HAND_STENCIL_LEG(UUU,Xm,3,skew,even);  
+    HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);   
+    HAND_STENCIL_LEG(UUU,Zm,1,skew,even);  
+    HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);  
+    
+    if ( dag ) {
+      result()()(0) = - even_0 - odd_0;
+      result()()(1) = - even_1 - odd_1;
+      result()()(2) = - even_2 - odd_2;
+    } else { 
+      result()()(0) = even_0 + odd_0;
+      result()()(1) = even_1 + odd_1;
+      result()()(2) = even_2 + odd_2;
+    }
+    vstream(out._odata[sF],result);
+  }
+}
+
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
+					     DoubledGaugeField &U, DoubledGaugeField &UUU,
+					     SiteSpinor *buf, int LLs, int sU, 
+					     const FermionField &in, FermionField &out,int dag) 
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  Simd even_0; // 12 regs on knc
+  Simd even_1;
+  Simd even_2;
+  Simd odd_0; // 12 regs on knc
+  Simd odd_1;
+  Simd odd_2;
+
+  Simd Chi_0;    // two spinor; 6 regs
+  Simd Chi_1;
+  Simd Chi_2;
+  
+  Simd U_00;  // two rows of U matrix
+  Simd U_10;
+  Simd U_20;  
+  Simd U_01;
+  Simd U_11;
+  Simd U_21;  // 2 reg left.
+  Simd U_02;
+  Simd U_12;
+  Simd U_22; 
+
+  SiteSpinor result;
+  int offset,local,perm, ptype;
+
+  StencilEntry *SE;
+  int skew;
+
+  for(int s=0;s<LLs;s++){
+    int sF=s+LLs*sU;
+
+    even_0 = zero;    even_1 = zero;    even_2 = zero;
+     odd_0 = zero;     odd_1 = zero;     odd_2 = zero;
+
+    skew = 0;
+    HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);  
+    HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);   
+    HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);  
+    HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);  
+    HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);  
+    HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);   
+    HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);  
+    HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);  
+    skew = 8;
+    HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);  
+    HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);   
+    HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);  
+    HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);  
+    HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);  
+    HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);   
+    HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);  
+    HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);  
+
+    // Assume every site must be connected to at least one interior point. No 1^4 subvols.
+    if ( dag ) {
+      result()()(0) = - even_0 - odd_0;
+      result()()(1) = - even_1 - odd_1;
+      result()()(2) = - even_2 - odd_2;
+    } else { 
+      result()()(0) = even_0 + odd_0;
+      result()()(1) = even_1 + odd_1;
+      result()()(2) = even_2 + odd_2;
+    }
+    vstream(out._odata[sF],result);
+  }
+}
+
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
+					     DoubledGaugeField &U, DoubledGaugeField &UUU,
+					     SiteSpinor *buf, int LLs, int sU, 
+					     const FermionField &in, FermionField &out,int dag) 
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  Simd even_0; // 12 regs on knc
+  Simd even_1;
+  Simd even_2;
+  Simd odd_0; // 12 regs on knc
+  Simd odd_1;
+  Simd odd_2;
+
+  Simd Chi_0;    // two spinor; 6 regs
+  Simd Chi_1;
+  Simd Chi_2;
+  
+  Simd U_00;  // two rows of U matrix
+  Simd U_10;
+  Simd U_20;  
+  Simd U_01;
+  Simd U_11;
+  Simd U_21;  // 2 reg left.
+  Simd U_02;
+  Simd U_12;
+  Simd U_22; 
+
+  SiteSpinor result;
+  int offset,local,perm, ptype;
+
+  StencilEntry *SE;
+  int skew;
+
+  for(int s=0;s<LLs;s++){
+    int sF=s+LLs*sU;
+
+    even_0 = zero;    even_1 = zero;    even_2 = zero;
+     odd_0 = zero;     odd_1 = zero;     odd_2 = zero;
+    int nmu=0;
+    skew = 0;
+    HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);  
+    HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);   
+    HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);  
+    HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);  
+    HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);  
+    HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);   
+    HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);  
+    HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);  
+    skew = 8;
+    HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);  
+    HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);   
+    HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);  
+    HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);  
+    HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);  
+    HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);   
+    HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);  
+    HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);  
+
+    // Add sum of all exterior connected stencil legs
+    if ( nmu ) { 
+      if ( dag ) {
+	result()()(0) = - even_0 - odd_0;
+	result()()(1) = - even_1 - odd_1;
+	result()()(2) = - even_2 - odd_2;
+      } else { 
+	result()()(0) = even_0 + odd_0;
+	result()()(1) = even_1 + odd_1;
+	result()()(2) = even_2 + odd_2;
+      }
+      out._odata[sF] = out._odata[sF] + result;
+    }
+  }
+}
+
+
+#define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
+  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
+						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
+						     SiteSpinor *buf, int LLs, int sU, \
+						     const FermionField &in, FermionField &out, int dag); \
+									\
+  template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
+						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
+						     SiteSpinor *buf, int LLs, int sU, \
+						     const FermionField &in, FermionField &out, int dag); \
+									\
+  template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
+						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
+						     SiteSpinor *buf, int LLs, int sU, \
+						     const FermionField &in, FermionField &out, int dag); \
+
+DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
+DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
+DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
+DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
+
+
+}
+}
+
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.cc
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.cc
@@ -0,0 +1,243 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
+
+    Copyright (C) 2017
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+//#include <Grid/Eigen/Dense>
+#include <Grid/qcd/spin/Dirac.h>
+
+namespace Grid
+{
+namespace QCD
+{
+
+// *NOT* EO
+template <class Impl>
+RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
+{
+  FermionField temp(out._grid);
+
+  // Wilson term
+  out.checkerboard = in.checkerboard;
+  this->Dhop(in, out, DaggerNo);
+
+  // Clover term
+  Mooee(in, temp);
+
+  out += temp;
+  return norm2(out);
+}
+
+template <class Impl>
+RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
+{
+  FermionField temp(out._grid);
+
+  // Wilson term
+  out.checkerboard = in.checkerboard;
+  this->Dhop(in, out, DaggerYes);
+
+  // Clover term
+  MooeeDag(in, temp);
+
+  out += temp;
+  return norm2(out);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
+{
+  WilsonFermion<Impl>::ImportGauge(_Umu);
+  GridBase *grid = _Umu._grid;
+  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
+
+  // Compute the field strength terms mu>nu
+  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
+
+  // Compute the Clover Operator acting on Colour and Spin
+  // multiply here by the clover coefficients for the anisotropy
+  CloverTerm  = fillCloverYZ(Bx) * csw_r;
+  CloverTerm += fillCloverXZ(By) * csw_r;
+  CloverTerm += fillCloverXY(Bz) * csw_r;
+  CloverTerm += fillCloverXT(Ex) * csw_t;
+  CloverTerm += fillCloverYT(Ey) * csw_t;
+  CloverTerm += fillCloverZT(Ez) * csw_t;
+  CloverTerm += diag_mass;
+
+  int lvol = _Umu._grid->lSites();
+  int DimRep = Impl::Dimension;
+
+  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+
+  std::vector<int> lcoor;
+  typename SiteCloverType::scalar_object Qx = zero, Qxinv = zero;
+
+  for (int site = 0; site < lvol; site++)
+  {
+    grid->LocalIndexToLocalCoor(site, lcoor);
+    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+    peekLocalSite(Qx, CloverTerm, lcoor);
+    Qxinv = zero;
+    //if (csw!=0){
+    for (int j = 0; j < Ns; j++)
+      for (int k = 0; k < Ns; k++)
+        for (int a = 0; a < DimRep; a++)
+          for (int b = 0; b < DimRep; b++)
+            EigenCloverOp(a + j * DimRep, b + k * DimRep) = Qx()(j, k)(a, b);
+    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
+
+    EigenInvCloverOp = EigenCloverOp.inverse();
+    //std::cout << EigenInvCloverOp << std::endl;
+    for (int j = 0; j < Ns; j++)
+      for (int k = 0; k < Ns; k++)
+        for (int a = 0; a < DimRep; a++)
+          for (int b = 0; b < DimRep; b++)
+            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
+    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
+    //  }
+    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
+  }
+
+  // Separate the even and odd parts
+  pickCheckerboard(Even, CloverTermEven, CloverTerm);
+  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
+
+  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
+  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
+
+  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
+  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
+
+  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
+  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerNo, InverseNo);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerYes, InverseNo);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerNo, InverseYes);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerYes, InverseYes);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
+{
+  out.checkerboard = in.checkerboard;
+  CloverFieldType *Clover;
+  assert(in.checkerboard == Odd || in.checkerboard == Even);
+
+  if (dag)
+  {
+    if (in._grid->_isCheckerBoarded)
+    {
+      if (in.checkerboard == Odd)
+      {
+        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
+      }
+      else
+      {
+        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
+      }
+      out = *Clover * in;
+    }
+    else
+    {
+      Clover = (inv) ? &CloverTermInv : &CloverTerm;
+      out = adj(*Clover) * in;
+    }
+  }
+  else
+  {
+    if (in._grid->_isCheckerBoarded)
+    {
+
+      if (in.checkerboard == Odd)
+      {
+        //  std::cout << "Calling clover term Odd" << std::endl;
+        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
+      }
+      else
+      {
+        //  std::cout << "Calling clover term Even" << std::endl;
+        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
+      }
+      out = *Clover * in;
+      //  std::cout << GridLogMessage << "*Clover.checkerboard "  << (*Clover).checkerboard << std::endl;
+    }
+    else
+    {
+      Clover = (inv) ? &CloverTermInv : &CloverTerm;
+      out = *Clover * in;
+    }
+  }
+
+} // MooeeInternal
+
+
+// Derivative parts
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
+{
+  assert(0);
+}
+
+// Derivative parts
+template <class Impl>
+void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+{
+  assert(0); // not implemented yet
+}
+
+FermOpTemplateInstantiate(WilsonCloverFermion);
+AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
+TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
+//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
+}
+}
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@@ -0,0 +1,366 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
+
+    Copyright (C) 2017
+
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: David Preti <>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#ifndef GRID_QCD_WILSON_CLOVER_FERMION_H
+#define GRID_QCD_WILSON_CLOVER_FERMION_H
+
+#include <Grid/Grid.h>
+
+namespace Grid
+{
+namespace QCD
+{
+
+///////////////////////////////////////////////////////////////////
+// Wilson Clover
+//
+// Operator ( with anisotropy coefficients):
+//
+// Q =   1 + (Nd-1)/xi_0 + m
+//     + W_t + (nu/xi_0) * W_s
+//     - 1/2*[ csw_t * sum_s (sigma_ts F_ts) + (csw_s/xi_0) * sum_ss (sigma_ss F_ss)  ]
+//
+// s spatial, t temporal directions.
+// where W_t and W_s are the temporal and spatial components of the
+// Wilson Dirac operator
+//
+// csw_r = csw_t to recover the isotropic version
+//////////////////////////////////////////////////////////////////
+
+template <class Impl>
+class WilsonCloverFermion : public WilsonFermion<Impl>
+{
+public:
+  // Types definitions
+  INHERIT_IMPL_TYPES(Impl);
+  template <typename vtype>
+  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
+  typedef iImplClover<Simd> SiteCloverType;
+  typedef Lattice<SiteCloverType> CloverFieldType;
+
+public:
+  typedef WilsonFermion<Impl> WilsonBase;
+
+  virtual void Instantiatable(void){};
+  // Constructors
+  WilsonCloverFermion(GaugeField &_Umu, GridCartesian &Fgrid,
+                      GridRedBlackCartesian &Hgrid,
+                      const RealD _mass,
+                      const RealD _csw_r = 0.0,
+                      const RealD _csw_t = 0.0,
+                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
+                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
+                                                                                     Fgrid,
+                                                                                     Hgrid,
+                                                                                     _mass, impl_p, clover_anisotropy),
+                                                                 CloverTerm(&Fgrid),
+                                                                 CloverTermInv(&Fgrid),
+                                                                 CloverTermEven(&Hgrid),
+                                                                 CloverTermOdd(&Hgrid),
+                                                                 CloverTermInvEven(&Hgrid),
+                                                                 CloverTermInvOdd(&Hgrid),
+                                                                 CloverTermDagEven(&Hgrid),
+                                                                 CloverTermDagOdd(&Hgrid),
+                                                                 CloverTermInvDagEven(&Hgrid),
+                                                                 CloverTermInvDagOdd(&Hgrid)
+  {
+    assert(Nd == 4); // require 4 dimensions
+
+    if (clover_anisotropy.isAnisotropic)
+    {
+      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
+      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
+    }
+    else
+    {
+      csw_r = _csw_r * 0.5;
+      diag_mass = 4.0 + _mass;
+    }
+    csw_t = _csw_t * 0.5;
+
+    if (csw_r == 0)
+      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
+    if (csw_t == 0)
+      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
+
+    ImportGauge(_Umu);
+  }
+
+  virtual RealD M(const FermionField &in, FermionField &out);
+  virtual RealD Mdag(const FermionField &in, FermionField &out);
+
+  virtual void Mooee(const FermionField &in, FermionField &out);
+  virtual void MooeeDag(const FermionField &in, FermionField &out);
+  virtual void MooeeInv(const FermionField &in, FermionField &out);
+  virtual void MooeeInvDag(const FermionField &in, FermionField &out);
+  virtual void MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv);
+
+  //virtual void MDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+  virtual void MooDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+  virtual void MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+
+  void ImportGauge(const GaugeField &_Umu);
+
+  // Derivative parts unpreconditioned pseudofermions
+  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+  {
+    conformable(X._grid, Y._grid);
+    conformable(X._grid, force._grid);
+    GaugeLinkField force_mu(force._grid), lambda(force._grid);
+    GaugeField clover_force(force._grid);
+    PropagatorField Lambda(force._grid);
+
+    // Guido: Here we are hitting some performance issues:
+    // need to extract the components of the DoubledGaugeField
+    // for each call
+    // Possible solution
+    // Create a vector object to store them? (cons: wasting space)
+    std::vector<GaugeLinkField> U(Nd, this->Umu._grid);
+
+    Impl::extractLinkField(U, this->Umu);
+
+    force = zero;
+    // Derivative of the Wilson hopping term
+    this->DhopDeriv(force, X, Y, dag);
+
+    ///////////////////////////////////////////////////////////
+    // Clover term derivative
+    ///////////////////////////////////////////////////////////
+    Impl::outerProductImpl(Lambda, X, Y);
+    //std::cout << "Lambda:" << Lambda << std::endl;
+
+    Gamma::Algebra sigma[] = {
+        Gamma::Algebra::SigmaXY,
+        Gamma::Algebra::SigmaXZ,
+        Gamma::Algebra::SigmaXT,
+        Gamma::Algebra::MinusSigmaXY,
+        Gamma::Algebra::SigmaYZ,
+        Gamma::Algebra::SigmaYT,
+        Gamma::Algebra::MinusSigmaXZ,
+        Gamma::Algebra::MinusSigmaYZ,
+        Gamma::Algebra::SigmaZT,
+        Gamma::Algebra::MinusSigmaXT,
+        Gamma::Algebra::MinusSigmaYT,
+        Gamma::Algebra::MinusSigmaZT};
+
+    /*
+      sigma_{\mu \nu}=
+      | 0         sigma[0]  sigma[1]  sigma[2] |
+      | sigma[3]    0       sigma[4]  sigma[5] |
+      | sigma[6]  sigma[7]     0      sigma[8] |
+      | sigma[9]  sigma[10] sigma[11]   0      |
+    */
+
+    int count = 0;
+    clover_force = zero;
+    for (int mu = 0; mu < 4; mu++)
+    {
+      force_mu = zero;
+      for (int nu = 0; nu < 4; nu++)
+      {
+        if (mu == nu)
+        continue;
+        
+        RealD factor;
+        if (nu == 4 || mu == 4)
+        {
+          factor = 2.0 * csw_t;
+        }
+        else
+        {
+          factor = 2.0 * csw_r;
+        }
+        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
+        count++;
+      }
+
+      pokeLorentz(clover_force, U[mu] * force_mu, mu);
+    }
+    //clover_force *= csw;
+    force += clover_force;
+  }
+
+  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
+  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
+  {
+    conformable(lambda._grid, U[0]._grid);
+    GaugeLinkField out(lambda._grid), tmp(lambda._grid);
+    // insertion in upper staple
+    // please check redundancy of shift operations
+
+    // C1+
+    tmp = lambda * U[nu];
+    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
+
+    // C2+
+    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
+
+    // C3+
+    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
+
+    // C4+
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
+
+    // insertion in lower staple
+    // C1-
+    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+
+    // C2-
+    tmp = adj(lambda) * U[nu];
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+
+    // C3-
+    tmp = lambda * U[nu];
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
+
+    // C4-
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
+
+    return out;
+  }
+
+private:
+  // here fixing the 4 dimensions, make it more general?
+
+  RealD csw_r;                                               // Clover coefficient - spatial
+  RealD csw_t;                                               // Clover coefficient - temporal
+  RealD diag_mass;                                           // Mass term
+  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
+  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
+  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
+  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
+  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
+
+  // eventually these can be compressed into 6x6 blocks instead of the 12x12
+  // using the DeGrand-Rossi basis for the gamma matrices
+  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 1) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(1, 0) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 1) = -F._odata[i]()();
+      T._odata[i]()(1, 0) = F._odata[i]()();
+      T._odata[i]()(2, 3) = -F._odata[i]()();
+      T._odata[i]()(3, 2) = F._odata[i]()();
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXY(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+
+      T._odata[i]()(0, 0) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(1, 1) = timesI(F._odata[i]()());
+      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 1) = timesI(F._odata[i]()());
+      T._odata[i]()(1, 0) = timesI(F._odata[i]()());
+      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverYT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 1) = -(F._odata[i]()());
+      T._odata[i]()(1, 0) = (F._odata[i]()());
+      T._odata[i]()(2, 3) = (F._odata[i]()());
+      T._odata[i]()(3, 2) = -(F._odata[i]()());
+    }
+
+    return T;
+  }
+
+  CloverFieldType fillCloverZT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F._grid);
+    T = zero;
+    PARALLEL_FOR_LOOP
+    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
+    {
+      T._odata[i]()(0, 0) = timesI(F._odata[i]()());
+      T._odata[i]()(1, 1) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
+      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
+    }
+
+    return T;
+  }
+};
+}
+}
+
+#endif // GRID_QCD_WILSON_CLOVER_FERMION_H
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -0,0 +1,373 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonCompressor.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_WILSON_COMPRESSOR_H
+#define  GRID_QCD_WILSON_COMPRESSOR_H
+
+namespace Grid {
+namespace QCD {
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+// optimised versions supporting half precision too
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+template<class _HCspinor,class _Hspinor,class _Spinor, class projector,typename SFINAE = void >
+class WilsonCompressorTemplate;
+
+
+template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
+class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
+  typename std::enable_if<std::is_same<_HCspinor,_Hspinor>::value>::type >
+{
+ public:
+  
+  int mu,dag;  
+
+  void Point(int p) { mu=p; };
+
+  WilsonCompressorTemplate(int _dag=0){
+    dag = _dag;
+  }
+
+  typedef _Spinor         SiteSpinor;
+  typedef _Hspinor     SiteHalfSpinor;
+  typedef _HCspinor SiteHalfCommSpinor;
+  typedef typename SiteHalfCommSpinor::vector_type vComplexLow;
+  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
+  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
+
+  inline int CommDatumSize(void) {
+    return sizeof(SiteHalfCommSpinor);
+  }
+
+  /*****************************************************/
+  /* Compress includes precision change if mpi data is not same */
+  /*****************************************************/
+  inline void Compress(SiteHalfSpinor * __restrict__ buf,Integer o,const SiteSpinor &in) {
+    SiteHalfSpinor tmp;
+    projector::Proj(tmp,in,mu,dag);
+    vstream(buf[o],tmp);
+  }
+
+  /*****************************************************/
+  /* Exchange includes precision change if mpi data is not same */
+  /*****************************************************/
+  inline void Exchange(SiteHalfSpinor * __restrict__ mp,
+                       const SiteHalfSpinor * __restrict__ vp0,
+                       const SiteHalfSpinor * __restrict__ vp1,
+		       Integer type,Integer o){
+    SiteHalfSpinor tmp1;
+    SiteHalfSpinor tmp2;
+    exchange(tmp1,tmp2,vp0[o],vp1[o],type);
+    vstream(mp[2*o  ],tmp1);
+    vstream(mp[2*o+1],tmp2);
+  }
+
+  /*****************************************************/
+  /* Have a decompression step if mpi data is not same */
+  /*****************************************************/
+  inline void Decompress(SiteHalfSpinor * __restrict__ out,
+			 SiteHalfSpinor * __restrict__ in, Integer o) {    
+    assert(0);
+  }
+
+  /*****************************************************/
+  /* Compress Exchange                                 */
+  /*****************************************************/
+  inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
+			       SiteHalfSpinor * __restrict__ out1,
+			       const SiteSpinor * __restrict__ in,
+			       Integer j,Integer k, Integer m,Integer type){
+    SiteHalfSpinor temp1, temp2,temp3,temp4;
+    projector::Proj(temp1,in[k],mu,dag);
+    projector::Proj(temp2,in[m],mu,dag);
+    exchange(temp3,temp4,temp1,temp2,type);
+    vstream(out0[j],temp3);
+    vstream(out1[j],temp4);
+  }
+
+  /*****************************************************/
+  /* Pass the info to the stencil */
+  /*****************************************************/
+  inline bool DecompressionStep(void) { return false; }
+
+};
+
+template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
+class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
+  typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
+{
+ public:
+  
+  int mu,dag;  
+
+  void Point(int p) { mu=p; };
+
+  WilsonCompressorTemplate(int _dag=0){
+    dag = _dag;
+  }
+
+  typedef _Spinor         SiteSpinor;
+  typedef _Hspinor     SiteHalfSpinor;
+  typedef _HCspinor SiteHalfCommSpinor;
+  typedef typename SiteHalfCommSpinor::vector_type vComplexLow;
+  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
+  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
+
+  inline int CommDatumSize(void) {
+    return sizeof(SiteHalfCommSpinor);
+  }
+
+  /*****************************************************/
+  /* Compress includes precision change if mpi data is not same */
+  /*****************************************************/
+  inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
+    SiteHalfSpinor hsp;
+    SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
+    projector::Proj(hsp,in,mu,dag);
+    precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
+  }
+
+  /*****************************************************/
+  /* Exchange includes precision change if mpi data is not same */
+  /*****************************************************/
+  inline void Exchange(SiteHalfSpinor *mp,
+                       SiteHalfSpinor *vp0,
+                       SiteHalfSpinor *vp1,
+		       Integer type,Integer o){
+    SiteHalfSpinor vt0,vt1;
+    SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
+    SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
+    precisionChange((vComplexHigh *)&vt0,(vComplexLow *)&vpp0[o],Nw);
+    precisionChange((vComplexHigh *)&vt1,(vComplexLow *)&vpp1[o],Nw);
+    exchange(mp[2*o],mp[2*o+1],vt0,vt1,type);
+  }
+
+  /*****************************************************/
+  /* Have a decompression step if mpi data is not same */
+  /*****************************************************/
+  inline void Decompress(SiteHalfSpinor *out,
+			 SiteHalfSpinor *in, Integer o){
+    SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
+    precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
+  }
+
+  /*****************************************************/
+  /* Compress Exchange                                 */
+  /*****************************************************/
+  inline void CompressExchange(SiteHalfSpinor *out0,
+			       SiteHalfSpinor *out1,
+			       const SiteSpinor *in,
+			       Integer j,Integer k, Integer m,Integer type){
+    SiteHalfSpinor temp1, temp2,temp3,temp4;
+    SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
+    SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
+    projector::Proj(temp1,in[k],mu,dag);
+    projector::Proj(temp2,in[m],mu,dag);
+    exchange(temp3,temp4,temp1,temp2,type);
+    precisionChange((vComplexLow *)&hout0[j],(vComplexHigh *)&temp3,Nw);
+    precisionChange((vComplexLow *)&hout1[j],(vComplexHigh *)&temp4,Nw);
+  }
+
+  /*****************************************************/
+  /* Pass the info to the stencil */
+  /*****************************************************/
+  inline bool DecompressionStep(void) { return true; }
+
+};
+
+#define DECLARE_PROJ(Projector,Compressor,spProj)			\
+  class Projector {							\
+  public:								\
+    template<class hsp,class fsp>					\
+    static void Proj(hsp &result,const fsp &in,int mu,int dag){			\
+      spProj(result,in);						\
+    }									\
+  };									\
+template<typename HCS,typename HS,typename S> using Compressor = WilsonCompressorTemplate<HCS,HS,S,Projector>;
+
+DECLARE_PROJ(WilsonXpProjector,WilsonXpCompressor,spProjXp);
+DECLARE_PROJ(WilsonYpProjector,WilsonYpCompressor,spProjYp);
+DECLARE_PROJ(WilsonZpProjector,WilsonZpCompressor,spProjZp);
+DECLARE_PROJ(WilsonTpProjector,WilsonTpCompressor,spProjTp);
+DECLARE_PROJ(WilsonXmProjector,WilsonXmCompressor,spProjXm);
+DECLARE_PROJ(WilsonYmProjector,WilsonYmCompressor,spProjYm);
+DECLARE_PROJ(WilsonZmProjector,WilsonZmCompressor,spProjZm);
+DECLARE_PROJ(WilsonTmProjector,WilsonTmCompressor,spProjTm);
+
+class WilsonProjector {
+ public:
+  template<class hsp,class fsp>
+  static void Proj(hsp &result,const fsp &in,int mu,int dag){
+    int mudag=dag? mu : (mu+Nd)%(2*Nd);
+    switch(mudag) {
+    case Xp:	spProjXp(result,in);	break;
+    case Yp:	spProjYp(result,in);	break;
+    case Zp:	spProjZp(result,in);	break;
+    case Tp:	spProjTp(result,in);	break;
+    case Xm:	spProjXm(result,in);	break;
+    case Ym:	spProjYm(result,in);	break;
+    case Zm:	spProjZm(result,in);	break;
+    case Tm:	spProjTm(result,in);	break;
+    default: 	assert(0);	        break;
+    }
+  }
+};
+template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCompressorTemplate<HCS,HS,S,WilsonProjector>;
+
+// Fast comms buffer manipulation which should inline right through (avoid direction
+// dependent logic that prevents inlining
+template<class vobj,class cobj>
+class WilsonStencil : public CartesianStencil<vobj,cobj> {
+public:
+  double timer0;
+  double timer1;
+  double timer2;
+  double timer3;
+  double timer4;
+  double timer5;
+  double timer6;
+  uint64_t callsi;
+  void ZeroCountersi(void)
+  {
+    timer0=0;
+    timer1=0;
+    timer2=0;
+    timer3=0;
+    timer4=0;
+    timer5=0;
+    timer6=0;
+    callsi=0;
+  }
+  void Reporti(int calls)
+  {
+    if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
+    if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl;
+    if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl;
+    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
+    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
+  }
+
+  WilsonStencil(GridBase *grid,
+		int npoints,
+		int checkerboard,
+		const std::vector<int> &directions,
+		const std::vector<int> &distances)  
+    : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) 
+  { 
+    ZeroCountersi();
+  };
+
+
+  template < class compressor>
+  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
+  {
+    std::vector<std::vector<CommsRequest_t> > reqs;
+    this->HaloExchangeOptGather(source,compress);
+    double t1=usecond();
+    // Asynchronous MPI calls multidirectional, Isend etc...
+    //    this->CommunicateBegin(reqs);
+    //    this->CommunicateComplete(reqs);
+    // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
+    this->Communicate();
+    double t2=usecond(); timer1 += t2-t1;
+    this->CommsMerge(compress);
+    double t3=usecond(); timer2 += t3-t2;
+    this->CommsMergeSHM(compress);
+    double t4=usecond(); timer3 += t4-t3;
+  }
+  
+  template <class compressor>
+  void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) 
+  {
+    this->Prepare();
+    double t0=usecond();
+    this->HaloGatherOpt(source,compress);
+    double t1=usecond();
+    timer0 += t1-t0;
+    callsi++;
+  }
+
+  template <class compressor>
+  void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress)
+  {
+    // Strategy. Inherit types from Compressor.
+    // Use types to select the write direction by directon compressor
+    typedef typename compressor::SiteSpinor         SiteSpinor;
+    typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor;
+    typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
+
+    this->mpi3synctime_g-=usecond();
+    this->_grid->StencilBarrier();
+    this->mpi3synctime_g+=usecond();
+
+    assert(source._grid==this->_grid);
+    this->halogtime-=usecond();
+    
+    this->u_comm_offset=0;
+      
+    WilsonXpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> XpCompress; 
+    WilsonYpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> YpCompress; 
+    WilsonZpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> ZpCompress; 
+    WilsonTpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> TpCompress;
+    WilsonXmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> XmCompress; 
+    WilsonYmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> YmCompress; 
+    WilsonZmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> ZmCompress; 
+    WilsonTmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> TmCompress;
+
+    int dag = compress.dag;
+    int face_idx=0;
+    if ( dag ) { 
+      assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
+      assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
+      assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
+      assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
+      assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
+      assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
+      assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
+      assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
+    } else {
+      assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
+      assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
+      assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
+      assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
+      assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
+      assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
+      assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
+      assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
+    }
+    this->face_table_computed=1;
+    assert(this->u_comm_offset==this->_unified_buffer_size);
+    this->halogtime+=usecond();
+  }
+
+ };
+
+}} // namespace close
+#endif
--- a/Grid/qcd/action/fermion/WilsonFermion.cc
+++ b/Grid/qcd/action/fermion/WilsonFermion.cc
@@ -0,0 +1,562 @@
+
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/WilsonFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
+const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
+int WilsonFermionStatic::HandOptDslash;
+
+/////////////////////////////////
+// Constructor and gauge import
+/////////////////////////////////
+
+template <class Impl>
+WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
+                                   GridRedBlackCartesian &Hgrid, RealD _mass,
+                                   const ImplParams &p,
+                                   const WilsonAnisotropyCoefficients &anis)
+    : Kernels(p),
+      _grid(&Fgrid),
+      _cbgrid(&Hgrid),
+      Stencil(&Fgrid, npoint, Even, directions, displacements),
+      StencilEven(&Hgrid, npoint, Even, directions,displacements),  // source is Even
+      StencilOdd(&Hgrid, npoint, Odd, directions,displacements),  // source is Odd
+      mass(_mass),
+      Lebesgue(_grid),
+      LebesgueEvenOdd(_cbgrid),
+      Umu(&Fgrid),
+      UmuEven(&Hgrid),
+      UmuOdd(&Hgrid),
+      _tmp(&Hgrid),
+      anisotropyCoeff(anis)
+{
+  // Allocate the required comms buffer
+  ImportGauge(_Umu);
+  if  (anisotropyCoeff.isAnisotropic){
+    diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
+  } else {
+    diag_mass = 4.0 + mass;
+  }
+
+
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {
+  GaugeField HUmu(_Umu._grid);
+
+  //Here multiply the anisotropy coefficients
+  if (anisotropyCoeff.isAnisotropic)
+  {
+
+    for (int mu = 0; mu < Nd; mu++)
+    {
+      GaugeLinkField U_dir = (-0.5)*PeekIndex<LorentzIndex>(_Umu, mu);
+      if (mu != anisotropyCoeff.t_direction)
+        U_dir *= (anisotropyCoeff.nu / anisotropyCoeff.xi_0);
+
+      PokeIndex<LorentzIndex>(HUmu, U_dir, mu);
+    }
+  }
+  else
+  {
+    HUmu = _Umu * (-0.5);
+  }
+  Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
+  pickCheckerboard(Even, UmuEven, Umu);
+  pickCheckerboard(Odd, UmuOdd, Umu);
+}
+
+/////////////////////////////
+// Implement the interface
+/////////////////////////////
+
+template <class Impl>
+RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  Dhop(in, out, DaggerNo);
+  return axpy_norm(out, diag_mass, in, out);
+}
+
+template <class Impl>
+RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  Dhop(in, out, DaggerYes);
+  return axpy_norm(out, diag_mass, in, out);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
+  if (in.checkerboard == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
+  }
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+  if (in.checkerboard == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
+  }
+}
+  
+template <class Impl>
+void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  typename FermionField::scalar_type scal(diag_mass);
+  out = scal * in;
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  Mooee(in, out);
+}
+
+template<class Impl>
+void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  out = (1.0/(diag_mass))*in;
+}
+  
+template<class Impl>
+void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  MooeeInv(in,out);
+}
+template<class Impl>
+void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m,std::vector<double> twist)
+{  
+  typedef typename FermionField::vector_type vector_type;
+  typedef typename FermionField::scalar_type ScalComplex;
+  typedef Lattice<iSinglet<vector_type> > LatComplex;
+  
+  // what type LatticeComplex 
+  conformable(_grid,out._grid);
+  
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+  
+  std::vector<int> latt_size   = _grid->_fdimensions;
+  
+  FermionField   num  (_grid); num  = zero;
+  LatComplex    wilson(_grid); wilson= zero;
+  LatComplex     one  (_grid); one = ScalComplex(1.0,0.0);
+  
+  LatComplex denom(_grid); denom= zero;
+  LatComplex kmu(_grid); 
+  ScalComplex ci(0.0,1.0);
+  // momphase = n * 2pi / L
+  for(int mu=0;mu<Nd;mu++) {
+    
+    LatticeCoordinate(kmu,mu);
+    
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    
+    kmu = TwoPiL * kmu;
+    kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
+    
+    wilson = wilson + 2.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
+    
+    num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in);    // derivative term
+    
+    denom=denom + sin(kmu)*sin(kmu);
+  }
+  
+  wilson = wilson + _m;     // 2 sin^2 k/2 + m
+  
+  num   = num + wilson*in;     // -i gmu sin k + 2 sin^2 k/2 + m
+  
+  denom= denom+wilson*wilson; // sin^2 k + (2 sin^2 k/2 + m)^2
+  
+  denom= one/denom;
+  
+  out = num*denom; // [ -i gmu sin k + 2 sin^2 k/2 + m] / [ sin^2 k + (2 sin^2 k/2 + m)^2 ]
+  
+}
+  
+
+///////////////////////////////////
+// Internal
+///////////////////////////////////
+
+template <class Impl>
+void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
+                                        GaugeField &mat, const FermionField &A,
+                                        const FermionField &B, int dag) {
+  assert((dag == DaggerNo) || (dag == DaggerYes));
+
+  Compressor compressor(dag);
+
+  FermionField Btilde(B._grid);
+  FermionField Atilde(B._grid);
+  Atilde = A;//redundant
+
+  st.HaloExchange(B, compressor);
+
+  for (int mu = 0; mu < Nd; mu++) {
+    ////////////////////////////////////////////////////////////////////////
+    // Flip gamma (1+g)<->(1-g) if dag
+    ////////////////////////////////////////////////////////////////////////
+    int gamma = mu;
+    if (!dag) gamma += Nd;
+
+    ////////////////////////
+    // Call the single hop
+    ////////////////////////
+    parallel_for (int sss = 0; sss < B._grid->oSites(); sss++) {
+      Kernels::DhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, gamma);
+    }
+
+    //////////////////////////////////////////////////
+    // spin trace outer product
+    //////////////////////////////////////////////////
+    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
+  }
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+  conformable(U._grid, _grid);
+  conformable(U._grid, V._grid);
+  conformable(U._grid, mat._grid);
+
+  mat.checkerboard = U.checkerboard;
+
+  DerivInternal(Stencil, Umu, mat, U, V, dag);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+  conformable(U._grid, _cbgrid);
+  conformable(U._grid, V._grid);
+  //conformable(U._grid, mat._grid); not general, leaving as a comment (Guido)
+  // Motivation: look at the SchurDiff operator
+  
+  assert(V.checkerboard == Even);
+  assert(U.checkerboard == Odd);
+  mat.checkerboard = Odd;
+
+  DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+  conformable(U._grid, _cbgrid);
+  conformable(U._grid, V._grid);
+  //conformable(U._grid, mat._grid);
+
+  assert(V.checkerboard == Odd);
+  assert(U.checkerboard == Even);
+  mat.checkerboard = Even;
+
+  DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
+  conformable(in._grid, _grid);  // verifies full grid
+  conformable(in._grid, out._grid);
+
+  out.checkerboard = in.checkerboard;
+
+  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
+  conformable(in._grid, _cbgrid);    // verifies half grid
+  conformable(in._grid, out._grid);  // drops the cb check
+
+  assert(in.checkerboard == Even);
+  out.checkerboard = Odd;
+
+  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) {
+  conformable(in._grid, _cbgrid);    // verifies half grid
+  conformable(in._grid, out._grid);  // drops the cb check
+
+  assert(in.checkerboard == Odd);
+  out.checkerboard = Even;
+
+  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+  DhopDir(in, out, dir, disp);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
+  int skip = (disp == 1) ? 0 : 1;
+  int dirdisp = dir + skip * 4;
+  int gamma = dir + (1 - skip) * 4;
+
+  DhopDirDisp(in, out, dirdisp, gamma, DaggerNo);
+};
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) {
+  Compressor compressor(dag);
+
+  Stencil.HaloExchange(in, compressor);
+
+  parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
+    Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
+  }
+} 
+/*Change starts*/
+template <class Impl>
+void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+                                       DoubledGaugeField &U,
+                                       const FermionField &in,
+                                       FermionField &out, int dag) {
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
+    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
+  else
+#endif 
+    DhopInternalSerial(st,lo,U,in,out,dag);
+
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+                                       DoubledGaugeField &U,
+                                       const FermionField &in,
+                                       FermionField &out, int dag) {
+  assert((dag == DaggerNo) || (dag == DaggerYes));
+#ifdef GRID_OMP
+  Compressor compressor;
+  int len =  U._grid->oSites();
+  const int LLs =  1;
+
+  st.Prepare();
+  st.HaloGather(in,compressor);
+  st.CommsMergeSHM(compressor);
+#pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    int nthreads = omp_get_num_threads();
+    int ncomms = CartesianCommunicator::nCommThreads;
+    if (ncomms == -1) ncomms = 1;
+    assert(nthreads > ncomms);
+    if (tid >= ncomms) {
+      nthreads -= ncomms;
+      int ttid  = tid - ncomms;
+      int n     = len;
+      int chunk = n / nthreads;
+      int rem   = n % nthreads;
+      int myblock, myn;
+      if (ttid < rem) {
+        myblock = ttid * chunk + ttid;
+        myn = chunk+1;
+      } else {
+        myblock = ttid*chunk + rem;
+        myn = chunk;
+      }
+      // do the compute
+     if (dag == DaggerYes) {
+
+        for (int sss = myblock; sss < myblock+myn; ++sss) {
+         Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
+       }
+     } else {
+        for (int sss = myblock; sss < myblock+myn; ++sss) {
+         Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
+       }
+    } //else
+
+    } else {
+      st.CommunicateThreaded();
+    }
+
+  Compressor compressor(dag);
+
+  if (dag == DaggerYes) {
+    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
+      Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
+    }
+  } else {
+    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
+      Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
+    }
+  }
+
+  }  //pragma
+#else
+  assert(0);
+#endif
+};
+
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
+                                       DoubledGaugeField &U,
+                                       const FermionField &in,
+                                       FermionField &out, int dag) {
+  assert((dag == DaggerNo) || (dag == DaggerYes));
+  Compressor compressor(dag);
+  st.HaloExchange(in, compressor);
+
+  if (dag == DaggerYes) {
+    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
+      Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
+    }
+  } else {
+    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
+      Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
+    }
+  }
+};
+/*Change ends */
+
+/*******************************************************************************
+ * Conserved current utilities for Wilson fermions, for contracting propagators
+ * to make a conserved current sink or inserting the conserved current 
+ * sequentially.
+ ******************************************************************************/
+template <class Impl>
+void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                   PropagatorField &q_in_2,
+                                                   PropagatorField &q_out,
+                                                   Current curr_type,
+                                                   unsigned int mu)
+{
+    Gamma g5(Gamma::Algebra::Gamma5);
+    conformable(_grid, q_in_1._grid);
+    conformable(_grid, q_in_2._grid);
+    conformable(_grid, q_out._grid);
+    PropagatorField tmp1(_grid), tmp2(_grid);
+    q_out = zero;
+
+    // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
+    // Inefficient comms method but not performance critical.
+    tmp1 = Cshift(q_in_1, mu, 1);
+    tmp2 = Cshift(q_in_2, mu, 1);
+    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    {
+        Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sU],
+                                                 q_in_2._odata[sU],
+                                                 q_out._odata[sU],
+                                                 Umu, sU, mu);
+        Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sU],
+                                                 tmp2._odata[sU],
+                                                 q_out._odata[sU],
+                                                 Umu, sU, mu);
+    }
+}
+
+
+template <class Impl>
+void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
+                                              PropagatorField &q_out,
+                                              Current curr_type,
+                                              unsigned int mu,
+                                              unsigned int tmin, 
+                                              unsigned int tmax,
+					      ComplexField &lattice_cmplx)
+{
+    conformable(_grid, q_in._grid);
+    conformable(_grid, q_out._grid);
+    PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
+    unsigned int tshift = (mu == Tp) ? 1 : 0;
+    unsigned int LLt    = GridDefaultLatt()[Tp];
+
+    q_out = zero;
+    LatticeInteger coords(_grid);
+    LatticeCoordinate(coords, Tp);
+
+    // Need q(x + mu) and q(x - mu).
+    tmp = Cshift(q_in, mu, 1);
+    tmpFwd = tmp*lattice_cmplx;
+    tmp = lattice_cmplx*q_in;
+    tmpBwd = Cshift(tmp, mu, -1);
+
+    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    {
+        // Compute the sequential conserved current insertion only if our simd
+        // object contains a timeslice we need.
+        vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
+                             (coords._odata[sU] <= tmax));
+        Integer timeSlices = Reduce(t_mask);
+
+        if (timeSlices > 0)
+        {
+            Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sU], 
+                                                q_out._odata[sU], 
+                                                Umu, sU, mu, t_mask);
+        }
+
+        // Repeat for backward direction.
+        t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
+                      (coords._odata[sU] <= (tmax + tshift)));
+
+	//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+	unsigned int t0 = 0;
+	if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
+
+        timeSlices = Reduce(t_mask);
+
+        if (timeSlices > 0)
+        {
+            Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sU], 
+                                                q_out._odata[sU], 
+                                                Umu, sU, mu, t_mask);
+        }
+    }
+
+
+}
+
+FermOpTemplateInstantiate(WilsonFermion);
+AdjointFermOpTemplateInstantiate(WilsonFermion);
+TwoIndexFermOpTemplateInstantiate(WilsonFermion);
+GparityFermOpTemplateInstantiate(WilsonFermion);
+}
+}
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -0,0 +1,200 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonFermion.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_QCD_WILSON_FERMION_H
+#define GRID_QCD_WILSON_FERMION_H
+
+namespace Grid {
+
+namespace QCD {
+
+class WilsonFermionStatic {
+ public:
+  static int HandOptDslash;  // these are a temporary hack
+  static int MortonOrder;
+  static const std::vector<int> directions;
+  static const std::vector<int> displacements;
+  static const int npoint = 8;
+};
+
+ struct WilsonAnisotropyCoefficients: Serializable
+ {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonAnisotropyCoefficients,
+  bool, isAnisotropic,
+  int, t_direction,
+  double, xi_0,
+  double, nu);
+
+  WilsonAnisotropyCoefficients():
+    isAnisotropic(false), 
+    t_direction(Nd-1), 
+    xi_0(1.0), 
+    nu(1.0){}
+};
+
+template <class Impl>
+class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
+ public:
+  INHERIT_IMPL_TYPES(Impl);
+  typedef WilsonKernels<Impl> Kernels;
+
+  ///////////////////////////////////////////////////////////////
+  // Implement the abstract base
+  ///////////////////////////////////////////////////////////////
+  GridBase *GaugeGrid(void) { return _grid; }
+  GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
+  GridBase *FermionGrid(void) { return _grid; }
+  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
+
+  FermionField _tmp;
+  FermionField &tmp(void) { return _tmp; }
+
+  //////////////////////////////////////////////////////////////////
+  // override multiply; cut number routines if pass dagger argument
+  // and also make interface more uniformly consistent
+  //////////////////////////////////////////////////////////////////
+  virtual RealD M(const FermionField &in, FermionField &out);
+  virtual RealD Mdag(const FermionField &in, FermionField &out);
+
+  /////////////////////////////////////////////////////////
+  // half checkerboard operations
+  // could remain virtual so we  can derive Clover from Wilson base
+  /////////////////////////////////////////////////////////
+  void Meooe(const FermionField &in, FermionField &out);
+  void MeooeDag(const FermionField &in, FermionField &out);
+
+  // allow override for twisted mass and clover
+  virtual void Mooee(const FermionField &in, FermionField &out);
+  virtual void MooeeDag(const FermionField &in, FermionField &out);
+  virtual void MooeeInv(const FermionField &in, FermionField &out);
+  virtual void MooeeInvDag(const FermionField &in, FermionField &out);
+
+  virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _mass,std::vector<double> twist) ;
+
+  ////////////////////////
+  // Derivative interface
+  ////////////////////////
+  // Interface calls an internal routine
+  void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+
+  ///////////////////////////////////////////////////////////////
+  // non-hermitian hopping term; half cb or both
+  ///////////////////////////////////////////////////////////////
+  void Dhop(const FermionField &in, FermionField &out, int dag);
+  void DhopOE(const FermionField &in, FermionField &out, int dag);
+  void DhopEO(const FermionField &in, FermionField &out, int dag);
+
+  ///////////////////////////////////////////////////////////////
+  // Multigrid assistance; force term uses too
+  ///////////////////////////////////////////////////////////////
+  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
+  void DhopDirDisp(const FermionField &in, FermionField &out, int dirdisp,
+                   int gamma, int dag);
+
+  ///////////////////////////////////////////////////////////////
+  // Extra methods added by derived
+  ///////////////////////////////////////////////////////////////
+  void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
+                     const FermionField &A, const FermionField &B, int dag);
+
+  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+                    const FermionField &in, FermionField &out, int dag);
+
+  void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+                    const FermionField &in, FermionField &out, int dag);
+
+  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+                    const FermionField &in, FermionField &out, int dag);
+
+  // Constructor
+  WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
+                GridRedBlackCartesian &Hgrid, RealD _mass, 
+                const ImplParams &p = ImplParams(), 
+                const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );
+
+  // DoubleStore impl dependent
+  void ImportGauge(const GaugeField &_Umu);
+
+  ///////////////////////////////////////////////////////////////
+  // Data members require to support the functionality
+  ///////////////////////////////////////////////////////////////
+
+  //    protected:
+ public:
+  virtual RealD Mass(void) { return mass; }
+  virtual int   isTrivialEE(void) { return 1; };
+  RealD mass;
+  RealD diag_mass;
+
+  GridBase *_grid;
+  GridBase *_cbgrid;
+
+  // Defines the stencils for even and odd
+  StencilImpl Stencil;
+  StencilImpl StencilEven;
+  StencilImpl StencilOdd;
+
+  // Copy of the gauge field , with even and odd subsets
+  DoubledGaugeField Umu;
+  DoubledGaugeField UmuEven;
+  DoubledGaugeField UmuOdd;
+
+  LebesgueOrder Lebesgue;
+  LebesgueOrder LebesgueEvenOdd;
+
+  WilsonAnisotropyCoefficients anisotropyCoeff;
+  
+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+                                PropagatorField &q_in_2,
+                                PropagatorField &q_out,
+                                Current curr_type,
+                                unsigned int mu);
+  void SeqConservedCurrent(PropagatorField &q_in, 
+                             PropagatorField &q_out,
+                             Current curr_type, 
+                             unsigned int mu,
+                             unsigned int tmin, 
+                             unsigned int tmax,
+			     ComplexField &lattice_cmplx);
+};
+
+typedef WilsonFermion<WilsonImplF> WilsonFermionF;
+typedef WilsonFermion<WilsonImplD> WilsonFermionD;
+
+
+}
+}
+#endif
--- a/Grid/qcd/action/fermion/WilsonFermion5D.cc
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.cc
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -0,0 +1,237 @@
+
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonFermion5D.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_WILSON_FERMION_5D_H
+#define  GRID_QCD_WILSON_FERMION_5D_H
+
+#include <Grid/perfmon/Stat.h>
+
+namespace Grid {
+namespace QCD {
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // This is the 4d red black case appropriate to support
+  //
+  // parity = (x+y+z+t)|2;
+  // generalised five dim fermions like mobius, zolotarev etc..	
+  //
+  // i.e. even even contains fifth dim hopping term.
+  //
+  // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
+  ////////////////////////////////////////////////////////////////////////////////
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // This is the 4d red black case appropriate to support
+    //
+    // parity = (x+y+z+t)|2;
+    // generalised five dim fermions like mobius, zolotarev etc..	
+    //
+    // i.e. even even contains fifth dim hopping term.
+    //
+    // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
+    ////////////////////////////////////////////////////////////////////////////////
+
+    class WilsonFermion5DStatic { 
+    public:
+      // S-direction is INNERMOST and takes no part in the parity.
+      static const std::vector<int> directions;
+      static const std::vector<int> displacements;
+      const int npoint = 8;
+    };
+
+    template<class Impl>
+    class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+     typedef WilsonKernels<Impl> Kernels;
+     PmuStat stat;
+
+     FermionField _tmp;
+     FermionField &tmp(void) { return _tmp; }
+
+     void Report(void);
+     void ZeroCounters(void);
+     double DhopCalls;
+     double DhopCommTime;
+     double DhopComputeTime;
+     double DhopComputeTime2;
+     double DhopFaceTime;
+     double DhopTotalTime;
+
+     double DerivCalls;
+     double DerivCommTime;
+     double DerivComputeTime;
+     double DerivDhopComputeTime;
+
+      ///////////////////////////////////////////////////////////////
+      // Implement the abstract base
+      ///////////////////////////////////////////////////////////////
+      GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
+      GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
+      GridBase *FermionGrid(void)            { return _FiveDimGrid;}
+      GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
+
+      // full checkerboard operations; leave unimplemented as abstract for now
+      virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+      virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+
+      // half checkerboard operations; leave unimplemented as abstract for now
+      virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
+
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+
+      // These can be overridden by fancy 5d chiral action
+      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+
+      void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+      void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+      void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+
+      // Implement hopping term non-hermitian hopping term; half cb or both
+      // Implement s-diagonal DW
+      void DW    (const FermionField &in, FermionField &out,int dag);
+      void Dhop  (const FermionField &in, FermionField &out,int dag);
+      void DhopOE(const FermionField &in, FermionField &out,int dag);
+      void DhopEO(const FermionField &in, FermionField &out,int dag);
+
+      // add a DhopComm
+      // -- suboptimal interface will presently trigger multiple comms.
+    void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+    
+    ///////////////////////////////////////////////////////////////
+    // New methods added 
+    ///////////////////////////////////////////////////////////////
+    void DerivInternal(StencilImpl & st,
+		       DoubledGaugeField & U,
+		       GaugeField &mat,
+		       const FermionField &A,
+		       const FermionField &B,
+		       int dag);
+    
+    void DhopInternal(StencilImpl & st,
+		      LebesgueOrder &lo,
+		      DoubledGaugeField &U,
+		      const FermionField &in, 
+		      FermionField &out,
+		      int dag);
+
+    void DhopInternalOverlappedComms(StencilImpl & st,
+				     LebesgueOrder &lo,
+				     DoubledGaugeField &U,
+				     const FermionField &in, 
+				     FermionField &out,
+				     int dag);
+
+    void DhopInternalSerialComms(StencilImpl & st,
+				 LebesgueOrder &lo,
+				 DoubledGaugeField &U,
+				 const FermionField &in, 
+				 FermionField &out,
+				 int dag);
+    
+    // Constructors
+    WilsonFermion5D(GaugeField &_Umu,
+		    GridCartesian         &FiveDimGrid,
+		    GridRedBlackCartesian &FiveDimRedBlackGrid,
+		    GridCartesian         &FourDimGrid,
+		    GridRedBlackCartesian &FourDimRedBlackGrid,
+		    double _M5,const ImplParams &p= ImplParams());
+    
+    // Constructors
+    /*
+      WilsonFermion5D(int simd, 
+      GaugeField &_Umu,
+      GridCartesian         &FiveDimGrid,
+      GridRedBlackCartesian &FiveDimRedBlackGrid,
+      GridCartesian         &FourDimGrid,
+      double _M5,const ImplParams &p= ImplParams());
+    */
+    
+    // DoubleStore
+    void ImportGauge(const GaugeField &_Umu);
+    
+    ///////////////////////////////////////////////////////////////
+    // Data members require to support the functionality
+    ///////////////////////////////////////////////////////////////
+  public:
+    
+    // Add these to the support from Wilson
+    GridBase *_FourDimGrid;
+    GridBase *_FourDimRedBlackGrid;
+    GridBase *_FiveDimGrid;
+    GridBase *_FiveDimRedBlackGrid;
+    
+    double                        M5;
+    int Ls;
+    
+    //Defines the stencils for even and odd
+    StencilImpl Stencil; 
+    StencilImpl StencilEven; 
+    StencilImpl StencilOdd; 
+    
+    // Copy of the gauge field , with even and odd subsets
+    DoubledGaugeField Umu;
+    DoubledGaugeField UmuEven;
+    DoubledGaugeField UmuOdd;
+    
+    LebesgueOrder Lebesgue;
+    LebesgueOrder LebesgueEvenOdd;
+    
+    // Comms buffer
+    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+    
+    ///////////////////////////////////////////////////////////////
+    // Conserved current utilities
+    ///////////////////////////////////////////////////////////////
+    void ContractConservedCurrent(PropagatorField &q_in_1,
+                                  PropagatorField &q_in_2,
+                                  PropagatorField &q_out,
+                                  Current curr_type, 
+                                  unsigned int mu);
+    void SeqConservedCurrent(PropagatorField &q_in, 
+                             PropagatorField &q_out,
+                             Current curr_type, 
+                             unsigned int mu,
+                             unsigned int tmin, 
+                             unsigned int tmax,
+			     ComplexField &lattice_cmplx);
+  };
+
+}}
+
+#endif
--- a/Grid/qcd/action/fermion/WilsonKernels.cc
+++ b/Grid/qcd/action/fermion/WilsonKernels.cc
@@ -0,0 +1,455 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+namespace QCD {
+
+int WilsonKernelsStatic::Opt   = WilsonKernelsStatic::OptGeneric;
+int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
+
+template <class Impl>
+WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
+
+////////////////////////////////////////////
+// Generic implementation; move to different file?
+////////////////////////////////////////////
+  
+#define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (SE->_is_local) {						\
+    chi_p = &chi;						\
+    if (SE->_permute) {						\
+      spProj(tmp, in._odata[SE->_offset]);			\
+      permute(chi, tmp, ptype);					\
+    } else {							\
+      spProj(chi, in._odata[SE->_offset]);			\
+    }								\
+  } else {							\
+    chi_p = &buf[SE->_offset];					\
+  }								\
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st);	\
+  Recon(result, Uchi);
+  
+#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon)		\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (SE->_is_local) {						\
+    chi_p = &chi;						\
+    if (SE->_permute) {						\
+      spProj(tmp, in._odata[SE->_offset]);			\
+      permute(chi, tmp, ptype);					\
+    } else {							\
+      spProj(chi, in._odata[SE->_offset]);			\
+    }								\
+  } else if ( st.same_node[Dir] ) {				\
+      chi_p = &buf[SE->_offset];				\
+  }								\
+  if (SE->_is_local || st.same_node[Dir] ) {			\
+    Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st);	\
+    Recon(result, Uchi);					\
+  }
+
+#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
+    chi_p = &buf[SE->_offset];					\
+    Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st);	\
+    Recon(result, Uchi);					\
+    nmu++;							\
+  }
+
+#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
+  if (gamma == Dir) {						\
+    if (SE->_is_local && SE->_permute) {			\
+      spProj(tmp, in._odata[SE->_offset]);			\
+      permute(chi, tmp, ptype);					\
+    } else if (SE->_is_local) {					\
+      spProj(chi, in._odata[SE->_offset]);			\
+    } else {							\
+      chi = buf[SE->_offset];					\
+    }								\
+    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);	\
+    Recon(result, Uchi);					\
+  }
+
+  ////////////////////////////////////////////////////////////////////
+  // All legs kernels ; comms then compute
+  ////////////////////////////////////////////////////////////////////
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+					     SiteHalfSpinor *buf, int sF,
+					     int sU, const FermionField &in, FermionField &out)
+{
+  SiteHalfSpinor tmp;
+  SiteHalfSpinor chi;
+  SiteHalfSpinor *chi_p;
+  SiteHalfSpinor Uchi;
+  SiteSpinor result;
+  StencilEntry *SE;
+  int ptype;
+
+  GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
+  GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG(Tp,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG(Xm,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG(Ym,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
+  vstream(out._odata[sF], result);
+};
+
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+					  SiteHalfSpinor *buf, int sF,
+					  int sU, const FermionField &in, FermionField &out) 
+{
+  SiteHalfSpinor tmp;
+  SiteHalfSpinor chi;
+  SiteHalfSpinor *chi_p;
+  SiteHalfSpinor Uchi;
+  SiteSpinor result;
+  StencilEntry *SE;
+  int ptype;
+
+  GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
+  GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG(Tm,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG(Xp,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG(Yp,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
+  vstream(out._odata[sF], result);
+};
+  ////////////////////////////////////////////////////////////////////
+  // Interior kernels
+  ////////////////////////////////////////////////////////////////////
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+						SiteHalfSpinor *buf, int sF,
+						int sU, const FermionField &in, FermionField &out)
+{
+  SiteHalfSpinor tmp;
+  SiteHalfSpinor chi;
+  SiteHalfSpinor *chi_p;
+  SiteHalfSpinor Uchi;
+  SiteSpinor result;
+  StencilEntry *SE;
+  int ptype;
+
+  result=zero;
+  GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
+  GENERIC_STENCIL_LEG_INT(Yp,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG_INT(Zp,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG_INT(Tp,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG_INT(Xm,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG_INT(Ym,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
+  vstream(out._odata[sF], result);
+};
+
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+					     SiteHalfSpinor *buf, int sF,
+					     int sU, const FermionField &in, FermionField &out) 
+{
+  SiteHalfSpinor tmp;
+  SiteHalfSpinor chi;
+  SiteHalfSpinor *chi_p;
+  SiteHalfSpinor Uchi;
+  SiteSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  result=zero;
+  GENERIC_STENCIL_LEG_INT(Xm,spProjXp,accumReconXp);
+  GENERIC_STENCIL_LEG_INT(Ym,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG_INT(Zm,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG_INT(Tm,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG_INT(Xp,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG_INT(Yp,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
+  vstream(out._odata[sF], result);
+};
+////////////////////////////////////////////////////////////////////
+// Exterior kernels
+////////////////////////////////////////////////////////////////////
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+						SiteHalfSpinor *buf, int sF,
+						int sU, const FermionField &in, FermionField &out)
+{
+  SiteHalfSpinor tmp;
+  SiteHalfSpinor chi;
+  SiteHalfSpinor *chi_p;
+  SiteHalfSpinor Uchi;
+  SiteSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  int nmu=0;
+  result=zero;
+  GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
+  GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG_EXT(Zp,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG_EXT(Tp,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG_EXT(Xm,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm);
+  if ( nmu ) { 
+    out._odata[sF] = out._odata[sF] + result; 
+  }
+};
+
+template <class Impl>
+void WilsonKernels<Impl>::GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+					     SiteHalfSpinor *buf, int sF,
+					     int sU, const FermionField &in, FermionField &out) 
+{
+  SiteHalfSpinor tmp;
+  SiteHalfSpinor chi;
+  SiteHalfSpinor *chi_p;
+  SiteHalfSpinor Uchi;
+  SiteSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  int nmu=0;
+  result=zero;
+  GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
+  GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
+  GENERIC_STENCIL_LEG_EXT(Zm,spProjZp,accumReconZp);
+  GENERIC_STENCIL_LEG_EXT(Tm,spProjTp,accumReconTp);
+  GENERIC_STENCIL_LEG_EXT(Xp,spProjXm,accumReconXm);
+  GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm);
+  GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm);
+  GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm);
+  if ( nmu ) { 
+    out._odata[sF] = out._odata[sF] + result; 
+  }
+};
+
+template <class Impl>
+void WilsonKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF,
+					   int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
+
+  SiteHalfSpinor tmp;
+  SiteHalfSpinor chi;
+  SiteSpinor result;
+  SiteHalfSpinor Uchi;
+  StencilEntry *SE;
+  int ptype;
+
+  SE = st.GetEntry(ptype, dir, sF);
+  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
+  GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
+  GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
+  GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
+  GENERIC_DHOPDIR_LEG(Xm,spProjXm,spReconXm);
+  GENERIC_DHOPDIR_LEG(Ym,spProjYm,spReconYm);
+  GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
+  GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
+  vstream(out._odata[sF], result);
+}
+
+/*******************************************************************************
+ * Conserved current utilities for Wilson fermions, for contracting propagators
+ * to make a conserved current sink or inserting the conserved current 
+ * sequentially. Common to both 4D and 5D.
+ ******************************************************************************/
+// N.B. Functions below assume a -1/2 factor within U.
+#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
+#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
+
+/*******************************************************************************
+ * Name: ContractConservedCurrentSiteFwd
+ * Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in_1 shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(
+                                                  const SitePropagator &q_in_1,
+                                                  const SitePropagator &q_in_2,
+                                                  SitePropagator &q_out,
+                                                  DoubledGaugeField &U,
+                                                  unsigned int sU,
+                                                  unsigned int mu,
+                                                  bool switch_sign)
+{
+    SitePropagator result, tmp;
+    Gamma g5(Gamma::Algebra::Gamma5);
+    Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu);
+    result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
+    if (switch_sign)
+    {
+        q_out -= result;
+    }
+    else
+    {
+        q_out += result;
+    }
+}
+
+/*******************************************************************************
+ * Name: ContractConservedCurrentSiteBwd
+ * Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in_2 shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(
+                                                  const SitePropagator &q_in_1,
+                                                  const SitePropagator &q_in_2,
+                                                  SitePropagator &q_out,
+                                                  DoubledGaugeField &U,
+                                                  unsigned int sU,
+                                                  unsigned int mu,
+                                                  bool switch_sign)
+{
+    SitePropagator result, tmp;
+    Gamma g5(Gamma::Algebra::Gamma5);
+    Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu + Nd);
+    result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
+    if (switch_sign)
+    {
+        q_out += result;
+    }
+    else
+    {
+        q_out -= result;
+    }
+}
+
+// G-parity requires more specialised implementation.
+#define NO_CURR_SITE(Impl) \
+template <> \
+void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd( \
+                                                  const SitePropagator &q_in_1, \
+                                                  const SitePropagator &q_in_2, \
+                                                  SitePropagator &q_out,        \
+                                                  DoubledGaugeField &U,         \
+                                                  unsigned int sU,              \
+                                                  unsigned int mu,              \
+                                                  bool switch_sign)             \
+{ \
+    assert(0); \
+} \
+template <> \
+void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd( \
+                                                  const SitePropagator &q_in_1, \
+                                                  const SitePropagator &q_in_2, \
+                                                  SitePropagator &q_out,        \
+                                                  DoubledGaugeField &U,         \
+                                                  unsigned int mu,              \
+                                                  unsigned int sU,              \
+                                                  bool switch_sign)             \
+{ \
+    assert(0); \
+}
+
+NO_CURR_SITE(GparityWilsonImplF);
+NO_CURR_SITE(GparityWilsonImplD);
+NO_CURR_SITE(GparityWilsonImplFH);
+NO_CURR_SITE(GparityWilsonImplDF);
+
+
+/*******************************************************************************
+ * Name: SeqConservedCurrentSiteFwd
+ * Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
+                                                     SitePropagator &q_out,
+                                                     DoubledGaugeField &U,
+                                                     unsigned int sU,
+                                                     unsigned int mu,
+                                                     vInteger t_mask,
+                                                     bool switch_sign)
+{
+    SitePropagator result;
+    Impl::multLinkProp(result, U._odata[sU], q_in, mu);
+    result = WilsonCurrentFwd(result, mu);
+
+    // Zero any unwanted timeslice entries.
+    result = predicatedWhere(t_mask, result, 0.*result);
+
+    if (switch_sign)
+    {
+        q_out -= result;
+    }
+    else
+    {
+        q_out += result;
+    }
+}
+
+/*******************************************************************************
+ * Name: SeqConservedCurrentSiteFwd
+ * Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in shifted in -ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in, 
+                                                     SitePropagator &q_out,
+                                                     DoubledGaugeField &U,
+                                                     unsigned int sU,
+                                                     unsigned int mu,
+                                                     vInteger t_mask,
+                                                     bool switch_sign)
+{
+    SitePropagator result;
+    Impl::multLinkProp(result, U._odata[sU], q_in, mu + Nd);
+    result = WilsonCurrentBwd(result, mu);
+
+    // Zero any unwanted timeslice entries.
+    result = predicatedWhere(t_mask, result, 0.*result);
+
+    if (switch_sign)
+    {
+        q_out += result;
+    }
+    else
+    {
+        q_out -= result;
+    }
+}
+
+FermOpTemplateInstantiate(WilsonKernels);
+AdjointFermOpTemplateInstantiate(WilsonKernels);
+TwoIndexFermOpTemplateInstantiate(WilsonKernels);
+
+}}
+
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -0,0 +1,281 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_QCD_DHOP_H
+#define GRID_QCD_DHOP_H
+
+namespace Grid {
+namespace QCD {
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Helper routines that implement Wilson stencil for a single site.
+  // Common to both the WilsonFermion and WilsonFermion5D
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class WilsonKernelsStatic { 
+ public:
+  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
+  enum { CommsAndCompute, CommsThenCompute };
+  static int Opt;  
+  static int Comms;
+};
+ 
+template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
+ public:
+   
+  INHERIT_IMPL_TYPES(Impl);
+  typedef FermionOperator<Impl> Base;
+   
+public:
+
+  template <bool EnableBool = true>
+  typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
+  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
+  {
+    bgq_l1p_optimisation(1);
+    switch(Opt) {
+#if defined(AVX512) || defined (QPX)
+    case OptInlineAsm:
+      if(interior&&exterior) WilsonKernels<Impl>::AsmDhopSite   (st,lo,U,buf,sF,sU,Ls,Ns,in,out);
+      else if (interior)     WilsonKernels<Impl>::AsmDhopSiteInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
+      else if (exterior)     WilsonKernels<Impl>::AsmDhopSiteExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
+      else assert(0);
+      break;
+#endif
+    case OptHandUnroll:
+         for (int site = 0; site < Ns; site++) {
+	   for (int s = 0; s < Ls; s++) {
+	     if(interior&&exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out);
+	     else if (interior)     WilsonKernels<Impl>::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
+	     else if (exterior)     WilsonKernels<Impl>::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
+	     sF++;
+	   }
+	   sU++;
+         }
+      break;
+    case OptGeneric:
+         for (int site = 0; site < Ns; site++) {
+	   for (int s = 0; s < Ls; s++) {
+	     if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
+	     else if (interior)     WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
+	     else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
+	     else assert(0);
+	     sF++;
+	   }
+	   sU++;
+       } 
+      break;
+    default:
+      assert(0);
+    }
+    bgq_l1p_optimisation(0);
+  }
+     
+  template <bool EnableBool = true>
+  typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool, void>::type
+  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+	   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) {
+    // no kernel choice  
+    for (int site = 0; site < Ns; site++) {
+      for (int s = 0; s < Ls; s++) {
+	if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
+	else if (interior)     WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
+	else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
+	else assert(0);
+	sF++;
+      }
+      sU++;
+    }
+  }
+     
+  template <bool EnableBool = true>
+  typename std::enable_if<Impl::isFundamental==true && Nc == 3 && EnableBool,void>::type
+  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+	      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
+{
+    bgq_l1p_optimisation(1);
+    switch(Opt) {
+#if defined(AVX512) || defined (QPX)
+    case OptInlineAsm:
+      if(interior&&exterior) WilsonKernels<Impl>::AsmDhopSiteDag   (st,lo,U,buf,sF,sU,Ls,Ns,in,out);
+      else if (interior)     WilsonKernels<Impl>::AsmDhopSiteDagInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
+      else if (exterior)     WilsonKernels<Impl>::AsmDhopSiteDagExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
+      else assert(0);
+      break;
+#endif
+    case OptHandUnroll:
+      for (int site = 0; site < Ns; site++) {
+	for (int s = 0; s < Ls; s++) {
+	  if(interior&&exterior) WilsonKernels<Impl>::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	  else if (interior)     WilsonKernels<Impl>::HandDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out);
+	  else if (exterior)     WilsonKernels<Impl>::HandDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out);
+	  else assert(0);
+	  sF++;
+	}
+	sU++;
+      }
+      break;
+    case OptGeneric:
+      for (int site = 0; site < Ns; site++) {
+	for (int s = 0; s < Ls; s++) {
+	  if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	  else if (interior)     WilsonKernels<Impl>::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out);
+	  else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out);
+	  else assert(0);
+	  sF++;
+	}
+	sU++;
+      }
+      break;
+    default:
+      assert(0);
+    }
+    bgq_l1p_optimisation(0);
+  }
+
+  template <bool EnableBool = true>
+  typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool,void>::type
+  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
+		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) {
+
+    for (int site = 0; site < Ns; site++) {
+      for (int s = 0; s < Ls; s++) {
+	if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	else if (interior)     WilsonKernels<Impl>::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out);
+	else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out);
+	else assert(0);
+	sF++;
+      }
+      sU++;
+    }
+  }
+
+  void DhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
+		       int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
+      
+  //////////////////////////////////////////////////////////////////////////////
+  // Utilities for inserting Wilson conserved current.
+  //////////////////////////////////////////////////////////////////////////////
+  void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
+                                       const SitePropagator &q_in_2,
+                                       SitePropagator &q_out,
+                                       DoubledGaugeField &U,
+                                       unsigned int sU,
+                                       unsigned int mu,
+                                       bool switch_sign = false);
+  void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
+                                       const SitePropagator &q_in_2,
+                                       SitePropagator &q_out,
+                                       DoubledGaugeField &U,
+                                       unsigned int sU,
+                                       unsigned int mu,
+                                       bool switch_sign = false);
+  void SeqConservedCurrentSiteFwd(const SitePropagator &q_in, 
+                                  SitePropagator &q_out,
+                                  DoubledGaugeField &U,
+                                  unsigned int sU,
+                                  unsigned int mu,
+                                  vInteger t_mask,
+                                  bool switch_sign = false);
+  void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
+                                  SitePropagator &q_out,
+                                  DoubledGaugeField &U,
+                                  unsigned int sU,
+                                  unsigned int mu,
+                                  vInteger t_mask,
+                                  bool switch_sign = false);
+
+private:
+     // Specialised variants
+  void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		       int sF, int sU, const FermionField &in, FermionField &out);
+      
+  void GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			  int sF, int sU, const FermionField &in, FermionField &out);
+
+  void GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			  int sF, int sU, const FermionField &in, FermionField &out);
+      
+  void GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			     int sF, int sU, const FermionField &in, FermionField &out);
+
+  void GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			  int sF, int sU, const FermionField &in, FermionField &out);
+      
+  void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			     int sF, int sU, const FermionField &in, FermionField &out);
+
+
+  void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		   int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
+
+  void AsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
+
+  void AsmDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		      int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
+
+  void AsmDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			 int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
+
+  void AsmDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		      int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
+
+  void AsmDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			 int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
+
+
+  void HandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		    int sF, int sU, const FermionField &in, FermionField &out);
+
+  void HandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		       int sF, int sU, const FermionField &in, FermionField &out);
+      
+  void HandDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		       int sF, int sU, const FermionField &in, FermionField &out);
+  
+  void HandDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			  int sF, int sU, const FermionField &in, FermionField &out);
+  
+  void HandDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		       int sF, int sU, const FermionField &in, FermionField &out);
+  
+  void HandDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			  int sF, int sU, const FermionField &in, FermionField &out);
+  
+public:
+
+  WilsonKernels(const ImplParams &p = ImplParams());
+
+};
+    
+}}
+
+#endif
--- a/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -0,0 +1,127 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+
+
+    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+namespace QCD {
+
+
+///////////////////////////////////////////////////////////
+// Default to no assembler implementation
+///////////////////////////////////////////////////////////
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
+
+#include <qcd/action/fermion/WilsonKernelsAsmAvx512.h>
+#include <qcd/action/fermion/WilsonKernelsAsmQPX.h>
+
+#define INSTANTIATE_ASM(A)\
+template void WilsonKernels<A>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+ \
+template void WilsonKernels<A>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+template void WilsonKernels<A>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+ \
+template void WilsonKernels<A>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+template void WilsonKernels<A>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+ \
+template void WilsonKernels<A>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+
+INSTANTIATE_ASM(WilsonImplF);
+INSTANTIATE_ASM(WilsonImplD);
+INSTANTIATE_ASM(ZWilsonImplF);
+INSTANTIATE_ASM(ZWilsonImplD);
+INSTANTIATE_ASM(GparityWilsonImplF);
+INSTANTIATE_ASM(GparityWilsonImplD);
+INSTANTIATE_ASM(DomainWallVec5dImplF);
+INSTANTIATE_ASM(DomainWallVec5dImplD);
+INSTANTIATE_ASM(ZDomainWallVec5dImplF);
+INSTANTIATE_ASM(ZDomainWallVec5dImplD);
+
+INSTANTIATE_ASM(WilsonImplFH);
+INSTANTIATE_ASM(WilsonImplDF);
+INSTANTIATE_ASM(ZWilsonImplFH);
+INSTANTIATE_ASM(ZWilsonImplDF);
+INSTANTIATE_ASM(GparityWilsonImplFH);
+INSTANTIATE_ASM(GparityWilsonImplDF);
+INSTANTIATE_ASM(DomainWallVec5dImplFH);
+INSTANTIATE_ASM(DomainWallVec5dImplDF);
+INSTANTIATE_ASM(ZDomainWallVec5dImplFH);
+INSTANTIATE_ASM(ZDomainWallVec5dImplDF);
+
+}}
+
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h
@@ -0,0 +1,650 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+
+
+    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+
+#if defined(AVX512) 
+    ///////////////////////////////////////////////////////////
+    // If we are AVX512 specialise the single precision routine
+    ///////////////////////////////////////////////////////////
+#include <simd/Intel512wilson.h>
+#include <simd/Intel512single.h>
+    
+static Vector<vComplexF> signsF;
+
+  template<typename vtype>    
+  int setupSigns(Vector<vtype>& signs ){
+    Vector<vtype> bother(2);
+    signs = bother;
+    vrsign(signs[0]);
+    visign(signs[1]);
+    return 1;
+  }
+
+  static int signInitF = setupSigns(signsF);
+
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
+#define COMPLEX_SIGNS(isigns) vComplexF *isigns = &signsF[0];  
+  
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+      
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#define MAYBEPERM(A,B) 
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+#undef  MULT_2SPIN
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef COMPLEX_SIGNS
+#undef MAYBEPERM
+#undef MULT_2SPIN
+	
+
+
+///////////////////////////////////////////////////////////
+// If we are AVX512 specialise the double precision routine
+///////////////////////////////////////////////////////////
+
+#include <simd/Intel512double.h>
+    
+static Vector<vComplexD> signsD;
+static int signInitD = setupSigns(signsD);
+    
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
+#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];  
+
+
+#define INTERIOR_AND_EXTERIOR    
+#undef  INTERIOR
+#undef  EXTERIOR
+  
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+      
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+      
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#define MAYBEPERM(A,B) 
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+#undef  MULT_2SPIN
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+template<> void 
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+template<> void 
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef COMPLEX_SIGNS
+#undef MAYBEPERM
+#undef MULT_2SPIN
+
+#endif //AVX512
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -0,0 +1,196 @@
+#ifdef KERNEL_DAG
+#define DIR0_PROJMEM(base) XP_PROJMEM(base);
+#define DIR1_PROJMEM(base) YP_PROJMEM(base);
+#define DIR2_PROJMEM(base) ZP_PROJMEM(base);
+#define DIR3_PROJMEM(base) TP_PROJMEM(base);
+#define DIR4_PROJMEM(base) XM_PROJMEM(base);
+#define DIR5_PROJMEM(base) YM_PROJMEM(base);
+#define DIR6_PROJMEM(base) ZM_PROJMEM(base);
+#define DIR7_PROJMEM(base) TM_PROJMEM(base);
+#define DIR0_RECON   XP_RECON
+#define DIR1_RECON   YP_RECON_ACCUM
+#define DIR2_RECON   ZP_RECON_ACCUM
+#define DIR3_RECON   TP_RECON_ACCUM
+#define DIR4_RECON   XM_RECON_ACCUM
+#define DIR5_RECON   YM_RECON_ACCUM
+#define DIR6_RECON   ZM_RECON_ACCUM
+#define DIR7_RECON   TM_RECON_ACCUM
+#else
+#define DIR0_PROJMEM(base) XM_PROJMEM(base);
+#define DIR1_PROJMEM(base) YM_PROJMEM(base);
+#define DIR2_PROJMEM(base) ZM_PROJMEM(base);
+#define DIR3_PROJMEM(base) TM_PROJMEM(base);
+#define DIR4_PROJMEM(base) XP_PROJMEM(base);
+#define DIR5_PROJMEM(base) YP_PROJMEM(base);
+#define DIR6_PROJMEM(base) ZP_PROJMEM(base);
+#define DIR7_PROJMEM(base) TP_PROJMEM(base);
+#define DIR0_RECON   XM_RECON
+#define DIR1_RECON   YM_RECON_ACCUM
+#define DIR2_RECON   ZM_RECON_ACCUM
+#define DIR3_RECON   TM_RECON_ACCUM
+#define DIR4_RECON   XP_RECON_ACCUM
+#define DIR5_RECON   YP_RECON_ACCUM
+#define DIR6_RECON   ZP_RECON_ACCUM
+#define DIR7_RECON   TP_RECON_ACCUM
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Comms then compute kernel
+////////////////////////////////////////////////////////////////////////////////
+#ifdef INTERIOR_AND_EXTERIOR
+
+#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
+      basep = st.GetPFInfo(nent,plocal); nent++;			\
+      if ( local ) {							\
+	LOAD64(%r10,isigns);						\
+	PROJ(base);							\
+	MAYBEPERM(PERMUTE_DIR,perm);					\
+      } else {								\
+	LOAD_CHI(base);							\
+      }									\
+      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
+      PREFETCH_CHIMU(base);						\
+      MULT_2SPIN_DIR_PF(Dir,basep);					\
+      LOAD64(%r10,isigns);						\
+      RECON;								\
+
+#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
+  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
+  PF_GAUGE(Xp);								\
+  PREFETCH1_CHIMU(base);						\
+  ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) 
+
+#define RESULT(base,basep) SAVE_RESULT(base,basep);
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Pre comms kernel -- prefetch like normal because it is mostly right
+////////////////////////////////////////////////////////////////////////////////
+#ifdef INTERIOR
+
+#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
+      basep = st.GetPFInfo(nent,plocal); nent++;			\
+      if ( local ) {							\
+	LOAD64(%r10,isigns);						\
+	PROJ(base);							\
+	MAYBEPERM(PERMUTE_DIR,perm);					\
+      }else if ( st.same_node[Dir] ) {LOAD_CHI(base);}			\
+      if ( local || st.same_node[Dir] ) {				\
+	MULT_2SPIN_DIR_PF(Dir,basep);					\
+	LOAD64(%r10,isigns);						\
+	RECON;								\
+      }									\
+      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
+      PREFETCH_CHIMU(base);						\
+
+#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
+  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
+  PF_GAUGE(Xp);								\
+  PREFETCH1_CHIMU(base);						\
+  { ZERO_PSI; }								\
+  ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) 
+
+#define RESULT(base,basep) SAVE_RESULT(base,basep);
+
+#endif
+////////////////////////////////////////////////////////////////////////////////
+// Post comms kernel
+////////////////////////////////////////////////////////////////////////////////
+#ifdef EXTERIOR
+
+
+#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
+  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
+  if((!local)&&(!st.same_node[Dir]) ) {					\
+    LOAD_CHI(base);							\
+    MULT_2SPIN_DIR_PF(Dir,base);					\
+    LOAD64(%r10,isigns);						\
+    RECON;								\
+    nmu++;								\
+  }									
+
+#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
+  nmu=0;								\
+  { ZERO_PSI;}								\
+  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
+  if((!local)&&(!st.same_node[Dir]) ) {					\
+    LOAD_CHI(base);							\
+    MULT_2SPIN_DIR_PF(Dir,base);					\
+    LOAD64(%r10,isigns);						\
+    RECON;								\
+    nmu++;								\
+  }
+
+#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
+
+#endif
+{
+  int nmu;
+  int local,perm, ptype;
+  uint64_t base;
+  uint64_t basep;
+  const uint64_t plocal =(uint64_t) & in._odata[0];
+
+  COMPLEX_SIGNS(isigns);
+  MASK_REGS;
+  int nmax=U._grid->oSites();
+  for(int site=0;site<Ns;site++) {
+#ifndef EXTERIOR
+    int sU =lo.Reorder(ssU);
+    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
+    int sUn=lo.Reorder(ssn);
+    LOCK_GAUGE(0);
+#else
+    int sU =ssU;
+    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
+    int sUn=ssn;
+#endif
+    for(int s=0;s<Ls;s++) {
+      ss =sU*Ls+s;
+      ssn=sUn*Ls+s; 
+      int  ent=ss*8;// 2*Ndim
+      int nent=ssn*8;
+
+   ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON);
+      ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON);
+      ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON);
+      ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON);
+
+      ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON);
+      ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON);
+      ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON);
+      ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON);
+
+#ifdef EXTERIOR
+      if (nmu==0) break;
+      //      if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
+#endif
+      base = (uint64_t) &out._odata[ss];
+      basep= st.GetPFInfo(nent,plocal); nent++;
+      RESULT(base,basep);
+    }
+    ssU++;
+    UNLOCK_GAUGE(0);
+  }
+}
+
+#undef DIR0_PROJMEM
+#undef DIR1_PROJMEM
+#undef DIR2_PROJMEM
+#undef DIR3_PROJMEM
+#undef DIR4_PROJMEM
+#undef DIR5_PROJMEM
+#undef DIR6_PROJMEM
+#undef DIR7_PROJMEM
+#undef DIR0_RECON
+#undef DIR1_RECON
+#undef DIR2_RECON
+#undef DIR3_RECON
+#undef DIR4_RECON
+#undef DIR5_RECON
+#undef DIR6_RECON
+#undef DIR7_RECON
+#undef ASM_LEG
+#undef ASM_LEG_XP
+#undef RESULT
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
@@ -0,0 +1,161 @@
+{
+  int locala,perma, ptypea;
+  int localb,permb, ptypeb;
+  uint64_t basea, baseb;
+  const uint64_t plocal =(uint64_t) & in._odata[0];
+
+  //  vComplexF isigns[2] = { signs[0], signs[1] };
+  vComplexF *isigns = &signs[0];
+
+  MASK_REGS;
+
+  for(int site=0;site<Ns;site++) {
+  int sU=lo.Reorder(ssU);  
+  for(int s=0;s<Ls;s++) {
+  ss=sU*Ls+s;
+  ////////////////////////////////
+  // Xp
+  ////////////////////////////////
+  int ent=ss*8;// 2*Ndim
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+
+  if ( locala ) {
+    LOAD64(%r10,isigns);
+    XM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXP(Xp,baseb);
+  }
+  LOAD64(%r10,isigns);
+  XM_RECON;
+
+  ////////////////////////////////
+  // Yp
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYP(Yp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zp
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZP(Zp,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tp
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFTP(Tp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Xm
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    XP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXM(Xm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  XP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Ym
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYM(Ym,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zm
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZM(Zm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tm
+  ////////////////////////////////
+  basea = (uint64_t)&out._odata[ss];
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal);
+  {
+    MULT_2SPIN_DIR_PFTM(Tm,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TP_RECON_ACCUM;
+
+  SAVE_RESULT(&out._odata[ss],baseb);
+
+  } 
+  ssU++;
+  }
+}
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
@@ -0,0 +1,187 @@
+{
+  int locala,perma, ptypea;
+  int localb,permb, ptypeb;
+  int localc,permc, ptypec;
+  uint64_t basea, baseb, basec;
+  uint64_t basex;
+  const uint64_t plocal =(uint64_t) & in._odata[0];
+
+  //  vComplexF isigns[2] = { signs[0], signs[1] };
+  vComplexF *isigns = &signs[0];
+
+  MASK_REGS;
+
+  for(int site=0;site<Ns;site++) {
+  int sU=lo.Reorder(ssU);
+
+  for(int s=0;s<Ls;s++) {
+  ss     =sU*Ls+s;
+
+  ////////////////////////////////
+  // Xp
+  ////////////////////////////////
+  int ent=ss*8;// 2*Ndim
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+
+  basex = basea;
+
+  label(FX(XP) );
+  if ( locala ) {
+    LOAD64(%r10,isigns);
+    XM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXP(Xp,baseb);
+  }
+  LOAD64(%r10,isigns);
+  XM_RECON;
+
+  ////////////////////////////////
+  // Yp
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  label(FX(YP) );
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYP(Yp,basec);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zp
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  label(FX(ZP) );
+  if ( localc ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZM_PROJMEM(basec);
+    MAYBEPERM(PERMUTE_DIR1,permc);
+  } else { 
+    LOAD_CHI(basec);
+  }
+  {
+    MULT_2SPIN_DIR_PFZP(Zp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tp
+  ////////////////////////////////
+  basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+  label(FX(TP) );
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR0,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFTP(Tp,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Xm
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  label(FX(XM) );
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    XP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR3,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFXM(Xm,basec);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  XP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Ym
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  label(FX(YM) );
+  if ( localc ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YP_PROJMEM(basec);
+    MAYBEPERM(PERMUTE_DIR2,permc);
+  } else { 
+    LOAD_CHI(basec);
+  }
+  {
+    MULT_2SPIN_DIR_PFYM(Ym,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zm
+  ////////////////////////////////
+  basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+  label(FX(ZM) );
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZM(Zm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tm
+  ////////////////////////////////
+  basea = (uint64_t)&out._odata[ss];
+  PREFETCH_CHIMU(basea);
+  label(FX(TM) );
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFTM(Tm,basec);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TP_RECON_ACCUM;
+
+  //  PREFETCH_CHIMU(basex);
+  label(FX(SAV) );
+  SAVE_RESULT(&out._odata[ss]);
+  
+  }
+  ssU++;
+  }
+}
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h
@@ -0,0 +1,150 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+
+
+    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmQPX.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+
+#if defined(QPX) 
+
+    ///////////////////////////////////////////////////////////
+    // If we are QPX specialise the single precision routine
+    ///////////////////////////////////////////////////////////
+
+#include <simd/IBM_qpx.h>
+#include <simd/IBM_qpx_single.h>
+  
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf)
+#define COMPLEX_SIGNS(isigns) 
+
+#define INTERIOR_AND_EXTERIOR    
+#undef  INTERIOR
+#undef  EXTERIOR
+  
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+#undef KERNEL_DAG
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+      
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+template<> void 
+WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#define MAYBEPERM(A,B) 
+#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX_LS(ptr,pf)
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+#undef KERNEL_DAG
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#undef MAYBEPERM
+#undef MULT_2SPIN
+	
+///////////////////////////////////////////////////////////
+// DP routines
+///////////////////////////////////////////////////////////
+
+#include <simd/IBM_qpx_double.h>
+    
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf)
+
+/////////////////////////////////////////////////////////////////
+// XYZT Vectorised, undag Kernel, double
+/////////////////////////////////////////////////////////////////
+#undef KERNEL_DAG
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+/////////////////////////////////////////////////////////////////
+      
+
+/////////////////////////////////////////////////////////////////
+// XYZT Vectorised, dag Kernel, double
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+template<> void 
+WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+/////////////////////////////////////////////////////////////////
+
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#define MAYBEPERM(A,B) 
+#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX_LS(ptr,pf)
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, undag Kernel, double
+/////////////////////////////////////////////////////////////////
+#undef KERNEL_DAG
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+/////////////////////////////////////////////////////////////////
+				    
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, dag Kernel, double
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+/////////////////////////////////////////////////////////////////
+	
+#undef MAYBEPERM
+#undef MULT_2SPIN
+
+#endif 
--- a/Grid/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsHand.cc
@@ -0,0 +1,631 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+#define REGISTER
+
+#define LOAD_CHIMU \
+  {const SiteSpinor & ref (in._odata[offset]);	\
+    Chimu_00=ref()(0)(0);\
+    Chimu_01=ref()(0)(1);\
+    Chimu_02=ref()(0)(2);\
+    Chimu_10=ref()(1)(0);\
+    Chimu_11=ref()(1)(1);\
+    Chimu_12=ref()(1)(2);\
+    Chimu_20=ref()(2)(0);\
+    Chimu_21=ref()(2)(1);\
+    Chimu_22=ref()(2)(2);\
+    Chimu_30=ref()(3)(0);\
+    Chimu_31=ref()(3)(1);\
+    Chimu_32=ref()(3)(2);}
+
+#define LOAD_CHI\
+  {const SiteHalfSpinor &ref(buf[offset]);	\
+    Chi_00 = ref()(0)(0);\
+    Chi_01 = ref()(0)(1);\
+    Chi_02 = ref()(0)(2);\
+    Chi_10 = ref()(1)(0);\
+    Chi_11 = ref()(1)(1);\
+    Chi_12 = ref()(1)(2);}
+
+// To splat or not to splat depends on the implementation
+#define MULT_2SPIN(A)\
+  {auto & ref(U._odata[sU](A));			\
+   Impl::loadLinkElement(U_00,ref()(0,0));	\
+   Impl::loadLinkElement(U_10,ref()(1,0));	\
+   Impl::loadLinkElement(U_20,ref()(2,0));	\
+   Impl::loadLinkElement(U_01,ref()(0,1));	\
+   Impl::loadLinkElement(U_11,ref()(1,1));	\
+   Impl::loadLinkElement(U_21,ref()(2,1));	\
+    UChi_00 = U_00*Chi_00;\
+    UChi_10 = U_00*Chi_10;\
+    UChi_01 = U_10*Chi_00;\
+    UChi_11 = U_10*Chi_10;\
+    UChi_02 = U_20*Chi_00;\
+    UChi_12 = U_20*Chi_10;\
+    UChi_00+= U_01*Chi_01;\
+    UChi_10+= U_01*Chi_11;\
+    UChi_01+= U_11*Chi_01;\
+    UChi_11+= U_11*Chi_11;\
+    UChi_02+= U_21*Chi_01;\
+    UChi_12+= U_21*Chi_11;\
+    Impl::loadLinkElement(U_00,ref()(0,2));	\
+    Impl::loadLinkElement(U_10,ref()(1,2));	\
+    Impl::loadLinkElement(U_20,ref()(2,2));	\
+    UChi_00+= U_00*Chi_02;\
+    UChi_10+= U_00*Chi_12;\
+    UChi_01+= U_10*Chi_02;\
+    UChi_11+= U_10*Chi_12;\
+    UChi_02+= U_20*Chi_02;\
+    UChi_12+= U_20*Chi_12;}
+
+
+#define PERMUTE_DIR(dir)			\
+      permute##dir(Chi_00,Chi_00);\
+      permute##dir(Chi_01,Chi_01);\
+      permute##dir(Chi_02,Chi_02);\
+      permute##dir(Chi_10,Chi_10);\
+      permute##dir(Chi_11,Chi_11);\
+      permute##dir(Chi_12,Chi_12);
+
+//      hspin(0)=fspin(0)+timesI(fspin(3));
+//      hspin(1)=fspin(1)+timesI(fspin(2));
+#define XP_PROJ \
+    Chi_00 = Chimu_00+timesI(Chimu_30);\
+    Chi_01 = Chimu_01+timesI(Chimu_31);\
+    Chi_02 = Chimu_02+timesI(Chimu_32);\
+    Chi_10 = Chimu_10+timesI(Chimu_20);\
+    Chi_11 = Chimu_11+timesI(Chimu_21);\
+    Chi_12 = Chimu_12+timesI(Chimu_22);
+
+#define YP_PROJ \
+    Chi_00 = Chimu_00-Chimu_30;\
+    Chi_01 = Chimu_01-Chimu_31;\
+    Chi_02 = Chimu_02-Chimu_32;\
+    Chi_10 = Chimu_10+Chimu_20;\
+    Chi_11 = Chimu_11+Chimu_21;\
+    Chi_12 = Chimu_12+Chimu_22;
+
+#define ZP_PROJ \
+  Chi_00 = Chimu_00+timesI(Chimu_20);		\
+  Chi_01 = Chimu_01+timesI(Chimu_21);		\
+  Chi_02 = Chimu_02+timesI(Chimu_22);		\
+  Chi_10 = Chimu_10-timesI(Chimu_30);		\
+  Chi_11 = Chimu_11-timesI(Chimu_31);		\
+  Chi_12 = Chimu_12-timesI(Chimu_32);
+
+#define TP_PROJ \
+  Chi_00 = Chimu_00+Chimu_20;		\
+  Chi_01 = Chimu_01+Chimu_21;		\
+  Chi_02 = Chimu_02+Chimu_22;		\
+  Chi_10 = Chimu_10+Chimu_30;		\
+  Chi_11 = Chimu_11+Chimu_31;		\
+  Chi_12 = Chimu_12+Chimu_32;
+
+
+//      hspin(0)=fspin(0)-timesI(fspin(3));
+//      hspin(1)=fspin(1)-timesI(fspin(2));
+#define XM_PROJ \
+    Chi_00 = Chimu_00-timesI(Chimu_30);\
+    Chi_01 = Chimu_01-timesI(Chimu_31);\
+    Chi_02 = Chimu_02-timesI(Chimu_32);\
+    Chi_10 = Chimu_10-timesI(Chimu_20);\
+    Chi_11 = Chimu_11-timesI(Chimu_21);\
+    Chi_12 = Chimu_12-timesI(Chimu_22);
+
+#define YM_PROJ \
+    Chi_00 = Chimu_00+Chimu_30;\
+    Chi_01 = Chimu_01+Chimu_31;\
+    Chi_02 = Chimu_02+Chimu_32;\
+    Chi_10 = Chimu_10-Chimu_20;\
+    Chi_11 = Chimu_11-Chimu_21;\
+    Chi_12 = Chimu_12-Chimu_22;
+
+#define ZM_PROJ \
+  Chi_00 = Chimu_00-timesI(Chimu_20);		\
+  Chi_01 = Chimu_01-timesI(Chimu_21);		\
+  Chi_02 = Chimu_02-timesI(Chimu_22);		\
+  Chi_10 = Chimu_10+timesI(Chimu_30);		\
+  Chi_11 = Chimu_11+timesI(Chimu_31);		\
+  Chi_12 = Chimu_12+timesI(Chimu_32);
+
+#define TM_PROJ \
+  Chi_00 = Chimu_00-Chimu_20;		\
+  Chi_01 = Chimu_01-Chimu_21;		\
+  Chi_02 = Chimu_02-Chimu_22;		\
+  Chi_10 = Chimu_10-Chimu_30;		\
+  Chi_11 = Chimu_11-Chimu_31;		\
+  Chi_12 = Chimu_12-Chimu_32;
+
+//      fspin(0)=hspin(0);
+//      fspin(1)=hspin(1);
+//      fspin(2)=timesMinusI(hspin(1));
+//      fspin(3)=timesMinusI(hspin(0));
+#define XP_RECON\
+  result_00 = UChi_00;\
+  result_01 = UChi_01;\
+  result_02 = UChi_02;\
+  result_10 = UChi_10;\
+  result_11 = UChi_11;\
+  result_12 = UChi_12;\
+  result_20 = timesMinusI(UChi_10);\
+  result_21 = timesMinusI(UChi_11);\
+  result_22 = timesMinusI(UChi_12);\
+  result_30 = timesMinusI(UChi_00);\
+  result_31 = timesMinusI(UChi_01);\
+  result_32 = timesMinusI(UChi_02);
+
+#define XP_RECON_ACCUM\
+  result_00+=UChi_00;\
+  result_01+=UChi_01;\
+  result_02+=UChi_02;\
+  result_10+=UChi_10;\
+  result_11+=UChi_11;\
+  result_12+=UChi_12;\
+  result_20-=timesI(UChi_10);\
+  result_21-=timesI(UChi_11);\
+  result_22-=timesI(UChi_12);\
+  result_30-=timesI(UChi_00);\
+  result_31-=timesI(UChi_01);\
+  result_32-=timesI(UChi_02);
+
+#define XM_RECON\
+  result_00 = UChi_00;\
+  result_01 = UChi_01;\
+  result_02 = UChi_02;\
+  result_10 = UChi_10;\
+  result_11 = UChi_11;\
+  result_12 = UChi_12;\
+  result_20 = timesI(UChi_10);\
+  result_21 = timesI(UChi_11);\
+  result_22 = timesI(UChi_12);\
+  result_30 = timesI(UChi_00);\
+  result_31 = timesI(UChi_01);\
+  result_32 = timesI(UChi_02);
+
+#define XM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= timesI(UChi_10);\
+  result_21+= timesI(UChi_11);\
+  result_22+= timesI(UChi_12);\
+  result_30+= timesI(UChi_00);\
+  result_31+= timesI(UChi_01);\
+  result_32+= timesI(UChi_02);
+
+#define YP_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= UChi_10;\
+  result_21+= UChi_11;\
+  result_22+= UChi_12;\
+  result_30-= UChi_00;\
+  result_31-= UChi_01;\
+  result_32-= UChi_02;
+
+#define YM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20-= UChi_10;\
+  result_21-= UChi_11;\
+  result_22-= UChi_12;\
+  result_30+= UChi_00;\
+  result_31+= UChi_01;\
+  result_32+= UChi_02;
+
+#define ZP_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20-= timesI(UChi_00);			\
+  result_21-= timesI(UChi_01);			\
+  result_22-= timesI(UChi_02);			\
+  result_30+= timesI(UChi_10);			\
+  result_31+= timesI(UChi_11);			\
+  result_32+= timesI(UChi_12);
+
+#define ZM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= timesI(UChi_00);			\
+  result_21+= timesI(UChi_01);			\
+  result_22+= timesI(UChi_02);			\
+  result_30-= timesI(UChi_10);			\
+  result_31-= timesI(UChi_11);			\
+  result_32-= timesI(UChi_12);
+
+#define TP_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= UChi_00;			\
+  result_21+= UChi_01;			\
+  result_22+= UChi_02;			\
+  result_30+= UChi_10;			\
+  result_31+= UChi_11;			\
+  result_32+= UChi_12;
+
+#define TM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20-= UChi_00;	\
+  result_21-= UChi_01;	\
+  result_22-= UChi_02;	\
+  result_30-= UChi_10;	\
+  result_31-= UChi_11;	\
+  result_32-= UChi_12;
+
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
+  SE=st.GetEntry(ptype,DIR,ss);			\
+  offset = SE->_offset;				\
+  local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
+  if ( local ) {				\
+    LOAD_CHIMU;					\
+    PROJ;					\
+    if ( perm) {				\
+      PERMUTE_DIR(PERM);			\
+    }						\
+  } else {					\
+    LOAD_CHI;					\
+  }						\
+  MULT_2SPIN(DIR);				\
+  RECON;					
+
+#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
+  SE=st.GetEntry(ptype,DIR,ss);			\
+  offset = SE->_offset;				\
+  local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
+  if ( local ) {				\
+    LOAD_CHIMU;					\
+    PROJ;					\
+    if ( perm) {				\
+      PERMUTE_DIR(PERM);			\
+    }						\
+  } else if ( st.same_node[DIR] ) {		\
+    LOAD_CHI;					\
+  }						\
+  if (local || st.same_node[DIR] ) {		\
+    MULT_2SPIN(DIR);				\
+    RECON;					\
+  }
+
+#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
+  SE=st.GetEntry(ptype,DIR,ss);			\
+  offset = SE->_offset;				\
+  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
+    LOAD_CHI;					\
+    MULT_2SPIN(DIR);				\
+    RECON;					\
+    nmu++;					\
+  }
+
+#define HAND_RESULT(ss)				\
+  {						\
+    SiteSpinor & ref (out._odata[ss]);		\
+    vstream(ref()(0)(0),result_00);		\
+    vstream(ref()(0)(1),result_01);		\
+    vstream(ref()(0)(2),result_02);		\
+    vstream(ref()(1)(0),result_10);		\
+    vstream(ref()(1)(1),result_11);		\
+    vstream(ref()(1)(2),result_12);		\
+    vstream(ref()(2)(0),result_20);		\
+    vstream(ref()(2)(1),result_21);		\
+    vstream(ref()(2)(2),result_22);		\
+    vstream(ref()(3)(0),result_30);		\
+    vstream(ref()(3)(1),result_31);		\
+    vstream(ref()(3)(2),result_32);		\
+  }
+
+#define HAND_RESULT_EXT(ss)			\
+  if (nmu){					\
+    SiteSpinor & ref (out._odata[ss]);		\
+    ref()(0)(0)+=result_00;		\
+    ref()(0)(1)+=result_01;		\
+    ref()(0)(2)+=result_02;		\
+    ref()(1)(0)+=result_10;		\
+    ref()(1)(1)+=result_11;		\
+    ref()(1)(2)+=result_12;		\
+    ref()(2)(0)+=result_20;		\
+    ref()(2)(1)+=result_21;		\
+    ref()(2)(2)+=result_22;		\
+    ref()(3)(0)+=result_30;		\
+    ref()(3)(1)+=result_31;		\
+    ref()(3)(2)+=result_32;		\
+  }
+
+
+#define HAND_DECLARATIONS(a)			\
+  Simd result_00;				\
+  Simd result_01;				\
+  Simd result_02;				\
+  Simd result_10;				\
+  Simd result_11;				\
+  Simd result_12;				\
+  Simd result_20;				\
+  Simd result_21;				\
+  Simd result_22;				\
+  Simd result_30;				\
+  Simd result_31;				\
+  Simd result_32;				\
+  Simd Chi_00;					\
+  Simd Chi_01;					\
+  Simd Chi_02;					\
+  Simd Chi_10;					\
+  Simd Chi_11;					\
+  Simd Chi_12;					\
+  Simd UChi_00;					\
+  Simd UChi_01;					\
+  Simd UChi_02;					\
+  Simd UChi_10;					\
+  Simd UChi_11;					\
+  Simd UChi_12;					\
+  Simd U_00;					\
+  Simd U_10;					\
+  Simd U_20;					\
+  Simd U_01;					\
+  Simd U_11;					\
+  Simd U_21;
+
+#define ZERO_RESULT				\
+  result_00=zero;				\
+  result_01=zero;				\
+  result_02=zero;				\
+  result_10=zero;				\
+  result_11=zero;				\
+  result_12=zero;				\
+  result_20=zero;				\
+  result_21=zero;				\
+  result_22=zero;				\
+  result_30=zero;				\
+  result_31=zero;				\
+  result_32=zero;			
+
+#define Chimu_00 Chi_00
+#define Chimu_01 Chi_01
+#define Chimu_02 Chi_02
+#define Chimu_10 Chi_10
+#define Chimu_11 Chi_11
+#define Chimu_12 Chi_12
+#define Chimu_20 UChi_00
+#define Chimu_21 UChi_01
+#define Chimu_22 UChi_02
+#define Chimu_30 UChi_10
+#define Chimu_31 UChi_11
+#define Chimu_32 UChi_12
+
+namespace Grid {
+namespace QCD {
+
+template<class Impl> void 
+WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionField &in, FermionField &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+
+  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
+  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
+  HAND_RESULT(ss);
+}
+
+template<class Impl>
+void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionField &in, FermionField &out)
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+  
+  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
+  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
+  HAND_RESULT(ss);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionField &in, FermionField &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+  ZERO_RESULT;
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
+  HAND_RESULT(ss);
+}
+
+template<class Impl>
+void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionField &in, FermionField &out)
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+  ZERO_RESULT;
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
+  HAND_RESULT(ss);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionField &in, FermionField &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+  int nmu=0;
+  ZERO_RESULT;
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
+  HAND_RESULT_EXT(ss);
+}
+
+template<class Impl>
+void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionField &in, FermionField &out)
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+  int nmu=0;
+  ZERO_RESULT;
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
+  HAND_RESULT_EXT(ss);
+}
+
+////////////// Wilson ; uses this implementation /////////////////////
+
+#define INSTANTIATE_THEM(A) \
+template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+					     int ss,int sU,const FermionField &in, FermionField &out); \
+template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+						int ss,int sU,const FermionField &in, FermionField &out);\
+template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+						int ss,int sU,const FermionField &in, FermionField &out); \
+template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+						   int ss,int sU,const FermionField &in, FermionField &out); \
+template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+						int ss,int sU,const FermionField &in, FermionField &out); \
+template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+						   int ss,int sU,const FermionField &in, FermionField &out); 
+
+INSTANTIATE_THEM(WilsonImplF);
+INSTANTIATE_THEM(WilsonImplD);
+INSTANTIATE_THEM(ZWilsonImplF);
+INSTANTIATE_THEM(ZWilsonImplD);
+INSTANTIATE_THEM(DomainWallVec5dImplF);
+INSTANTIATE_THEM(DomainWallVec5dImplD);
+INSTANTIATE_THEM(ZDomainWallVec5dImplF);
+INSTANTIATE_THEM(ZDomainWallVec5dImplD);
+INSTANTIATE_THEM(WilsonImplFH);
+INSTANTIATE_THEM(WilsonImplDF);
+INSTANTIATE_THEM(ZWilsonImplFH);
+INSTANTIATE_THEM(ZWilsonImplDF);
+INSTANTIATE_THEM(DomainWallVec5dImplFH);
+INSTANTIATE_THEM(DomainWallVec5dImplDF);
+INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
+INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
+INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
+INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
+
+}}
--- a/Grid/qcd/action/fermion/WilsonKernelsHandGparity.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsHandGparity.cc
@@ -0,0 +1,878 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+#define REGISTER
+
+#define LOAD_CHIMU_BODY(F)			\
+  Chimu_00=ref(F)(0)(0);			\
+  Chimu_01=ref(F)(0)(1);			\
+  Chimu_02=ref(F)(0)(2);			\
+  Chimu_10=ref(F)(1)(0);			\
+  Chimu_11=ref(F)(1)(1);			\
+  Chimu_12=ref(F)(1)(2);			\
+  Chimu_20=ref(F)(2)(0);			\
+  Chimu_21=ref(F)(2)(1);			\
+  Chimu_22=ref(F)(2)(2);			\
+  Chimu_30=ref(F)(3)(0);			\
+  Chimu_31=ref(F)(3)(1);			\
+  Chimu_32=ref(F)(3)(2)
+
+#define LOAD_CHIMU(DIR,F,PERM)						\
+  { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
+
+#define LOAD_CHI_BODY(F)				\
+    Chi_00 = ref(F)(0)(0);\
+    Chi_01 = ref(F)(0)(1);\
+    Chi_02 = ref(F)(0)(2);\
+    Chi_10 = ref(F)(1)(0);\
+    Chi_11 = ref(F)(1)(1);\
+    Chi_12 = ref(F)(1)(2)
+
+#define LOAD_CHI(DIR,F,PERM)					\
+  {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
+
+
+//G-parity implementations using in-place intrinsic ops
+
+//1l 1h -> 1h 1l
+//0l 0h , 1h 1l -> 0l 1h 0h,1l
+//0h,1l -> 1l,0h
+//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
+//Pulled fermion through forwards face, GPBC on upper component
+//Need 0= 0l 1h   1= 1l 0h
+//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
+//Pulled fermion through backwards face, GPBC on lower component
+//Need 0= 1l 0h   1= 0l 1h
+
+//1l 1h -> 1h 1l
+//0l 0h , 1h 1l -> 0l 1h 0h,1l
+#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
+  permute##PERM(tmp1, ref(1)(S)(C));				\
+  exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1);		\
+  INTO = tmp2;
+
+//0l 0h -> 0h 0l
+//1l 1h, 0h 0l -> 1l 0h, 1h 0l
+#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
+  permute##PERM(tmp1, ref(0)(S)(C));				\
+  exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1);		\
+  INTO = tmp2;
+
+
+
+
+#define LOAD_CHI_SETUP(DIR,F)						\
+  g = F;								\
+  direction = st._directions[DIR];				\
+  distance = st._distances[DIR];				\
+  sl = st._grid->_simd_layout[direction];			\
+  inplace_twist = 0;						\
+  if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\
+    if(sl == 1){							\
+      g = (F+1) % 2;							\
+    }else{								\
+      inplace_twist = 1;						\
+    }									\
+  }  
+
+#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\
+  { const SiteSpinor &ref(in._odata[offset]);				\
+    LOAD_CHI_SETUP(DIR,F);						\
+    if(!inplace_twist){							\
+      LOAD_CHIMU_BODY(g);						\
+    }else{								\
+      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
+	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
+	DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
+      }else{								\
+	DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
+      } \
+    } \
+  }
+
+
+#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)				\
+  { const SiteHalfSpinor &ref(buf[offset]);				\
+    LOAD_CHI_SETUP(DIR,F);						\
+    if(!inplace_twist){							\
+      LOAD_CHI_BODY(g);							\
+    }else{								\
+      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
+	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
+	DO_TWIST_0L_1H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_0L_1H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
+	DO_TWIST_0L_1H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
+	DO_TWIST_0L_1H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
+	DO_TWIST_0L_1H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_0L_1H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
+      }else{								\
+	DO_TWIST_1L_0H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_1L_0H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
+	DO_TWIST_1L_0H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
+	DO_TWIST_1L_0H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
+	DO_TWIST_1L_0H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_1L_0H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
+      }									\
+    }									\
+  }
+
+
+#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
+#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
+
+// To splat or not to splat depends on the implementation
+#define MULT_2SPIN_BODY \
+  Impl::loadLinkElement(U_00,ref()(0,0));	\
+  Impl::loadLinkElement(U_10,ref()(1,0));	\
+  Impl::loadLinkElement(U_20,ref()(2,0));	\
+  Impl::loadLinkElement(U_01,ref()(0,1));	\
+  Impl::loadLinkElement(U_11,ref()(1,1));	\
+  Impl::loadLinkElement(U_21,ref()(2,1));	\
+  UChi_00 = U_00*Chi_00;			\
+  UChi_10 = U_00*Chi_10;			\
+  UChi_01 = U_10*Chi_00;			\
+  UChi_11 = U_10*Chi_10;			\
+  UChi_02 = U_20*Chi_00;			\
+  UChi_12 = U_20*Chi_10;			\
+  UChi_00+= U_01*Chi_01;			\
+  UChi_10+= U_01*Chi_11;			\
+  UChi_01+= U_11*Chi_01;			\
+  UChi_11+= U_11*Chi_11;			\
+  UChi_02+= U_21*Chi_01;			\
+  UChi_12+= U_21*Chi_11;			\
+  Impl::loadLinkElement(U_00,ref()(0,2));	\
+  Impl::loadLinkElement(U_10,ref()(1,2));	\
+  Impl::loadLinkElement(U_20,ref()(2,2));	\
+  UChi_00+= U_00*Chi_02;			\
+  UChi_10+= U_00*Chi_12;			\
+  UChi_01+= U_10*Chi_02;			\
+  UChi_11+= U_10*Chi_12;			\
+  UChi_02+= U_20*Chi_02;			\
+  UChi_12+= U_20*Chi_12
+
+
+#define MULT_2SPIN(A,F)					\
+  {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
+
+#define MULT_2SPIN_GPARITY(A,F)				\
+  {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
+
+
+#define PERMUTE_DIR(dir)			\
+      permute##dir(Chi_00,Chi_00);\
+      permute##dir(Chi_01,Chi_01);\
+      permute##dir(Chi_02,Chi_02);\
+      permute##dir(Chi_10,Chi_10);\
+      permute##dir(Chi_11,Chi_11);\
+      permute##dir(Chi_12,Chi_12);
+
+//      hspin(0)=fspin(0)+timesI(fspin(3));
+//      hspin(1)=fspin(1)+timesI(fspin(2));
+#define XP_PROJ \
+    Chi_00 = Chimu_00+timesI(Chimu_30);\
+    Chi_01 = Chimu_01+timesI(Chimu_31);\
+    Chi_02 = Chimu_02+timesI(Chimu_32);\
+    Chi_10 = Chimu_10+timesI(Chimu_20);\
+    Chi_11 = Chimu_11+timesI(Chimu_21);\
+    Chi_12 = Chimu_12+timesI(Chimu_22);
+
+#define YP_PROJ \
+    Chi_00 = Chimu_00-Chimu_30;\
+    Chi_01 = Chimu_01-Chimu_31;\
+    Chi_02 = Chimu_02-Chimu_32;\
+    Chi_10 = Chimu_10+Chimu_20;\
+    Chi_11 = Chimu_11+Chimu_21;\
+    Chi_12 = Chimu_12+Chimu_22;
+
+#define ZP_PROJ \
+  Chi_00 = Chimu_00+timesI(Chimu_20);		\
+  Chi_01 = Chimu_01+timesI(Chimu_21);		\
+  Chi_02 = Chimu_02+timesI(Chimu_22);		\
+  Chi_10 = Chimu_10-timesI(Chimu_30);		\
+  Chi_11 = Chimu_11-timesI(Chimu_31);		\
+  Chi_12 = Chimu_12-timesI(Chimu_32);
+
+#define TP_PROJ \
+  Chi_00 = Chimu_00+Chimu_20;		\
+  Chi_01 = Chimu_01+Chimu_21;		\
+  Chi_02 = Chimu_02+Chimu_22;		\
+  Chi_10 = Chimu_10+Chimu_30;		\
+  Chi_11 = Chimu_11+Chimu_31;		\
+  Chi_12 = Chimu_12+Chimu_32;
+
+
+//      hspin(0)=fspin(0)-timesI(fspin(3));
+//      hspin(1)=fspin(1)-timesI(fspin(2));
+#define XM_PROJ \
+    Chi_00 = Chimu_00-timesI(Chimu_30);\
+    Chi_01 = Chimu_01-timesI(Chimu_31);\
+    Chi_02 = Chimu_02-timesI(Chimu_32);\
+    Chi_10 = Chimu_10-timesI(Chimu_20);\
+    Chi_11 = Chimu_11-timesI(Chimu_21);\
+    Chi_12 = Chimu_12-timesI(Chimu_22);
+
+#define YM_PROJ \
+    Chi_00 = Chimu_00+Chimu_30;\
+    Chi_01 = Chimu_01+Chimu_31;\
+    Chi_02 = Chimu_02+Chimu_32;\
+    Chi_10 = Chimu_10-Chimu_20;\
+    Chi_11 = Chimu_11-Chimu_21;\
+    Chi_12 = Chimu_12-Chimu_22;
+
+#define ZM_PROJ \
+  Chi_00 = Chimu_00-timesI(Chimu_20);		\
+  Chi_01 = Chimu_01-timesI(Chimu_21);		\
+  Chi_02 = Chimu_02-timesI(Chimu_22);		\
+  Chi_10 = Chimu_10+timesI(Chimu_30);		\
+  Chi_11 = Chimu_11+timesI(Chimu_31);		\
+  Chi_12 = Chimu_12+timesI(Chimu_32);
+
+#define TM_PROJ \
+  Chi_00 = Chimu_00-Chimu_20;		\
+  Chi_01 = Chimu_01-Chimu_21;		\
+  Chi_02 = Chimu_02-Chimu_22;		\
+  Chi_10 = Chimu_10-Chimu_30;		\
+  Chi_11 = Chimu_11-Chimu_31;		\
+  Chi_12 = Chimu_12-Chimu_32;
+
+//      fspin(0)=hspin(0);
+//      fspin(1)=hspin(1);
+//      fspin(2)=timesMinusI(hspin(1));
+//      fspin(3)=timesMinusI(hspin(0));
+#define XP_RECON\
+  result_00 = UChi_00;\
+  result_01 = UChi_01;\
+  result_02 = UChi_02;\
+  result_10 = UChi_10;\
+  result_11 = UChi_11;\
+  result_12 = UChi_12;\
+  result_20 = timesMinusI(UChi_10);\
+  result_21 = timesMinusI(UChi_11);\
+  result_22 = timesMinusI(UChi_12);\
+  result_30 = timesMinusI(UChi_00);\
+  result_31 = timesMinusI(UChi_01);\
+  result_32 = timesMinusI(UChi_02);
+
+#define XP_RECON_ACCUM\
+  result_00+=UChi_00;\
+  result_01+=UChi_01;\
+  result_02+=UChi_02;\
+  result_10+=UChi_10;\
+  result_11+=UChi_11;\
+  result_12+=UChi_12;\
+  result_20-=timesI(UChi_10);\
+  result_21-=timesI(UChi_11);\
+  result_22-=timesI(UChi_12);\
+  result_30-=timesI(UChi_00);\
+  result_31-=timesI(UChi_01);\
+  result_32-=timesI(UChi_02);
+
+#define XM_RECON\
+  result_00 = UChi_00;\
+  result_01 = UChi_01;\
+  result_02 = UChi_02;\
+  result_10 = UChi_10;\
+  result_11 = UChi_11;\
+  result_12 = UChi_12;\
+  result_20 = timesI(UChi_10);\
+  result_21 = timesI(UChi_11);\
+  result_22 = timesI(UChi_12);\
+  result_30 = timesI(UChi_00);\
+  result_31 = timesI(UChi_01);\
+  result_32 = timesI(UChi_02);
+
+#define XM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= timesI(UChi_10);\
+  result_21+= timesI(UChi_11);\
+  result_22+= timesI(UChi_12);\
+  result_30+= timesI(UChi_00);\
+  result_31+= timesI(UChi_01);\
+  result_32+= timesI(UChi_02);
+
+#define YP_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= UChi_10;\
+  result_21+= UChi_11;\
+  result_22+= UChi_12;\
+  result_30-= UChi_00;\
+  result_31-= UChi_01;\
+  result_32-= UChi_02;
+
+#define YM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20-= UChi_10;\
+  result_21-= UChi_11;\
+  result_22-= UChi_12;\
+  result_30+= UChi_00;\
+  result_31+= UChi_01;\
+  result_32+= UChi_02;
+
+#define ZP_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20-= timesI(UChi_00);			\
+  result_21-= timesI(UChi_01);			\
+  result_22-= timesI(UChi_02);			\
+  result_30+= timesI(UChi_10);			\
+  result_31+= timesI(UChi_11);			\
+  result_32+= timesI(UChi_12);
+
+#define ZM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= timesI(UChi_00);			\
+  result_21+= timesI(UChi_01);			\
+  result_22+= timesI(UChi_02);			\
+  result_30-= timesI(UChi_10);			\
+  result_31-= timesI(UChi_11);			\
+  result_32-= timesI(UChi_12);
+
+#define TP_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= UChi_00;			\
+  result_21+= UChi_01;			\
+  result_22+= UChi_02;			\
+  result_30+= UChi_10;			\
+  result_31+= UChi_11;			\
+  result_32+= UChi_12;
+
+#define TM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20-= UChi_00;	\
+  result_21-= UChi_01;	\
+  result_22-= UChi_02;	\
+  result_30-= UChi_10;	\
+  result_31-= UChi_11;	\
+  result_32-= UChi_12;
+
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  SE=st.GetEntry(ptype,DIR,ss);			\
+  offset = SE->_offset;				\
+  local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
+  if ( local ) {				\
+    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
+    PROJ;					\
+    if ( perm) {				\
+      PERMUTE_DIR(PERM);			\
+    }						\
+  } else {					\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
+  }						\
+  MULT_2SPIN_IMPL(DIR,F);			\
+  RECON;					
+
+
+#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
+  SE=st.GetEntry(ptype,DIR,ss);			\
+  offset = SE->_offset;				\
+  local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
+  if ( local ) {				\
+    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
+    PROJ;					\
+    if ( perm) {				\
+      PERMUTE_DIR(PERM);			\
+    }						\
+  } else if ( st.same_node[DIR] ) {		\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
+  }						\
+  if (local || st.same_node[DIR] ) {		\
+    MULT_2SPIN_IMPL(DIR,F);			\
+    RECON;					\
+  }
+
+#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
+  SE=st.GetEntry(ptype,DIR,ss);			\
+  offset = SE->_offset;				\
+  local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
+  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
+    MULT_2SPIN_IMPL(DIR,F);			\
+    RECON;					\
+    nmu++;					\
+  }
+
+#define HAND_RESULT(ss,F)			\
+  {						\
+    SiteSpinor & ref (out._odata[ss]);		\
+    vstream(ref(F)(0)(0),result_00);		\
+    vstream(ref(F)(0)(1),result_01);		\
+    vstream(ref(F)(0)(2),result_02);		\
+    vstream(ref(F)(1)(0),result_10);		\
+    vstream(ref(F)(1)(1),result_11);		\
+    vstream(ref(F)(1)(2),result_12);		\
+    vstream(ref(F)(2)(0),result_20);		\
+    vstream(ref(F)(2)(1),result_21);		\
+    vstream(ref(F)(2)(2),result_22);		\
+    vstream(ref(F)(3)(0),result_30);		\
+    vstream(ref(F)(3)(1),result_31);		\
+    vstream(ref(F)(3)(2),result_32);		\
+  }
+
+#define HAND_RESULT_EXT(ss,F)			\
+  if (nmu){					\
+    SiteSpinor & ref (out._odata[ss]);		\
+    ref(F)(0)(0)+=result_00;		\
+    ref(F)(0)(1)+=result_01;		\
+    ref(F)(0)(2)+=result_02;		\
+    ref(F)(1)(0)+=result_10;		\
+    ref(F)(1)(1)+=result_11;		\
+    ref(F)(1)(2)+=result_12;		\
+    ref(F)(2)(0)+=result_20;		\
+    ref(F)(2)(1)+=result_21;		\
+    ref(F)(2)(2)+=result_22;		\
+    ref(F)(3)(0)+=result_30;		\
+    ref(F)(3)(1)+=result_31;		\
+    ref(F)(3)(2)+=result_32;		\
+  }
+
+
+#define HAND_DECLARATIONS(a)			\
+  Simd result_00;				\
+  Simd result_01;				\
+  Simd result_02;				\
+  Simd result_10;				\
+  Simd result_11;				\
+  Simd result_12;				\
+  Simd result_20;				\
+  Simd result_21;				\
+  Simd result_22;				\
+  Simd result_30;				\
+  Simd result_31;				\
+  Simd result_32;				\
+  Simd Chi_00;					\
+  Simd Chi_01;					\
+  Simd Chi_02;					\
+  Simd Chi_10;					\
+  Simd Chi_11;					\
+  Simd Chi_12;					\
+  Simd UChi_00;					\
+  Simd UChi_01;					\
+  Simd UChi_02;					\
+  Simd UChi_10;					\
+  Simd UChi_11;					\
+  Simd UChi_12;					\
+  Simd U_00;					\
+  Simd U_10;					\
+  Simd U_20;					\
+  Simd U_01;					\
+  Simd U_11;					\
+  Simd U_21;
+
+#define ZERO_RESULT				\
+  result_00=zero;				\
+  result_01=zero;				\
+  result_02=zero;				\
+  result_10=zero;				\
+  result_11=zero;				\
+  result_12=zero;				\
+  result_20=zero;				\
+  result_21=zero;				\
+  result_22=zero;				\
+  result_30=zero;				\
+  result_31=zero;				\
+  result_32=zero;			
+
+#define Chimu_00 Chi_00
+#define Chimu_01 Chi_01
+#define Chimu_02 Chi_02
+#define Chimu_10 Chi_10
+#define Chimu_11 Chi_11
+#define Chimu_12 Chi_12
+#define Chimu_20 UChi_00
+#define Chimu_21 UChi_01
+#define Chimu_22 UChi_02
+#define Chimu_30 UChi_10
+#define Chimu_31 UChi_11
+#define Chimu_32 UChi_12
+
+namespace Grid {
+namespace QCD {
+
+template<class Impl> void 
+WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionField &in, FermionField &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+
+#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT(ss,F)
+
+  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+}
+
+template<class Impl>
+void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionField &in, FermionField &out)
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+
+#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT(ss,F)
+
+  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionField &in, FermionField &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+
+#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  ZERO_RESULT; \
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT(ss,F)
+
+  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+}
+
+template<class Impl>
+void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionField &in, FermionField &out)
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+
+#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
+  ZERO_RESULT;							\
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_RESULT(ss,F)
+  
+  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionField &in, FermionField &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+  int nmu=0;
+
+#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  ZERO_RESULT; \
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT_EXT(ss,F)
+
+  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+}
+
+template<class Impl>
+void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionField &in, FermionField &out)
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+  int nmu=0;
+
+#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  ZERO_RESULT; \
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT_EXT(ss,F)
+
+  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
+}
+
+#define HAND_SPECIALISE_GPARITY(IMPL)					\
+  template<> void							\
+  WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
+				    int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    StencilEntry *SE;							\
+    HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<>								\
+  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+					    int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    StencilEntry *SE;							\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
+    HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<> void							\
+  WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
+						     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
+    StencilEntry *SE;							\
+    HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<>								\
+  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+							     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    StencilEntry *SE;							\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<> void							\
+  WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
+						     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    StencilEntry *SE;							\
+    int nmu=0;								\
+    HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    nmu = 0;								\
+    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+  template<>								\
+  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+							     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    StencilEntry *SE;							\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    int nmu=0;								\
+    HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    nmu = 0;								\
+    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }
+
+
+HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
+HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
+HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
+HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
+
+
+
+
+
+
+
+
+
+
+  
+////////////// Wilson ; uses this implementation /////////////////////
+
+#define INSTANTIATE_THEM(A) \
+template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+					     int ss,int sU,const FermionField &in, FermionField &out); \
+template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+						int ss,int sU,const FermionField &in, FermionField &out);\
+template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+						int ss,int sU,const FermionField &in, FermionField &out); \
+template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+						   int ss,int sU,const FermionField &in, FermionField &out); \
+template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+						int ss,int sU,const FermionField &in, FermionField &out); \
+template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+						   int ss,int sU,const FermionField &in, FermionField &out); 
+
+INSTANTIATE_THEM(GparityWilsonImplF);
+INSTANTIATE_THEM(GparityWilsonImplD);
+INSTANTIATE_THEM(GparityWilsonImplFH);
+INSTANTIATE_THEM(GparityWilsonImplDF);
+}}
--- a/Grid/qcd/action/fermion/WilsonTMFermion.cc
+++ b/Grid/qcd/action/fermion/WilsonTMFermion.cc
@@ -0,0 +1,99 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+    /*
+     * BF sequence
+     *
+      void bfmbase<Float>::MooeeInv(Fermion_t psi, 
+			       Fermion_t chi, 
+			      int dag, int cb)
+
+    double m    = this->mass;
+    double tm   = this->twistedmass;
+    double mtil = 4.0+this->mass;
+
+    double sq = mtil*mtil + tm*tm;
+
+    double a = mtil/sq;
+    double b = -tm /sq;
+    if(dag) b=-b;
+    axpibg5x(chi,psi,a,b);
+
+      void bfmbase<Float>::Mooee(Fermion_t psi, 
+			   Fermion_t chi, 
+			   int dag,int cb)
+    double a = 4.0+this->mass;
+    double b = this->twistedmass;
+    if(dag) b=-b;
+    axpibg5x(chi,psi,a,b);
+    */
+
+  template<class Impl>
+  void WilsonTMFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+    RealD a = 4.0+this->mass;
+    RealD b = this->mu;
+    out.checkerboard = in.checkerboard;
+    axpibg5x(out,in,a,b);
+  }
+  template<class Impl>
+  void WilsonTMFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+    RealD a = 4.0+this->mass;
+    RealD b = -this->mu;
+    out.checkerboard = in.checkerboard;
+    axpibg5x(out,in,a,b);
+  }
+  template<class Impl>
+  void WilsonTMFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+    RealD m    = this->mass;
+    RealD tm   = this->mu;
+    RealD mtil = 4.0+this->mass;
+    RealD sq   = mtil*mtil+tm*tm;
+    RealD a    = mtil/sq;
+    RealD b    = -tm /sq;
+    axpibg5x(out,in,a,b);
+  }
+  template<class Impl>
+  void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
+    RealD m    = this->mass;
+    RealD tm   = this->mu;
+    RealD mtil = 4.0+this->mass;
+    RealD sq   = mtil*mtil+tm*tm;
+    RealD a    = mtil/sq;
+    RealD b    = tm /sq;
+    axpibg5x(out,in,a,b);
+  }
+
+  FermOpTemplateInstantiate(WilsonTMFermion);
+
+}
+}
--- a/Grid/qcd/action/fermion/WilsonTMFermion.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion.h
@@ -0,0 +1,77 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_WILSON_TM_FERMION_H
+#define  GRID_QCD_WILSON_TM_FERMION_H
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/WilsonFermion.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class WilsonTMFermion : public WilsonFermion<Impl>
+    {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+    public:
+
+      virtual void   Instantiatable(void) {};
+      // Constructors
+      WilsonTMFermion(GaugeField &_Umu,
+		    GridCartesian         &Fgrid,
+		    GridRedBlackCartesian &Hgrid, 
+		    RealD _mass,
+		    RealD _mu,
+		    const ImplParams &p= ImplParams()
+		      ) :
+	WilsonFermion<Impl>(_Umu,
+			    Fgrid,
+			    Hgrid,
+			    _mass,p)
+
+      {
+	mu = _mu;
+      }
+
+
+    // allow override for twisted mass and clover
+    virtual void Mooee(const FermionField &in, FermionField &out) ;
+    virtual void MooeeDag(const FermionField &in, FermionField &out) ;
+    virtual void MooeeInv(const FermionField &in, FermionField &out) ;
+    virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
+
+  private:
+     RealD mu; // TwistedMass parameter
+
+  };
+
+}}
+
+#endif
--- a/Grid/qcd/action/fermion/ZMobiusFermion.h
+++ b/Grid/qcd/action/fermion/ZMobiusFermion.h
@@ -0,0 +1,79 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/MobiusFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_ZMOBIUS_FERMION_H
+#define  GRID_QCD_ZMOBIUS_FERMION_H
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class ZMobiusFermion : public CayleyFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+    public:
+
+      virtual void   Instantiatable(void) {};
+      // Constructors
+      ZMobiusFermion(GaugeField &_Umu,
+		     GridCartesian         &FiveDimGrid,
+		     GridRedBlackCartesian &FiveDimRedBlackGrid,
+		     GridCartesian         &FourDimGrid,
+		     GridRedBlackCartesian &FourDimRedBlackGrid,
+		     RealD _mass,RealD _M5,
+		     std::vector<ComplexD> &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : 
+      
+      CayleyFermion5D<Impl>(_Umu,
+			    FiveDimGrid,
+			    FiveDimRedBlackGrid,
+			    FourDimGrid,
+			    FourDimRedBlackGrid,_mass,_M5,p)
+
+      {
+	RealD eps = 1.0;
+	
+	std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
+	std::vector<Coeff_t> zgamma(this->Ls);
+	for(int s=0;s<this->Ls;s++){
+	  zgamma[s] = gamma[s];
+	}
+
+	// Call base setter
+	this->SetCoefficientsInternal(1.0,zgamma,b,c);
+      }
+
+    };
+
+  }
+}
+
+#endif
--- a/Grid/qcd/action/fermion/g5HermitianLinop.h
+++ b/Grid/qcd/action/fermion/g5HermitianLinop.h
@@ -0,0 +1,121 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/g5HermitianLinop.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef G5_HERMITIAN_LINOP
+#define G5_HERMITIAN_LINOP
+
+namespace Grid {
+  namespace QCD {
+
+////////////////////////////////////////////////////////////////////
+// Wrap an already herm matrix
+////////////////////////////////////////////////////////////////////
+template<class Matrix,class Field>
+class Gamma5R5HermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+public:
+  Gamma5R5HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
+  void Op     (const Field &in, Field &out){
+    HermOp(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    HermOp(in,out);
+  }
+  void OpDiag (const Field &in, Field &out) {
+    Field tmp(in._grid);
+    _Mat.Mdiag(in,tmp);
+    G5R5(out,tmp);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    Field tmp(in._grid);
+    _Mat.Mdir(in,tmp,dir,disp);
+    G5R5(out,tmp);
+  }
+
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+
+    HermOp(in,out);
+    
+    ComplexD dot;
+    dot= innerProduct(in,out);
+    n1=real(dot);
+    
+    dot = innerProduct(out,out);
+    n2=real(dot);
+  }
+  void HermOp(const Field &in, Field &out){
+    Field tmp(in._grid);
+    _Mat.M(in,tmp);
+    G5R5(out,tmp);
+  }
+};
+
+
+template<class Matrix,class Field>
+class Gamma5HermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Gamma g5;
+public:
+    Gamma5HermitianLinearOperator(Matrix &Mat): _Mat(Mat), g5(Gamma::Algebra::Gamma5) {};
+  void Op     (const Field &in, Field &out){
+    HermOp(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    HermOp(in,out);
+  }
+  void OpDiag (const Field &in, Field &out) {
+    Field tmp(in._grid);
+    _Mat.Mdiag(in,tmp);
+    out=g5*tmp;
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    Field tmp(in._grid);
+    _Mat.Mdir(in,tmp,dir,disp);
+    out=g5*tmp;
+  }
+
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+
+    HermOp(in,out);
+    
+    ComplexD dot;
+    dot= innerProduct(in,out);
+    n1=real(dot);
+    
+    dot = innerProduct(out,out);
+    n2=real(dot);
+  }
+  void HermOp(const Field &in, Field &out){
+    Field tmp(in._grid);
+    _Mat.M(in,tmp);
+    out=g5*tmp;
+  }
+};
+
+
+}}
+#endif
--- a/Grid/qcd/action/gauge/Gauge.h
+++ b/Grid/qcd/action/gauge/Gauge.h
@@ -0,0 +1,70 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/gauge/Gauge_aggregate.h
+
+Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_QCD_GAUGE_H
+#define GRID_QCD_GAUGE_H
+
+#include <Grid/qcd/action/gauge/GaugeImplementations.h>
+#include <Grid/qcd/utils/WilsonLoops.h>
+#include <Grid/qcd/action/gauge/WilsonGaugeAction.h>
+#include <Grid/qcd/action/gauge/PlaqPlusRectangleAction.h>
+
+namespace Grid {
+namespace QCD {
+
+typedef WilsonGaugeAction<PeriodicGimplR>          WilsonGaugeActionR;
+typedef WilsonGaugeAction<PeriodicGimplF>          WilsonGaugeActionF;
+typedef WilsonGaugeAction<PeriodicGimplD>          WilsonGaugeActionD;
+typedef PlaqPlusRectangleAction<PeriodicGimplR>    PlaqPlusRectangleActionR;
+typedef PlaqPlusRectangleAction<PeriodicGimplF>    PlaqPlusRectangleActionF;
+typedef PlaqPlusRectangleAction<PeriodicGimplD>    PlaqPlusRectangleActionD;
+typedef IwasakiGaugeAction<PeriodicGimplR>         IwasakiGaugeActionR;
+typedef IwasakiGaugeAction<PeriodicGimplF>         IwasakiGaugeActionF;
+typedef IwasakiGaugeAction<PeriodicGimplD>         IwasakiGaugeActionD;
+typedef SymanzikGaugeAction<PeriodicGimplR>        SymanzikGaugeActionR;
+typedef SymanzikGaugeAction<PeriodicGimplF>        SymanzikGaugeActionF;
+typedef SymanzikGaugeAction<PeriodicGimplD>        SymanzikGaugeActionD;
+
+
+typedef WilsonGaugeAction<ConjugateGimplR>          ConjugateWilsonGaugeActionR;
+typedef WilsonGaugeAction<ConjugateGimplF>          ConjugateWilsonGaugeActionF;
+typedef WilsonGaugeAction<ConjugateGimplD>          ConjugateWilsonGaugeActionD;
+typedef PlaqPlusRectangleAction<ConjugateGimplR>    ConjugatePlaqPlusRectangleActionR;
+typedef PlaqPlusRectangleAction<ConjugateGimplF>    ConjugatePlaqPlusRectangleActionF;
+typedef PlaqPlusRectangleAction<ConjugateGimplD>    ConjugatePlaqPlusRectangleActionD;
+typedef IwasakiGaugeAction<ConjugateGimplR>         ConjugateIwasakiGaugeActionR;
+typedef IwasakiGaugeAction<ConjugateGimplF>         ConjugateIwasakiGaugeActionF;
+typedef IwasakiGaugeAction<ConjugateGimplD>         ConjugateIwasakiGaugeActionD;
+typedef SymanzikGaugeAction<ConjugateGimplR>        ConjugateSymanzikGaugeActionR;
+typedef SymanzikGaugeAction<ConjugateGimplF>        ConjugateSymanzikGaugeActionF;
+typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeActionD;
+
+}}
+
+
+#endif
--- a/Grid/qcd/action/gauge/GaugeImplTypes.h
+++ b/Grid/qcd/action/gauge/GaugeImplTypes.h
@@ -0,0 +1,153 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/gauge/GaugeImpl.h
+
+Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_GAUGE_IMPL_TYPES_H
+#define GRID_GAUGE_IMPL_TYPES_H
+
+namespace Grid {
+namespace QCD {
+
+////////////////////////////////////////////////////////////////////////
+// Implementation dependent gauge types
+////////////////////////////////////////////////////////////////////////
+
+#define INHERIT_GIMPL_TYPES(GImpl)                  \
+  typedef typename GImpl::Simd Simd;                \
+  typedef typename GImpl::LinkField GaugeLinkField; \
+  typedef typename GImpl::Field GaugeField;         \
+  typedef typename GImpl::ComplexField ComplexField;\
+  typedef typename GImpl::SiteField SiteGaugeField; \
+  typedef typename GImpl::SiteComplex SiteComplex;  \
+  typedef typename GImpl::SiteLink SiteGaugeLink;
+
+#define INHERIT_FIELD_TYPES(Impl)		    \
+  typedef typename Impl::Simd Simd;		    \
+  typedef typename Impl::ComplexField ComplexField; \
+  typedef typename Impl::SiteField SiteField;	    \
+  typedef typename Impl::Field Field;
+
+// hardcodes the exponential approximation in the template
+template <class S, int Nrepresentation = Nc, int Nexp = 12 > class GaugeImplTypes {
+public:
+  typedef S Simd;
+
+  template <typename vtype> using iImplScalar     = iScalar<iScalar<iScalar<vtype> > >;
+  template <typename vtype> using iImplGaugeLink  = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+  template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
+
+  typedef iImplScalar<Simd>     SiteComplex;
+  typedef iImplGaugeLink<Simd>  SiteLink;
+  typedef iImplGaugeField<Simd> SiteField;
+
+  typedef Lattice<SiteComplex> ComplexField;
+  typedef Lattice<SiteLink>    LinkField; 
+  typedef Lattice<SiteField>   Field;
+
+  // Guido: we can probably separate the types from the HMC functions
+  // this will create 2 kind of implementations
+  // probably confusing the users
+  // Now keeping only one class
+
+
+  // Move this elsewhere? FIXME
+  static inline void AddLink(Field &U, LinkField &W,
+                                  int mu) { // U[mu] += W
+    PARALLEL_FOR_LOOP
+    for (auto ss = 0; ss < U._grid->oSites(); ss++) {
+      U._odata[ss]._internal[mu] =
+          U._odata[ss]._internal[mu] + W._odata[ss]._internal;
+    }
+  }
+
+  ///////////////////////////////////////////////////////////
+  // Move these to another class
+  // HMC auxiliary functions
+  static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) {
+    // specific for SU gauge fields
+    LinkField Pmu(P._grid);
+    Pmu = zero;
+    for (int mu = 0; mu < Nd; mu++) {
+      SU<Nrepresentation>::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
+      PokeIndex<LorentzIndex>(P, Pmu, mu);
+    }
+  }
+
+  static inline Field projectForce(Field &P) { return Ta(P); }
+
+  static inline void update_field(Field& P, Field& U, double ep){
+    //static std::chrono::duration<double> diff;
+
+    //auto start = std::chrono::high_resolution_clock::now();
+    parallel_for(int ss=0;ss<P._grid->oSites();ss++){
+      for (int mu = 0; mu < Nd; mu++) 
+        U[ss]._internal[mu] = ProjectOnGroup(Exponentiate(P[ss]._internal[mu], ep, Nexp) * U[ss]._internal[mu]);
+    }
+    
+    //auto end = std::chrono::high_resolution_clock::now();
+   // diff += end - start;
+   // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n";
+  }
+
+  static inline RealD FieldSquareNorm(Field& U){
+    LatticeComplex Hloc(U._grid);
+    Hloc = zero;
+    for (int mu = 0; mu < Nd; mu++) {
+      auto Umu = PeekIndex<LorentzIndex>(U, mu);
+      Hloc += trace(Umu * Umu);
+    }
+    Complex Hsum = sum(Hloc);
+    return Hsum.real();
+  }
+
+  static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
+    SU<Nc>::HotConfiguration(pRNG, U);
+  }
+
+  static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
+    SU<Nc>::TepidConfiguration(pRNG, U);
+  }
+
+  static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
+    SU<Nc>::ColdConfiguration(pRNG, U);
+  }
+};
+
+
+typedef GaugeImplTypes<vComplex, Nc> GimplTypesR;
+typedef GaugeImplTypes<vComplexF, Nc> GimplTypesF;
+typedef GaugeImplTypes<vComplexD, Nc> GimplTypesD;
+
+typedef GaugeImplTypes<vComplex, SU<Nc>::AdjointDimension> GimplAdjointTypesR;
+typedef GaugeImplTypes<vComplexF, SU<Nc>::AdjointDimension> GimplAdjointTypesF;
+typedef GaugeImplTypes<vComplexD, SU<Nc>::AdjointDimension> GimplAdjointTypesD;
+
+
+} // QCD
+} // Grid
+
+#endif // GRID_GAUGE_IMPL_TYPES_H
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@@ -0,0 +1,148 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/gauge/GaugeImplementations.h
+
+Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_QCD_GAUGE_IMPLEMENTATIONS_H
+#define GRID_QCD_GAUGE_IMPLEMENTATIONS_H
+
+#include "GaugeImplTypes.h"
+
+namespace Grid {
+namespace QCD {
+
+// Composition with smeared link, bc's etc.. probably need multiple inheritance
+// Variable precision "S" and variable Nc
+template <class GimplTypes> class PeriodicGaugeImpl : public GimplTypes {
+public:
+  INHERIT_GIMPL_TYPES(GimplTypes);
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Support needed for the assembly of loops including all boundary condition
+  // effects such as conjugate bcs
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  template <class covariant>
+  static inline Lattice<covariant>
+  CovShiftForward(const GaugeLinkField &Link, int mu,
+                  const Lattice<covariant> &field) {
+    return PeriodicBC::CovShiftForward(Link, mu, field);
+  }
+
+  template <class covariant>
+  static inline Lattice<covariant>
+  CovShiftBackward(const GaugeLinkField &Link, int mu,
+                   const Lattice<covariant> &field) {
+    return PeriodicBC::CovShiftBackward(Link, mu, field);
+  }
+  static inline GaugeLinkField
+  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
+    return Cshift(adj(Link), mu, -1);
+  }
+  static inline GaugeLinkField
+  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
+    return Link;
+  }
+  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
+    return Cshift(Link, mu, 1);
+  }
+
+  static inline bool isPeriodicGaugeField(void) { return true; }
+};
+
+// Composition with smeared link, bc's etc.. probably need multiple inheritance
+// Variable precision "S" and variable Nc
+template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes {
+public:
+  INHERIT_GIMPL_TYPES(GimplTypes);
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Support needed for the assembly of loops including all boundary condition
+  // effects such as Gparity.
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  template <class covariant>
+  static Lattice<covariant> CovShiftForward(const GaugeLinkField &Link, int mu,
+                                            const Lattice<covariant> &field) {
+    return ConjugateBC::CovShiftForward(Link, mu, field);
+  }
+
+  template <class covariant>
+  static Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,
+                                             const Lattice<covariant> &field) {
+    return ConjugateBC::CovShiftBackward(Link, mu, field);
+  }
+
+  static inline GaugeLinkField
+  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
+    GridBase *grid = Link._grid;
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    GaugeLinkField tmp(grid);
+    tmp = adj(Link);
+    tmp = where(coor == Lmu, conjugate(tmp), tmp);
+    return Cshift(tmp, mu, -1); // moves towards positive mu
+  }
+  static inline GaugeLinkField
+  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
+    return Link;
+  }
+
+  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
+    GridBase *grid = Link._grid;
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    GaugeLinkField tmp(grid);
+    tmp = Cshift(Link, mu, 1);
+    tmp = where(coor == Lmu, conjugate(tmp), tmp);
+    return tmp;
+  }
+
+  static inline bool isPeriodicGaugeField(void) { return false; }
+};
+
+typedef PeriodicGaugeImpl<GimplTypesR> PeriodicGimplR; // Real.. whichever prec
+typedef PeriodicGaugeImpl<GimplTypesF> PeriodicGimplF; // Float
+typedef PeriodicGaugeImpl<GimplTypesD> PeriodicGimplD; // Double
+
+typedef PeriodicGaugeImpl<GimplAdjointTypesR> PeriodicGimplAdjR; // Real.. whichever prec
+typedef PeriodicGaugeImpl<GimplAdjointTypesF> PeriodicGimplAdjF; // Float
+typedef PeriodicGaugeImpl<GimplAdjointTypesD> PeriodicGimplAdjD; // Double
+
+typedef ConjugateGaugeImpl<GimplTypesR> ConjugateGimplR; // Real.. whichever prec
+typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float
+typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double
+
+
+}
+}
+
+#endif
--- a/Grid/qcd/action/gauge/Photon.h
+++ b/Grid/qcd/action/gauge/Photon.h
@@ -0,0 +1,417 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: ./lib/qcd/action/gauge/Photon.h
+ 
+ Copyright (C) 2015
+ 
+ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+#ifndef QCD_PHOTON_ACTION_H
+#define QCD_PHOTON_ACTION_H
+
+namespace Grid{
+namespace QCD{
+  template <class S>
+  class QedGimpl
+  {
+  public:
+    typedef S Simd;
+    
+    template <typename vtype>
+    using iImplGaugeLink  = iScalar<iScalar<iScalar<vtype>>>;
+    template <typename vtype>
+    using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
+    
+    typedef iImplGaugeLink<Simd>  SiteLink;
+    typedef iImplGaugeField<Simd> SiteField;
+    typedef SiteField             SiteComplex;
+    
+    typedef Lattice<SiteLink>  LinkField;
+    typedef Lattice<SiteField> Field;
+    typedef Field              ComplexField;
+  };
+  
+  typedef QedGimpl<vComplex> QedGimplR;
+  
+  template<class Gimpl>
+  class Photon
+  {
+  public:
+    INHERIT_GIMPL_TYPES(Gimpl);
+    GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3);
+    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2, qedInf, 3);
+  public:
+    Photon(Gauge gauge, ZmScheme zmScheme);
+    Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements);
+    Photon(Gauge gauge, ZmScheme zmScheme, Real G0);
+    Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements, Real G0);
+    virtual ~Photon(void) = default;
+    void FreePropagator(const GaugeField &in, GaugeField &out);
+    void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
+    void StochasticWeight(GaugeLinkField &weight);
+    void StochasticField(GaugeField &out, GridParallelRNG &rng);
+    void StochasticField(GaugeField &out, GridParallelRNG &rng,
+                         const GaugeLinkField &weight);
+    void UnitField(GaugeField &out);
+  private:
+    void infVolPropagator(GaugeLinkField &out);
+    void invKHatSquared(GaugeLinkField &out);
+    void zmSub(GaugeLinkField &out);
+  private:
+    Gauge    gauge_;
+    ZmScheme zmScheme_;
+    std::vector<Real>  improvement_;
+    Real     G0_;
+  };
+
+  typedef Photon<QedGimplR>  PhotonR;
+  
+  template<class Gimpl>
+  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
+  : gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()),
+    G0_(0.15493339023106021408483720810737508876916113364521)
+  {}
+
+  template<class Gimpl>
+  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
+                        std::vector<Real> improvements)
+  : gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements),
+    G0_(0.15493339023106021408483720810737508876916113364521)
+  {}
+
+  template<class Gimpl>
+  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme, Real G0)
+  : gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()), G0_(G0)
+  {}
+
+  template<class Gimpl>
+  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
+                        std::vector<Real> improvements, Real G0)
+  : gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements), G0_(G0)
+  {}
+
+  template<class Gimpl>
+  void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out)
+  {
+    FFT theFFT(in._grid);
+    
+    GaugeField in_k(in._grid);
+    GaugeField prop_k(in._grid);
+    
+    theFFT.FFT_all_dim(in_k,in,FFT::forward);
+    MomentumSpacePropagator(prop_k,in_k);
+    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+  }
+
+  template<class Gimpl>
+  void Photon<Gimpl>::infVolPropagator(GaugeLinkField &out)
+  {
+    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
+    LatticeReal        xmu(grid);
+    GaugeLinkField     one(grid);
+    const unsigned int nd    = grid->_ndimension;
+    std::vector<int>   &l    = grid->_fdimensions;
+    std::vector<int>   x0(nd,0);
+    TComplex           Tone  = Complex(1.0,0.0);
+    TComplex           Tzero = Complex(G0_,0.0);
+    FFT                fft(grid);
+    
+    one = Complex(1.0,0.0);
+    out = zero;
+    for(int mu = 0; mu < nd; mu++)
+    {
+      LatticeCoordinate(xmu,mu);
+      Real lo2 = l[mu]/2.0;
+      xmu = where(xmu < lo2, xmu, xmu-double(l[mu]));
+      out = out + toComplex(4*M_PI*M_PI*xmu*xmu);
+    }
+    pokeSite(Tone, out, x0);
+    out = one/out;
+    pokeSite(Tzero, out, x0);
+    fft.FFT_all_dim(out, out, FFT::forward);
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out)
+  {
+    GridBase           *grid = out._grid;
+    GaugeLinkField     kmu(grid), one(grid);
+    const unsigned int nd    = grid->_ndimension;
+    std::vector<int>   &l    = grid->_fdimensions;
+    std::vector<int>   zm(nd,0);
+    TComplex           Tone = Complex(1.0,0.0);
+    TComplex           Tzero= Complex(0.0,0.0);
+    
+    one = Complex(1.0,0.0);
+    out = zero;
+    for(int mu = 0; mu < nd; mu++)
+    {
+      Real twoPiL = M_PI*2./l[mu];
+      
+      LatticeCoordinate(kmu,mu);
+      kmu = 2.*sin(.5*twoPiL*kmu);
+      out = out + kmu*kmu;
+    }
+    pokeSite(Tone, out, zm);
+    out = one/out;
+    pokeSite(Tzero, out, zm);
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::zmSub(GaugeLinkField &out)
+  {
+    GridBase           *grid = out._grid;
+    const unsigned int nd    = grid->_ndimension;
+    std::vector<int>   &l    = grid->_fdimensions;
+    
+    switch (zmScheme_)
+    {
+      case ZmScheme::qedTL:
+      {
+        std::vector<int> zm(nd,0);
+        TComplex         Tzero = Complex(0.0,0.0);
+        
+        pokeSite(Tzero, out, zm);
+        
+        break;
+      }
+      case ZmScheme::qedL:
+      {
+        LatticeInteger spNrm(grid), coor(grid);
+        GaugeLinkField z(grid);
+        
+        spNrm = zero;
+        for(int d = 0; d < grid->_ndimension - 1; d++)
+        {
+          LatticeCoordinate(coor,d);
+          coor = where(coor < Integer(l[d]/2), coor, coor-Integer(l[d]));
+          spNrm = spNrm + coor*coor;
+        }
+        out = where(spNrm == Integer(0), 0.*out, out);
+
+        // IR improvement
+        for(int i = 0; i < improvement_.size(); i++)
+        {
+          Real f = sqrt(improvement_[i]+1);
+          out = where(spNrm == Integer(i+1), f*out, out);
+        }
+      }
+      default:
+        break;
+    }
+  }
+
+  template<class Gimpl>
+  void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in,
+                                               GaugeField &out)
+  {
+  GridBase           *grid = out._grid;
+    LatticeComplex     momProp(grid);
+    
+    switch (zmScheme_)
+    {
+      case ZmScheme::qedTL:
+      case ZmScheme::qedL:
+      {
+        invKHatSquared(momProp);
+        zmSub(momProp);
+        break;
+      }
+      case ZmScheme::qedInf:
+      {
+        infVolPropagator(momProp);
+        break;
+      }
+      default:
+        break;
+    }
+    
+    out = in*momProp;
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::StochasticWeight(GaugeLinkField &weight)
+  {
+    auto               *grid     = dynamic_cast<GridCartesian *>(weight._grid);
+    const unsigned int nd        = grid->_ndimension;
+    std::vector<int>   latt_size = grid->_fdimensions;
+    
+    switch (zmScheme_)
+    {
+      case ZmScheme::qedTL:
+      case ZmScheme::qedL:
+      {
+        Integer vol = 1;
+        for(int d = 0; d < nd; d++)
+        {
+          vol = vol * latt_size[d];
+        }
+        invKHatSquared(weight);
+        weight = sqrt(vol)*sqrt(weight);
+        zmSub(weight);
+        break;
+      }
+      case ZmScheme::qedInf:
+      {
+        infVolPropagator(weight);
+        weight = sqrt(real(weight));
+        break;
+      }
+      default:
+        break;
+    }
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
+  {
+    auto           *grid = dynamic_cast<GridCartesian *>(out._grid);
+    GaugeLinkField weight(grid);
+    
+    StochasticWeight(weight);
+    StochasticField(out, rng, weight);
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
+                                      const GaugeLinkField &weight)
+  {
+    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
+    const unsigned int nd = grid->_ndimension;
+    GaugeLinkField     r(grid);
+    GaugeField         aTilde(grid);
+    FFT                fft(grid);
+    
+    switch (zmScheme_)
+    {
+      case ZmScheme::qedTL:
+      case ZmScheme::qedL:
+      {
+        for(int mu = 0; mu < nd; mu++)
+        {
+          gaussian(rng, r);
+          r = weight*r;
+          pokeLorentz(aTilde, r, mu);
+        }
+        break;
+      }
+      case ZmScheme::qedInf:
+      {
+        Complex                    shift(1., 1.); // This needs to be a GaugeLink element?
+        for(int mu = 0; mu < nd; mu++)
+        {
+          bernoulli(rng, r);
+          r = weight*(2.*r - shift);
+          pokeLorentz(aTilde, r, mu);
+        }
+        break;
+      }
+      default:
+        break;
+    }
+
+    fft.FFT_all_dim(out, aTilde, FFT::backward);
+    
+    out = real(out);
+  }
+
+  template<class Gimpl>
+  void Photon<Gimpl>::UnitField(GaugeField &out)
+  {
+    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
+    const unsigned int nd = grid->_ndimension;
+    GaugeLinkField     r(grid);
+    
+    r = Complex(1.0,0.0);
+
+    for(int mu = 0; mu < nd; mu++)
+    {
+      pokeLorentz(out, r, mu);
+    }
+    
+    out = real(out);
+  }
+//  template<class Gimpl>
+//  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
+//                                                            const GaugeField &in)
+//  {
+//    
+//    FeynmanGaugeMomentumSpacePropagator_TL(out,in);
+//    
+//    GridBase *grid = out._grid;
+//    LatticeInteger     coor(grid);
+//    GaugeField zz(grid); zz=zero;
+//    
+//    // xyzt
+//    for(int d = 0; d < grid->_ndimension-1;d++){
+//      LatticeCoordinate(coor,d);
+//      out = where(coor==Integer(0),zz,out);
+//    }
+//  }
+//  
+//  template<class Gimpl>
+//  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out,
+//                                                             const GaugeField &in)
+//  {
+//    
+//    // what type LatticeComplex
+//    GridBase *grid = out._grid;
+//    int nd = grid->_ndimension;
+//    
+//    typedef typename GaugeField::vector_type vector_type;
+//    typedef typename GaugeField::scalar_type ScalComplex;
+//    typedef Lattice<iSinglet<vector_type> > LatComplex;
+//    
+//    std::vector<int> latt_size   = grid->_fdimensions;
+//    
+//    LatComplex denom(grid); denom= zero;
+//    LatComplex   one(grid); one = ScalComplex(1.0,0.0);
+//    LatComplex   kmu(grid);
+//    
+//    ScalComplex ci(0.0,1.0);
+//    // momphase = n * 2pi / L
+//    for(int mu=0;mu<Nd;mu++) {
+//      
+//      LatticeCoordinate(kmu,mu);
+//      
+//      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+//      
+//      kmu = TwoPiL * kmu ;
+//      
+//      denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
+//    }
+//    std::vector<int> zero_mode(nd,0);
+//    TComplexD Tone = ComplexD(1.0,0.0);
+//    TComplexD Tzero= ComplexD(0.0,0.0);
+//    
+//    pokeSite(Tone,denom,zero_mode);
+//    
+//    denom= one/denom;
+//    
+//    pokeSite(Tzero,denom,zero_mode);
+//    
+//    out = zero;
+//    out = in*denom;
+//  };
+  
+}}
+#endif
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@@ -0,0 +1,152 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/gauge/PlaqPlusRectangleAction.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_PLAQ_PLUS_RECTANGLE_ACTION_H
+#define QCD_PLAQ_PLUS_RECTANGLE_ACTION_H
+
+namespace Grid{
+  namespace QCD{
+    
+    ////////////////////////////////////////////////////////////////////////
+    // PlaqPlusRectangleActoin
+    ////////////////////////////////////////////////////////////////////////
+    template<class Gimpl>
+    class PlaqPlusRectangleAction : public Action<typename Gimpl::GaugeField> {
+    public:
+
+      INHERIT_GIMPL_TYPES(Gimpl);
+
+    private:
+      RealD c_plaq;
+      RealD c_rect;
+
+    public:
+    PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){};
+
+      virtual std::string action_name(){return "PlaqPlusRectangleAction";}
+      
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {}; // noop as no pseudoferms
+      
+      virtual std::string LogParameters(){
+      	std::stringstream sstream;
+      	sstream << GridLogMessage << "["<<action_name() <<"] c_plaq: " << c_plaq << std::endl;
+      	sstream << GridLogMessage << "["<<action_name() <<"] c_rect: " << c_rect << std::endl;
+      	return sstream.str();
+      }
+
+
+      virtual RealD S(const GaugeField &U) {
+	RealD vol = U._grid->gSites();
+
+	RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
+	RealD rect = WilsonLoops<Gimpl>::avgRectangle(U);
+
+	RealD action=c_plaq*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5
+	            +c_rect*(1.0 -rect)*(Nd*(Nd-1.0))*vol;
+
+	return action;
+      };
+
+      virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
+	//extend Ta to include Lorentz indexes
+	RealD factor_p = c_plaq/RealD(Nc)*0.5;
+	RealD factor_r =   c_rect/RealD(Nc)*0.5;
+
+	GridBase *grid = Umu._grid;
+
+	std::vector<GaugeLinkField> U (Nd,grid);
+	std::vector<GaugeLinkField> U2(Nd,grid);
+
+	for(int mu=0;mu<Nd;mu++){
+	  U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+	  WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
+	}
+
+	GaugeLinkField dSdU_mu(grid);
+	GaugeLinkField staple(grid);
+
+	for (int mu=0; mu < Nd; mu++){
+
+	  // Staple in direction mu
+
+	  WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
+
+	  dSdU_mu = Ta(U[mu]*staple)*factor_p;
+
+	  WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
+
+	  dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
+	  
+	  PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
+	}
+
+      };
+
+    };
+
+    // Convenience for common physically defined cases.
+    //
+    // RBC c1 parameterisation is not really RBC but don't have good
+    // reference and we are happy to change name if prior use of this plaq coeff
+    // parameterisation is made known to us. 
+    template<class Gimpl>
+    class RBCGaugeAction : public PlaqPlusRectangleAction<Gimpl> {
+    public:
+      INHERIT_GIMPL_TYPES(Gimpl);
+      RBCGaugeAction(RealD beta,RealD c1) : PlaqPlusRectangleAction<Gimpl>(beta*(1.0-8.0*c1), beta*c1) {};
+      virtual std::string action_name(){return "RBCGaugeAction";}
+    };
+
+    template<class Gimpl>
+    class IwasakiGaugeAction : public RBCGaugeAction<Gimpl> {
+    public:
+      INHERIT_GIMPL_TYPES(Gimpl);
+      IwasakiGaugeAction(RealD beta) : RBCGaugeAction<Gimpl>(beta,-0.331) {};
+      virtual std::string action_name(){return "IwasakiGaugeAction";}
+    };
+
+    template<class Gimpl>
+    class SymanzikGaugeAction : public RBCGaugeAction<Gimpl> {
+    public:
+      INHERIT_GIMPL_TYPES(Gimpl);
+      SymanzikGaugeAction(RealD beta) : RBCGaugeAction<Gimpl>(beta,-1.0/12.0) {};
+      virtual std::string action_name(){return "SymanzikGaugeAction";}
+    };
+
+    template<class Gimpl>
+    class DBW2GaugeAction : public RBCGaugeAction<Gimpl> {
+    public:
+      INHERIT_GIMPL_TYPES(Gimpl);
+      DBW2GaugeAction(RealD beta) : RBCGaugeAction<Gimpl>(beta,-1.4067) {};
+      virtual std::string action_name(){return "DBW2GaugeAction";}
+    };
+
+  }
+}
+
+#endif
--- a/Grid/qcd/action/gauge/WilsonGaugeAction.h
+++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h
@@ -0,0 +1,95 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef QCD_WILSON_GAUGE_ACTION_H
+#define QCD_WILSON_GAUGE_ACTION_H
+
+namespace Grid {
+namespace QCD {
+
+////////////////////////////////////////////////////////////////////////
+// Wilson Gauge Action .. should I template the Nc etc..
+////////////////////////////////////////////////////////////////////////
+template <class Gimpl>
+class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
+ public:  
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  /////////////////////////// constructors
+  explicit WilsonGaugeAction(RealD beta_):beta(beta_){};
+
+  virtual std::string action_name() {return "WilsonGaugeAction";}
+
+  virtual std::string LogParameters(){
+    std::stringstream sstream;
+    sstream << GridLogMessage << "[WilsonGaugeAction] Beta: " << beta << std::endl;
+    return sstream.str();
+  }
+
+  virtual void refresh(const GaugeField &U,
+                       GridParallelRNG &pRNG){};  // noop as no pseudoferms
+
+  virtual RealD S(const GaugeField &U) {
+    RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
+    RealD vol = U._grid->gSites();
+    RealD action = beta * (1.0 - plaq) * (Nd * (Nd - 1.0)) * vol * 0.5;
+    return action;
+  };
+
+  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
+    // not optimal implementation FIXME
+    // extend Ta to include Lorentz indexes
+
+    RealD factor = 0.5 * beta / RealD(Nc);
+
+    GaugeLinkField Umu(U._grid);
+    GaugeLinkField dSdU_mu(U._grid);
+    for (int mu = 0; mu < Nd; mu++) {
+      Umu = PeekIndex<LorentzIndex>(U, mu);
+
+      // Staple in direction mu
+      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
+      dSdU_mu = Ta(Umu * dSdU_mu) * factor;
+
+      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
+    }
+  }
+private:
+  RealD beta;  
+};
+
+
+
+}
+}
+
+#endif
--- a/Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
+++ b/Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
@@ -0,0 +1,145 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_EVEN_ODD_SCHUR_DIFFERENTIABLE_H
+#define QCD_EVEN_ODD_SCHUR_DIFFERENTIABLE_H
+
+namespace Grid{
+  namespace QCD{
+
+    // Base even odd HMC on the normal Mee based schur decomposition.
+    //
+    //     M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
+    //         (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
+    //
+    // Determinant is det of middle factor
+    // This assumes Mee is indept of U.
+    //
+    template<class Impl>
+    class SchurDifferentiableOperator :  public SchurDiagMooeeOperator<FermionOperator<Impl>,typename Impl::FermionField> 
+      {
+      public:
+      INHERIT_IMPL_TYPES(Impl);
+
+        typedef FermionOperator<Impl> Matrix;
+
+        SchurDifferentiableOperator (Matrix &Mat) : SchurDiagMooeeOperator<Matrix,FermionField>(Mat) {};
+
+        void MpcDeriv(GaugeField &Force,const FermionField &U,const FermionField &V) {
+        
+          GridBase *fgrid   = this->_Mat.FermionGrid();
+          GridBase *fcbgrid = this->_Mat.FermionRedBlackGrid();
+
+          FermionField tmp1(fcbgrid);
+          FermionField tmp2(fcbgrid);
+
+          conformable(fcbgrid,U._grid);
+          conformable(fcbgrid,V._grid);
+
+          // Assert the checkerboard?? or code for either
+          assert(U.checkerboard==Odd);
+          assert(V.checkerboard==U.checkerboard);
+
+          // NOTE Guido: WE DO NOT WANT TO USE THE ucbgrid GRID FOR THE FORCE
+          // it is not conformable with the HMC force field
+	  // Case: Ls vectorised fields
+          // INHERIT FROM THE Force field instead
+          GridRedBlackCartesian* forcecb = new GridRedBlackCartesian(Force._grid);
+          GaugeField ForceO(forcecb);
+          GaugeField ForceE(forcecb);
+
+
+          //  X^dag Der_oe MeeInv Meo Y
+          // Use Mooee as nontrivial but gauge field indept
+          this->_Mat.Meooe   (V,tmp1);      // odd->even -- implicit -0.5 factor to be applied
+	  this->_Mat.MooeeInv(tmp1,tmp2);   // even->even 
+          this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerNo);
+          //  Accumulate X^dag M_oe MeeInv Der_eo Y
+          this->_Mat.MeooeDag   (U,tmp1);    // even->odd -- implicit -0.5 factor to be applied
+          this->_Mat.MooeeInvDag(tmp1,tmp2); // even->even 
+          this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerNo);
+          
+          assert(ForceE.checkerboard==Even);
+          assert(ForceO.checkerboard==Odd);
+
+          setCheckerboard(Force,ForceE); 
+          setCheckerboard(Force,ForceO);
+          Force=-Force;
+
+          delete forcecb;
+        }
+
+
+        void MpcDagDeriv(GaugeField &Force,const FermionField &U,const FermionField &V) {
+        
+          GridBase *fgrid   = this->_Mat.FermionGrid();
+          GridBase *fcbgrid = this->_Mat.FermionRedBlackGrid();
+
+          FermionField tmp1(fcbgrid);
+          FermionField tmp2(fcbgrid);
+
+          conformable(fcbgrid,U._grid);
+          conformable(fcbgrid,V._grid);
+
+          // Assert the checkerboard?? or code for either
+          assert(V.checkerboard==Odd);
+          assert(V.checkerboard==V.checkerboard);
+
+          // NOTE Guido: WE DO NOT WANT TO USE THE ucbgrid GRID FOR THE FORCE
+          // it is not conformable with the HMC force field
+          // INHERIT FROM THE Force field instead
+	  GridRedBlackCartesian* forcecb = new GridRedBlackCartesian(Force._grid);
+          GaugeField ForceO(forcecb);
+          GaugeField ForceE(forcecb);
+
+          //  X^dag Der_oe MeeInv Meo Y
+          // Use Mooee as nontrivial but gauge field indept
+          this->_Mat.MeooeDag   (V,tmp1);      // odd->even -- implicit -0.5 factor to be applied
+          this->_Mat.MooeeInvDag(tmp1,tmp2);   // even->even 
+          this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerYes);
+          
+          //  Accumulate X^dag M_oe MeeInv Der_eo Y
+          this->_Mat.Meooe   (U,tmp1);    // even->odd -- implicit -0.5 factor to be applied
+          this->_Mat.MooeeInv(tmp1,tmp2); // even->even 
+          this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerYes);
+
+          assert(ForceE.checkerboard==Even);
+          assert(ForceO.checkerboard==Odd);
+
+          setCheckerboard(Force,ForceE); 
+          setCheckerboard(Force,ForceO);
+          Force=-Force;
+
+          delete forcecb;
+        }
+
+    };
+
+  }
+}
+#endif
--- a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
+++ b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
@@ -0,0 +1,264 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/pseudofermion/ExactOneFlavourRatio.h
+
+Copyright (C) 2017
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+/////////////////////////////////////////////////////////////////
+// Implementation of exact one flavour algorithm (EOFA)         //
+// using fermion classes defined in:                           //
+//    Grid/qcd/action/fermion/DomainWallEOFAFermion.h (Shamir) //
+//    Grid/qcd/action/fermion/MobiusEOFAFermion.h (Mobius)     //
+// arXiv: 1403.1683, 1706.05843                                //
+/////////////////////////////////////////////////////////////////
+
+#ifndef QCD_PSEUDOFERMION_EXACT_ONE_FLAVOUR_RATIO_H
+#define QCD_PSEUDOFERMION_EXACT_ONE_FLAVOUR_RATIO_H
+
+namespace Grid{
+namespace QCD{
+
+  ///////////////////////////////////////////////////////////////
+  // Exact one flavour implementation of DWF determinant ratio //
+  ///////////////////////////////////////////////////////////////
+
+  template<class Impl>
+  class ExactOneFlavourRatioPseudoFermionAction : public Action<typename Impl::GaugeField>
+  {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+      typedef OneFlavourRationalParams Params;
+      Params param;
+      MultiShiftFunction PowerNegHalf;
+
+    private:
+      bool use_heatbath_forecasting;
+      AbstractEOFAFermion<Impl>& Lop; // the basic LH operator
+      AbstractEOFAFermion<Impl>& Rop; // the basic RH operator
+      SchurRedBlackDiagMooeeSolve<FermionField> Solver;
+      FermionField Phi; // the pseudofermion field for this trajectory
+
+    public:
+      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, AbstractEOFAFermion<Impl>& _Rop,
+        OperatorFunction<FermionField>& S, Params& p, bool use_fc=false) : Lop(_Lop), Rop(_Rop), Solver(S),
+        Phi(_Lop.FermionGrid()), param(p), use_heatbath_forecasting(use_fc)
+      {
+        AlgRemez remez(param.lo, param.hi, param.precision);
+
+        // MdagM^(+- 1/2)
+        std::cout << GridLogMessage << "Generating degree " << param.degree << " for x^(-1/2)" << std::endl;
+        remez.generateApprox(param.degree, 1, 2);
+        PowerNegHalf.Init(remez, param.tolerance, true);
+      };
+
+      virtual std::string action_name() { return "ExactOneFlavourRatioPseudoFermionAction"; }
+
+      virtual std::string LogParameters() {
+        std::stringstream sstream;
+        sstream << GridLogMessage << "[" << action_name() << "] Low            :" << param.lo << std::endl;
+        sstream << GridLogMessage << "[" << action_name() << "] High           :" << param.hi << std::endl;
+        sstream << GridLogMessage << "[" << action_name() << "] Max iterations :" << param.MaxIter << std::endl;
+        sstream << GridLogMessage << "[" << action_name() << "] Tolerance      :" << param.tolerance << std::endl;
+        sstream << GridLogMessage << "[" << action_name() << "] Degree         :" << param.degree << std::endl;
+        sstream << GridLogMessage << "[" << action_name() << "] Precision      :" << param.precision << std::endl;
+        return sstream.str();
+      }
+
+      // Spin projection
+      void spProj(const FermionField& in, FermionField& out, int sign, int Ls)
+      {
+        if(sign == 1){ for(int s=0; s<Ls; ++s){ axpby_ssp_pplus(out, 0.0, in, 1.0, in, s, s); } }
+        else{ for(int s=0; s<Ls; ++s){ axpby_ssp_pminus(out, 0.0, in, 1.0, in, s, s); } }
+      }
+
+      // EOFA heatbath: see Eqn. (29) of arXiv:1706.05843
+      // We generate a Gaussian noise vector \eta, and then compute
+      //  \Phi = M_{\rm EOFA}^{-1/2} * \eta
+      // using a rational approximation to the inverse square root
+      virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG)
+      {
+        Lop.ImportGauge(U);
+        Rop.ImportGauge(U);
+
+        FermionField eta         (Lop.FermionGrid());
+        FermionField CG_src      (Lop.FermionGrid());
+        FermionField CG_soln     (Lop.FermionGrid());
+        FermionField Forecast_src(Lop.FermionGrid());
+        std::vector<FermionField> tmp(2, Lop.FermionGrid());
+
+        // Use chronological inverter to forecast solutions across poles
+        std::vector<FermionField> prev_solns;
+        if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); }
+        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;
+
+        // Seed with Gaussian noise vector (var = 0.5)
+        RealD scale = std::sqrt(0.5);
+        gaussian(pRNG,eta);
+        eta = eta * scale;
+        printf("Heatbath source vector: <\\eta|\\eta> = %1.15e\n", norm2(eta));
+
+        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
+        RealD N(PowerNegHalf.norm);
+        for(int k=0; k<param.degree; ++k){ N += PowerNegHalf.residues[k] / ( 1.0 + PowerNegHalf.poles[k] ); }
+        Phi = eta * N;
+
+        // LH terms:
+        // \Phi = \Phi + k \sum_{k=1}^{N_{p}} P_{-} \Omega_{-}^{\dagger} ( H(mf)
+        //          - \gamma_{l} \Delta_{-}(mf,mb) P_{-} )^{-1} \Omega_{-} P_{-} \eta
+        RealD gamma_l(0.0);
+        spProj(eta, tmp[0], -1, Lop.Ls);
+        Lop.Omega(tmp[0], tmp[1], -1, 0);
+        G5R5(CG_src, tmp[1]);
+        tmp[1] = zero;
+        for(int k=0; k<param.degree; ++k){
+          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
+          Lop.RefreshShiftCoefficients(-gamma_l);
+          if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles
+            Lop.Mdag(CG_src, Forecast_src);
+            CG_soln = Forecast(Lop, Forecast_src, prev_solns);
+            Solver(Lop, CG_src, CG_soln);
+            prev_solns.push_back(CG_soln);
+          } else {
+            CG_soln = zero; // Just use zero as the initial guess
+            Solver(Lop, CG_src, CG_soln);
+          }
+          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
+          tmp[1] = tmp[1] + ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Lop.k ) * tmp[0];
+        }
+        Lop.Omega(tmp[1], tmp[0], -1, 1);
+        spProj(tmp[0], tmp[1], -1, Lop.Ls);
+        Phi = Phi + tmp[1];
+
+        // RH terms:
+        // \Phi = \Phi - k \sum_{k=1}^{N_{p}} P_{+} \Omega_{+}^{\dagger} ( H(mb)
+        //          + \gamma_{l} \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \eta
+        spProj(eta, tmp[0], 1, Rop.Ls);
+        Rop.Omega(tmp[0], tmp[1], 1, 0);
+        G5R5(CG_src, tmp[1]);
+        tmp[1] = zero;
+        if(use_heatbath_forecasting){ prev_solns.clear(); } // empirically, LH solns don't help for RH solves
+        for(int k=0; k<param.degree; ++k){
+          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
+          Rop.RefreshShiftCoefficients(-gamma_l*PowerNegHalf.poles[k]);
+          if(use_heatbath_forecasting){
+            Rop.Mdag(CG_src, Forecast_src);
+            CG_soln = Forecast(Rop, Forecast_src, prev_solns);
+            Solver(Rop, CG_src, CG_soln);
+            prev_solns.push_back(CG_soln);
+          } else {
+            CG_soln = zero;
+            Solver(Rop, CG_src, CG_soln);
+          }
+          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
+          tmp[1] = tmp[1] - ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Rop.k ) * tmp[0];
+        }
+        Rop.Omega(tmp[1], tmp[0], 1, 1);
+        spProj(tmp[0], tmp[1], 1, Rop.Ls);
+        Phi = Phi + tmp[1];
+
+        // Reset shift coefficients for energy and force evals
+        Lop.RefreshShiftCoefficients(0.0);
+        Rop.RefreshShiftCoefficients(-1.0);
+      };
+
+      // EOFA action: see Eqn. (10) of arXiv:1706.05843
+      virtual RealD S(const GaugeField& U)
+      {
+        Lop.ImportGauge(U);
+        Rop.ImportGauge(U);
+
+        FermionField spProj_Phi(Lop.FermionGrid());
+        std::vector<FermionField> tmp(2, Lop.FermionGrid());
+
+        // S = <\Phi|\Phi>
+        RealD action(norm2(Phi));
+
+        // LH term: S = S - k <\Phi| P_{-} \Omega_{-}^{\dagger} H(mf)^{-1} \Omega_{-} P_{-} |\Phi>
+        spProj(Phi, spProj_Phi, -1, Lop.Ls);
+        Lop.Omega(spProj_Phi, tmp[0], -1, 0);
+        G5R5(tmp[1], tmp[0]);
+        tmp[0] = zero;
+        Solver(Lop, tmp[1], tmp[0]);
+        Lop.Dtilde(tmp[0], tmp[1]); // We actually solved Cayley preconditioned system: transform back
+        Lop.Omega(tmp[1], tmp[0], -1, 1);
+        action -= Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
+
+        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
+        spProj(Phi, spProj_Phi, 1, Rop.Ls);
+        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
+        G5R5(tmp[1], tmp[0]);
+        tmp[0] = zero;
+        Solver(Rop, tmp[1], tmp[0]);
+        Rop.Dtilde(tmp[0], tmp[1]);
+        Rop.Omega(tmp[1], tmp[0], 1, 1);
+        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
+
+        return action;
+      };
+
+      // EOFA pseudofermion force: see Eqns. (34)-(36) of arXiv:1706.05843
+      virtual void deriv(const GaugeField& U, GaugeField& dSdU)
+      {
+        Lop.ImportGauge(U);
+        Rop.ImportGauge(U);
+
+        FermionField spProj_Phi      (Lop.FermionGrid());
+        FermionField Omega_spProj_Phi(Lop.FermionGrid());
+        FermionField CG_src          (Lop.FermionGrid());
+        FermionField Chi             (Lop.FermionGrid());
+        FermionField g5_R5_Chi       (Lop.FermionGrid());
+
+        GaugeField force(Lop.GaugeGrid());
+
+        // LH: dSdU = k \chi_{L}^{\dagger} \gamma_{5} R_{5} ( \partial_{x,\mu} D_{w} ) \chi_{L}
+        //     \chi_{L} = H(mf)^{-1} \Omega_{-} P_{-} \Phi
+        spProj(Phi, spProj_Phi, -1, Lop.Ls);
+        Lop.Omega(spProj_Phi, Omega_spProj_Phi, -1, 0);
+        G5R5(CG_src, Omega_spProj_Phi);
+        spProj_Phi = zero;
+        Solver(Lop, CG_src, spProj_Phi);
+        Lop.Dtilde(spProj_Phi, Chi);
+        G5R5(g5_R5_Chi, Chi);
+        Lop.MDeriv(force, g5_R5_Chi, Chi, DaggerNo);
+        dSdU = Lop.k * force;
+
+        // RH: dSdU = dSdU - k \chi_{R}^{\dagger} \gamma_{5} R_{5} ( \partial_{x,\mu} D_{w} ) \chi_{}
+        //     \chi_{R} = ( H(mb) - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \Phi
+        spProj(Phi, spProj_Phi, 1, Rop.Ls);
+        Rop.Omega(spProj_Phi, Omega_spProj_Phi, 1, 0);
+        G5R5(CG_src, Omega_spProj_Phi);
+        spProj_Phi = zero;
+        Solver(Rop, CG_src, spProj_Phi);
+        Rop.Dtilde(spProj_Phi, Chi);
+        G5R5(g5_R5_Chi, Chi);
+        Lop.MDeriv(force, g5_R5_Chi, Chi, DaggerNo);
+        dSdU = dSdU - Rop.k * force;
+      };
+  };
+}}
+
+#endif
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
@@ -0,0 +1,228 @@
+
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_H
+#define QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_H
+
+namespace Grid {
+namespace QCD {
+
+///////////////////////////////////////
+// One flavour rational
+///////////////////////////////////////
+
+// S_f = chi^dag *  N(Mpc^dag*Mpc)/D(Mpc^dag*Mpc) * chi
+//
+// Here, M is some operator
+// N and D makeup the rat. poly
+//
+
+template <class Impl>
+class OneFlavourEvenOddRationalPseudoFermionAction
+    : public Action<typename Impl::GaugeField> {
+ public:
+  INHERIT_IMPL_TYPES(Impl);
+
+  typedef OneFlavourRationalParams Params;
+  Params param;
+
+  MultiShiftFunction PowerHalf;
+  MultiShiftFunction PowerNegHalf;
+  MultiShiftFunction PowerQuarter;
+  MultiShiftFunction PowerNegQuarter;
+
+ private:
+  FermionOperator<Impl> &FermOp;  // the basic operator
+
+  // NOT using "Nroots"; IroIro is -- perhaps later, but this wasn't good for us
+  // historically
+  // and hasenbusch works better
+
+  FermionField PhiEven;  // the pseudo fermion field for this trajectory
+  FermionField PhiOdd;   // the pseudo fermion field for this trajectory
+
+ public:
+  OneFlavourEvenOddRationalPseudoFermionAction(FermionOperator<Impl> &Op,
+                                               Params &p)
+      : FermOp(Op),
+        PhiEven(Op.FermionRedBlackGrid()),
+        PhiOdd(Op.FermionRedBlackGrid()),
+        param(p) {
+    AlgRemez remez(param.lo, param.hi, param.precision);
+
+    // MdagM^(+- 1/2)
+    std::cout << GridLogMessage << "Generating degree " << param.degree
+              << " for x^(1/2)" << std::endl;
+    remez.generateApprox(param.degree, 1, 2);
+    PowerHalf.Init(remez, param.tolerance, false);
+    PowerNegHalf.Init(remez, param.tolerance, true);
+
+    // MdagM^(+- 1/4)
+    std::cout << GridLogMessage << "Generating degree " << param.degree
+              << " for x^(1/4)" << std::endl;
+    remez.generateApprox(param.degree, 1, 4);
+    PowerQuarter.Init(remez, param.tolerance, false);
+    PowerNegQuarter.Init(remez, param.tolerance, true);
+  };
+
+  virtual std::string action_name(){return "OneFlavourEvenOddRationalPseudoFermionAction";}
+
+  virtual std::string LogParameters(){
+    std::stringstream sstream;
+    sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
+    sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
+    sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
+    sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
+    sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
+    sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
+    return sstream.str();
+  }
+  
+  virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) {
+    // P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
+    //        = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
+    // Phi = MpcdagMpc^{1/4} eta
+    //
+    // P(eta) = e^{- eta^dag eta}
+    //
+    // e^{x^2/2 sig^2} => sig^2 = 0.5.
+    //
+    // So eta should be of width sig = 1/sqrt(2).
+
+    RealD scale = std::sqrt(0.5);
+
+    FermionField eta(FermOp.FermionGrid());
+    FermionField etaOdd(FermOp.FermionRedBlackGrid());
+    FermionField etaEven(FermOp.FermionRedBlackGrid());
+
+    gaussian(pRNG, eta);
+    eta = eta * scale;
+
+    pickCheckerboard(Even, etaEven, eta);
+    pickCheckerboard(Odd, etaOdd, eta);
+
+    FermOp.ImportGauge(U);
+
+    // mutishift CG
+    SchurDifferentiableOperator<Impl> Mpc(FermOp);
+    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter, PowerQuarter);
+    msCG(Mpc, etaOdd, PhiOdd);
+
+    //////////////////////////////////////////////////////
+    // FIXME : Clover term not yet..
+    //////////////////////////////////////////////////////
+
+    assert(FermOp.ConstEE() == 1);
+    PhiEven = zero;
+  };
+
+  //////////////////////////////////////////////////////
+  // S = phi^dag (Mdag M)^-1/2 phi
+  //////////////////////////////////////////////////////
+  virtual RealD S(const GaugeField &U) {
+    FermOp.ImportGauge(U);
+
+    FermionField Y(FermOp.FermionRedBlackGrid());
+
+    SchurDifferentiableOperator<Impl> Mpc(FermOp);
+
+    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,
+                                                   PowerNegQuarter);
+
+    msCG(Mpc, PhiOdd, Y);
+
+    RealD action = norm2(Y);
+    std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 "
+                                   "solve or -1/2 solve faster??? "
+              << action << std::endl;
+
+    return action;
+  };
+
+  //////////////////////////////////////////////////////
+  // Need
+  // dS_f/dU = chi^dag   d[N/D]  chi
+  //
+  // N/D is expressed as partial fraction expansion:
+  //
+  //           a0 + \sum_k ak/(M^dagM + bk)
+  //
+  // d[N/D] is then
+  //
+  //          \sum_k -ak [M^dagM+bk]^{-1}  [ dM^dag M + M^dag dM ] [M^dag M +
+  //          bk]^{-1}
+  //
+  // Need
+  //       Mf Phi_k = [MdagM+bk]^{-1} Phi
+  //       Mf Phi   = \sum_k ak [MdagM+bk]^{-1} Phi
+  //
+  // With these building blocks
+  //
+  //       dS/dU =  \sum_k -ak Mf Phi_k^dag      [ dM^dag M + M^dag dM ] Mf
+  //       Phi_k
+  //        S    = innerprodReal(Phi,Mf Phi);
+  //////////////////////////////////////////////////////
+  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
+    const int Npole = PowerNegHalf.poles.size();
+
+    std::vector<FermionField> MPhi_k(Npole, FermOp.FermionRedBlackGrid());
+
+    FermionField X(FermOp.FermionRedBlackGrid());
+    FermionField Y(FermOp.FermionRedBlackGrid());
+
+    GaugeField tmp(FermOp.GaugeGrid());
+
+    FermOp.ImportGauge(U);
+
+    SchurDifferentiableOperator<Impl> Mpc(FermOp);
+
+    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter, PowerNegHalf);
+
+    msCG(Mpc, PhiOdd, MPhi_k);
+
+    dSdU = zero;
+    for (int k = 0; k < Npole; k++) {
+      RealD ak = PowerNegHalf.residues[k];
+
+      X = MPhi_k[k];
+
+      Mpc.Mpc(X, Y);
+      Mpc.MpcDeriv(tmp, Y, X);
+      dSdU = dSdU + ak * tmp;
+      Mpc.MpcDagDeriv(tmp, X, Y);
+      dSdU = dSdU + ak * tmp;
+    }
+
+    // dSdU = Ta(dSdU);
+  };
+};
+}
+}
+
+#endif
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@@ -0,0 +1,281 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_RATIO_H
+#define QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_RATIO_H
+
+namespace Grid{
+  namespace QCD{
+
+    ///////////////////////////////////////
+    // One flavour rational
+    ///////////////////////////////////////
+
+    // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+    //
+    // Here P/Q \sim R_{1/4}  ~ (V^dagV)^{1/4}  
+    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
+  
+    template<class Impl>
+    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+
+      INHERIT_IMPL_TYPES(Impl);
+
+      typedef OneFlavourRationalParams Params;
+      Params param;
+
+      MultiShiftFunction PowerHalf   ;
+      MultiShiftFunction PowerNegHalf;
+      MultiShiftFunction PowerQuarter;
+      MultiShiftFunction PowerNegQuarter;
+
+    private:
+     
+      FermionOperator<Impl> & NumOp;// the basic operator
+      FermionOperator<Impl> & DenOp;// the basic operator
+      FermionField PhiEven; // the pseudo fermion field for this trajectory
+      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+
+    public:
+
+      OneFlavourEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+					    FermionOperator<Impl>  &_DenOp, 
+					    Params & p
+					    ) : 
+      NumOp(_NumOp), 
+      DenOp(_DenOp), 
+      PhiOdd (_NumOp.FermionRedBlackGrid()),
+      PhiEven(_NumOp.FermionRedBlackGrid()),
+      param(p) 
+      {
+	AlgRemez remez(param.lo,param.hi,param.precision);
+
+	// MdagM^(+- 1/2)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
+	remez.generateApprox(param.degree,1,2);
+	PowerHalf.Init(remez,param.tolerance,false);
+	PowerNegHalf.Init(remez,param.tolerance,true);
+
+	// MdagM^(+- 1/4)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
+	remez.generateApprox(param.degree,1,4);
+   	PowerQuarter.Init(remez,param.tolerance,false);
+	PowerNegQuarter.Init(remez,param.tolerance,true);
+      };
+
+      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
+
+      virtual std::string LogParameters(){
+	std::stringstream sstream;
+	sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
+	return sstream.str();
+      }
+      
+      
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+
+	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+	//
+	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
+	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
+	//
+	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
+	//
+	// P(eta) = e^{- eta^dag eta}
+	//
+	// e^{x^2/2 sig^2} => sig^2 = 0.5.
+	// 
+	// So eta should be of width sig = 1/sqrt(2).
+
+	RealD scale = std::sqrt(0.5);
+
+	FermionField eta(NumOp.FermionGrid());
+	FermionField etaOdd (NumOp.FermionRedBlackGrid());
+	FermionField etaEven(NumOp.FermionRedBlackGrid());
+	FermionField     tmp(NumOp.FermionRedBlackGrid());
+
+	gaussian(pRNG,eta);	eta=eta*scale;
+
+	pickCheckerboard(Even,etaEven,eta);
+	pickCheckerboard(Odd,etaOdd,eta);
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+
+	// MdagM^1/4 eta
+	SchurDifferentiableOperator<Impl> MdagM(DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
+	msCG_M(MdagM,etaOdd,tmp);
+
+	// VdagV^-1/4 MdagM^1/4 eta
+	SchurDifferentiableOperator<Impl> VdagV(NumOp);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
+	msCG_V(VdagV,tmp,PhiOdd);
+
+	assert(NumOp.ConstEE() == 1);
+	assert(DenOp.ConstEE() == 1);
+	PhiEven = zero;
+	
+      };
+
+      //////////////////////////////////////////////////////
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	FermionField X(NumOp.FermionRedBlackGrid());
+	FermionField Y(NumOp.FermionRedBlackGrid());
+
+	// VdagV^1/4 Phi
+	SchurDifferentiableOperator<Impl> VdagV(NumOp);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
+	msCG_V(VdagV,PhiOdd,X);
+
+	// MdagM^-1/4 VdagV^1/4 Phi
+	SchurDifferentiableOperator<Impl> MdagM(DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
+	msCG_M(MdagM,X,Y);
+
+	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
+	RealD action = norm2(Y);
+
+	return action;
+      };
+
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //
+      // Here, M is some 5D operator and V is the Pauli-Villars field
+      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
+      //
+      // Need  
+      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
+      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
+      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
+      //
+      // P/Q is expressed as partial fraction expansion: 
+      // 
+      //           a0 + \sum_k ak/(V^dagV + bk) 
+      //  
+      // d[P/Q] is then  
+      //
+      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
+      //  
+      // and similar for N/D. 
+      // 
+      // Need   
+      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
+      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
+      //   
+      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
+      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
+      // 
+      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
+      //  
+
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+	const int n_f  = PowerNegHalf.poles.size();
+	const int n_pv = PowerQuarter.poles.size();
+
+	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
+	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
+	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
+
+	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField           Y(NumOp.FermionRedBlackGrid());
+
+	GaugeField   tmp(NumOp.GaugeGrid());
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	SchurDifferentiableOperator<Impl> VdagV(NumOp);
+	SchurDifferentiableOperator<Impl> MdagM(DenOp);
+
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
+
+	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
+	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
+	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
+
+	RealD ak;
+
+	dSdU = zero;
+
+	// With these building blocks  
+	//  
+	//       dS/dU = 
+	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
+	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
+	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
+
+	//(1)
+	for(int k=0;k<n_f;k++){
+	  ak = PowerNegHalf.residues[k];
+	  MdagM.Mpc(MfMpvPhi_k[k],Y);
+	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
+	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
+	}
+	
+	//(2)
+	//(3)
+	for(int k=0;k<n_pv;k++){
+
+          ak = PowerQuarter.residues[k];
+	  
+	  VdagV.Mpc(MpvPhi_k[k],Y);
+	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
+	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
+	  
+	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
+	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
+	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
+
+	}
+
+	//dSdU = Ta(dSdU);
+
+      };
+    };
+  }
+}
+
+
+#endif
--- a/Grid/qcd/action/pseudofermion/OneFlavourRational.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourRational.h
@@ -0,0 +1,213 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/OneFlavourRational.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_RATIONAL_H
+#define QCD_PSEUDOFERMION_ONE_FLAVOUR_RATIONAL_H
+
+namespace Grid{
+  namespace QCD{
+
+    ///////////////////////////////////////
+    // One flavour rational
+    ///////////////////////////////////////
+
+    // S_f = chi^dag *  N(M^dag*M)/D(M^dag*M) * chi
+    //
+    // Here, M is some operator 
+    // N and D makeup the rat. poly 
+    //
+  
+    template<class Impl>
+    class OneFlavourRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+      typedef OneFlavourRationalParams Params;
+      Params param;
+
+      MultiShiftFunction PowerHalf   ;
+      MultiShiftFunction PowerNegHalf;
+      MultiShiftFunction PowerQuarter;
+      MultiShiftFunction PowerNegQuarter;
+
+    private:
+     
+      FermionOperator<Impl> & FermOp;// the basic operator
+
+      // NOT using "Nroots"; IroIro is -- perhaps later, but this wasn't good for us historically
+      // and hasenbusch works better
+
+      FermionField Phi; // the pseudo fermion field for this trajectory
+
+    public:
+
+      OneFlavourRationalPseudoFermionAction(FermionOperator<Impl>  &Op, 
+					    Params & p
+					    ) : FermOp(Op), Phi(Op.FermionGrid()), param(p) 
+      {
+	AlgRemez remez(param.lo,param.hi,param.precision);
+
+	// MdagM^(+- 1/2)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
+	remez.generateApprox(param.degree,1,2);
+	PowerHalf.Init(remez,param.tolerance,false);
+	PowerNegHalf.Init(remez,param.tolerance,true);
+
+	// MdagM^(+- 1/4)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
+	remez.generateApprox(param.degree,1,4);
+   	PowerQuarter.Init(remez,param.tolerance,false);
+	PowerNegQuarter.Init(remez,param.tolerance,true);
+      };
+
+      virtual std::string action_name(){return "OneFlavourRationalPseudoFermionAction";}
+
+      virtual std::string LogParameters(){
+	std::stringstream sstream;
+	sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
+	return sstream.str();
+      }  
+
+
+      
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+
+	
+	// P(phi) = e^{- phi^dag (MdagM)^-1/2 phi}
+	//        = e^{- phi^dag (MdagM)^-1/4 (MdagM)^-1/4 phi}
+	// Phi = Mdag^{1/4} eta 
+	// P(eta) = e^{- eta^dag eta}
+	//
+	// e^{x^2/2 sig^2} => sig^2 = 0.5.
+	// 
+	// So eta should be of width sig = 1/sqrt(2).
+
+	RealD scale = std::sqrt(0.5);
+
+	FermionField eta(FermOp.FermionGrid());
+
+	gaussian(pRNG,eta);
+
+	FermOp.ImportGauge(U);
+
+	// mutishift CG
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(FermOp);
+	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerQuarter);
+	msCG(MdagMOp,eta,Phi);
+
+	Phi=Phi*scale;
+	
+      };
+
+      //////////////////////////////////////////////////////
+      // S = phi^dag (Mdag M)^-1/2 phi
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+
+	FermOp.ImportGauge(U);
+
+	FermionField Y(FermOp.FermionGrid());
+	
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(FermOp);
+
+	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegQuarter);
+
+	msCG(MdagMOp,Phi,Y);
+
+	RealD action = norm2(Y);
+	std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 solve or -1/2 solve faster??? "<<action<<std::endl;
+	return action;
+      };
+
+      //////////////////////////////////////////////////////
+      // Need
+      // dS_f/dU = chi^dag   d[N/D]  chi
+      //
+      // N/D is expressed as partial fraction expansion:
+      //
+      //           a0 + \sum_k ak/(M^dagM + bk)
+      //
+      // d[N/D] is then
+      //
+      //          \sum_k -ak [M^dagM+bk]^{-1}  [ dM^dag M + M^dag dM ] [M^dag M + bk]^{-1}
+      //
+      // Need
+      //       Mf Phi_k = [MdagM+bk]^{-1} Phi
+      //       Mf Phi   = \sum_k ak [MdagM+bk]^{-1} Phi
+      //
+      // With these building blocks
+      //
+      //       dS/dU =  \sum_k -ak Mf Phi_k^dag      [ dM^dag M + M^dag dM ] Mf Phi_k
+      //        S    = innerprodReal(Phi,Mf Phi);
+      //////////////////////////////////////////////////////
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+	const int Npole = PowerNegHalf.poles.size();
+
+	std::vector<FermionField> MPhi_k (Npole,FermOp.FermionGrid());
+
+	FermionField X(FermOp.FermionGrid());
+	FermionField Y(FermOp.FermionGrid());
+
+	GaugeField   tmp(FermOp.GaugeGrid());
+
+	FermOp.ImportGauge(U);
+
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(FermOp);
+
+	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegHalf);
+
+	msCG(MdagMOp,Phi,MPhi_k);
+
+	dSdU = zero;
+	for(int k=0;k<Npole;k++){
+
+	  RealD ak = PowerNegHalf.residues[k];
+
+	  X  = MPhi_k[k];
+
+	  FermOp.M(X,Y);
+
+	  FermOp.MDeriv(tmp , Y, X,DaggerNo );  dSdU=dSdU+ak*tmp;
+	  FermOp.MDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+ak*tmp;
+
+	}
+
+	//dSdU = Ta(dSdU);
+
+      };
+    };
+  }
+}
+
+
+#endif
--- a/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h
@@ -0,0 +1,267 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_RATIONAL_RATIO_H
+#define QCD_PSEUDOFERMION_ONE_FLAVOUR_RATIONAL_RATIO_H
+
+namespace Grid{
+  namespace QCD{
+
+    ///////////////////////////////////////
+    // One flavour rational
+    ///////////////////////////////////////
+
+    // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+    //
+    // Here P/Q \sim R_{1/4}  ~ (V^dagV)^{1/4}  
+    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
+  
+    template<class Impl>
+    class OneFlavourRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+
+      INHERIT_IMPL_TYPES(Impl);
+
+      typedef OneFlavourRationalParams Params;
+      Params param;
+
+      MultiShiftFunction PowerHalf   ;
+      MultiShiftFunction PowerNegHalf;
+      MultiShiftFunction PowerQuarter;
+      MultiShiftFunction PowerNegQuarter;
+
+    private:
+     
+      FermionOperator<Impl> & NumOp;// the basic operator
+      FermionOperator<Impl> & DenOp;// the basic operator
+      FermionField Phi; // the pseudo fermion field for this trajectory
+
+    public:
+
+      OneFlavourRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+					    FermionOperator<Impl>  &_DenOp, 
+					    Params & p
+					    ) : NumOp(_NumOp), DenOp(_DenOp), Phi(_NumOp.FermionGrid()), param(p) 
+      {
+	AlgRemez remez(param.lo,param.hi,param.precision);
+
+	// MdagM^(+- 1/2)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
+	remez.generateApprox(param.degree,1,2);
+	PowerHalf.Init(remez,param.tolerance,false);
+	PowerNegHalf.Init(remez,param.tolerance,true);
+
+	// MdagM^(+- 1/4)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
+	remez.generateApprox(param.degree,1,4);
+   	PowerQuarter.Init(remez,param.tolerance,false);
+	PowerNegQuarter.Init(remez,param.tolerance,true);
+      };
+
+      virtual std::string action_name(){return "OneFlavourRatioRationalPseudoFermionAction";}
+      
+      virtual std::string LogParameters(){
+	std::stringstream sstream;
+	sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
+	return sstream.str();
+      }
+      
+
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+
+	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+	//
+	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
+	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
+	//
+	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
+	//
+	// P(eta) = e^{- eta^dag eta}
+	//
+	// e^{x^2/2 sig^2} => sig^2 = 0.5.
+	// 
+	// So eta should be of width sig = 1/sqrt(2).
+
+	RealD scale = std::sqrt(0.5);
+
+	FermionField tmp(NumOp.FermionGrid());
+	FermionField eta(NumOp.FermionGrid());
+
+	gaussian(pRNG,eta);
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	// MdagM^1/4 eta
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagM(DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
+	msCG_M(MdagM,eta,tmp);
+
+	// VdagV^-1/4 MdagM^1/4 eta
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> VdagV(NumOp);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
+	msCG_V(VdagV,tmp,Phi);
+
+	Phi=Phi*scale;
+	
+      };
+
+      //////////////////////////////////////////////////////
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	FermionField X(NumOp.FermionGrid());
+	FermionField Y(NumOp.FermionGrid());
+
+	// VdagV^1/4 Phi
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> VdagV(NumOp);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
+	msCG_V(VdagV,Phi,X);
+
+	// MdagM^-1/4 VdagV^1/4 Phi
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagM(DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
+	msCG_M(MdagM,X,Y);
+
+	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
+	RealD action = norm2(Y);
+
+	return action;
+      };
+
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //
+      // Here, M is some 5D operator and V is the Pauli-Villars field
+      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
+      //
+      // Need  
+      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
+      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
+      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
+      //
+      // P/Q is expressed as partial fraction expansion: 
+      // 
+      //           a0 + \sum_k ak/(V^dagV + bk) 
+      //  
+      // d[P/Q] is then  
+      //
+      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
+      //  
+      // and similar for N/D. 
+      // 
+      // Need   
+      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
+      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
+      //   
+      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
+      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
+      // 
+      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
+      //  
+
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+	const int n_f  = PowerNegHalf.poles.size();
+	const int n_pv = PowerQuarter.poles.size();
+
+	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionGrid());
+	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionGrid());
+	std::vector<FermionField> MfMpvPhi_k   (n_f,NumOp.FermionGrid());
+
+	FermionField      MpvPhi(NumOp.FermionGrid());
+	FermionField    MfMpvPhi(NumOp.FermionGrid());
+	FermionField MpvMfMpvPhi(NumOp.FermionGrid());
+	FermionField           Y(NumOp.FermionGrid());
+
+	GaugeField   tmp(NumOp.GaugeGrid());
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagM(DenOp);
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> VdagV(NumOp);
+
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
+
+	msCG_V(VdagV,Phi,MpvPhi_k,MpvPhi);
+	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
+	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
+
+	RealD ak;
+
+	dSdU = zero;
+
+	// With these building blocks  
+	//  
+	//       dS/dU = 
+	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
+	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
+	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
+
+	//(1)
+	for(int k=0;k<n_f;k++){
+	  ak = PowerNegHalf.residues[k];
+	  DenOp.M(MfMpvPhi_k[k],Y);
+	  DenOp.MDeriv(tmp , MfMpvPhi_k[k], Y,DaggerYes );  dSdU=dSdU+ak*tmp;
+	  DenOp.MDeriv(tmp , Y, MfMpvPhi_k[k], DaggerNo );  dSdU=dSdU+ak*tmp;
+	}
+	
+	//(2)
+	//(3)
+	for(int k=0;k<n_pv;k++){
+
+          ak = PowerQuarter.residues[k];
+	  
+	  NumOp.M(MpvPhi_k[k],Y);
+	  NumOp.MDeriv(tmp,MpvMfMpvPhi_k[k],Y,DaggerYes); dSdU=dSdU+ak*tmp;
+	  NumOp.MDeriv(tmp,Y,MpvMfMpvPhi_k[k],DaggerNo);  dSdU=dSdU+ak*tmp;     
+	  
+	  NumOp.M(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
+	  NumOp.MDeriv(tmp,Y, MpvPhi_k[k], DaggerNo); dSdU=dSdU+ak*tmp;
+	  NumOp.MDeriv(tmp,MpvPhi_k[k], Y,DaggerYes); dSdU=dSdU+ak*tmp;
+
+	}
+
+	//dSdU = Ta(dSdU);
+
+      };
+    };
+  }
+}
+
+
+#endif
--- a/Grid/qcd/action/pseudofermion/PseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/PseudoFermion.h
@@ -0,0 +1,43 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/pseudofermion/PseudoFermion_aggregate.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_AGGREGATE_H
+#define QCD_PSEUDOFERMION_AGGREGATE_H
+
+#include <Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavour.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourRatio.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>
+
+#endif
--- a/Grid/qcd/action/pseudofermion/TwoFlavour.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavour.h
@@ -0,0 +1,160 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/pseudofermion/TwoFlavour.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_H
+#define QCD_PSEUDOFERMION_TWO_FLAVOUR_H
+
+namespace Grid {
+namespace QCD {
+
+////////////////////////////////////////////////////////////////////////
+// Two flavour pseudofermion action for any dop
+////////////////////////////////////////////////////////////////////////
+template <class Impl>
+class TwoFlavourPseudoFermionAction : public Action<typename Impl::GaugeField> {
+ public:
+  INHERIT_IMPL_TYPES(Impl);
+
+ private:
+  FermionOperator<Impl> &FermOp;  // the basic operator
+
+  OperatorFunction<FermionField> &DerivativeSolver;
+
+  OperatorFunction<FermionField> &ActionSolver;
+
+  FermionField Phi;  // the pseudo fermion field for this trajectory
+
+ public:
+  /////////////////////////////////////////////////
+  // Pass in required objects.
+  /////////////////////////////////////////////////
+  TwoFlavourPseudoFermionAction(FermionOperator<Impl> &Op,
+                                OperatorFunction<FermionField> &DS,
+                                OperatorFunction<FermionField> &AS)
+      : FermOp(Op),
+        DerivativeSolver(DS),
+        ActionSolver(AS),
+        Phi(Op.FermionGrid()){};
+
+
+  virtual std::string action_name(){return "TwoFlavourPseudoFermionAction";}
+
+  virtual std::string LogParameters(){
+    std::stringstream sstream;
+    sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
+    return sstream.str();
+  }  
+  
+  //////////////////////////////////////////////////////////////////////////////////////
+  // Push the gauge field in to the dops. Assume any BC's and smearing already applied
+  //////////////////////////////////////////////////////////////////////////////////////
+  virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) {
+    // P(phi) = e^{- phi^dag (MdagM)^-1 phi}
+    // Phi = Mdag eta
+    // P(eta) = e^{- eta^dag eta}
+    //
+    // e^{x^2/2 sig^2} => sig^2 = 0.5.
+    //
+    // So eta should be of width sig = 1/sqrt(2).
+    // and must multiply by 0.707....
+    //
+    // Chroma has this scale factor: two_flavor_monomial_w.h
+    // IroIro: does not use this scale. It is absorbed by a change of vars
+    //         in the Phi integral, and thus is only an irrelevant prefactor for
+    //         the partition function.
+    //
+
+    RealD scale = std::sqrt(0.5);
+
+    FermionField eta(FermOp.FermionGrid());
+
+    gaussian(pRNG, eta);
+
+    FermOp.ImportGauge(U);
+    FermOp.Mdag(eta, Phi);
+
+    Phi = Phi * scale;
+  };
+
+  //////////////////////////////////////////////////////
+  // S = phi^dag (Mdag M)^-1 phi
+  //////////////////////////////////////////////////////
+  virtual RealD S(const GaugeField &U) {
+    FermOp.ImportGauge(U);
+
+    FermionField X(FermOp.FermionGrid());
+    FermionField Y(FermOp.FermionGrid());
+
+    MdagMLinearOperator<FermionOperator<Impl>, FermionField> MdagMOp(FermOp);
+    X = zero;
+    ActionSolver(MdagMOp, Phi, X);
+    MdagMOp.Op(X, Y);
+
+    RealD action = norm2(Y);
+    std::cout << GridLogMessage << "Pseudofermion action " << action << std::endl;
+    return action;
+  };
+
+  //////////////////////////////////////////////////////
+  // dS/du = - phi^dag  (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 phi
+  //       = - phi^dag M^-1 dM (MdagM)^-1 phi -  phi^dag (MdagM)^-1 dMdag dM
+  //       (Mdag)^-1 phi
+  //
+  //       = - Ydag dM X  - Xdag dMdag Y
+  //
+  // 
+  //////////////////////////////////////////////////////
+  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
+    FermOp.ImportGauge(U);
+
+    FermionField X(FermOp.FermionGrid());
+    FermionField Y(FermOp.FermionGrid());
+    GaugeField tmp(FermOp.GaugeGrid());
+
+    MdagMLinearOperator<FermionOperator<Impl>, FermionField> MdagMOp(FermOp);
+
+    X = zero;
+    DerivativeSolver(MdagMOp, Phi, X); // X = (MdagM)^-1 phi    
+    MdagMOp.Op(X, Y);                  // Y = M X = (Mdag)^-1 phi
+
+    // Our conventions really make this UdSdU; We do not differentiate wrt Udag here.
+    // So must take dSdU - adj(dSdU) and left multiply by mom to get dS/dt.
+
+    FermOp.MDeriv(tmp, Y, X, DaggerNo);
+    dSdU = tmp;
+    FermOp.MDeriv(tmp, X, Y, DaggerYes);
+    dSdU = dSdU + tmp;
+
+    // not taking here the traceless antihermitian component
+  };
+};
+}
+}
+
+#endif
--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
@@ -0,0 +1,189 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_H
+#define QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_H
+
+namespace Grid {
+  namespace QCD {
+
+    ////////////////////////////////////////////////////////////////////////
+    // Two flavour pseudofermion action for any EO prec dop
+    ////////////////////////////////////////////////////////////////////////
+    template <class Impl>
+    class TwoFlavourEvenOddPseudoFermionAction
+      : public Action<typename Impl::GaugeField> {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+    private:
+      FermionOperator<Impl> &FermOp;  // the basic operator
+
+      OperatorFunction<FermionField> &DerivativeSolver;
+      OperatorFunction<FermionField> &ActionSolver;
+
+      FermionField PhiOdd;   // the pseudo fermion field for this trajectory
+      FermionField PhiEven;  // the pseudo fermion field for this trajectory
+
+    public:
+      /////////////////////////////////////////////////
+      // Pass in required objects.
+      /////////////////////////////////////////////////
+      TwoFlavourEvenOddPseudoFermionAction(FermionOperator<Impl> &Op,
+					   OperatorFunction<FermionField> &DS,
+					   OperatorFunction<FermionField> &AS)
+	: FermOp(Op),
+	  DerivativeSolver(DS),
+	  ActionSolver(AS),
+	  PhiEven(Op.FermionRedBlackGrid()),
+	  PhiOdd(Op.FermionRedBlackGrid())
+      {};
+  
+      virtual std::string action_name(){return "TwoFlavourEvenOddPseudoFermionAction";}
+      
+      virtual std::string LogParameters(){
+	std::stringstream sstream;
+	sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
+	return sstream.str();
+      }  
+
+
+      //////////////////////////////////////////////////////////////////////////////////////
+      // Push the gauge field in to the dops. Assume any BC's and smearing already applied
+      //////////////////////////////////////////////////////////////////////////////////////
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+    
+	// P(phi) = e^{- phi^dag (MpcdagMpc)^-1 phi}
+	// Phi = McpDag eta 
+	// P(eta) = e^{- eta^dag eta}
+	//
+	// e^{x^2/2 sig^2} => sig^2 = 0.5.
+    
+	RealD scale = std::sqrt(0.5);
+    
+	FermionField eta    (FermOp.FermionGrid());
+	FermionField etaOdd (FermOp.FermionRedBlackGrid());
+	FermionField etaEven(FermOp.FermionRedBlackGrid());
+    
+	gaussian(pRNG,eta);
+	pickCheckerboard(Even,etaEven,eta);
+	pickCheckerboard(Odd,etaOdd,eta);
+    
+	FermOp.ImportGauge(U);
+	SchurDifferentiableOperator<Impl> PCop(FermOp);
+    
+    
+	PCop.MpcDag(etaOdd,PhiOdd);
+    
+	FermOp.MooeeDag(etaEven,PhiEven);
+    
+	PhiOdd =PhiOdd*scale;
+	PhiEven=PhiEven*scale;
+    
+      };
+  
+      //////////////////////////////////////////////////////
+      // S = phi^dag (Mdag M)^-1 phi  (odd)
+      //   + phi^dag (Mdag M)^-1 phi  (even)
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+	
+	FermOp.ImportGauge(U);
+
+	FermionField X(FermOp.FermionRedBlackGrid());
+	FermionField Y(FermOp.FermionRedBlackGrid());
+	
+	SchurDifferentiableOperator<Impl> PCop(FermOp);
+
+	X=zero;
+	ActionSolver(PCop,PhiOdd,X);
+	PCop.Op(X,Y);
+	RealD action = norm2(Y);
+
+	// The EE factorised block; normally can replace with zero if det is constant (gauge field indept)
+	// Only really clover term that creates this.
+	FermOp.MooeeInvDag(PhiEven,Y);
+	action = action + norm2(Y);
+
+	std::cout << GridLogMessage << "Pseudofermion EO action "<<action<<std::endl;
+	return action;
+      };
+
+      //////////////////////////////////////////////////////
+      //
+      // dS/du = - phi^dag  (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 phi
+      //       = - phi^dag M^-1 dM (MdagM)^-1 phi -  phi^dag (MdagM)^-1 dMdag dM (Mdag)^-1 phi 
+      //
+      //       = - Ydag dM X  - Xdag dMdag Y
+      //
+      //////////////////////////////////////////////////////
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+	FermOp.ImportGauge(U);
+
+	FermionField X(FermOp.FermionRedBlackGrid());
+	FermionField Y(FermOp.FermionRedBlackGrid());
+	GaugeField tmp(FermOp.GaugeGrid());
+
+	SchurDifferentiableOperator<Impl> Mpc(FermOp);
+
+	// Our conventions really make this UdSdU; We do not differentiate wrt Udag here.
+	// So must take dSdU - adj(dSdU) and left multiply by mom to get dS/dt.
+
+	X=zero;
+	DerivativeSolver(Mpc,PhiOdd,X);
+	Mpc.Mpc(X,Y);
+  Mpc.MpcDeriv(tmp , Y, X );    dSdU=tmp;
+  Mpc.MpcDagDeriv(tmp , X, Y);  dSdU=dSdU+tmp;
+
+	// Treat the EE case. (MdagM)^-1 = Minv Minvdag
+	// Deriv defaults to zero.
+	//        FermOp.MooeeInvDag(PhiOdd,Y);
+	//      FermOp.MooeeInv(Y,X);
+	//	FermOp.MeeDeriv(tmp , Y, X,DaggerNo );    dSdU=tmp;
+	//  FermOp.MeeDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
+
+	assert(FermOp.ConstEE() == 1);
+
+	/*
+	  FermOp.MooeeInvDag(PhiOdd,Y);
+	  FermOp.MooeeInv(Y,X);
+	  FermOp.MeeDeriv(tmp , Y, X,DaggerNo );    dSdU=tmp;
+	  FermOp.MeeDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
+	*/
+	
+	//dSdU = Ta(dSdU);
+
+      };
+
+    };
+    
+  }
+}
+
+#endif
--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -0,0 +1,209 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_RATIO_H
+#define QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_RATIO_H
+
+namespace Grid{
+  namespace QCD{
+
+    ///////////////////////////////////////
+    // Two flavour ratio
+    ///////////////////////////////////////
+    template<class Impl>
+    class TwoFlavourEvenOddRatioPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+    private:
+      FermionOperator<Impl> & NumOp;// the basic operator
+      FermionOperator<Impl> & DenOp;// the basic operator
+
+      OperatorFunction<FermionField> &DerivativeSolver;
+      OperatorFunction<FermionField> &ActionSolver;
+
+      FermionField PhiOdd;   // the pseudo fermion field for this trajectory
+      FermionField PhiEven;  // the pseudo fermion field for this trajectory
+
+    public:
+      TwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+                                                FermionOperator<Impl>  &_DenOp, 
+                                                OperatorFunction<FermionField> & DS,
+                                                OperatorFunction<FermionField> & AS) :
+      NumOp(_NumOp), 
+      DenOp(_DenOp), 
+      DerivativeSolver(DS), 
+      ActionSolver(AS),
+      PhiEven(_NumOp.FermionRedBlackGrid()),
+      PhiOdd(_NumOp.FermionRedBlackGrid()) 
+        {
+          conformable(_NumOp.FermionGrid(), _DenOp.FermionGrid());
+          conformable(_NumOp.FermionRedBlackGrid(), _DenOp.FermionRedBlackGrid());
+          conformable(_NumOp.GaugeGrid(), _DenOp.GaugeGrid());
+          conformable(_NumOp.GaugeRedBlackGrid(), _DenOp.GaugeRedBlackGrid());
+        };
+
+      virtual std::string action_name(){return "TwoFlavourEvenOddRatioPseudoFermionAction";}
+
+      virtual std::string LogParameters(){
+	std::stringstream sstream;
+	sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
+	return sstream.str();
+      } 
+
+      
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+
+        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
+        //
+        // NumOp == V
+        // DenOp == M
+        //
+        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
+        //
+        // P(eta_o) = e^{- eta_o^dag eta_o}
+        //
+        // e^{x^2/2 sig^2} => sig^2 = 0.5.
+        // 
+        RealD scale = std::sqrt(0.5);
+
+        FermionField eta    (NumOp.FermionGrid());
+        FermionField etaOdd (NumOp.FermionRedBlackGrid());
+        FermionField etaEven(NumOp.FermionRedBlackGrid());
+        FermionField tmp    (NumOp.FermionRedBlackGrid());
+
+        gaussian(pRNG,eta);
+
+        pickCheckerboard(Even,etaEven,eta);
+        pickCheckerboard(Odd,etaOdd,eta);
+
+        NumOp.ImportGauge(U);
+        DenOp.ImportGauge(U);
+
+        SchurDifferentiableOperator<Impl> Mpc(DenOp);
+        SchurDifferentiableOperator<Impl> Vpc(NumOp);
+
+        // Odd det factors
+        Mpc.MpcDag(etaOdd,PhiOdd);
+        tmp=zero;
+        ActionSolver(Vpc,PhiOdd,tmp);
+        Vpc.Mpc(tmp,PhiOdd);            
+
+        // Even det factors
+        DenOp.MooeeDag(etaEven,tmp);
+        NumOp.MooeeInvDag(tmp,PhiEven);
+
+        PhiOdd =PhiOdd*scale;
+        PhiEven=PhiEven*scale;
+        
+      };
+
+      //////////////////////////////////////////////////////
+      // S = phi^dag V (Mdag M)^-1 Vdag phi
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+
+        NumOp.ImportGauge(U);
+        DenOp.ImportGauge(U);
+
+        SchurDifferentiableOperator<Impl> Mpc(DenOp);
+        SchurDifferentiableOperator<Impl> Vpc(NumOp);
+
+        FermionField X(NumOp.FermionRedBlackGrid());
+        FermionField Y(NumOp.FermionRedBlackGrid());
+
+        Vpc.MpcDag(PhiOdd,Y);           // Y= Vdag phi
+        X=zero;
+        ActionSolver(Mpc,Y,X);          // X= (MdagM)^-1 Vdag phi
+        //Mpc.Mpc(X,Y);                   // Y=  Mdag^-1 Vdag phi
+        // Multiply by Ydag
+        RealD action = real(innerProduct(Y,X));
+
+        //RealD action = norm2(Y);
+
+        // The EE factorised block; normally can replace with zero if det is constant (gauge field indept)
+        // Only really clover term that creates this. Leave the EE portion as a future to do to make most
+        // rapid progresss on DWF for now.
+        //
+        NumOp.MooeeDag(PhiEven,X);
+        DenOp.MooeeInvDag(X,Y);
+        action = action + norm2(Y);
+
+        return action;
+      };
+
+      //////////////////////////////////////////////////////
+      // dS/du = phi^dag dV (Mdag M)^-1 V^dag  phi
+      //       - phi^dag V (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 V^dag  phi
+      //       + phi^dag V (Mdag M)^-1 dV^dag  phi
+      //////////////////////////////////////////////////////
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+        NumOp.ImportGauge(U);
+        DenOp.ImportGauge(U);
+
+        SchurDifferentiableOperator<Impl> Mpc(DenOp);
+        SchurDifferentiableOperator<Impl> Vpc(NumOp);
+
+        FermionField  X(NumOp.FermionRedBlackGrid());
+        FermionField  Y(NumOp.FermionRedBlackGrid());
+
+        // This assignment is necessary to be compliant with the HMC grids
+	GaugeField force(dSdU._grid);
+
+        //Y=Vdag phi
+        //X = (Mdag M)^-1 V^dag phi
+        //Y = (Mdag)^-1 V^dag  phi
+        Vpc.MpcDag(PhiOdd,Y);          // Y= Vdag phi
+        X=zero;
+        DerivativeSolver(Mpc,Y,X);     // X= (MdagM)^-1 Vdag phi
+        Mpc.Mpc(X,Y);                  // Y=  Mdag^-1 Vdag phi
+
+        // phi^dag V (Mdag M)^-1 dV^dag  phi
+        Vpc.MpcDagDeriv(force , X, PhiOdd );   dSdU = force;
+  
+        // phi^dag dV (Mdag M)^-1 V^dag  phi
+        Vpc.MpcDeriv(force , PhiOdd, X );      dSdU = dSdU+force;
+
+        //    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi
+        //    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi
+        Mpc.MpcDeriv(force,Y,X);              dSdU = dSdU-force;
+        Mpc.MpcDagDeriv(force,X,Y);           dSdU = dSdU-force;
+
+        // FIXME No force contribution from EvenEven assumed here
+        // Needs a fix for clover.
+        assert(NumOp.ConstEE() == 1);
+        assert(DenOp.ConstEE() == 1);
+
+        dSdU = -dSdU;
+        
+      };
+    };
+  }
+}
+#endif
--- a/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h
@@ -0,0 +1,173 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/TwoFlavourRatio.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_RATIO_H
+#define QCD_PSEUDOFERMION_TWO_FLAVOUR_RATIO_H
+
+namespace Grid{
+  namespace QCD{
+
+    ///////////////////////////////////////
+    // Two flavour ratio
+    ///////////////////////////////////////
+    template<class Impl>
+    class TwoFlavourRatioPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+    private:
+      FermionOperator<Impl> & NumOp;// the basic operator
+      FermionOperator<Impl> & DenOp;// the basic operator
+
+      OperatorFunction<FermionField> &DerivativeSolver;
+      OperatorFunction<FermionField> &ActionSolver;
+
+      FermionField Phi; // the pseudo fermion field for this trajectory
+
+    public:
+      TwoFlavourRatioPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+					 FermionOperator<Impl>  &_DenOp, 
+					 OperatorFunction<FermionField> & DS,
+					 OperatorFunction<FermionField> & AS
+					 ) : NumOp(_NumOp), DenOp(_DenOp), DerivativeSolver(DS), ActionSolver(AS), Phi(_NumOp.FermionGrid()) {};
+      
+      virtual std::string action_name(){return "TwoFlavourRatioPseudoFermionAction";}
+
+      virtual std::string LogParameters(){
+	std::stringstream sstream;
+	sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
+	return sstream.str();
+      }  
+      
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+
+	// P(phi) = e^{- phi^dag V (MdagM)^-1 Vdag phi}
+	//
+	// NumOp == V
+	// DenOp == M
+	//
+	// Take phi = Vdag^{-1} Mdag eta  ; eta = Mdag^{-1} Vdag Phi
+	//
+	// P(eta) = e^{- eta^dag eta}
+	//
+	// e^{x^2/2 sig^2} => sig^2 = 0.5.
+	// 
+	// So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
+	//
+	RealD scale = std::sqrt(0.5);
+
+	FermionField eta(NumOp.FermionGrid());
+	FermionField tmp(NumOp.FermionGrid());
+
+	gaussian(pRNG,eta);
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	// Note: this hard codes normal equations type solvers; alternate implementation needed for 
+	// non-herm style solvers.
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(NumOp);
+
+	DenOp.Mdag(eta,Phi);            // Mdag eta
+	tmp = zero;
+	ActionSolver(MdagMOp,Phi,tmp);  // (VdagV)^-1 Mdag eta = V^-1 Vdag^-1 Mdag eta
+	NumOp.M(tmp,Phi);               // Vdag^-1 Mdag eta
+
+	Phi=Phi*scale;
+	
+      };
+
+      //////////////////////////////////////////////////////
+      // S = phi^dag V (Mdag M)^-1 Vdag phi
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	FermionField X(NumOp.FermionGrid());
+	FermionField Y(NumOp.FermionGrid());
+	
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
+
+	NumOp.Mdag(Phi,Y);              // Y= Vdag phi
+	X=zero;
+	ActionSolver(MdagMOp,Y,X);      // X= (MdagM)^-1 Vdag phi
+	DenOp.M(X,Y);                  // Y=  Mdag^-1 Vdag phi
+
+	RealD action = norm2(Y);
+
+	return action;
+      };
+
+      //////////////////////////////////////////////////////
+      // dS/du = phi^dag dV (Mdag M)^-1 V^dag  phi
+      //       - phi^dag V (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 V^dag  phi
+      //       + phi^dag V (Mdag M)^-1 dV^dag  phi
+      //////////////////////////////////////////////////////
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
+
+	FermionField  X(NumOp.FermionGrid());
+	FermionField  Y(NumOp.FermionGrid());
+
+	GaugeField   force(NumOp.GaugeGrid());	
+
+
+	//Y=Vdag phi
+	//X = (Mdag M)^-1 V^dag phi
+	//Y = (Mdag)^-1 V^dag  phi
+	NumOp.Mdag(Phi,Y);              // Y= Vdag phi
+	X=zero;
+	DerivativeSolver(MdagMOp,Y,X);      // X= (MdagM)^-1 Vdag phi
+	DenOp.M(X,Y);                  // Y=  Mdag^-1 Vdag phi
+
+	// phi^dag V (Mdag M)^-1 dV^dag  phi
+	NumOp.MDeriv(force , X, Phi, DaggerYes );  dSdU=force;
+  
+	// phi^dag dV (Mdag M)^-1 V^dag  phi
+	NumOp.MDeriv(force , Phi, X ,DaggerNo  );  dSdU=dSdU+force;
+
+	//    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi
+	//    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi
+	DenOp.MDeriv(force,Y,X,DaggerNo);   dSdU=dSdU-force;
+	DenOp.MDeriv(force,X,Y,DaggerYes);  dSdU=dSdU-force;
+
+	dSdU *= -1.0;
+	//dSdU = - Ta(dSdU);
+
+      };
+    };
+  }
+}
+#endif
--- a/Grid/qcd/action/scalar/Scalar.h
+++ b/Grid/qcd/action/scalar/Scalar.h
@@ -0,0 +1,50 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/gauge/Scalar.h
+
+Copyright (C) 2017
+
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_QCD_SCALAR_H
+#define GRID_QCD_SCALAR_H
+
+#include <Grid/qcd/action/scalar/ScalarImpl.h>
+#include <Grid/qcd/action/scalar/ScalarAction.h>
+#include <Grid/qcd/action/scalar/ScalarInteractionAction.h>
+
+namespace Grid {
+namespace QCD {
+
+  typedef ScalarAction<ScalarImplR>                 ScalarActionR;
+  typedef ScalarAction<ScalarImplF>                 ScalarActionF;
+  typedef ScalarAction<ScalarImplD>                 ScalarActionD;
+
+  template <int Colours, int Dimensions> using ScalarAdjActionR = ScalarInteractionAction<ScalarNxNAdjImplR<Colours>, Dimensions>;
+  template <int Colours, int Dimensions> using ScalarAdjActionF = ScalarInteractionAction<ScalarNxNAdjImplF<Colours>, Dimensions>;
+  template <int Colours, int Dimensions> using ScalarAdjActionD = ScalarInteractionAction<ScalarNxNAdjImplD<Colours>, Dimensions>;
+  
+}
+}
+
+#endif  // GRID_QCD_SCALAR_H
--- a/Grid/qcd/action/scalar/ScalarAction.h
+++ b/Grid/qcd/action/scalar/ScalarAction.h
@@ -0,0 +1,83 @@
+/*************************************************************************************
+
+  Grid physics library, www.github.com/paboyle/Grid
+
+  Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
+
+  Copyright (C) 2015
+
+  Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+  Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+  Author: neo <cossu@post.kek.jp>
+  Author: paboyle <paboyle@ph.ed.ac.uk>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with this program; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+  See the full license in the file "LICENSE" in the top level distribution
+directory
+  *************************************************************************************/
+/*  END LEGAL */
+
+#ifndef SCALAR_ACTION_H
+#define SCALAR_ACTION_H
+
+namespace Grid {
+  // FIXME drop the QCD namespace everywhere here
+
+template <class Impl>
+class ScalarAction : public QCD::Action<typename Impl::Field> {
+ public:
+    INHERIT_FIELD_TYPES(Impl);
+
+ private:
+    RealD mass_square;
+    RealD lambda;
+
+ public:
+    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {}
+
+    virtual std::string LogParameters() {
+      std::stringstream sstream;
+      sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
+      sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
+      return sstream.str();
+    }
+    virtual std::string action_name() {return "ScalarAction";}
+
+    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}  // noop as no pseudoferms
+
+    virtual RealD S(const Field &p) {
+      return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
+    (lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
+    ScalarObs<Impl>::sumphider(p);
+    };
+
+    virtual void deriv(const Field &p,
+                       Field &force) {
+      Field tmp(p._grid);
+      Field p2(p._grid);
+      ScalarObs<Impl>::phisquared(p2, p);
+      tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
+      for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
+
+      force =+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
+    }
+};
+
+
+
+}  // namespace Grid
+
+#endif // SCALAR_ACTION_H
--- a/Grid/qcd/action/scalar/ScalarImpl.h
+++ b/Grid/qcd/action/scalar/ScalarImpl.h
@@ -0,0 +1,263 @@
+#ifndef SCALAR_IMPL
+#define SCALAR_IMPL
+
+
+namespace Grid {
+  //namespace QCD {
+
+template <class S>
+class ScalarImplTypes {
+ public:
+    typedef S Simd;
+
+    template <typename vtype>
+    using iImplField = iScalar<iScalar<iScalar<vtype> > >;
+
+    typedef iImplField<Simd> SiteField;
+    typedef SiteField        SitePropagator;
+    typedef SiteField        SiteComplex;
+
+    typedef Lattice<SiteField> Field;
+    typedef Field              ComplexField;
+    typedef Field              FermionField;
+    typedef Field              PropagatorField;
+
+    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
+      gaussian(pRNG, P);
+    }
+
+    static inline Field projectForce(Field& P){return P;}
+
+    static inline void update_field(Field& P, Field& U, double ep) {
+      U += P*ep;
+    }
+
+    static inline RealD FieldSquareNorm(Field& U) {
+      return (- sum(trace(U*U))/2.0);
+    }
+
+    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
+      gaussian(pRNG, U);
+    }
+
+    static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
+      gaussian(pRNG, U);
+    }
+
+    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
+      U = 1.0;
+    }
+
+    static void MomentumSpacePropagator(Field &out, RealD m)
+    {
+      GridBase           *grid = out._grid;
+      Field              kmu(grid), one(grid);
+      const unsigned int nd    = grid->_ndimension;
+      std::vector<int>   &l    = grid->_fdimensions;
+
+      one = Complex(1.0,0.0);
+      out = m*m;
+      for(int mu = 0; mu < nd; mu++)
+      {
+        Real twoPiL = M_PI*2./l[mu];
+
+        LatticeCoordinate(kmu,mu);
+        kmu = 2.*sin(.5*twoPiL*kmu);
+        out = out + kmu*kmu;
+      }
+      out = one/out;
+    }
+
+    static void FreePropagator(const Field &in, Field &out,
+                               const Field &momKernel)
+    {
+      FFT   fft((GridCartesian *)in._grid);
+      Field inFT(in._grid);
+
+      fft.FFT_all_dim(inFT, in, FFT::forward);
+      inFT = inFT*momKernel;
+      fft.FFT_all_dim(out, inFT, FFT::backward);
+    }
+
+    static void FreePropagator(const Field &in, Field &out, RealD m)
+    {
+      Field momKernel(in._grid);
+
+      MomentumSpacePropagator(momKernel, m);
+      FreePropagator(in, out, momKernel);
+    }
+
+  };
+
+  #ifdef  USE_FFT_ACCELERATION
+  #ifndef FFT_MASS
+  #error  "USE_FFT_ACCELERATION is defined but not FFT_MASS"
+  #endif
+  #endif
+  
+  template <class S, unsigned int N>
+  class ScalarAdjMatrixImplTypes {
+  public:
+    typedef S Simd;
+    typedef QCD::SU<N> Group;
+
+    template <typename vtype>
+    using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
+    template <typename vtype>
+    using iImplComplex = iScalar<iScalar<iScalar<vtype>>>;
+
+    typedef iImplField<Simd>   SiteField;
+    typedef SiteField          SitePropagator;
+    typedef iImplComplex<Simd> SiteComplex;
+
+    typedef Lattice<SiteField>   Field;
+    typedef Lattice<SiteComplex> ComplexField;
+    typedef Field                FermionField;
+    typedef Field                PropagatorField;
+
+    static void MomentaSquare(ComplexField &out)
+    {
+      GridBase *grid = out._grid;
+      const std::vector<int> &l = grid->FullDimensions();
+      ComplexField kmu(grid);
+
+      for (int mu = 0; mu < grid->Nd(); mu++)
+      {
+        Real twoPiL = M_PI * 2.0 / l[mu];
+        LatticeCoordinate(kmu, mu);
+        kmu = 2.0 * sin(0.5 * twoPiL * kmu);
+        out += kmu * kmu;
+      }
+    }
+
+    static void MomentumSpacePropagator(ComplexField &out, RealD m)
+    {
+      GridBase *grid = out._grid;
+      ComplexField one(grid);
+      one = Complex(1.0, 0.0);
+      out = m * m;
+      MomentaSquare(out);
+      out = one / out;
+    }
+
+    static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
+    {
+#ifndef USE_FFT_ACCELERATION
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
+#else
+
+      Field Pgaussian(P._grid), Pp(P._grid);
+      ComplexField p2(P._grid); p2 = zero;
+      RealD M = FFT_MASS;
+      
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
+
+      FFT theFFT((GridCartesian*)P._grid);
+      theFFT.FFT_all_dim(Pp, Pgaussian, FFT::forward);
+      MomentaSquare(p2);
+      p2 += M * M;
+      p2 = sqrt(p2);
+      Pp *= p2;
+      theFFT.FFT_all_dim(P, Pp, FFT::backward);
+
+#endif //USE_FFT_ACCELERATION
+    }
+
+    static inline Field projectForce(Field& P) {return P;}
+
+    static inline void update_field(Field &P, Field &U, double ep)
+    {
+#ifndef USE_FFT_ACCELERATION
+      double t0=usecond(); 
+      U += P * ep;
+      double t1=usecond();
+      double total_time = (t1-t0)/1e6;
+      std::cout << GridLogIntegrator << "Total time for updating field (s)       : " << total_time << std::endl; 
+#else
+      // FFT transform P(x) -> P(p)
+      // divide by (M^2+p^2)  M external parameter (how to pass?)
+      // P'(p) = P(p)/(M^2+p^2)
+      // Transform back -> P'(x)
+      // U += P'(x)*ep
+
+      Field Pp(U._grid), P_FFT(U._grid);     
+      static ComplexField p2(U._grid);
+      RealD M = FFT_MASS;
+      
+      FFT theFFT((GridCartesian*)U._grid);
+      theFFT.FFT_all_dim(Pp, P, FFT::forward);
+
+      static bool first_call = true;
+      if (first_call)
+      {
+        // avoid recomputing
+        MomentumSpacePropagator(p2, M);
+        first_call = false;
+      }
+      Pp *= p2;
+      theFFT.FFT_all_dim(P_FFT, Pp, FFT::backward);
+      U += P_FFT * ep;
+
+#endif //USE_FFT_ACCELERATION
+    }
+
+    static inline RealD FieldSquareNorm(Field &U)
+    {
+#ifndef USE_FFT_ACCELERATION
+      return (TensorRemove(sum(trace(U * U))).real());
+#else
+      // In case of Fourier acceleration we have to:
+      // compute U(p)*U(p)/(M^2+p^2))   Parseval theorem
+      // 1 FFT needed U(x) -> U(p)
+      // M to be passed
+
+      FFT theFFT((GridCartesian*)U._grid);
+      Field Up(U._grid);
+
+      theFFT.FFT_all_dim(Up, U, FFT::forward);
+      RealD M = FFT_MASS;
+      ComplexField p2(U._grid);
+      MomentumSpacePropagator(p2, M);
+      Field Up2 = Up * p2;
+      // from the definition of the DFT we need to divide by the volume
+      return (-TensorRemove(sum(trace(adj(Up) * Up2))).real() / U._grid->gSites());
+#endif //USE_FFT_ACCELERATION
+    }
+
+    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
+    }
+
+    static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01);
+    }
+
+    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
+      U = zero;
+    }
+
+  };
+
+
+
+
+  typedef ScalarImplTypes<vReal> ScalarImplR;
+  typedef ScalarImplTypes<vRealF> ScalarImplF;
+  typedef ScalarImplTypes<vRealD> ScalarImplD;
+  typedef ScalarImplTypes<vComplex> ScalarImplCR;
+  typedef ScalarImplTypes<vComplexF> ScalarImplCF;
+  typedef ScalarImplTypes<vComplexD> ScalarImplCD;
+
+  // Hardcoding here the size of the matrices
+  typedef ScalarAdjMatrixImplTypes<vComplex,  QCD::Nc> ScalarAdjImplR;
+  typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
+  typedef ScalarAdjMatrixImplTypes<vComplexD, QCD::Nc> ScalarAdjImplD;
+
+  template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex,   Colours >;
+  template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF,  Colours >;
+  template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD,  Colours >;
+
+  //}
+}
+
+#endif
--- a/Grid/qcd/action/scalar/ScalarInteractionAction.h
+++ b/Grid/qcd/action/scalar/ScalarInteractionAction.h
@@ -0,0 +1,208 @@
+/*************************************************************************************
+
+  Grid physics library, www.github.com/paboyle/Grid
+
+  Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
+
+  Copyright (C) 2015
+
+  Author: Guido Cossu <guido,cossu@ed.ac.uk>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with this program; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+  See the full license in the file "LICENSE" in the top level distribution
+directory
+  *************************************************************************************/
+/*  END LEGAL */
+
+#ifndef SCALAR_INT_ACTION_H
+#define SCALAR_INT_ACTION_H
+
+// Note: this action can completely absorb the ScalarAction for real float fields
+// use the scalarObjs to generalise the structure
+
+namespace Grid
+{
+// FIXME drop the QCD namespace everywhere here
+
+template <class Impl, int Ndim>
+class ScalarInteractionAction : public QCD::Action<typename Impl::Field>
+{
+public:
+  INHERIT_FIELD_TYPES(Impl);
+
+private:
+  RealD mass_square;
+  RealD lambda;
+  RealD g;
+  const unsigned int N = Impl::Group::Dimension;
+
+  typedef typename Field::vector_object vobj;
+  typedef CartesianStencil<vobj, vobj> Stencil;
+
+  SimpleCompressor<vobj> compressor;
+  int npoint = 2 * Ndim;
+  std::vector<int> directions;    //
+  std::vector<int> displacements; //
+
+public:
+  ScalarInteractionAction(RealD ms, RealD l, RealD gval) : mass_square(ms), lambda(l), g(gval), displacements(2 * Ndim, 0), directions(2 * Ndim, 0)
+  {
+    for (int mu = 0; mu < Ndim; mu++)
+    {
+      directions[mu] = mu;
+      directions[mu + Ndim] = mu;
+      displacements[mu] = 1;
+      displacements[mu + Ndim] = -1;
+    }
+  }
+
+  virtual std::string LogParameters()
+  {
+    std::stringstream sstream;
+    sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda << std::endl;
+    sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
+    sstream << GridLogMessage << "[ScalarAction] g           : " << g << std::endl;
+    return sstream.str();
+  }
+
+  virtual std::string action_name() { return "ScalarAction"; }
+
+  virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
+
+  virtual RealD S(const Field &p)
+  {
+    assert(p._grid->Nd() == Ndim);
+    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+    phiStencil.HaloExchange(p, compressor);
+    Field action(p._grid), pshift(p._grid), phisquared(p._grid);
+    phisquared = p * p;
+    action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
+    for (int mu = 0; mu < Ndim; mu++)
+    {
+      //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
+      parallel_for(int i = 0; i < p._grid->oSites(); i++)
+      {
+        int permute_type;
+        StencilEntry *SE;
+        vobj temp2;
+        const vobj *temp, *t_p;
+
+        SE = phiStencil.GetEntry(permute_type, mu, i);
+        t_p = &p._odata[i];
+        if (SE->_is_local)
+        {
+          temp = &p._odata[SE->_offset];
+          if (SE->_permute)
+          {
+            permute(temp2, *temp, permute_type);
+            action._odata[i] -= temp2 * (*t_p) + (*t_p) * temp2;
+          }
+          else
+          {
+            action._odata[i] -= (*temp) * (*t_p) + (*t_p) * (*temp);
+          }
+        }
+        else
+        {
+          action._odata[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset];
+        }
+      }
+      //  action -= pshift*p + p*pshift;
+    }
+    // NB the trace in the algebra is normalised to 1/2
+    // minus sign coming from the antihermitian fields
+    return -(TensorRemove(sum(trace(action)))).real() * N / g;
+  };
+
+  virtual void deriv(const Field &p, Field &force)
+  {
+    double t0 = usecond();
+    assert(p._grid->Nd() == Ndim);
+    force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p;
+    double interm_t = usecond();
+
+    // move this outside
+    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+
+    phiStencil.HaloExchange(p, compressor);
+    double halo_t = usecond();
+    int chunk = 128;
+    //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
+
+    // inverting the order of the loops slows down the code(! g++ 7)
+    // cannot try to reduce the number of  force writes by factor npoint...
+    // use cache blocking
+    for (int point = 0; point < npoint; point++)
+    {
+
+#pragma omp parallel 
+{
+        int permute_type;
+        StencilEntry *SE;
+        const vobj *temp;
+
+#pragma omp for schedule(static, chunk)
+      for (int i = 0; i < p._grid->oSites(); i++)
+      {
+        SE = phiStencil.GetEntry(permute_type, point, i);
+        // prefetch next p?
+
+        if (SE->_is_local)
+        {
+          temp = &p._odata[SE->_offset];
+      
+          if (SE->_permute)
+          {
+            vobj temp2;
+            permute(temp2, *temp, permute_type);
+            force._odata[i] -= temp2;
+          }
+          else
+          {
+            force._odata[i] -= *temp; // slow part. Dominated by this read/write (BW)
+          }
+        }
+        else
+        {
+          force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
+        }
+      }
+
+    }
+  }
+  force *= N / g;
+
+  double t1 = usecond();
+  double total_time = (t1 - t0) / 1e6;
+  double interm_time = (interm_t - t0) / 1e6;
+  double halo_time = (halo_t - interm_t) / 1e6;
+  double stencil_time = (t1 - halo_t) / 1e6;
+  std::cout << GridLogIntegrator << "Total time for force computation (s)       : " << total_time << std::endl;
+  std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl;
+  std::cout << GridLogIntegrator << "Halo time in force computation (s)         : " << halo_time << std::endl;
+  std::cout << GridLogIntegrator << "Stencil time in force computation (s)      : " << stencil_time << std::endl;
+  double flops = p._grid->gSites() * (14 * N * N * N + 18 * N * N + 2);
+  double flops_no_stencil = p._grid->gSites() * (14 * N * N * N + 6 * N * N + 2);
+  double Gflops = flops / (total_time * 1e9);
+  double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9);
+  std::cout << GridLogIntegrator << "Flops: " << flops << "  - Gflop/s : " << Gflops << std::endl;
+  std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << "  - Gflop/s NS: " << Gflops_no_stencil << std::endl;
+}
+};
+
+} // namespace Grid
+
+#endif // SCALAR_INT_ACTION_H