Getting closer on the GPU port, but will start deleting 5th dim vectorised variants

for code maintainability
2025-07-21 23:27:06 +01:00 · 2019-06-04 11:53:44 +01:00
parent 7b59ab5bd7
commit ade4a126da
26 changed files with 8125 additions and 477 deletions
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -574,12 +574,12 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
    dee[Ls-1] += delta_d;
  }  

-  int inv=1;
-  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
-  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
+  //  int inv=1;
+  //  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
+  //  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
 }

-
+#if 0
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
 						 Vector<iSinglet<Simd> > & Matp,
@@ -658,7 +658,7 @@ void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
      Matm[LLs*s2+s1] = Vm;
    }}
 }
-
+#endif

 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DInstantiation.cc
@@ -1,45 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/CayleyFermion5D.h>
-#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h>
-			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h>
-			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dgpu.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// FIXME: Break these out to parallel make accelerate
-FermOpTemplateInstantiate(CayleyFermion5D);
-GparityFermOpTemplateInstantiate(CayleyFermion5D);
-
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@@ -0,0 +1,239 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
+
+
+NAMESPACE_BEGIN(Grid);
+
+// Pminus fowards
+// Pplus  backwards..
+template<class Impl>  
+void
+CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
+			       const FermionField &phi_i, 
+			       FermionField &chi_i,
+			       Vector<Coeff_t> &lower,
+			       Vector<Coeff_t> &diag,
+			       Vector<Coeff_t> &upper)
+{
+  
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  int Ls =this->Ls;
+
+  // 10 = 3 complex mult + 2 complex add
+  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
+  M5Dcalls++;
+  M5Dtime-=usecond();
+
+  thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
+    for(int s=0;s<Ls;s++){
+      auto tmp = psi[0];
+      if ( s==0 ) {
+	spProj5m(tmp,psi[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5p(tmp,psi[ss+Ls-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else if ( s==(Ls-1)) {
+	spProj5m(tmp,psi[ss+0]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5p(tmp,psi[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else { 
+	spProj5m(tmp,psi[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5p(tmp,psi[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      }
+    }
+  });
+  M5Dtime+=usecond();
+}
+
+template<class Impl>  
+void
+CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
+				  const FermionField &phi_i, 
+				  FermionField &chi_i,
+				  Vector<Coeff_t> &lower,
+				  Vector<Coeff_t> &diag,
+				  Vector<Coeff_t> &upper)
+{
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  int Ls=this->Ls;
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  M5Dcalls++;
+  M5Dtime-=usecond();
+
+  thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
+    auto tmp = psi[0];
+    for(int s=0;s<Ls;s++){
+      if ( s==0 ) {
+	spProj5p(tmp,psi[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi[ss+Ls-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else if ( s==(Ls-1)) {
+	spProj5p(tmp,psi[ss+0]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else { 
+	spProj5p(tmp,psi[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      }
+    }
+  });
+  M5Dtime+=usecond();
+}
+
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+  int Ls=this->Ls;
+
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+
+  thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
+    auto tmp = psi[0];
+
+    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
+    // Apply (L^{\prime})^{-1}
+    chi[ss]=psi[ss]; // chi[0]=psi[0]
+    for(int s=1;s<Ls;s++){
+      spProj5p(tmp,chi[ss+s-1]);  
+      chi[ss+s] = psi[ss+s]-lee[s-1]*tmp;
+    }
+    // L_m^{-1} 
+    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+      spProj5m(tmp,chi[ss+s]);    
+      chi[ss+Ls-1] = chi[ss+Ls-1] - leem[s]*tmp;
+    }
+    // U_m^{-1} D^{-1}
+    for (int s=0;s<Ls-1;s++){
+      // Chi[s] + 1/d chi[s] 
+      spProj5p(tmp,chi[ss+Ls-1]); 
+      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(ueem[s]/dee[Ls-1])*tmp;
+    }	
+    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
+      
+    // Apply U^{-1}
+    for (int s=Ls-2;s>=0;s--){
+      spProj5m(tmp,chi[ss+s+1]);  
+      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
+    }
+  });
+
+  MooeeInvTime+=usecond();
+
+}
+
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  int Ls=this->Ls;
+
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+  assert(psi.Checkerboard() == psi.Checkerboard());
+
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+
+  thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
+
+    auto tmp = psi[0];
+
+    // Apply (U^{\prime})^{-dagger}
+    chi[ss]=psi[ss];
+    for (int s=1;s<Ls;s++){
+      spProj5m(tmp,chi[ss+s-1]);
+      chi[ss+s] = psi[ss+s]-conjugate(uee[s-1])*tmp;
+    }
+    // U_m^{-\dagger} 
+    for (int s=0;s<Ls-1;s++){
+      spProj5p(tmp,chi[ss+s]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] - conjugate(ueem[s])*tmp;
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for (int s=0;s<Ls-1;s++){
+      spProj5m(tmp,chi[ss+Ls-1]);
+      chi[ss+s] = conjugate(1.0/dee[s])*chi[ss+s]-conjugate(leem[s]/dee[Ls-1])*tmp;
+    }	
+    chi[ss+Ls-1]= conjugate(1.0/dee[Ls-1])*chi[ss+Ls-1];
+  
+    // Apply L^{-dagger}
+    for (int s=Ls-2;s>=0;s--){
+      spProj5p(tmp,chi[ss+s+1]);
+      chi[ss+s] = chi[ss+s] - conjugate(lee[s])*tmp;
+    }
+  });
+
+  MooeeInvTime+=usecond();
+
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dgpu.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dgpu.h
@@ -0,0 +1,284 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
+
+
+NAMESPACE_BEGIN(Grid);
+
+// Pminus fowards
+// Pplus  backwards..
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
+				const FermionField &phi_i, 
+				FermionField &chi_i,
+				Vector<Coeff_t> &lower,
+				Vector<Coeff_t> &diag,
+				Vector<Coeff_t> &upper)
+{
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  Coeff_t *lower_v = &lower[0];
+  Coeff_t *diag_v  = &diag[0];
+  Coeff_t *upper_v = &upper[0];
+  int Ls =this->Ls;
+  assert(phi.Checkerboard() == psi.Checkerboard());
+  
+  const uint64_t nsimd = grid->Nsimd();
+  const uint64_t sites4d = nsimd * grid->oSites() / Ls;
+  
+  // 10 = 3 complex mult + 2 complex add
+  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
+  M5Dcalls++;
+  M5Dtime-=usecond();
+  
+  accelerator_loopN( sss, sites4d ,{
+    uint64_t lane = sss % nsimd;
+    uint64_t ss   = Ls * (sss / nsimd);
+    
+    for(int s=0;s<Ls;s++){
+      auto res = extractLane(lane,phi[ss+s]);
+      res = diag_v[s]*res;
+      
+      auto tmp = extractLane(lane,psi[ss+(s+1)%Ls]);
+      spProj5m(tmp,tmp);
+      res += upper_v[s]*tmp;
+      
+      tmp = extractLane(lane,psi[ss+(s+Ls-1)%Ls]);
+      spProj5p(tmp,tmp);
+      res += lower_v[s]*tmp;
+      
+      insertLane(lane,chi[ss+s],res);
+    }
+  });
+  M5Dtime+=usecond();
+}
+
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
+				   const FermionField &phi_i, 
+				   FermionField &chi_i,
+				   Vector<Coeff_t> &lower,
+				   Vector<Coeff_t> &diag,
+				   Vector<Coeff_t> &upper)
+{
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  Coeff_t *lower_v = &lower[0];
+  Coeff_t *diag_v  = &diag[0];
+  Coeff_t *upper_v = &upper[0];
+  int Ls =this->Ls;
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  const uint64_t nsimd = grid->Nsimd();
+  const uint64_t sites4d = nsimd * grid->oSites() / Ls;
+  
+  // 10 = 3 complex mult + 2 complex add
+  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
+  M5Dcalls++;
+  M5Dtime-=usecond();
+
+  accelerator_loopN( sss, sites4d ,{
+    uint64_t lane = sss % nsimd;
+    uint64_t ss   = Ls * (sss / nsimd);
+    
+    for(int s=0;s<Ls;s++){
+      auto res = extractLane(lane,phi[ss+s]);
+      res = diag_v[s]*res;
+      
+      auto tmp = extractLane(lane,psi[ss+(s+1)%Ls]);
+      spProj5p(tmp,tmp);
+      res += upper_v[s]*tmp;
+      
+      tmp = extractLane(lane,psi[ss+(s+Ls-1)%Ls]);
+      spProj5m(tmp,tmp);
+      res += lower_v[s]*tmp;
+      
+      insertLane(lane,chi[ss+s],res);
+    }
+  });
+  M5Dtime+=usecond();
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+  Coeff_t *lee_v  = &lee[0];
+  Coeff_t *leem_v = &leem[0];
+  Coeff_t *uee_v  = &uee[0];
+  Coeff_t *ueem_v = &ueem[0];
+  Coeff_t *dee_v  = &dee[0];
+  
+  int Ls=this->Ls;
+  const uint64_t nsimd = grid->Nsimd();
+  const uint64_t sites4d = nsimd * grid->oSites() / Ls;
+  
+  typedef typename SiteSpinor::scalar_object ScalarSiteSpinor;
+  
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+  
+  accelerator_loopN( sss, sites4d ,{
+    uint64_t lane = sss % nsimd;
+    uint64_t ss   = Ls * (sss / nsimd);
+    ScalarSiteSpinor res, tmp, acc;
+    
+    // X = Nc*Ns
+    // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
+    // Apply (L^{\prime})^{-1} L_m^{-1}
+    res = extractLane(lane,psi[ss]);
+    spProj5m(tmp,res);
+    acc = leem_v[0]*tmp;
+    spProj5p(tmp,res);
+    insertLane(lane,chi[ss],res);
+    
+    for(int s=1;s<Ls-1;s++){
+      res = extractLane(lane,psi[ss+s]);
+      res -= lee_v[s-1]*tmp;
+      spProj5m(tmp,res);
+      acc += leem_v[s]*tmp;
+      spProj5p(tmp,res);
+      insertLane(lane,chi[ss+s],res);
+    }
+    res = extractLane(lane,psi[ss+Ls-1]);
+    res = res - lee_v[Ls-2]*tmp - acc;
+    
+    // Apply U_m^{-1} D^{-1} U^{-1}
+    res = (1.0/dee_v[Ls-1])*res;
+    insertLane(lane,chi[ss+Ls-1],res);
+    spProj5p(acc,res);
+    spProj5m(tmp,res);
+    for (int s=Ls-2;s>=0;s--){
+      res = extractLane(lane,chi[ss+s]);
+      res = (1.0/dee_v[s])*res - uee_v[s]*tmp - ueem_v[s]*acc;
+      spProj5m(tmp,res);
+      insertLane(lane,chi[ss+s],res);
+    }
+  });
+  
+  MooeeInvTime+=usecond();
+
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+  Coeff_t *lee_v  = &lee[0];
+  Coeff_t *leem_v = &leem[0];
+  Coeff_t *uee_v  = &uee[0];
+  Coeff_t *ueem_v = &ueem[0];
+  Coeff_t *dee_v  = &dee[0];
+  
+  int Ls=this->Ls;
+  const uint64_t nsimd = grid->Nsimd();
+  const uint64_t sites4d = nsimd * grid->oSites() / Ls;
+  
+  typedef typename SiteSpinor::scalar_object ScalarSiteSpinor;
+  
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+  
+  accelerator_loopN( sss, sites4d ,{
+    uint64_t lane = sss % nsimd;
+    uint64_t ss   = Ls * (sss / nsimd);
+    ScalarSiteSpinor res, tmp, acc;
+    
+    // X = Nc*Ns
+    // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
+    // Apply (U^{\prime})^{-dagger} U_m^{-\dagger}
+    res = extractLane(lane,psi[ss]);
+    spProj5p(tmp,res);
+    acc = conjugate(ueem_v[0])*tmp;
+    spProj5m(tmp,res);
+    insertLane(lane,chi[ss],res);
+    
+    for(int s=1;s<Ls-1;s++){
+      res = extractLane(lane,psi[ss+s]);
+      res -= conjugate(uee_v[s-1])*tmp;
+      spProj5p(tmp,res);
+      acc += conjugate(ueem_v[s])*tmp;
+      spProj5m(tmp,res);
+      insertLane(lane,chi[ss+s],res);
+    }
+    res = extractLane(lane,psi[ss+Ls-1]);
+    res = res - conjugate(uee_v[Ls-2])*tmp - acc;
+    
+    // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
+    res = conjugate(1.0/dee_v[Ls-1])*res;
+    insertLane(lane,chi[ss+Ls-1],res);
+    spProj5m(acc,res);
+    spProj5p(tmp,res);
+    for (int s=Ls-2;s>=0;s--){
+      res = extractLane(lane,chi[ss+s]);
+      res = conjugate(1.0/dee_v[s])*res - conjugate(lee_v[s])*tmp - conjugate(leem_v[s])*acc;
+      spProj5p(tmp,res);
+      insertLane(lane,chi[ss+s],res);
+    }
+  });
+  
+  MooeeInvTime+=usecond();
+  
+}
+
+#ifdef CAYLEY_DPERP_GPU
+INSTANTIATE_DPERP(WilsonImplF);
+INSTANTIATE_DPERP(WilsonImplD);
+INSTANTIATE_DPERP(GparityWilsonImplF);
+INSTANTIATE_DPERP(GparityWilsonImplD);
+INSTANTIATE_DPERP(ZWilsonImplF);
+INSTANTIATE_DPERP(ZWilsonImplD);
+
+INSTANTIATE_DPERP(WilsonImplFH);
+INSTANTIATE_DPERP(WilsonImplDF);
+INSTANTIATE_DPERP(GparityWilsonImplFH);
+INSTANTIATE_DPERP(GparityWilsonImplDF);
+INSTANTIATE_DPERP(ZWilsonImplFH);
+INSTANTIATE_DPERP(ZWilsonImplDF);
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
@@ -0,0 +1,831 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
+
+NAMESPACE_BEGIN(Grid);
+
+/*
+ * Dense matrix versions of routines
+ */
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInvDag(const FermionField &psi, FermionField &chi)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
+}
+  
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
+}
+template<class Impl>  
+void
+CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
+			   const FermionField &phi_i, 
+			   FermionField &chi_i,
+			   Vector<Coeff_t> &lower,
+			   Vector<Coeff_t> &diag,
+			   Vector<Coeff_t> &upper)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  const int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+    for(int i=0;i<nsimd;i++){ //inner
+      int s  = o+i*LLs;
+      int ss = o*nsimd+i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+    }}
+
+
+  M5Dcalls++;
+  M5Dtime-=usecond();
+
+  assert(Nc==3);
+
+  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
+#if 0
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5m(hp,psi[ss+vp]);
+      spProj5p(hm,psi[ss+vm]);
+
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+	
+      hp=0.5*hp;
+      hm=0.5*hm;
+
+      spRecon5m(fp,hp);
+      spRecon5p(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v];
+      chi[ss+v] = chi[ss+v]     +u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+#else
+    for(int v=0;v<LLs;v++){
+      
+      vprefetch(psi[ss+v+LLs]);
+
+      int vp= (v==LLs-1) ? 0     : v+1;
+      int vm= (v==0    ) ? LLs-1 : v-1;
+	
+      Simd hp_00 = psi[ss+vp]()(2)(0); 
+      Simd hp_01 = psi[ss+vp]()(2)(1); 
+      Simd hp_02 = psi[ss+vp]()(2)(2); 
+      Simd hp_10 = psi[ss+vp]()(3)(0); 
+      Simd hp_11 = psi[ss+vp]()(3)(1); 
+      Simd hp_12 = psi[ss+vp]()(3)(2); 
+	
+      Simd hm_00 = psi[ss+vm]()(0)(0); 
+      Simd hm_01 = psi[ss+vm]()(0)(1); 
+      Simd hm_02 = psi[ss+vm]()(0)(2); 
+      Simd hm_10 = psi[ss+vm]()(1)(0); 
+      Simd hm_11 = psi[ss+vm]()(1)(1); 
+      Simd hm_12 = psi[ss+vm]()(1)(2); 
+
+      if ( vp<=v ) {
+	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+      }
+      if ( vm>=v ) {
+	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+      }
+
+      // Can force these to real arithmetic and save 2x.
+      Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
+      Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
+      Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02); 
+      Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
+      Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
+      Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
+      Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
+      Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
+      Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);  
+      Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
+      Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
+      Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
+
+      vstream(chi[ss+v]()(0)(0),p_00);
+      vstream(chi[ss+v]()(0)(1),p_01);
+      vstream(chi[ss+v]()(0)(2),p_02);
+      vstream(chi[ss+v]()(1)(0),p_10);
+      vstream(chi[ss+v]()(1)(1),p_11);
+      vstream(chi[ss+v]()(1)(2),p_12);
+      vstream(chi[ss+v]()(2)(0),p_20);
+      vstream(chi[ss+v]()(2)(1),p_21);
+      vstream(chi[ss+v]()(2)(2),p_22);
+      vstream(chi[ss+v]()(3)(0),p_30);
+      vstream(chi[ss+v]()(3)(1),p_31);
+      vstream(chi[ss+v]()(3)(2),p_32);
+
+    }
+#endif
+  });
+  M5Dtime+=usecond();
+}
+
+template<class Impl>  
+void
+CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
+			      const FermionField &phi_i, 
+			      FermionField &chi_i,
+			      Vector<Coeff_t> &lower,
+			      Vector<Coeff_t> &diag,
+			      Vector<Coeff_t> &upper)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  chi_i.Checkerboard()=psi_i.Checkerboard();
+  GridBase *grid=psi_i.Grid();
+  auto psi=psi_i.View();
+  auto phi=phi_i.View();
+  auto chi=chi_i.View();
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+    for(int i=0;i<nsimd;i++){ //inner
+      int s  = o+i*LLs;
+      int ss = o*nsimd+i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+    }}
+
+  M5Dcalls++;
+  M5Dtime-=usecond();
+  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
+#if 0
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5p(hp,psi[ss+vp]);
+      spProj5m(hm,psi[ss+vm]);
+
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+      
+      hp=hp*0.5;
+      hm=hm*0.5;
+      spRecon5p(fp,hp);
+      spRecon5m(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+#else
+    for(int v=0;v<LLs;v++){
+
+      vprefetch(psi[ss+v+LLs]);
+
+      int vp= (v==LLs-1) ? 0     : v+1;
+      int vm= (v==0    ) ? LLs-1 : v-1;
+	
+      Simd hp_00 = psi[ss+vp]()(0)(0); 
+      Simd hp_01 = psi[ss+vp]()(0)(1); 
+      Simd hp_02 = psi[ss+vp]()(0)(2); 
+      Simd hp_10 = psi[ss+vp]()(1)(0); 
+      Simd hp_11 = psi[ss+vp]()(1)(1); 
+      Simd hp_12 = psi[ss+vp]()(1)(2); 
+	
+      Simd hm_00 = psi[ss+vm]()(2)(0); 
+      Simd hm_01 = psi[ss+vm]()(2)(1); 
+      Simd hm_02 = psi[ss+vm]()(2)(2); 
+      Simd hm_10 = psi[ss+vm]()(3)(0); 
+      Simd hm_11 = psi[ss+vm]()(3)(1); 
+      Simd hm_12 = psi[ss+vm]()(3)(2); 
+
+      if ( vp<=v ) {
+	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+      }
+      if ( vm>=v ) {
+	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+      }
+
+      Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
+      Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
+      Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02); 
+      Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
+      Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
+      Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
+
+      Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
+      Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
+      Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);  
+      Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
+      Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
+      Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
+
+      vstream(chi[ss+v]()(0)(0),p_00);
+      vstream(chi[ss+v]()(0)(1),p_01);
+      vstream(chi[ss+v]()(0)(2),p_02);
+      vstream(chi[ss+v]()(1)(0),p_10);
+      vstream(chi[ss+v]()(1)(1),p_11);
+      vstream(chi[ss+v]()(1)(2),p_12);
+      vstream(chi[ss+v]()(2)(0),p_20);
+      vstream(chi[ss+v]()(2)(1),p_21);
+      vstream(chi[ss+v]()(2)(2),p_22);
+      vstream(chi[ss+v]()(3)(0),p_30);
+      vstream(chi[ss+v]()(3)(1),p_31);
+      vstream(chi[ss+v]()(3)(2),p_32);
+    }
+#endif
+  });
+  M5Dtime+=usecond();
+}
+
+
+#ifdef AVX512 
+#include <simd/Intel512common.h>
+#include <simd/Intel512avx.h>
+#include <simd/Intel512single.h>
+#endif 
+
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField &chi_i,
+					int LLs, int site,
+					Vector<iSinglet<Simd> > &Matp,
+					Vector<iSinglet<Simd> > &Matm)
+{
+  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+#ifndef AVX512
+  {
+    SiteHalfSpinor BcastP;
+    SiteHalfSpinor BcastM;
+    SiteHalfSpinor SiteChiP;
+    SiteHalfSpinor SiteChiM;
+
+    // Ls*Ls * 2 * 12 * vol flops
+    for(int s1=0;s1<LLs;s1++){ 
+      for(int s2=0;s2<LLs;s2++){ 
+	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+
+	  int s=s2+l*LLs;
+	  int lex=s2+LLs*site;
+	
+	  if ( s2==0 && l==0) {
+	    SiteChiP=Zero();
+	    SiteChiM=Zero();
+	  }
+	
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
+	    }}
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
+	    }}
+
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
+	      SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
+	    }}
+
+	}}
+      {
+	int lex = s1+LLs*site;
+	for(int sp=0;sp<2;sp++){
+	  for(int co=0;co<Nc;co++){
+	    vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
+	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+	  }}
+      }
+    }
+
+  }
+#else
+  {
+    // pointers
+    //  MASK_REGS;
+#define Chi_00 %%zmm1
+#define Chi_01 %%zmm2
+#define Chi_02 %%zmm3
+#define Chi_10 %%zmm4
+#define Chi_11 %%zmm5
+#define Chi_12 %%zmm6
+#define Chi_20 %%zmm7
+#define Chi_21 %%zmm8
+#define Chi_22 %%zmm9
+#define Chi_30 %%zmm10
+#define Chi_31 %%zmm11
+#define Chi_32 %%zmm12
+
+#define BCAST0   %%zmm13
+#define BCAST1   %%zmm14
+#define BCAST2   %%zmm15
+#define BCAST3   %%zmm16
+#define BCAST4   %%zmm17
+#define BCAST5   %%zmm18
+#define BCAST6   %%zmm19
+#define BCAST7   %%zmm20
+#define BCAST8   %%zmm21
+#define BCAST9   %%zmm22
+#define BCAST10  %%zmm23
+#define BCAST11  %%zmm24
+
+    int incr=LLs*LLs*sizeof(iSinglet<Simd>);
+    for(int s1=0;s1<LLs;s1++){ 
+      for(int s2=0;s2<LLs;s2++){ 
+	int lex=s2+LLs*site;
+	uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
+	uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
+	uint64_t a2 = (uint64_t)&psi[lex];
+	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+	  if ( (s2+l)==0 ) {
+	    asm (
+		 VPREFETCH1(0,%2)  	     VPREFETCH1(0,%1)
+		 VPREFETCH1(12,%2)  	     VPREFETCH1(13,%2)
+		 VPREFETCH1(14,%2)  	     VPREFETCH1(15,%2)         
+		 VBCASTCDUP(0,%2,BCAST0)   
+		 VBCASTCDUP(1,%2,BCAST1)   
+		 VBCASTCDUP(2,%2,BCAST2)   
+		 VBCASTCDUP(3,%2,BCAST3)   
+		 VBCASTCDUP(4,%2,BCAST4)     VMULMEM (0,%0,BCAST0,Chi_00)
+		 VBCASTCDUP(5,%2,BCAST5)     VMULMEM (0,%0,BCAST1,Chi_01)
+		 VBCASTCDUP(6,%2,BCAST6)     VMULMEM (0,%0,BCAST2,Chi_02)
+		 VBCASTCDUP(7,%2,BCAST7)     VMULMEM (0,%0,BCAST3,Chi_10)
+		 VBCASTCDUP(8,%2,BCAST8)     VMULMEM (0,%0,BCAST4,Chi_11)
+		 VBCASTCDUP(9,%2,BCAST9)     VMULMEM (0,%0,BCAST5,Chi_12)
+		 VBCASTCDUP(10,%2,BCAST10)   VMULMEM (0,%1,BCAST6,Chi_20)
+		 VBCASTCDUP(11,%2,BCAST11)   VMULMEM (0,%1,BCAST7,Chi_21)
+		 VMULMEM (0,%1,BCAST8,Chi_22)         
+		 VMULMEM (0,%1,BCAST9,Chi_30)
+		 VMULMEM (0,%1,BCAST10,Chi_31)       
+		 VMULMEM (0,%1,BCAST11,Chi_32)
+		 : : "r" (a0), "r" (a1), "r" (a2)  );
+	  } else { 
+	    asm (
+		 VBCASTCDUP(0,%2,BCAST0)   VMADDMEM (0,%0,BCAST0,Chi_00)
+		 VBCASTCDUP(1,%2,BCAST1)   VMADDMEM (0,%0,BCAST1,Chi_01)
+		 VBCASTCDUP(2,%2,BCAST2)   VMADDMEM (0,%0,BCAST2,Chi_02)
+		 VBCASTCDUP(3,%2,BCAST3)   VMADDMEM (0,%0,BCAST3,Chi_10)
+		 VBCASTCDUP(4,%2,BCAST4)   VMADDMEM (0,%0,BCAST4,Chi_11)
+		 VBCASTCDUP(5,%2,BCAST5)   VMADDMEM (0,%0,BCAST5,Chi_12)
+		 VBCASTCDUP(6,%2,BCAST6)   VMADDMEM (0,%1,BCAST6,Chi_20)
+		 VBCASTCDUP(7,%2,BCAST7)   VMADDMEM (0,%1,BCAST7,Chi_21)
+		 VBCASTCDUP(8,%2,BCAST8)   VMADDMEM (0,%1,BCAST8,Chi_22)
+		 VBCASTCDUP(9,%2,BCAST9)   VMADDMEM (0,%1,BCAST9,Chi_30)
+		 VBCASTCDUP(10,%2,BCAST10)  VMADDMEM (0,%1,BCAST10,Chi_31)
+		 VBCASTCDUP(11,%2,BCAST11)  VMADDMEM (0,%1,BCAST11,Chi_32) 
+		 : : "r" (a0), "r" (a1), "r" (a2)  );
+	  }
+	  a0 = a0+incr;
+	  a1 = a1+incr;
+	a2 = a2+sizeof(typename Simd::scalar_type);
+	}}
+      {
+	int lexa = s1+LLs*site;
+	asm (
+	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)		
+	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)		
+	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)		
+	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)		
+	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+      }
+    }
+  }
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef BCAST0
+#undef BCAST1
+#undef BCAST2
+#undef BCAST3
+#undef BCAST4
+#undef BCAST5
+#undef BCAST6
+#undef BCAST7
+#undef BCAST8
+#undef BCAST9
+#undef BCAST10
+#undef BCAST11
+#endif
+};
+
+// Z-mobius version
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField &chi_i,
+					 int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
+{
+  EnableIf<Impl::LsVectorised,int> sfinae=0;
+#ifndef AVX512
+  {
+    auto psi = psi_i.View();
+    auto chi = chi_i.View();
+
+    SiteHalfSpinor BcastP;
+    SiteHalfSpinor BcastM;
+    SiteHalfSpinor SiteChiP;
+    SiteHalfSpinor SiteChiM;
+
+    // Ls*Ls * 2 * 12 * vol flops
+    for(int s1=0;s1<LLs;s1++){ 
+      for(int s2=0;s2<LLs;s2++){ 
+	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+
+	  int s=s2+l*LLs;
+	  int lex=s2+LLs*site;
+	
+	  if ( s2==0 && l==0) {
+	    SiteChiP=Zero();
+	    SiteChiM=Zero();
+	  }
+	
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
+	    }}
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
+	    }}
+
+	  for(int sp=0;sp<2;sp++){
+	    for(int co=0;co<Nc;co++){
+	      SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co); 
+	      SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co); 
+	    }}
+
+
+	}}
+      {
+	int lex = s1+LLs*site;
+	for(int sp=0;sp<2;sp++){
+	  for(int co=0;co<Nc;co++){
+	    vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
+	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+	  }}
+      }
+    }
+
+  }
+#else
+  {
+    auto psi = psi_i.View();
+    auto chi = chi_i.View();
+    // pointers
+    //  MASK_REGS;
+#define Chi_00 %zmm0
+#define Chi_01 %zmm1
+#define Chi_02 %zmm2
+#define Chi_10 %zmm3
+#define Chi_11 %zmm4
+#define Chi_12 %zmm5
+#define Chi_20 %zmm6
+#define Chi_21 %zmm7
+#define Chi_22 %zmm8
+#define Chi_30 %zmm9
+#define Chi_31 %zmm10
+#define Chi_32 %zmm11
+#define pChi_00 %%zmm0
+#define pChi_01 %%zmm1
+#define pChi_02 %%zmm2
+#define pChi_10 %%zmm3
+#define pChi_11 %%zmm4
+#define pChi_12 %%zmm5
+#define pChi_20 %%zmm6
+#define pChi_21 %%zmm7
+#define pChi_22 %%zmm8
+#define pChi_30 %%zmm9
+#define pChi_31 %%zmm10
+#define pChi_32 %%zmm11
+
+#define BCAST_00   %zmm12
+#define  SHUF_00   %zmm13
+#define BCAST_01   %zmm14
+#define  SHUF_01   %zmm15
+#define BCAST_02   %zmm16
+#define  SHUF_02   %zmm17
+#define BCAST_10   %zmm18
+#define  SHUF_10   %zmm19
+#define BCAST_11   %zmm20
+#define  SHUF_11   %zmm21
+#define BCAST_12   %zmm22
+#define  SHUF_12   %zmm23
+
+#define Mp  %zmm24
+#define Mps %zmm25
+#define Mm  %zmm26
+#define Mms %zmm27
+#define N 8
+    int incr=LLs*LLs*sizeof(iSinglet<Simd>);
+    for(int s1=0;s1<LLs;s1++){ 
+      for(int s2=0;s2<LLs;s2++){ 
+	int lex=s2+LLs*site;
+	uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
+	uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
+	uint64_t a2 = (uint64_t)&psi[lex];
+	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+	  if ( (s2+l)==0 ) {
+	    LOAD64(%r8,a0);
+	    LOAD64(%r9,a1);
+	    LOAD64(%r10,a2);
+	    asm (
+		 VLOAD(0,%r8,Mp)// i r
+		 VLOAD(0,%r9,Mm)
+		 VSHUF(Mp,Mps)  // r i 
+		 VSHUF(Mm,Mms)
+		 VPREFETCH1(12,%r10)  	     VPREFETCH1(13,%r10)
+		 VPREFETCH1(14,%r10)  	     VPREFETCH1(15,%r10)         
+
+		 VMULIDUP(0*N,%r10,Mps,Chi_00)
+		 VMULIDUP(1*N,%r10,Mps,Chi_01)
+		 VMULIDUP(2*N,%r10,Mps,Chi_02)
+		 VMULIDUP(3*N,%r10,Mps,Chi_10)
+		 VMULIDUP(4*N,%r10,Mps,Chi_11)
+		 VMULIDUP(5*N,%r10,Mps,Chi_12)
+
+		 VMULIDUP(6*N ,%r10,Mms,Chi_20)
+		 VMULIDUP(7*N ,%r10,Mms,Chi_21)
+		 VMULIDUP(8*N ,%r10,Mms,Chi_22)
+		 VMULIDUP(9*N ,%r10,Mms,Chi_30)
+		 VMULIDUP(10*N,%r10,Mms,Chi_31)
+		 VMULIDUP(11*N,%r10,Mms,Chi_32)
+
+		 VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
+		 VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
+		 VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
+		 VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
+		 VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
+		 VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
+
+		 VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
+		 VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
+		 VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
+		 VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
+		 VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
+		 VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
+		 );
+	  } else { 
+	    LOAD64(%r8,a0);
+	    LOAD64(%r9,a1);
+	    LOAD64(%r10,a2);
+	    asm (
+		 VLOAD(0,%r8,Mp)
+		 VSHUF(Mp,Mps)
+
+		 VLOAD(0,%r9,Mm)
+		 VSHUF(Mm,Mms)
+
+		 VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) //  Mri * Pii +- Cir
+		 VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
+		 VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
+		 VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
+		 VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
+		 VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
+
+		 VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
+		 VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
+		 VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
+		 VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
+		 VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
+		 VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
+
+		 VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) //  Cir = Mir * Prr +- ( Mri * Pii +- Cir) 
+		 VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) //  Ci = MiPr + Ci + MrPi ;    Cr = MrPr - ( MiPi - Cr)
+		 VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
+		 VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
+		 VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
+		 VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
+
+		 VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
+		 VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
+		 VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
+		 VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
+		 VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
+		 VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
+		 );
+	  }
+	  a0 = a0+incr;
+	  a1 = a1+incr;
+	a2 = a2+sizeof(typename Simd::scalar_type);
+	}}
+      {
+	int lexa = s1+LLs*site;
+	/*
+	  SiteSpinor tmp;
+	  asm (
+	  VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
+	  VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
+	  VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
+	  VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
+	  : : "r" ((uint64_t)&tmp) : "memory" );
+	*/
+
+	asm (
+	     VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
+	     VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
+	     VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
+	     VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
+	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+	//      if ( 1 || (site==0) ) { 
+	//	std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
+	//      }
+      }
+    }
+  }
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef BCAST0
+#undef BCAST1
+#undef BCAST2
+#undef BCAST3
+#undef BCAST4
+#undef BCAST5
+#undef BCAST6
+#undef BCAST7
+#undef BCAST8
+#undef BCAST9
+#undef BCAST10
+#undef BCAST11
+
+#endif
+};
+
+
+template<class Impl>
+void
+CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
+{
+  EnableIf<Impl::LsVectorised,int> sfinae=0;
+  chi.Checkerboard()=psi.Checkerboard();
+
+  int Ls=this->Ls;
+  int LLs = psi.Grid()->_rdimensions[0];
+  int vol = psi.Grid()->oSites()/LLs;
+
+  
+  Vector<iSinglet<Simd> >  Matp;
+  Vector<iSinglet<Simd> >  Matm;
+  Vector<iSinglet<Simd> >  *_Matp;
+  Vector<iSinglet<Simd> >  *_Matm;
+  
+  //  MooeeInternalCompute(dag,inv,Matp,Matm);
+  if ( inv && dag ) { 
+    _Matp = &MatpInvDag;
+    _Matm = &MatmInvDag;
+  }
+  if ( inv && (!dag) ) { 
+    _Matp = &MatpInv;
+    _Matm = &MatmInv;
+  } 
+  if ( !inv ) {
+    MooeeInternalCompute(dag,inv,Matp,Matm);
+    _Matp = &Matp;
+    _Matm = &Matm;
+  }
+  assert(_Matp->size()==Ls*LLs);
+
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+
+  if ( switcheroo<Coeff_t>::iscomplex() ) {
+    thread_loop( (auto site=0;site<vol;site++),{
+      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
+    });
+  } else { 
+    thread_loop( (auto site=0;site<vol;site++),{
+      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
+    });
+  }
+  MooeeInvTime+=usecond();
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@@ -0,0 +1,321 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
+
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
+{
+  SetCoefficientsZolotarev(1.0/scale,zdata);
+}
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
+{
+  // How to check Ls matches??
+  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+  int Ls = this->Ls;
+  assert(zdata->db==Ls);// Beta has Ls coeffs
+
+  R=(1+this->mass)/(1-this->mass);
+
+  Beta.resize(Ls);
+  cc.resize(Ls);
+  cc_d.resize(Ls);
+  sqrt_cc.resize(Ls);
+  for(int i=0; i < Ls ; i++){
+    Beta[i] = zdata -> beta[i];
+    cc[i] = 1.0/Beta[i];
+    cc_d[i]=std::sqrt(cc[i]);
+  }
+    
+  cc_d[Ls-1]=1.0;
+  for(int i=0; i < Ls-1 ; i++){
+    sqrt_cc[i]= std::sqrt(cc[i]*cc[i+1]);
+  }    
+  sqrt_cc[Ls-2]=std::sqrt(cc[Ls-2]);
+
+
+  ZoloHiInv =1.0/zolo_hi;
+  dw_diag = (4.0-this->M5)*ZoloHiInv;
+    
+  See.resize(Ls);
+  Aee.resize(Ls);
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    Aee[s] = sign * Beta[s] * dw_diag;
+    sign   = - sign;
+  }
+  Aee[Ls-1] += R;
+    
+  See[0] = Aee[0];
+  for(int s=1;s<Ls;s++){
+    See[s] = Aee[s] - 1.0/See[s-1];
+  }
+  for(int s=0;s<Ls;s++){
+    std::cout<<GridLogMessage <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
+  }
+}
+
+
+
+template<class Impl>
+RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
+{
+  int Ls = this->Ls;
+
+  FermionField D(psi.Grid());
+
+  this->DW(psi,D,DaggerNo); 
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*ZoloHiInv,D,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
+    } else if ( s==(Ls-1) ){
+      RealD R=(1.0+mass)/(1.0-mass);
+      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,D,sqrt_cc[s-1],psi,s,s-1);
+      ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
+    } else {
+      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,D,sqrt_cc[s],psi,s,s+1);
+      axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
+    }
+    sign=-sign; 
+  }
+  return norm2(chi);
+}
+template<class Impl>
+RealD  ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
+{
+  // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
+  // The rest of matrix is symmetric.
+  // Can ignore "dag"
+  return M(psi,chi);
+}
+template<class Impl>
+void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+  int Ls = this->Ls;
+
+  this->DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
+    } else {
+      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
+    }
+    sign=-sign; 
+  }
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
+{
+  int Ls = this->Ls;
+
+  // Apply 4d dslash
+  if ( psi.Checkerboard() == Odd ) {
+    this->DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
+  } else {
+    this->DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
+  }
+      
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
+    } else {
+      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
+    }
+    sign=-sign; 
+  }
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
+{
+  this->Meooe(psi,chi);
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
+{
+  int Ls = this->Ls;
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*dw_diag,psi,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
+    } else if ( s==(Ls-1) ){
+      // Drop the CC here.
+      double R=(1+mass)/(1-mass);
+      ag5xpby_ssp(chi,Beta[s]*dw_diag,psi,sqrt_cc[s-1],psi,s,s-1);
+      ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
+    } else {
+      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*dw_diag,psi,sqrt_cc[s],psi,s,s+1);
+      axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
+    }
+    sign=-sign; 
+  }
+}
+
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
+{
+  this->Mooee(psi,chi);
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
+{
+  int Ls = this->Ls;
+
+  // Apply Linv
+  axpby_ssp(chi,1.0/cc_d[0],psi,0.0,psi,0,0); 
+  for(int s=1;s<Ls;s++){
+    axpbg5y_ssp(chi,1.0/cc_d[s],psi,-1.0/See[s-1],chi,s,s-1);
+  }
+  // Apply Dinv
+  for(int s=0;s<Ls;s++){
+    ag5xpby_ssp(chi,1.0/See[s],chi,0.0,chi,s,s); //only appearance of See[0]
+  }
+  // Apply Uinv = (Linv)^T
+  axpby_ssp(chi,1.0/cc_d[Ls-1],chi,0.0,chi,Ls-1,Ls-1);
+  for(int s=Ls-2;s>=0;s--){
+    axpbg5y_ssp(chi,1.0/cc_d[s],chi,-1.0*cc_d[s+1]/See[s]/cc_d[s],chi,s,s+1);
+  }
+}
+template<class Impl>
+void   ContinuedFractionFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInv(psi,chi);
+}
+
+// force terms; five routines; default to Dhop on diagonal
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+    } else {
+      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+    }
+    sign=-sign; 
+  }
+  this->DhopDeriv(mat,D,V,DaggerNo); 
+};
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+    } else {
+      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+    }
+    sign=-sign; 
+  }
+  this->DhopDerivOE(mat,D,V,DaggerNo); 
+};
+template<class Impl>
+void ContinuedFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int sign=1;
+  for(int s=0;s<Ls;s++){
+    if ( s==(Ls-1) ){
+      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+    } else {
+      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+    }
+    sign=-sign; 
+  }
+  this->DhopDerivEO(mat,D,V,DaggerNo); 
+};
+    
+// Constructors
+template<class Impl>
+ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
+							     GaugeField &_Umu,
+							     GridCartesian         &FiveDimGrid,
+							     GridRedBlackCartesian &FiveDimRedBlackGrid,
+							     GridCartesian         &FourDimGrid,
+							     GridRedBlackCartesian &FourDimRedBlackGrid,
+							     RealD _mass,RealD M5,const ImplParams &p) :
+  WilsonFermion5D<Impl>(_Umu,
+			FiveDimGrid, FiveDimRedBlackGrid,
+			FourDimGrid, FourDimRedBlackGrid,M5,p),
+  mass(_mass)
+{
+  int Ls = this->Ls;
+  assert((Ls&0x1)==1); // Odd Ls required
+}
+
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+    {
+      int Ls = this->Ls;
+      conformable(solution5d.Grid(),this->FermionGrid());
+      conformable(exported4d.Grid(),this->GaugeGrid());
+      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+    }
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+    {
+      int Ls = this->Ls;
+      conformable(imported5d.Grid(),this->FermionGrid());
+      conformable(input4d.Grid()   ,this->GaugeGrid());
+      FermionField tmp(this->FermionGrid());
+      tmp=Zero();
+      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
+      this->Dminus(tmp,imported5d);
+    }
+
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermion.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermion.h
@@ -0,0 +1,433 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl>
+DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
+						   GaugeField            &_Umu,
+						   GridCartesian         &FiveDimGrid,
+						   GridRedBlackCartesian &FiveDimRedBlackGrid,
+						   GridCartesian         &FourDimGrid,
+						   GridRedBlackCartesian &FourDimRedBlackGrid,
+						   RealD _mq1, RealD _mq2, RealD _mq3,
+						   RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
+  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
+			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
+			    _shift, _pm, _M5, 1.0, 0.0, p)
+{
+  RealD eps = 1.0;
+  Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
+  assert(zdata->n == this->Ls);
+
+  std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
+  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
+
+  Approx::zolotarev_free(zdata);
+}
+
+/***************************************************************
+ * Additional EOFA operators only called outside the inverter.
+ * Since speed is not essential, simple axpby-style
+ * implementations should be fine.
+ ***************************************************************/
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
+{
+  int Ls = this->Ls;
+
+  Din = Zero();
+  if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
+  else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
+  else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
+  else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
+}
+
+// This is just the identity for DWF
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
+
+// This is just the identity for DWF
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
+
+/*****************************************************************************************************/
+
+template<class Impl>
+RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
+{
+  FermionField Din(psi.Grid());
+
+  this->Meooe5D(psi, Din);
+  this->DW(Din, chi, DaggerNo);
+  axpby(chi, 1.0, 1.0, chi, psi);
+  this->M5D(psi, chi);
+  return(norm2(chi));
+}
+
+template<class Impl>
+RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
+{
+  FermionField Din(psi.Grid());
+
+  this->DW(psi, Din, DaggerYes);
+  this->MeooeDag5D(Din, chi);
+  this->M5Ddag(psi, chi);
+  axpby(chi, 1.0, 1.0, chi, psi);
+  return(norm2(chi));
+}
+
+/********************************************************************
+ * Performance critical fermion operators called inside the inverter
+ ********************************************************************/
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
+{
+  int   Ls    = this->Ls;
+  int   pm    = this->pm;
+  RealD shift = this->shift;
+  RealD mq1   = this->mq1;
+  RealD mq2   = this->mq2;
+  RealD mq3   = this->mq3;
+
+  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
+  Coeff_t shiftp(0.0), shiftm(0.0);
+  if(shift != 0.0){
+    if(pm == 1){ shiftp = shift*(mq3-mq2); }
+    else{ shiftm = -shift*(mq3-mq2); }
+  }
+
+  Vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
+
+#if(0)
+  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
+  for(int i=0; i<diag.size(); ++i){
+    std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
+  }
+  for(int i=0; i<upper.size(); ++i){
+    std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
+  }
+  for(int i=0; i<lower.size(); ++i){
+    std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
+  }
+#endif
+
+  this->M5D(psi, chi, chi, lower, diag, upper);
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
+{
+  int   Ls    = this->Ls;
+  int   pm    = this->pm;
+  RealD shift = this->shift;
+  RealD mq1   = this->mq1;
+  RealD mq2   = this->mq2;
+  RealD mq3   = this->mq3;
+
+  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
+  Coeff_t shiftp(0.0), shiftm(0.0);
+  if(shift != 0.0){
+    if(pm == 1){ shiftp = shift*(mq3-mq2); }
+    else{ shiftm = -shift*(mq3-mq2); }
+  }
+
+  Vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
+
+#if(0)
+  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
+  for(int i=0; i<diag.size(); ++i){
+    std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
+  }
+  for(int i=0; i<upper.size(); ++i){
+    std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
+  }
+  for(int i=0; i<lower.size(); ++i){
+    std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
+  }
+#endif
+
+  this->M5Ddag(psi, chi, chi, lower, diag, upper);
+}
+
+// half checkerboard operations
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  Vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
+
+  for(int s=0; s<Ls; s++){
+    upper[s] = -this->cee[s];
+    lower[s] = -this->cee[s];
+  }
+  upper[Ls-1] = this->dm;
+  lower[0]    = this->dp;
+
+  this->M5D(psi, psi, chi, lower, diag, upper);
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  Vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
+
+  for(int s=0; s<Ls; s++){
+    upper[s] = -this->cee[s];
+    lower[s] = -this->cee[s];
+  }
+  upper[Ls-1] = this->dp;
+  lower[0]    = this->dm;
+
+  this->M5Ddag(psi, psi, chi, lower, diag, upper);
+}
+
+/****************************************************************************************/
+
+//Zolo
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
+{
+  int   Ls    = this->Ls;
+  int   pm    = this->pm;
+  RealD mq1   = this->mq1;
+  RealD mq2   = this->mq2;
+  RealD mq3   = this->mq3;
+  RealD shift = this->shift;
+
+  ////////////////////////////////////////////////////////
+  // Constants for the preconditioned matrix Cayley form
+  ////////////////////////////////////////////////////////
+  this->bs.resize(Ls);
+  this->cs.resize(Ls);
+  this->aee.resize(Ls);
+  this->aeo.resize(Ls);
+  this->bee.resize(Ls);
+  this->beo.resize(Ls);
+  this->cee.resize(Ls);
+  this->ceo.resize(Ls);
+
+  for(int i=0; i<Ls; ++i){
+    this->bee[i] = 4.0 - this->M5 + 1.0;
+    this->cee[i] = 1.0;
+  }
+
+  for(int i=0; i<Ls; ++i){
+    this->aee[i] = this->cee[i];
+    this->bs[i] = this->beo[i] = 1.0;
+    this->cs[i] = this->ceo[i] = 0.0;
+  }
+
+  //////////////////////////////////////////
+  // EOFA shift terms
+  //////////////////////////////////////////
+  if(pm == 1){
+    this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
+    this->dm = mq1*this->cee[Ls-1];
+  } else if(this->pm == -1) {
+    this->dp = mq1*this->cee[0];
+    this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
+  } else {
+    this->dp = mq1*this->cee[0];
+    this->dm = mq1*this->cee[Ls-1];
+  }
+
+  //////////////////////////////////////////
+  // LDU decomposition of eeoo
+  //////////////////////////////////////////
+  this->dee.resize(Ls+1);
+  this->lee.resize(Ls);
+  this->leem.resize(Ls);
+  this->uee.resize(Ls);
+  this->ueem.resize(Ls);
+
+  for(int i=0; i<Ls; ++i){
+
+    if(i < Ls-1){
+
+      this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
+
+      this->leem[i] = this->dm/this->bee[i];
+      for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
+
+      this->dee[i] = this->bee[i];
+
+      this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
+
+      this->ueem[i] = this->dp / this->bee[0];
+      for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
+
+    } else {
+
+      this->lee[i]  = 0.0;
+      this->leem[i] = 0.0;
+      this->uee[i]  = 0.0;
+      this->ueem[i] = 0.0;
+
+    }
+  }
+
+  {
+    Coeff_t delta_d = 1.0 / this->bee[0];
+    for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
+    this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
+    this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
+  }
+
+  int inv = 1;
+  this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
+  this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
+}
+
+// Recompute Cayley-form coefficients for different shift
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
+{
+  this->shift = new_shift;
+  Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
+  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
+						       Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+{
+  int Ls = this->Ls;
+
+  GridBase* grid = this->FermionRedBlackGrid();
+  int LLs = grid->_rdimensions[0];
+
+  if(LLs == Ls){ return; } // Not vectorised in 5th direction
+
+  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
+
+  for(int s=0; s<Ls; s++){
+    Pplus(s,s)  = this->bee[s];
+    Pminus(s,s) = this->bee[s];
+  }
+
+  for(int s=0; s<Ls-1; s++){
+    Pminus(s,s+1) = -this->cee[s];
+  }
+
+  for(int s=0; s<Ls-1; s++){
+    Pplus(s+1,s) = -this->cee[s+1];
+  }
+
+  Pplus (0,Ls-1) = this->dp;
+  Pminus(Ls-1,0) = this->dm;
+
+  Eigen::MatrixXcd PplusMat ;
+  Eigen::MatrixXcd PminusMat;
+
+#if(0)
+  std::cout << GridLogMessage << "Pplus:" << std::endl;
+  for(int s=0; s<Ls; ++s){
+    for(int ss=0; ss<Ls; ++ss){
+      std::cout << Pplus(s,ss) << "\t";
+    }
+    std::cout << std::endl;
+  }
+  std::cout << GridLogMessage << "Pminus:" << std::endl;
+  for(int s=0; s<Ls; ++s){
+    for(int ss=0; ss<Ls; ++ss){
+      std::cout << Pminus(s,ss) << "\t";
+    }
+    std::cout << std::endl;
+  }
+#endif
+
+  if(inv) {
+    PplusMat  = Pplus.inverse();
+    PminusMat = Pminus.inverse();
+  } else {
+    PplusMat  = Pplus;
+    PminusMat = Pminus;
+  }
+
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+
+  typedef typename SiteHalfSpinor::scalar_type scalar_type;
+  const int Nsimd = Simd::Nsimd();
+  Matp.resize(Ls*LLs);
+  Matm.resize(Ls*LLs);
+
+  for(int s2=0; s2<Ls; s2++){
+    for(int s1=0; s1<LLs; s1++){
+      int istride = LLs;
+      int ostride = 1;
+      Simd Vp;
+      Simd Vm;
+      scalar_type *sp = (scalar_type*) &Vp;
+      scalar_type *sm = (scalar_type*) &Vm;
+      for(int l=0; l<Nsimd; l++){
+	if(switcheroo<Coeff_t>::iscomplex()) {
+	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
+	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
+	} else {
+	  // if real
+	  scalar_type tmp;
+	  tmp = PplusMat (l*istride+s1*ostride,s2);
+	  sp[l] = scalar_type(tmp.real(),tmp.real());
+	  tmp = PminusMat(l*istride+s1*ostride,s2);
+	  sm[l] = scalar_type(tmp.real(),tmp.real());
+	}
+      }
+      Matp[LLs*s2+s1] = Vp;
+      Matm[LLs*s2+s1] = Vm;
+    }}
+}
+
+FermOpTemplateInstantiate(DomainWallEOFAFermion);
+GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermioncache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermioncache.h
@@ -0,0 +1,255 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// FIXME -- make a version of these routines with site loop outermost for cache reuse.
+
+// Pminus fowards
+// Pplus  backwards..
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
+				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  int Ls = this->Ls;
+  GridBase* grid = psi_i.Grid();
+  auto phi = phi_i.View();
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+  assert(phi.Checkerboard() == psi.Checkerboard());
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+  
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
+    for(int s=0; s<Ls; s++){
+      auto tmp = psi[0];
+      if(s==0) {
+	spProj5m(tmp, psi[ss+s+1]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5p(tmp, psi[ss+Ls-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      } else if(s==(Ls-1)) {
+	spProj5m(tmp, psi[ss+0]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5p(tmp, psi[ss+s-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      } else {
+	spProj5m(tmp, psi[ss+s+1]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5p(tmp, psi[ss+s-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      }
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
+					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid = psi_i.Grid();
+  int Ls = this->Ls;
+
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
+    auto tmp = psi[0];
+    for(int s=0; s<Ls; s++){
+      if(s==0) {
+	spProj5p(tmp, psi[ss+s+1]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5m(tmp, psi[ss+Ls-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      } else if(s==(Ls-1)) {
+	spProj5p(tmp, psi[ss+0]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5m(tmp, psi[ss+s-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      } else {
+	spProj5p(tmp, psi[ss+s+1]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5m(tmp, psi[ss+s-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      }
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionField& chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid = psi_i.Grid();
+  auto psi=psi_i.View();
+  auto chi=chi_i.View();
+  int Ls = this->Ls;
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
+
+    auto tmp1 = psi[0];
+    auto tmp2 = psi[0];
+
+    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
+    // Apply (L^{\prime})^{-1}
+    chi[ss] = psi[ss]; // chi[0]=psi[0]
+    for(int s=1; s<Ls; s++){
+      spProj5p(tmp1, chi[ss+s-1]);
+      chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
+    }
+
+    // L_m^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+      spProj5m(tmp1, chi[ss+s]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
+    }
+
+    // U_m^{-1} D^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+      spProj5p(tmp1, chi[ss+Ls-1]);
+      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
+    }
+    spProj5m(tmp2, chi[ss+Ls-1]);
+    chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
+
+    // Apply U^{-1}
+    for(int s=Ls-2; s>=0; s--){
+      spProj5m(tmp1, chi[ss+s+1]);
+      chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
+    }
+  });
+
+  this->MooeeInvTime += usecond();
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, FermionField& chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid = psi_i.Grid();
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+  int Ls = this->Ls;
+
+  assert(psi.Checkerboard() == psi.Checkerboard());
+
+  Vector<Coeff_t> ueec(Ls);
+  Vector<Coeff_t> deec(Ls+1);
+  Vector<Coeff_t> leec(Ls);
+  Vector<Coeff_t> ueemc(Ls);
+  Vector<Coeff_t> leemc(Ls);
+
+  for(int s=0; s<ueec.size(); s++){
+    ueec[s]  = conjugate(this->uee[s]);
+    deec[s]  = conjugate(this->dee[s]);
+    leec[s]  = conjugate(this->lee[s]);
+    ueemc[s] = conjugate(this->ueem[s]);
+    leemc[s] = conjugate(this->leem[s]);
+  }
+  deec[Ls] = conjugate(this->dee[Ls]);
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
+
+    auto tmp1 = psi[0];
+    auto tmp2 = psi[0];
+
+    // Apply (U^{\prime})^{-dagger}
+    chi[ss] = psi[ss];
+    for(int s=1; s<Ls; s++){
+      spProj5m(tmp1, chi[ss+s-1]);
+      chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
+    }
+
+    // U_m^{-\dagger}
+    for(int s=0; s<Ls-1; s++){
+      spProj5p(tmp1, chi[ss+s]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for(int s=0; s<Ls-1; s++){
+      spProj5m(tmp1, chi[ss+Ls-1]);
+      chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
+    }
+    spProj5p(tmp2, chi[ss+Ls-1]);
+    chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
+
+    // Apply L^{-dagger}
+    for(int s=Ls-2; s>=0; s--){
+      spProj5p(tmp1, chi[ss+s+1]);
+      chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
+    }
+  });
+
+  this->MooeeInvTime += usecond();
+}
+
+#ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
+
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
+
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
+
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionvec.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionvec.h
@@ -0,0 +1,613 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+
+/*
+ * Dense matrix versions of routines
+ */
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
+				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid = psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+
+  int Ls  = this->Ls;
+  int LLs = grid->_rdimensions[0];
+  const int nsimd = Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs == nsimd);
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type* u_p = (scalar_type*) &u[0];
+  scalar_type* l_p = (scalar_type*) &l[0];
+  scalar_type* d_p = (scalar_type*) &d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+    for(int i=0;i<nsimd;i++){ //inner
+      int s  = o + i*LLs;
+      int ss = o*nsimd + i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+    }}
+
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  assert(Nc == 3);
+
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
+
+#if 0
+
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0; v<LLs; v++){
+
+      int vp = (v+1)%LLs;
+      int vm = (v+LLs-1)%LLs;
+
+      spProj5m(hp, psi[ss+vp]);
+      spProj5p(hm, psi[ss+vm]);
+
+      if (vp <= v){ rotate(hp, hp, 1); }
+      if (vm >= v){ rotate(hm, hm, nsimd-1); }
+
+      hp = 0.5*hp;
+      hm = 0.5*hm;
+
+      spRecon5m(fp, hp);
+      spRecon5p(fm, hm);
+
+      chi[ss+v] = d[v]*phi[ss+v];
+      chi[ss+v] = chi[ss+v] + u[v]*fp;
+      chi[ss+v] = chi[ss+v] + l[v]*fm;
+
+    }
+
+#else
+
+    for(int v=0; v<LLs; v++){
+
+      vprefetch(psi[ss+v+LLs]);
+
+      int vp = (v==LLs-1) ? 0     : v+1;
+      int vm = (v==0)     ? LLs-1 : v-1;
+
+      Simd hp_00 = psi[ss+vp]()(2)(0);
+      Simd hp_01 = psi[ss+vp]()(2)(1);
+      Simd hp_02 = psi[ss+vp]()(2)(2);
+      Simd hp_10 = psi[ss+vp]()(3)(0);
+      Simd hp_11 = psi[ss+vp]()(3)(1);
+      Simd hp_12 = psi[ss+vp]()(3)(2);
+
+      Simd hm_00 = psi[ss+vm]()(0)(0);
+      Simd hm_01 = psi[ss+vm]()(0)(1);
+      Simd hm_02 = psi[ss+vm]()(0)(2);
+      Simd hm_10 = psi[ss+vm]()(1)(0);
+      Simd hm_11 = psi[ss+vm]()(1)(1);
+      Simd hm_12 = psi[ss+vm]()(1)(2);
+
+      if(vp <= v){
+	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+      }
+
+      if(vm >= v){
+	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+      }
+
+      // Can force these to real arithmetic and save 2x.
+      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
+      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
+      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
+      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
+      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
+      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
+      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+
+      vstream(chi[ss+v]()(0)(0), p_00);
+      vstream(chi[ss+v]()(0)(1), p_01);
+      vstream(chi[ss+v]()(0)(2), p_02);
+      vstream(chi[ss+v]()(1)(0), p_10);
+      vstream(chi[ss+v]()(1)(1), p_11);
+      vstream(chi[ss+v]()(1)(2), p_12);
+      vstream(chi[ss+v]()(2)(0), p_20);
+      vstream(chi[ss+v]()(2)(1), p_21);
+      vstream(chi[ss+v]()(2)(2), p_22);
+      vstream(chi[ss+v]()(3)(0), p_30);
+      vstream(chi[ss+v]()(3)(1), p_31);
+      vstream(chi[ss+v]()(3)(2), p_32);
+    }
+
+#endif
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
+					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid = psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+
+  int Ls  = this->Ls;
+  int LLs = grid->_rdimensions[0];
+  int nsimd = Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs == nsimd);
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type* u_p = (scalar_type*) &u[0];
+  scalar_type* l_p = (scalar_type*) &l[0];
+  scalar_type* d_p = (scalar_type*) &d[0];
+
+  for(int o=0; o<LLs; o++){ // outer
+    for(int i=0; i<nsimd; i++){ //inner
+      int s  = o + i*LLs;
+      int ss = o*nsimd + i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+    }}
+
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  thread_loop((int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
+
+#if 0
+
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0; v<LLs; v++){
+
+      int vp = (v+1)%LLs;
+      int vm = (v+LLs-1)%LLs;
+
+      spProj5p(hp, psi[ss+vp]);
+      spProj5m(hm, psi[ss+vm]);
+
+      if(vp <= v){ rotate(hp, hp, 1); }
+      if(vm >= v){ rotate(hm, hm, nsimd-1); }
+
+      hp = hp*0.5;
+      hm = hm*0.5;
+      spRecon5p(fp, hp);
+      spRecon5m(fm, hm);
+
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+    }
+
+#else
+
+    for(int v=0; v<LLs; v++){
+
+      vprefetch(psi[ss+v+LLs]);
+
+      int vp = (v == LLs-1) ? 0     : v+1;
+      int vm = (v == 0    ) ? LLs-1 : v-1;
+
+      Simd hp_00 = psi[ss+vp]()(0)(0);
+      Simd hp_01 = psi[ss+vp]()(0)(1);
+      Simd hp_02 = psi[ss+vp]()(0)(2);
+      Simd hp_10 = psi[ss+vp]()(1)(0);
+      Simd hp_11 = psi[ss+vp]()(1)(1);
+      Simd hp_12 = psi[ss+vp]()(1)(2);
+
+      Simd hm_00 = psi[ss+vm]()(2)(0);
+      Simd hm_01 = psi[ss+vm]()(2)(1);
+      Simd hm_02 = psi[ss+vm]()(2)(2);
+      Simd hm_10 = psi[ss+vm]()(3)(0);
+      Simd hm_11 = psi[ss+vm]()(3)(1);
+      Simd hm_12 = psi[ss+vm]()(3)(2);
+
+      if (vp <= v){
+	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+      }
+
+      if(vm >= v){
+	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+      }
+
+      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
+      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
+      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
+      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
+      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
+      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
+
+      vstream(chi[ss+v]()(0)(0), p_00);
+      vstream(chi[ss+v]()(0)(1), p_01);
+      vstream(chi[ss+v]()(0)(2), p_02);
+      vstream(chi[ss+v]()(1)(0), p_10);
+      vstream(chi[ss+v]()(1)(1), p_11);
+      vstream(chi[ss+v]()(1)(2), p_12);
+      vstream(chi[ss+v]()(2)(0), p_20);
+      vstream(chi[ss+v]()(2)(1), p_21);
+      vstream(chi[ss+v]()(2)(2), p_22);
+      vstream(chi[ss+v]()(3)(0), p_30);
+      vstream(chi[ss+v]()(3)(1), p_31);
+      vstream(chi[ss+v]()(3)(2), p_32);
+    }
+#endif
+
+  });
+
+  this->M5Dtime += usecond();
+}
+
+#ifdef AVX512
+#include<simd/Intel512common.h>
+#include<simd/Intel512avx.h>
+#include<simd/Intel512single.h>
+#endif
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
+						   int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+{
+  GridBase* grid = psi_i.Grid();
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+#ifndef AVX512
+  {
+    SiteHalfSpinor BcastP;
+    SiteHalfSpinor BcastM;
+    SiteHalfSpinor SiteChiP;
+    SiteHalfSpinor SiteChiM;
+
+    // Ls*Ls * 2 * 12 * vol flops
+    for(int s1=0; s1<LLs; s1++){
+
+      for(int s2=0; s2<LLs; s2++){
+	for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
+
+	  int s = s2 + l*LLs;
+	  int lex = s2 + LLs*site;
+
+	  if( s2==0 && l==0 ){
+	    SiteChiP=Zero();
+	    SiteChiM=Zero();
+	  }
+
+	  for(int sp=0; sp<2;  sp++){
+	    for(int co=0; co<Nc; co++){
+	      vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
+	    }}
+
+	  for(int sp=0; sp<2;  sp++){
+	    for(int co=0; co<Nc; co++){
+	      vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
+	    }}
+
+	  for(int sp=0; sp<2;  sp++){
+	    for(int co=0; co<Nc; co++){
+	      SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
+	      SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
+	    }}
+	}}
+
+      {
+	int lex = s1 + LLs*site;
+	for(int sp=0; sp<2;  sp++){
+	  for(int co=0; co<Nc; co++){
+	    vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
+	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+	  }}
+      }
+    }
+
+  }
+#else
+  {
+    // pointers
+    //  MASK_REGS;
+#define Chi_00 %%zmm1
+#define Chi_01 %%zmm2
+#define Chi_02 %%zmm3
+#define Chi_10 %%zmm4
+#define Chi_11 %%zmm5
+#define Chi_12 %%zmm6
+#define Chi_20 %%zmm7
+#define Chi_21 %%zmm8
+#define Chi_22 %%zmm9
+#define Chi_30 %%zmm10
+#define Chi_31 %%zmm11
+#define Chi_32 %%zmm12
+
+#define BCAST0  %%zmm13
+#define BCAST1  %%zmm14
+#define BCAST2  %%zmm15
+#define BCAST3  %%zmm16
+#define BCAST4  %%zmm17
+#define BCAST5  %%zmm18
+#define BCAST6  %%zmm19
+#define BCAST7  %%zmm20
+#define BCAST8  %%zmm21
+#define BCAST9  %%zmm22
+#define BCAST10 %%zmm23
+#define BCAST11 %%zmm24
+
+    int incr = LLs*LLs*sizeof(iSinglet<Simd>);
+    for(int s1=0; s1<LLs; s1++){
+
+      for(int s2=0; s2<LLs; s2++){
+
+	int lex = s2 + LLs*site;
+	uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
+	uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
+	uint64_t a2 = (uint64_t) &psi[lex];
+
+	for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
+	  if((s2+l)==0) {
+	    asm(
+		VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
+		VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
+		VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
+		VBCASTCDUP(0,%2,BCAST0)
+		VBCASTCDUP(1,%2,BCAST1)
+		VBCASTCDUP(2,%2,BCAST2)
+		VBCASTCDUP(3,%2,BCAST3)
+		VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
+		VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
+		VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
+		VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
+		VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
+		VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
+		VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
+		VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
+		VMULMEM(0,%1,BCAST8,Chi_22)
+		VMULMEM(0,%1,BCAST9,Chi_30)
+		VMULMEM(0,%1,BCAST10,Chi_31)
+		VMULMEM(0,%1,BCAST11,Chi_32)
+		: : "r" (a0), "r" (a1), "r" (a2)                            );
+	  } else {
+	    asm(
+		VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
+		VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
+		VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
+		VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
+		VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
+		VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
+		VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
+		VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
+		VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
+		VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
+		VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
+		VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
+		: : "r" (a0), "r" (a1), "r" (a2)                            );
+	  }
+	  a0 = a0 + incr;
+	  a1 = a1 + incr;
+                        a2 = a2 + sizeof(typename Simd::scalar_type);
+	}
+      }
+
+      {
+	int lexa = s1+LLs*site;
+	asm (
+	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
+	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
+	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
+	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
+	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+      }
+    }
+  }
+
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef BCAST0
+#undef BCAST1
+#undef BCAST2
+#undef BCAST3
+#undef BCAST4
+#undef BCAST5
+#undef BCAST6
+#undef BCAST7
+#undef BCAST8
+#undef BCAST9
+#undef BCAST10
+#undef BCAST11
+#endif
+};
+
+// Z-mobius version
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
+						    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+{
+  std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
+  exit(-1);
+};
+
+template<class Impl>
+void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
+{
+  chi.Checkerboard() = psi.Checkerboard();
+  int Ls  = this->Ls;
+  int LLs = psi.Grid()->_rdimensions[0];
+  int vol = psi.Grid()->oSites()/LLs;
+
+  Vector<iSinglet<Simd> > Matp;
+  Vector<iSinglet<Simd> > Matm;
+  Vector<iSinglet<Simd> > *_Matp;
+  Vector<iSinglet<Simd> > *_Matm;
+
+  //  MooeeInternalCompute(dag,inv,Matp,Matm);
+  if(inv && dag){
+    _Matp = &this->MatpInvDag;
+    _Matm = &this->MatmInvDag;
+  }
+
+  if(inv && (!dag)){
+    _Matp = &this->MatpInv;
+    _Matm = &this->MatmInv;
+  }
+
+  if(!inv){
+    MooeeInternalCompute(dag, inv, Matp, Matm);
+    _Matp = &Matp;
+    _Matm = &Matm;
+  }
+
+  assert(_Matp->size() == Ls*LLs);
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  if(switcheroo<Coeff_t>::iscomplex()){
+    thread_loop((auto site=0; site<vol; site++),{
+      MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
+    });
+  } else {
+    thread_loop((auto site=0; site<vol; site++){
+      MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
+    });
+  }
+
+  this->MooeeInvTime += usecond();
+}
+
+#ifdef DOMAIN_WALL_EOFA_DPERP_VEC
+
+INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
+INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
+INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
+INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
+
+INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
+INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
+INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
+INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
+
+template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@@ -0,0 +1,663 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
+#include <Grid/perfmon/PerfCount.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// 5d lattice for DWF.
+template<class Impl>
+ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian         &FiveDimGrid,
+							     GridRedBlackCartesian &FiveDimRedBlackGrid,
+							     GridCartesian         &FourDimGrid,
+							     GridRedBlackCartesian &FourDimRedBlackGrid,
+							     RealD _mass,
+							     RealD _c1,RealD _c2, RealD _u0,
+							     const ImplParams &p) :
+  Kernels(p),
+  _FiveDimGrid        (&FiveDimGrid),
+  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
+  _FourDimGrid        (&FourDimGrid),
+  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
+  Stencil    (&FiveDimGrid,npoint,Even,directions,displacements,p),
+  StencilEven(&FiveDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
+  StencilOdd (&FiveDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
+  mass(_mass),
+  c1(_c1),
+  c2(_c2),
+  u0(_u0),
+  Umu(&FourDimGrid),
+  UmuEven(&FourDimRedBlackGrid),
+  UmuOdd (&FourDimRedBlackGrid),
+  UUUmu(&FourDimGrid),
+  UUUmuEven(&FourDimRedBlackGrid),
+  UUUmuOdd(&FourDimRedBlackGrid),
+  Lebesgue(&FourDimGrid),
+  LebesgueEvenOdd(&FourDimRedBlackGrid),
+  _tmp(&FiveDimRedBlackGrid)
+{
+
+  // some assertions
+  assert(FiveDimGrid._ndimension==5);
+  assert(FourDimGrid._ndimension==4);
+  assert(FourDimRedBlackGrid._ndimension==4);
+  assert(FiveDimRedBlackGrid._ndimension==5);
+  assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
+
+  // extent of fifth dim and not spread out
+  Ls=FiveDimGrid._fdimensions[0];
+  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+  assert(FiveDimGrid._processors[0]         ==1);
+  assert(FiveDimRedBlackGrid._processors[0] ==1);
+
+  // Other dimensions must match the decomposition of the four-D fields 
+  for(int d=0;d<4;d++){
+    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
+
+    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
+
+    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+    assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
+    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
+  }
+
+  if (Impl::LsVectorised) { 
+
+    int nsimd = Simd::Nsimd();
+    
+    // Dimension zero of the five-d is the Ls direction
+    assert(FiveDimGrid._simd_layout[0]        ==nsimd);
+    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
+
+    for(int d=0;d<4;d++){
+      assert(FourDimGrid._simd_layout[d]==1);
+      assert(FourDimRedBlackGrid._simd_layout[d]==1);
+      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
+    }
+
+  } else {
+    
+    // Dimension zero of the five-d is the Ls direction
+    assert(FiveDimRedBlackGrid._simd_layout[0]==1);
+    assert(FiveDimGrid._simd_layout[0]        ==1);
+
+  }
+  int LLs = FiveDimGrid._rdimensions[0];
+  int vol4= FourDimGrid.oSites();
+  Stencil.BuildSurfaceList(LLs,vol4);
+
+  vol4=FourDimRedBlackGrid.oSites();
+  StencilEven.BuildSurfaceList(LLs,vol4);
+  StencilOdd.BuildSurfaceList(LLs,vol4);
+}
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::CopyGaugeCheckerboards(void)
+{
+  pickCheckerboard(Even, UmuEven,  Umu);
+  pickCheckerboard(Odd,  UmuOdd ,  Umu);
+  pickCheckerboard(Even, UUUmuEven,UUUmu);
+  pickCheckerboard(Odd,  UUUmuOdd, UUUmu);
+}
+template<class Impl>
+ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
+							     GridCartesian         &FiveDimGrid,
+							     GridRedBlackCartesian &FiveDimRedBlackGrid,
+							     GridCartesian         &FourDimGrid,
+							     GridRedBlackCartesian &FourDimRedBlackGrid,
+							     RealD _mass,
+							     RealD _c1,RealD _c2, RealD _u0,
+							     const ImplParams &p) :
+  ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid,
+			     FourDimGrid,FourDimRedBlackGrid,
+			     _mass,_c1,_c2,_u0,p)
+{
+  ImportGauge(_Uthin,_Ufat);
+}
+
+///////////////////////////////////////////////////
+// For MILC use; pass three link U's and 1 link U
+///////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) 
+{
+  /////////////////////////////////////////////////////////////////
+  // Trivial import; phases and fattening and such like preapplied
+  /////////////////////////////////////////////////////////////////
+  for (int mu = 0; mu < Nd; mu++) {
+
+    auto U = PeekIndex<LorentzIndex>(_Utriple, mu);
+    Impl::InsertGaugeField(UUUmu,U,mu);
+
+    U = adj( Cshift(U, mu, -3));
+    Impl::InsertGaugeField(UUUmu,-U,mu+4);
+
+    U = PeekIndex<LorentzIndex>(_Ufat, mu);
+    Impl::InsertGaugeField(Umu,U,mu);
+
+    U = adj( Cshift(U, mu, -1));
+    Impl::InsertGaugeField(Umu,-U,mu+4);
+
+  }
+  CopyGaugeCheckerboards();
+}
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) 
+{
+  /////////////////////////////////////////////////////////////////
+  // Trivial import; phases and fattening and such like preapplied
+  /////////////////////////////////////////////////////////////////
+  Umu   = _U;
+  UUUmu = _UUU;
+  CopyGaugeCheckerboards();
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
+{
+  ////////////////////////////////////////////////////////
+  // Double Store should take two fields for Naik and one hop separately.
+  ////////////////////////////////////////////////////////
+  Impl::DoubleStore(GaugeGrid(), UUUmu, Umu, _Uthin, _Ufat );
+
+  ////////////////////////////////////////////////////////
+  // Apply scale factors to get the right fermion Kinetic term
+  // Could pass coeffs into the double store to save work.
+  // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) 
+  ////////////////////////////////////////////////////////
+  for (int mu = 0; mu < Nd; mu++) {
+
+    auto U = PeekIndex<LorentzIndex>(Umu, mu);
+    PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
+    
+    U = PeekIndex<LorentzIndex>(Umu, mu+4);
+    PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
+
+    U = PeekIndex<LorentzIndex>(UUUmu, mu);
+    PokeIndex<LorentzIndex>(UUUmu, U*( 0.5*c2/u0/u0/u0), mu );
+    
+    U = PeekIndex<LorentzIndex>(UUUmu, mu+4);
+    PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
+  }
+
+  CopyGaugeCheckerboards();
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
+{
+  int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
+                    // we drop off the innermost fifth dimension
+
+  Compressor compressor;
+  Stencil.HaloExchange(in,compressor);
+  auto Umu_v   = Umu.View();
+  auto UUUmu_v = UUUmu.View();
+  auto in_v    = in.View();
+  auto out_v   = out.View();
+  thread_loop( (int ss=0;ss<Umu.Grid()->oSites();ss++),{
+    for(int s=0;s<Ls;s++){
+      int sU=ss;
+      int sF = s+Ls*sU; 
+      Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sF, sU, in_v, out_v, dir, disp);
+    }
+  });
+};
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DerivInternal(StencilImpl & st,
+						     DoubledGaugeField & U,
+						     DoubledGaugeField & UUU,
+						     GaugeField &mat,
+						     const FermionField &A,
+						     const FermionField &B,
+						     int dag)
+{
+  // No force terms in multi-rhs solver staggered
+  assert(0);
+}
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopDeriv(GaugeField &mat,
+						 const FermionField &A,
+						 const FermionField &B,
+						 int dag)
+{
+  assert(0);
+}
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
+						   const FermionField &A,
+						   const FermionField &B,
+						   int dag)
+{
+  assert(0);
+}
+
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
+						   const FermionField &A,
+						   const FermionField &B,
+						   int dag)
+{
+  assert(0);
+}
+
+/*CHANGE */
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
+						    DoubledGaugeField & U,DoubledGaugeField & UUU,
+						    const FermionField &in, FermionField &out,int dag)
+{
+#ifdef GRID_OMP
+  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
+    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
+  else
+#endif
+    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
+}
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
+								   DoubledGaugeField & U,DoubledGaugeField & UUU,
+								   const FermionField &in, FermionField &out,int dag)
+{
+#ifdef GRID_OMP
+  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
+
+  Compressor compressor; 
+
+  int LLs = in.Grid()->_rdimensions[0];
+  int len =  U.Grid()->oSites();
+
+  DhopFaceTime-=usecond();
+  st.Prepare();
+  st.HaloGather(in,compressor);
+  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+  DhopFaceTime+=usecond();
+
+  double ctime=0;
+  double ptime=0;
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Ugly explicit thread mapping introduced for OPA reasons.
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
+  {
+    int tid = omp_get_thread_num();
+    int nthreads = omp_get_num_threads();
+    int ncomms = CartesianCommunicator::nCommThreads;
+    if (ncomms == -1) ncomms = 1;
+    assert(nthreads > ncomms);
+    if (tid >= ncomms) {
+      double start = usecond();
+      nthreads -= ncomms;
+      int ttid  = tid - ncomms;
+      int n     = U.Grid()->oSites(); // 4d vol
+      int chunk = n / nthreads;
+      int rem   = n % nthreads;
+      int myblock, myn;
+      if (ttid < rem) {
+        myblock = ttid * chunk + ttid;
+        myn = chunk+1;
+      } else {
+        myblock = ttid*chunk + rem;
+        myn = chunk;
+      }
+
+      // do the compute
+      auto   U_v  =   U.View();
+      auto UUU_v  = UUU.View();
+      auto  in_v  =  in.View();
+      auto out_v  = out.View();
+
+      if (dag == DaggerYes) {
+        for (int ss = myblock; ss < myblock+myn; ++ss) {
+          int sU = ss;
+	  // Interior = 1; Exterior = 0; must implement for staggered
+          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
+        }
+      } else {
+        for (int ss = myblock; ss < myblock+myn; ++ss) {
+	  // Interior = 1; Exterior = 0;
+          int sU = ss;
+          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
+        }
+      }
+        ptime = usecond() - start;
+    } else {
+      double start = usecond();
+      st.CommunicateThreaded();
+      ctime = usecond() - start;
+    }
+  }
+  DhopCommTime += ctime;
+  DhopComputeTime+=ptime;
+
+  // First to enter, last to leave timing
+  st.CollateThreads();
+
+  DhopFaceTime-=usecond();
+  st.CommsMerge(compressor);
+  DhopFaceTime+=usecond();
+
+  DhopComputeTime2-=usecond();
+
+  auto   U_v  =   U.View();
+  auto UUU_v  = UUU.View();
+  auto  in_v  =  in.View();
+  auto out_v  = out.View();
+  if (dag == DaggerYes) {
+    int sz=st.surface_list.size();
+    thread_loop( (int ss = 0; ss < sz; ss++) ,{
+      int sU = st.surface_list[ss];
+      Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
+    });
+  } else {
+    int sz=st.surface_list.size();
+    thread_loop( (int ss = 0; ss < sz; ss++) ,{
+      int sU = st.surface_list[ss];
+      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
+    });
+  }
+  DhopComputeTime2+=usecond();
+#else
+  assert(0);
+#endif
+
+}
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
+						    DoubledGaugeField & U,DoubledGaugeField & UUU,
+						    const FermionField &in, FermionField &out,int dag)
+{
+  Compressor compressor;
+  int LLs = in.Grid()->_rdimensions[0];
+
+
+
+ //double t1=usecond();
+  DhopTotalTime -= usecond();
+  DhopCommTime -= usecond();
+  st.HaloExchange(in,compressor);
+  DhopCommTime += usecond();
+  
+  DhopComputeTime -= usecond();
+  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  auto   U_v  =   U.View();
+  auto UUU_v  = UUU.View();
+  auto  in_v  =  in.View();
+  auto out_v  = out.View();
+  if (dag == DaggerYes) {
+  thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++), {
+      int sU=ss;
+      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
+    });
+  } else {
+    thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{
+      int sU=ss;
+      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
+    });
+  }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
+ //double t2=usecond();
+ //std::cout << __FILE__ << " " << __func__  << " Total Time " << DhopTotalTime << std::endl;
+ //std::cout << __FILE__ << " " << __func__  << " Total Time Org " << t2-t1 << std::endl;
+ //std::cout << __FILE__ << " " << __func__  << " Comml Time " << DhopCommTime << std::endl;
+ //std::cout << __FILE__ << " " << __func__  << " Compute Time " << DhopComputeTime << std::endl;
+
+}
+/*CHANGE END*/
+
+/* ORG
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
+						    DoubledGaugeField & U,DoubledGaugeField & UUU,
+						    const FermionField &in, FermionField &out,int dag)
+{
+  Compressor compressor;
+  int LLs = in.Grid()->_rdimensions[0];
+
+  DhopTotalTime -= usecond();
+  DhopCommTime -= usecond();
+  st.HaloExchange(in,compressor);
+  DhopCommTime += usecond();
+  
+  DhopComputeTime -= usecond();
+  auto U_v   =   U.View();
+  auto UUU_v = UUU.View();
+  auto out_v = out.View();
+  auto in_v  =  in.View();
+  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  if (dag == DaggerYes) {
+    thread_loop(  (int ss = 0; ss < U.Grid()->oSites(); ss++), {
+      int sU=ss;
+      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
+    });
+  } else {
+    thread_loop(  (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{
+      int sU=ss;
+      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
+    });
+  }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
+}
+*/
+
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
+{
+  DhopCalls+=1;
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check
+
+  assert(in.Checkerboard()==Even);
+  out.Checkerboard() = Odd;
+
+  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
+{
+  DhopCalls+=1;
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check
+
+  assert(in.Checkerboard()==Odd);
+  out.Checkerboard() = Even;
+
+  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
+{
+  DhopCalls+=2;
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+
+  out.Checkerboard() = in.Checkerboard();
+
+  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
+}
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Report(void) 
+{
+  Coordinate latt = GridDefaultLatt();          
+  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP = _FourDimGrid->_Nprocessors;
+  RealD NN = _FourDimGrid->NodeCount();
+
+  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
+	    << DhopCalls   << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
+	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
+	    << DhopCommTime    / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
+	    << DhopComputeTime / DhopCalls << " us" << std::endl;
+
+  // Average the compute time
+  _FourDimGrid->GlobalSum(DhopComputeTime);
+  DhopComputeTime/=NP;
+
+  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
+  
+  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
+{
+  DhopCalls       = 0;
+  DhopTotalTime    = 0;
+  DhopCommTime    = 0;
+  DhopComputeTime = 0;
+  DhopFaceTime    = 0;
+
+
+  Stencil.ZeroCounters();
+  StencilEven.ZeroCounters();
+  StencilOdd.ZeroCounters();
+}
+
+/////////////////////////////////////////////////////////////////////////
+// Implement the general interface. Here we use SAME mass on all slices
+/////////////////////////////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+  DhopDir(in, out, dir, disp);
+}
+template <class Impl>
+RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  Dhop(in, out, DaggerNo);
+  return axpy_norm(out, mass, in, out);
+}
+
+template <class Impl>
+RealD ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  Dhop(in, out, DaggerYes);
+  return axpy_norm(out, mass, in, out);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) {
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
+  }
+}
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
+  }
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  typename FermionField::scalar_type scal(mass);
+  out = scal * in;
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  Mooee(in, out);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  out = (1.0 / (mass)) * in;
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
+						   FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  MooeeInv(in, out);
+}
+
+//////////////////////////////////////////////////////// 
+// Conserved current - not yet implemented.
+////////////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+								PropagatorField &q_in_2,
+								PropagatorField &q_out,
+								Current curr_type,
+								unsigned int mu)
+{
+  assert(0);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+							   PropagatorField &q_out,
+							   Current curr_type,
+							   unsigned int mu, 
+							   unsigned int tmin,
+                                              unsigned int tmax,
+					      ComplexField &lattice_cmplx)
+{
+  assert(0);
+
+}
+  
+NAMESPACE_END(Grid);
+
+
+
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@@ -0,0 +1,615 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#include <Grid.h>
+
+#pragma once 
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////
+// Constructor and gauge import
+/////////////////////////////////
+
+template <class Impl>
+ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, 
+							 RealD _mass,
+							 RealD _c1, RealD _c2,RealD _u0,
+							 const ImplParams &p)
+  : Kernels(p),
+    _grid(&Fgrid),
+    _cbgrid(&Hgrid),
+    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
+    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
+    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
+    mass(_mass),
+    Lebesgue(_grid),
+    LebesgueEvenOdd(_cbgrid),
+    Umu(&Fgrid),
+    UmuEven(&Hgrid),
+    UmuOdd(&Hgrid),
+    UUUmu(&Fgrid),
+    UUUmuEven(&Hgrid),
+    UUUmuOdd(&Hgrid) ,
+    _tmp(&Hgrid)
+{
+  int vol4;
+  int LLs=1;
+  c1=_c1;
+  c2=_c2;
+  u0=_u0;
+  vol4= _grid->oSites();
+  Stencil.BuildSurfaceList(LLs,vol4);
+  vol4= _cbgrid->oSites();
+  StencilEven.BuildSurfaceList(LLs,vol4);
+  StencilOdd.BuildSurfaceList(LLs,vol4);
+}
+
+template <class Impl>
+ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
+							 GridRedBlackCartesian &Hgrid, RealD _mass,
+							 RealD _c1, RealD _c2,RealD _u0,
+							 const ImplParams &p)
+  : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p)
+{
+  ImportGauge(_Uthin,_Ufat);
+}
+
+////////////////////////////////////////////////////////////
+// Momentum space propagator should be 
+// https://arxiv.org/pdf/hep-lat/9712010.pdf
+//
+// mom space action.
+//   gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
+//
+// must track through staggered flavour/spin reduction in literature to 
+// turn to free propagator for the one component chi field, a la page 4/5
+// of above link to implmement fourier based solver.
+////////////////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) 
+{
+  /////////////////////////////////////////////////////////////////
+  // Trivial import; phases and fattening and such like preapplied
+  /////////////////////////////////////////////////////////////////
+  GaugeLinkField U(GaugeGrid());
+
+  for (int mu = 0; mu < Nd; mu++) {
+
+    U = PeekIndex<LorentzIndex>(_Utriple, mu);
+    PokeIndex<LorentzIndex>(UUUmu, U, mu );
+
+    U = adj( Cshift(U, mu, -3));
+    PokeIndex<LorentzIndex>(UUUmu, -U, mu+4 );
+
+    U = PeekIndex<LorentzIndex>(_Ufat, mu);
+    PokeIndex<LorentzIndex>(Umu, U, mu);
+
+    U = adj( Cshift(U, mu, -1));
+    PokeIndex<LorentzIndex>(Umu, -U, mu+4);
+
+  }
+  CopyGaugeCheckerboards();
+}
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) 
+{
+
+  Umu   = _U;
+  UUUmu = _UUU;
+  CopyGaugeCheckerboards();
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
+{
+  pickCheckerboard(Even, UmuEven,  Umu);
+  pickCheckerboard(Odd,  UmuOdd ,  Umu);
+  pickCheckerboard(Even, UUUmuEven,UUUmu);
+  pickCheckerboard(Odd,  UUUmuOdd, UUUmu);
+}
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) 
+{
+  GaugeLinkField U(GaugeGrid());
+
+  ////////////////////////////////////////////////////////
+  // Double Store should take two fields for Naik and one hop separately.
+  ////////////////////////////////////////////////////////
+  Impl::DoubleStore(GaugeGrid(), UUUmu, Umu, _Uthin, _Ufat );
+
+  ////////////////////////////////////////////////////////
+  // Apply scale factors to get the right fermion Kinetic term
+  // Could pass coeffs into the double store to save work.
+  // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) 
+  ////////////////////////////////////////////////////////
+  for (int mu = 0; mu < Nd; mu++) {
+
+    U = PeekIndex<LorentzIndex>(Umu, mu);
+    PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
+    
+    U = PeekIndex<LorentzIndex>(Umu, mu+4);
+    PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
+
+    U = PeekIndex<LorentzIndex>(UUUmu, mu);
+    PokeIndex<LorentzIndex>(UUUmu, U*( 0.5*c2/u0/u0/u0), mu );
+    
+    U = PeekIndex<LorentzIndex>(UUUmu, mu+4);
+    PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
+  }
+
+  CopyGaugeCheckerboards();
+}
+
+/////////////////////////////
+// Implement the interface
+/////////////////////////////
+
+template <class Impl>
+RealD ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  Dhop(in, out, DaggerNo);
+  return axpy_norm(out, mass, in, out);
+}
+
+template <class Impl>
+RealD ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  Dhop(in, out, DaggerYes);
+  return axpy_norm(out, mass, in, out);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
+  }
+}
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
+  }
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  typename FermionField::scalar_type scal(mass);
+  out = scal * in;
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  Mooee(in, out);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  out = (1.0 / (mass)) * in;
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
+						 FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  MooeeInv(in, out);
+}
+
+///////////////////////////////////
+// Internal
+///////////////////////////////////
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, 
+						   GaugeField & mat,
+						   const FermionField &A, const FermionField &B, int dag) {
+  assert((dag == DaggerNo) || (dag == DaggerYes));
+
+  Compressor compressor;
+
+  FermionField Btilde(B.Grid());
+  FermionField Atilde(B.Grid());
+  Atilde = A;
+
+  st.HaloExchange(B, compressor);
+
+  for (int mu = 0; mu < Nd; mu++) {
+
+    ////////////////////////
+    // Call the single hop
+    ////////////////////////
+    auto U_v   = U.View();
+    auto UUU_v = UUU.View();
+    auto B_v   = B.View();
+    auto Btilde_v   = Btilde.View();
+    thread_loop( (int sss = 0; sss < B.Grid()->oSites(); sss++), {
+      Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
+    });
+
+    // Force in three link terms
+    //
+    //    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
+    //
+    // dU_ac(x)/dt = i p_ab U_bc(x)
+    //
+    // => dS_f/dt = dS_f/dU_ac(x) . dU_ac(x)/dt =  i p_ab U_bc(x) dS_f/dU_ac(x) 
+    //
+    // One link: form fragments S_f = A U B 
+    //
+    //         write Btilde = U(x) B(x+mu)
+    //
+    // mat+= TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+    // 
+    // Three link: form fragments S_f = A UUU B 
+    //
+    // mat+= outer ( A, UUUB) <-- Best take DhopDeriv with one linke or identity matrix
+    // mat+= outer ( AU, UUB) <-- and then use covariant cshift?
+    // mat+= outer ( AUU, UB) <-- Returned from call to DhopDir
+
+    assert(0);// need to figure out the force interface with a blasted three link term.
+    
+  }
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+
+  conformable(U.Grid(), _grid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());
+
+  mat.Checkerboard() = U.Checkerboard();
+
+  DerivInternal(Stencil, Umu, UUUmu, mat, U, V, dag);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+
+  conformable(U.Grid(), _cbgrid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());
+
+  assert(V.Checkerboard() == Even);
+  assert(U.Checkerboard() == Odd);
+  mat.Checkerboard() = Odd;
+
+  DerivInternal(StencilEven, UmuOdd, UUUmuOdd, mat, U, V, dag);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+
+  conformable(U.Grid(), _cbgrid);
+  conformable(U.Grid(), V.Grid());
+  conformable(U.Grid(), mat.Grid());
+
+  assert(V.Checkerboard() == Odd);
+  assert(U.Checkerboard() == Even);
+  mat.Checkerboard() = Even;
+
+  DerivInternal(StencilOdd, UmuEven, UUUmuEven, mat, U, V, dag);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
+{
+  DhopCalls+=2;
+  conformable(in.Grid(), _grid);  // verifies full grid
+  conformable(in.Grid(), out.Grid());
+
+  out.Checkerboard() = in.Checkerboard();
+
+  DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
+{
+  DhopCalls+=1;
+  conformable(in.Grid(), _cbgrid);    // verifies half grid
+  conformable(in.Grid(), out.Grid());  // drops the cb check
+
+  assert(in.Checkerboard() == Even);
+  out.Checkerboard() = Odd;
+
+  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
+{
+  DhopCalls+=1;
+  conformable(in.Grid(), _cbgrid);    // verifies half grid
+  conformable(in.Grid(), out.Grid());  // drops the cb check
+
+  assert(in.Checkerboard() == Odd);
+  out.Checkerboard() = Even;
+
+  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+  DhopDir(in, out, dir, disp);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
+
+  Compressor compressor;
+  Stencil.HaloExchange(in, compressor);
+  auto Umu_v   =   Umu.View();
+  auto UUUmu_v = UUUmu.View();
+  auto in_v    =  in.View();
+  auto out_v   = out.View();
+  thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) , {
+      Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
+  });
+};
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+						  DoubledGaugeField &U,
+						  DoubledGaugeField &UUU,
+						  const FermionField &in,
+						  FermionField &out, int dag) 
+{
+#ifdef GRID_OMP
+  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
+    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
+  else
+#endif
+    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
+}
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+								 DoubledGaugeField &U,
+								 DoubledGaugeField &UUU,
+								 const FermionField &in,
+								 FermionField &out, int dag) 
+{
+#ifdef GRID_OMP
+  Compressor compressor; 
+  int len =  U.Grid()->oSites();
+  const int LLs =  1;
+
+  DhopTotalTime   -= usecond();
+
+  DhopFaceTime    -= usecond();
+  st.Prepare();
+  st.HaloGather(in,compressor);
+  st.CommsMergeSHM(compressor);
+  DhopFaceTime    += usecond();
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Ugly explicit thread mapping introduced for OPA reasons.
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  DhopComputeTime    -= usecond();
+#pragma omp parallel 
+  {
+    int tid = omp_get_thread_num();
+    int nthreads = omp_get_num_threads();
+    int ncomms = CartesianCommunicator::nCommThreads;
+    if (ncomms == -1) ncomms = 1;
+    assert(nthreads > ncomms);
+
+    if (tid >= ncomms) {
+      nthreads -= ncomms;
+      int ttid  = tid - ncomms;
+      int n     = len;
+      int chunk = n / nthreads;
+      int rem   = n % nthreads;
+      int myblock, myn;
+      if (ttid < rem) {
+        myblock = ttid * chunk + ttid;
+        myn = chunk+1;
+      } else {
+        myblock = ttid*chunk + rem;
+        myn = chunk;
+      }
+
+      // do the compute
+      auto U_v   = U.View();
+      auto UUU_v = UUU.View();
+      auto in_v  = in.View();
+      auto out_v = out.View();
+      if (dag == DaggerYes) {
+        for (int ss = myblock; ss < myblock+myn; ++ss) {
+          int sU = ss;
+	  // Interior = 1; Exterior = 0; must implement for staggered
+          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); 
+        }
+      } else {
+        for (int ss = myblock; ss < myblock+myn; ++ss) {
+	  // Interior = 1; Exterior = 0;
+          int sU = ss;
+          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
+        }
+      }
+    } else {
+      st.CommunicateThreaded();
+    }
+  }
+  DhopComputeTime    += usecond();
+
+  // First to enter, last to leave timing
+  DhopFaceTime    -= usecond();
+  st.CommsMerge(compressor);
+  DhopFaceTime    -= usecond();
+
+  DhopComputeTime2    -= usecond();
+  {
+    auto U_v   = U.View();
+    auto UUU_v = UUU.View();
+    auto in_v  = in.View();
+    auto out_v = out.View();
+    if (dag == DaggerYes) {
+      int sz=st.surface_list.size();
+      thread_loop( (int ss = 0; ss < sz; ss++) ,{
+	int sU = st.surface_list[ss];
+	Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
+      });
+    } else {
+      int sz=st.surface_list.size();
+      thread_loop( (int ss = 0; ss < sz; ss++) ,{
+	int sU = st.surface_list[ss];
+	Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
+      });
+    }
+  }
+  DhopComputeTime2    += usecond();
+#else
+  assert(0);
+#endif
+}
+
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
+							     DoubledGaugeField &U,
+							     DoubledGaugeField &UUU,
+							     const FermionField &in,
+							     FermionField &out, int dag) 
+{
+  assert((dag == DaggerNo) || (dag == DaggerYes));
+
+  DhopTotalTime   -= usecond();
+
+  DhopCommTime    -= usecond();
+  Compressor compressor;
+  st.HaloExchange(in, compressor);
+  DhopCommTime    += usecond();
+
+  auto U_v   =   U.View();
+  auto UUU_v = UUU.View();
+  auto in_v  =  in.View();
+  auto out_v = out.View();
+  DhopComputeTime -= usecond();
+  if (dag == DaggerYes) {
+    thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++), {
+      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
+    });
+  } else {
+    thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++), {
+      Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
+    });
+  }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
+};
+
+  ////////////////////////////////////////////////////////////////
+  // Reporting
+  ////////////////////////////////////////////////////////////////
+template<class Impl>
+void ImprovedStaggeredFermion<Impl>::Report(void) 
+{
+  Coordinate latt = _grid->GlobalDimensions();
+  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP = _grid->_Nprocessors;
+  RealD NN = _grid->NodeCount();
+
+  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls   : " 
+	    << DhopCalls   << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime   /Calls       : " 
+	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime    /Calls       : " 
+	    << DhopCommTime    / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls        : " 
+	    << DhopComputeTime / DhopCalls << " us" << std::endl;
+
+  // Average the compute time
+  _grid->GlobalSum(DhopComputeTime);
+  DhopComputeTime/=NP;
+
+  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
+  
+  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
+}
+template<class Impl>
+void ImprovedStaggeredFermion<Impl>::ZeroCounters(void) 
+{
+  DhopCalls       = 0;
+  DhopTotalTime   = 0;
+  DhopCommTime    = 0;
+  DhopComputeTime = 0;
+  DhopFaceTime    = 0;
+
+  Stencil.ZeroCounters();
+  StencilEven.ZeroCounters();
+  StencilOdd.ZeroCounters();
+}
+
+
+//////////////////////////////////////////////////////// 
+// Conserved current - not yet implemented.
+////////////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+							      PropagatorField &q_in_2,
+							      PropagatorField &q_out,
+							      Current curr_type,
+							      unsigned int mu)
+{
+  assert(0);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+                                                         PropagatorField &q_out,
+                                                         Current curr_type,
+                                                         unsigned int mu, 
+                                                         unsigned int tmin,
+                                              unsigned int tmax,
+					      ComplexField &lattice_cmplx)
+{
+  assert(0);
+
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermion.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermion.h
@@ -0,0 +1,497 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl>
+MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
+					   GaugeField            &_Umu,
+					   GridCartesian         &FiveDimGrid,
+					   GridRedBlackCartesian &FiveDimRedBlackGrid,
+					   GridCartesian         &FourDimGrid,
+					   GridRedBlackCartesian &FourDimRedBlackGrid,
+					   RealD _mq1, RealD _mq2, RealD _mq3,
+					   RealD _shift, int _pm, RealD _M5,
+					   RealD _b, RealD _c, const ImplParams &p) :
+  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
+			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
+			    _shift, _pm, _M5, _b, _c, p)
+{
+  int Ls = this->Ls;
+
+  RealD eps = 1.0;
+  Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
+  assert(zdata->n == this->Ls);
+
+  std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
+    ",c=" << _c << ") with Ls=" << Ls << std::endl;
+  this->SetCoefficientsTanh(zdata, _b, _c);
+  std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
+    ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
+    ",pm=" << _pm << ")" << std::endl;
+
+  Approx::zolotarev_free(zdata);
+
+  if(_shift != 0.0){
+    SetCoefficientsPrecondShiftOps();
+  } else {
+    Mooee_shift.resize(Ls, 0.0);
+    MooeeInv_shift_lc.resize(Ls, 0.0);
+    MooeeInv_shift_norm.resize(Ls, 0.0);
+    MooeeInvDag_shift_lc.resize(Ls, 0.0);
+    MooeeInvDag_shift_norm.resize(Ls, 0.0);
+  }
+}
+
+/****************************************************************
+ * Additional EOFA operators only called outside the inverter.  
+ * Since speed is not essential, simple axpby-style
+ * implementations should be fine.
+ ***************************************************************/
+template<class Impl>
+void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
+{
+  int Ls = this->Ls;
+  RealD alpha = this->alpha;
+
+  Din = Zero();
+  if((sign == 1) && (dag == 0)) { // \Omega_{+}
+    for(int s=0; s<Ls; ++s){
+      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
+    }
+  } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
+    for(int s=0; s<Ls; ++s){
+      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
+    }
+  } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
+    for(int sp=0; sp<Ls; ++sp){
+      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
+    }
+  } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
+    for(int sp=0; sp<Ls; ++sp){
+      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
+    }
+  }
+}
+
+// This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
+// It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
+template<class Impl>
+void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
+{
+  int Ls    = this->Ls;
+  RealD b   = 0.5 * ( 1.0 + this->alpha );
+  RealD c   = 0.5 * ( 1.0 - this->alpha );
+  RealD mq1 = this->mq1;
+
+  for(int s=0; s<Ls; ++s){
+    if(s == 0) {
+      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
+      axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
+    } else if(s == (Ls-1)) {
+      axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
+      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
+    } else {
+      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
+      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
+    }
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+  RealD m = this->mq1;
+  RealD c = 0.5 * this->alpha;
+  RealD d = 0.5;
+
+  RealD DtInv_p(0.0), DtInv_m(0.0);
+  RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
+  FermionField tmp(this->FermionGrid());
+
+  for(int s=0; s<Ls; ++s){
+    for(int sp=0; sp<Ls; ++sp){
+
+      DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
+      DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
+      DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
+      DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
+
+      if(sp == 0){
+	axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
+	axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
+      } else {
+	axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
+	axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
+      }
+
+    }}
+}
+
+/*****************************************************************************************************/
+
+template<class Impl>
+RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
+{
+  FermionField Din(psi.Grid());
+
+  this->Meooe5D(psi, Din);
+  this->DW(Din, chi, DaggerNo);
+  axpby(chi, 1.0, 1.0, chi, psi);
+  this->M5D(psi, chi);
+  return(norm2(chi));
+}
+
+template<class Impl>
+RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
+{
+  FermionField Din(psi.Grid());
+
+  this->DW(psi, Din, DaggerYes);
+  this->MeooeDag5D(Din, chi);
+  this->M5Ddag(psi, chi);
+  axpby(chi, 1.0, 1.0, chi, psi);
+  return(norm2(chi));
+}
+
+/********************************************************************
+ * Performance critical fermion operators called inside the inverter
+ ********************************************************************/
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  Vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+
+  // no shift term
+  if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
+
+  // fused M + shift operation
+  else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  Vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+
+  // no shift term
+  if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
+
+  // fused M + shift operation
+  else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
+}
+
+// half checkerboard operations
+template<class Impl>
+void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  // coefficients of Mooee
+  Vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
+  for(int s=0; s<Ls; s++){
+    upper[s] = -this->cee[s];
+    lower[s] = -this->cee[s];
+  }
+  upper[Ls-1] *= -this->mq1;
+  lower[0]    *= -this->mq1;
+
+  // no shift term
+  if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
+
+  // fused M + shift operation
+  else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
+{
+  int Ls = this->Ls;
+
+  // coefficients of MooeeDag
+  Vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> lower(Ls);
+  for(int s=0; s<Ls; s++){
+    if(s==0) {
+      upper[s] = -this->cee[s+1];
+      lower[s] = this->mq1*this->cee[Ls-1];
+    } else if(s==(Ls-1)) {
+      upper[s] = this->mq1*this->cee[0];
+      lower[s] = -this->cee[s-1];
+    } else {
+      upper[s] = -this->cee[s+1];
+      lower[s] = -this->cee[s-1];
+    }
+  }
+
+  // no shift term
+  if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
+
+  // fused M + shift operation
+  else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
+}
+
+/****************************************************************************************/
+
+// Computes coefficients for applying Cayley preconditioned shift operators
+//  (Mooee + \Delta) --> Mooee_shift
+//  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
+//  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
+// For the latter two cases, the operation takes the form
+//  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
+//      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
+template<class Impl>
+void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
+{
+  int   Ls    = this->Ls;
+  int   pm    = this->pm;
+  RealD alpha = this->alpha;
+  RealD k     = this->k;
+  RealD mq1   = this->mq1;
+  RealD shift = this->shift;
+
+  // Initialize
+  Mooee_shift.resize(Ls);
+  MooeeInv_shift_lc.resize(Ls);
+  MooeeInv_shift_norm.resize(Ls);
+  MooeeInvDag_shift_lc.resize(Ls);
+  MooeeInvDag_shift_norm.resize(Ls);
+
+  // Construct Mooee_shift
+  int idx(0);
+  Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
+    ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
+  for(int s=0; s<Ls; ++s){
+    idx = (pm == 1) ? (s) : (Ls-1-s);
+    Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
+  }
+
+  // Tridiagonal solve for MooeeInvDag_shift_lc
+  {
+    Coeff_t m(0.0);
+    Vector<Coeff_t> d = Mooee_shift;
+    Vector<Coeff_t> u(Ls,0.0);
+    Vector<Coeff_t> y(Ls,0.0);
+    Vector<Coeff_t> q(Ls,0.0);
+    if(pm == 1){ u[0] = 1.0; }
+    else{ u[Ls-1] = 1.0; }
+
+    // Tridiagonal matrix algorithm + Sherman-Morrison formula
+    //
+    // We solve
+    //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
+    // where Mooee' is the tridiagonal part of Mooee_{+}, and
+    // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
+    // so that the outer-product u \otimes v gives the (0,Ls-1)
+    // entry of Mooee_{+}.
+    //
+    // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
+    // and then construct the solution to the original system
+    //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
+    if(pm == 1){
+      for(int s=1; s<Ls; ++s){
+	m = -this->cee[s] / this->bee[s-1];
+	d[s] -= m*d[s-1];
+	u[s] -= m*u[s-1];
+      }
+    }
+    y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
+    q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
+    for(int s=Ls-2; s>=0; --s){
+      if(pm == 1){
+	y[s] = d[s] / this->bee[s];
+	q[s] = u[s] / this->bee[s];
+      } else {
+	y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
+	q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
+      }
+    }
+
+    // Construct MooeeInvDag_shift_lc
+    for(int s=0; s<Ls; ++s){
+      if(pm == 1){
+	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
+	  (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
+      } else {
+	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
+	  (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
+      }
+    }
+
+    // Compute remaining coefficients
+    N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
+    for(int s=0; s<Ls; ++s){
+
+      // MooeeInv_shift_lc
+      if(pm == 1){ MooeeInv_shift_lc[s] = pow(this->bee[s],s)      * pow(this->cee[s],Ls-1-s); }
+      else       { MooeeInv_shift_lc[s] = pow(this->bee[s],Ls-1-s) * pow(this->cee[s],s); }
+
+      // MooeeInv_shift_norm
+      MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
+	( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N;
+
+      // MooeeInvDag_shift_norm
+      if(pm == 1){ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],s) * pow(this->cee[s],(Ls-1-s)) /
+     	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
+      else{ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],(Ls-1-s)) * pow(this->cee[s],s) /
+	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
+    }
+  }
+}
+
+// Recompute coefficients for a different value of shift constant
+template<class Impl>
+void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
+{
+  this->shift = new_shift;
+  if(new_shift != 0.0){
+    SetCoefficientsPrecondShiftOps();
+  } else {
+    int Ls = this->Ls;
+    Mooee_shift.resize(Ls,0.0);
+    MooeeInv_shift_lc.resize(Ls,0.0);
+    MooeeInv_shift_norm.resize(Ls,0.0);
+    MooeeInvDag_shift_lc.resize(Ls,0.0);
+    MooeeInvDag_shift_norm.resize(Ls,0.0);
+  }
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
+						   Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+{
+  int Ls = this->Ls;
+
+  GridBase* grid = this->FermionRedBlackGrid();
+  int LLs = grid->_rdimensions[0];
+
+  if(LLs == Ls){ return; } // Not vectorised in 5th direction
+
+  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
+
+  for(int s=0; s<Ls; s++){
+    Pplus(s,s)  = this->bee[s];
+    Pminus(s,s) = this->bee[s];
+  }
+
+  for(int s=0; s<Ls-1; s++){
+    Pminus(s,s+1) = -this->cee[s];
+    Pplus(s+1,s) = -this->cee[s+1];
+  }
+
+  Pplus (0,Ls-1) = this->mq1*this->cee[0];
+  Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
+
+  if(this->shift != 0.0){
+    RealD c = 0.5 * this->alpha;
+    RealD d = 0.5;
+    RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
+    if(this->pm == 1) {
+      for(int s=0; s<Ls; ++s){
+	Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
+      }
+    } else {
+      for(int s=0; s<Ls; ++s){
+	Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
+      }
+    }
+  }
+
+  Eigen::MatrixXcd PplusMat ;
+  Eigen::MatrixXcd PminusMat;
+
+  if(inv) {
+    PplusMat  = Pplus.inverse();
+    PminusMat = Pminus.inverse();
+  } else {
+    PplusMat  = Pplus;
+    PminusMat = Pminus;
+  }
+
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+
+  typedef typename SiteHalfSpinor::scalar_type scalar_type;
+  const int Nsimd = Simd::Nsimd();
+  Matp.resize(Ls*LLs);
+  Matm.resize(Ls*LLs);
+
+  for(int s2=0; s2<Ls; s2++){
+    for(int s1=0; s1<LLs; s1++){
+      int istride = LLs;
+      int ostride = 1;
+      Simd Vp;
+      Simd Vm;
+      scalar_type *sp = (scalar_type*) &Vp;
+      scalar_type *sm = (scalar_type*) &Vm;
+      for(int l=0; l<Nsimd; l++){
+	if(switcheroo<Coeff_t>::iscomplex()) {
+	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
+	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
+	} else {
+	  // if real
+	  scalar_type tmp;
+	  tmp = PplusMat (l*istride+s1*ostride,s2);
+	  sp[l] = scalar_type(tmp.real(),tmp.real());
+	  tmp = PminusMat(l*istride+s1*ostride,s2);
+	  sm[l] = scalar_type(tmp.real(),tmp.real());
+	}
+      }
+      Matp[LLs*s2+s1] = Vp;
+      Matm[LLs*s2+s1] = Vm;
+    }}
+}
+
+FermOpTemplateInstantiate(MobiusEOFAFermion);
+GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermioncache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermioncache.h
@@ -0,0 +1,445 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
+				  Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
+    for(int s=0; s<Ls; s++){
+      auto tmp = psi[0];
+      if(s==0){
+	spProj5m(tmp, psi[ss+s+1]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5p(tmp, psi[ss+Ls-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      } else if(s==(Ls-1)) {
+	spProj5m(tmp, psi[ss+0]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5p(tmp, psi[ss+s-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      } else {
+	spProj5m(tmp, psi[ss+s+1]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5p(tmp, psi[ss+s-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      }
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
+					Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
+					Vector<Coeff_t> &shift_coeffs)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+
+  int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
+
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
+    for(int s=0; s<Ls; s++){
+      auto tmp = psi[0];
+      if(s==0){
+	spProj5m(tmp, psi[ss+s+1]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5p(tmp, psi[ss+Ls-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      } else if(s==(Ls-1)) {
+	spProj5m(tmp, psi[ss+0]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5p(tmp, psi[ss+s-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      } else {
+	spProj5m(tmp, psi[ss+s+1]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5p(tmp, psi[ss+s-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      }
+      if(this->pm == 1){ spProj5p(tmp, psi[ss+shift_s]); }
+      else{ spProj5m(tmp, psi[ss+shift_s]); }
+      chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
+				     Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
+    auto tmp = psi[0];
+    for(int s=0; s<Ls; s++){
+      if(s==0) {
+	spProj5p(tmp, psi[ss+s+1]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5m(tmp, psi[ss+Ls-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      } else if(s==(Ls-1)) {
+	spProj5p(tmp, psi[ss+0]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5m(tmp, psi[ss+s-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      } else {
+	spProj5p(tmp, psi[ss+s+1]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5m(tmp, psi[ss+s-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      }
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
+					   Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
+					   Vector<Coeff_t> &shift_coeffs)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
+    chi[ss+Ls-1] = Zero();
+    auto tmp = psi[0];
+    for(int s=0; s<Ls; s++){
+      if(s==0) {
+	spProj5p(tmp, psi[ss+s+1]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5m(tmp, psi[ss+Ls-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      } else if(s==(Ls-1)) {
+	spProj5p(tmp, psi[ss+0]);
+	chi[ss+s] = chi[ss+s] + diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5m(tmp, psi[ss+s-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      } else {
+	spProj5p(tmp, psi[ss+s+1]);
+	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+	spProj5m(tmp, psi[ss+s-1]);
+	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+      }
+      if(this->pm == 1){ spProj5p(tmp, psi[ss+s]); }
+      else{ spProj5m(tmp, psi[ss+s]); }
+      chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
+    }
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
+
+    auto tmp = psi[0];
+
+    // Apply (L^{\prime})^{-1}
+    chi[ss] = psi[ss]; // chi[0]=psi[0]
+    for(int s=1; s<Ls; s++){
+      spProj5p(tmp, chi[ss+s-1]);
+      chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp;
+    }
+
+    // L_m^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+      spProj5m(tmp, chi[ss+s]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp;
+    }
+
+    // U_m^{-1} D^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+      spProj5p(tmp, chi[ss+Ls-1]);
+      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp;
+    }
+    chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
+
+    // Apply U^{-1}
+    for(int s=Ls-2; s>=0; s--){
+      spProj5m(tmp, chi[ss+s+1]);
+      chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
+    }
+  });
+
+  this->MooeeInvTime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
+
+    auto tmp1        = psi[0];
+    auto tmp2        = psi[0];
+    auto tmp2_spProj = psi[0];
+
+    // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
+    chi[ss] = psi[ss]; // chi[0]=psi[0]
+    tmp2 = MooeeInv_shift_lc[0]*psi[ss];
+    for(int s=1; s<Ls; s++){
+      spProj5p(tmp1, chi[ss+s-1]);
+      chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
+      tmp2 = tmp2 + MooeeInv_shift_lc[s]*psi[ss+s];
+    }
+    if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
+    else{ spProj5m(tmp2_spProj, tmp2); }
+
+    // L_m^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+      spProj5m(tmp1, chi[ss+s]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
+    }
+
+    // U_m^{-1} D^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+      spProj5p(tmp1, chi[ss+Ls-1]);
+      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp1;
+    }
+    // chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
+    chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
+    spProj5m(tmp1, chi[ss+Ls-1]);
+    chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
+
+    // Apply U^{-1} and add shift term
+    for(int s=Ls-2; s>=0; s--){
+      chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
+      spProj5m(tmp1, chi[ss+s]);
+      chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
+    }
+  });
+
+  this->MooeeInvTime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionField &chi_i)
+{
+  if(this->shift != 0.0){ MooeeInvDag_shift(psi_i,chi_i); return; }
+
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  int Ls = this->Ls;
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
+
+    auto tmp = psi[0];
+
+    // Apply (U^{\prime})^{-dag}
+    chi[ss] = psi[ss];
+    for(int s=1; s<Ls; s++){
+      spProj5m(tmp, chi[ss+s-1]);
+      chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp;
+    }
+
+    // U_m^{-\dag}
+    for(int s=0; s<Ls-1; s++){
+      spProj5p(tmp, chi[ss+s]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp;
+    }
+
+    // L_m^{-\dag} D^{-dag}
+    for(int s=0; s<Ls-1; s++){
+      spProj5m(tmp, chi[ss+Ls-1]);
+      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp;
+    }
+    chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
+
+    // Apply L^{-dag}
+    for(int s=Ls-2; s>=0; s--){
+      spProj5p(tmp, chi[ss+s+1]);
+      chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
+    }
+  });
+
+  this->MooeeInvTime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, FermionField &chi_i)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase *grid = psi_i.Grid();
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+  int Ls = this->Ls;
+
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
+
+    auto tmp1        = psi[0];
+    auto tmp2        = psi[0];
+    auto tmp2_spProj = psi[0];
+
+    // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
+    chi[ss] = psi[ss];
+    tmp2 = MooeeInvDag_shift_lc[0]*psi[ss];
+    for(int s=1; s<Ls; s++){
+      spProj5m(tmp1, chi[ss+s-1]);
+      chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp1;
+      tmp2 = tmp2 + MooeeInvDag_shift_lc[s]*psi[ss+s];
+    }
+    if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
+    else{ spProj5m(tmp2_spProj, tmp2); }
+
+    // U_m^{-\dag}
+    for(int s=0; s<Ls-1; s++){
+      spProj5p(tmp1, chi[ss+s]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp1;
+    }
+
+    // L_m^{-\dag} D^{-dag}
+    for(int s=0; s<Ls-1; s++){
+      spProj5m(tmp1, chi[ss+Ls-1]);
+      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp1;
+    }
+    chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
+    spProj5p(tmp1, chi[ss+Ls-1]);
+    chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInvDag_shift_norm[Ls-1]*tmp2_spProj;
+
+    // Apply L^{-dag}
+    for(int s=Ls-2; s>=0; s--){
+      chi[ss+s] = chi[ss+s] - this->lee[s]*tmp1;
+      spProj5p(tmp1, chi[ss+s]);
+      chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
+    }
+  });
+
+  this->MooeeInvTime += usecond();
+}
+
+#ifdef MOBIUS_EOFA_DPERP_CACHE
+
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
+
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
+
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionvec.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionvec.h
@@ -0,0 +1,998 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+/*
+ * Dense matrix versions of routines
+ */
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
+{
+  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
+				  Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid  = psi_i.Grid();
+  auto psi        = psi_i.View();
+  auto phi        = phi_i.View();
+  auto chi        = chi_i.View();
+  int Ls          = this->Ls;
+  int LLs         = grid->_rdimensions[0];
+  const int nsimd = Simd::Nsimd();
+
+  Vector<iSinglet<Simd>> u(LLs);
+  Vector<iSinglet<Simd>> l(LLs);
+  Vector<iSinglet<Simd>> d(LLs);
+
+  assert(Ls/LLs == nsimd);
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type* u_p = (scalar_type*) &u[0];
+  scalar_type* l_p = (scalar_type*) &l[0];
+  scalar_type* d_p = (scalar_type*) &d[0];
+
+  for(int o=0; o<LLs; o++){ // outer
+    for(int i=0; i<nsimd; i++){ //inner
+      int s   = o + i*LLs;
+      int ss  = o*nsimd + i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+    }}
+
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  assert(Nc == 3);
+
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
+
+#if 0
+
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0; v<LLs; v++){
+
+      int vp = (v+1)%LLs;
+      int vm = (v+LLs-1)%LLs;
+
+      spProj5m(hp, psi[ss+vp]);
+      spProj5p(hm, psi[ss+vm]);
+
+      if (vp <= v){ rotate(hp, hp, 1); }
+      if (vm >= v){ rotate(hm, hm, nsimd-1); }
+
+      hp = 0.5*hp;
+      hm = 0.5*hm;
+
+      spRecon5m(fp, hp);
+      spRecon5p(fm, hm);
+
+      chi[ss+v] = d[v]*phi[ss+v];
+      chi[ss+v] = chi[ss+v] + u[v]*fp;
+      chi[ss+v] = chi[ss+v] + l[v]*fm;
+
+    }
+
+#else
+
+    for(int v=0; v<LLs; v++){
+
+      vprefetch(psi[ss+v+LLs]);
+
+      int vp = (v == LLs-1) ? 0     : v+1;
+      int vm = (v == 0)     ? LLs-1 : v-1;
+
+      Simd hp_00 = psi[ss+vp]()(2)(0);
+      Simd hp_01 = psi[ss+vp]()(2)(1);
+      Simd hp_02 = psi[ss+vp]()(2)(2);
+      Simd hp_10 = psi[ss+vp]()(3)(0);
+      Simd hp_11 = psi[ss+vp]()(3)(1);
+      Simd hp_12 = psi[ss+vp]()(3)(2);
+
+      Simd hm_00 = psi[ss+vm]()(0)(0);
+      Simd hm_01 = psi[ss+vm]()(0)(1);
+      Simd hm_02 = psi[ss+vm]()(0)(2);
+      Simd hm_10 = psi[ss+vm]()(1)(0);
+      Simd hm_11 = psi[ss+vm]()(1)(1);
+      Simd hm_12 = psi[ss+vm]()(1)(2);
+
+      if(vp <= v){
+	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+      }
+
+      if(vm >= v){
+	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+      }
+
+      // Can force these to real arithmetic and save 2x.
+      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
+      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
+      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
+      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
+      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
+      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
+      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+
+      vstream(chi[ss+v]()(0)(0), p_00);
+      vstream(chi[ss+v]()(0)(1), p_01);
+      vstream(chi[ss+v]()(0)(2), p_02);
+      vstream(chi[ss+v]()(1)(0), p_10);
+      vstream(chi[ss+v]()(1)(1), p_11);
+      vstream(chi[ss+v]()(1)(2), p_12);
+      vstream(chi[ss+v]()(2)(0), p_20);
+      vstream(chi[ss+v]()(2)(1), p_21);
+      vstream(chi[ss+v]()(2)(2), p_22);
+      vstream(chi[ss+v]()(3)(0), p_30);
+      vstream(chi[ss+v]()(3)(1), p_31);
+      vstream(chi[ss+v]()(3)(2), p_32);
+    }
+
+#endif
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi_i, const FermionField& phi_i,
+					FermionField& chi_i, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
+					Vector<Coeff_t>& shift_coeffs)
+{
+#if 0
+  auto & psi = psi_i;
+  auto & phi = phi_i;
+  auto & chi = chi_i;
+
+  this->M5D(psi, phi, chi, lower, diag, upper);
+
+  // FIXME: possible gain from vectorizing shift operation as well?
+  Coeff_t one(1.0);
+  int Ls = this->Ls;
+  for(int s=0; s<Ls; s++){
+    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
+    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
+  }
+
+#else
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid  = psi_i.Grid();
+  auto psi        = psi_i.View();
+  auto phi        = phi_i.View();
+  auto chi        = chi_i.View();
+  int Ls          = this->Ls;
+  int LLs         = grid->_rdimensions[0];
+  const int nsimd = Simd::Nsimd();
+
+  Vector<iSinglet<Simd>> u(LLs);
+  Vector<iSinglet<Simd>> l(LLs);
+  Vector<iSinglet<Simd>> d(LLs);
+  Vector<iSinglet<Simd>> s(LLs);
+
+  assert(Ls/LLs == nsimd);
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type* u_p = (scalar_type*) &u[0];
+  scalar_type* l_p = (scalar_type*) &l[0];
+  scalar_type* d_p = (scalar_type*) &d[0];
+  scalar_type* s_p = (scalar_type*) &s[0];
+
+  for(int o=0; o<LLs; o++){ // outer
+    for(int i=0; i<nsimd; i++){ //inner
+      int s   = o + i*LLs;
+      int ss  = o*nsimd + i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+      s_p[ss] = shift_coeffs[s];
+    }}
+
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  assert(Nc == 3);
+
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
+
+    int vs     = (this->pm == 1) ? LLs-1 : 0;
+    Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
+    Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
+    Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
+    Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
+    Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
+    Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
+
+    for(int v=0; v<LLs; v++){
+
+      vprefetch(psi[ss+v+LLs]);
+
+      int vp = (v == LLs-1) ? 0     : v+1;
+      int vm = (v == 0)     ? LLs-1 : v-1;
+
+      Simd hp_00 = psi[ss+vp]()(2)(0);
+      Simd hp_01 = psi[ss+vp]()(2)(1);
+      Simd hp_02 = psi[ss+vp]()(2)(2);
+      Simd hp_10 = psi[ss+vp]()(3)(0);
+      Simd hp_11 = psi[ss+vp]()(3)(1);
+      Simd hp_12 = psi[ss+vp]()(3)(2);
+
+      Simd hm_00 = psi[ss+vm]()(0)(0);
+      Simd hm_01 = psi[ss+vm]()(0)(1);
+      Simd hm_02 = psi[ss+vm]()(0)(2);
+      Simd hm_10 = psi[ss+vm]()(1)(0);
+      Simd hm_11 = psi[ss+vm]()(1)(1);
+      Simd hm_12 = psi[ss+vm]()(1)(2);
+
+      if(vp <= v){
+	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+      }
+
+      if(this->pm == 1 && vs <= v){
+	hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
+	hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
+	hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
+	hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
+	hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
+	hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
+      }
+
+      if(vm >= v){
+	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+      }
+
+      if(this->pm == -1 && vs >= v){
+	hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
+	hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
+	hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
+	hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
+	hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
+	hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
+      }
+
+      // Can force these to real arithmetic and save 2x.
+      Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
+      Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
+      Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
+      Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
+      Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
+      Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
+      Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+      Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+      Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+      Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+      Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+      Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+
+      vstream(chi[ss+v]()(0)(0), p_00);
+      vstream(chi[ss+v]()(0)(1), p_01);
+      vstream(chi[ss+v]()(0)(2), p_02);
+      vstream(chi[ss+v]()(1)(0), p_10);
+      vstream(chi[ss+v]()(1)(1), p_11);
+      vstream(chi[ss+v]()(1)(2), p_12);
+      vstream(chi[ss+v]()(2)(0), p_20);
+      vstream(chi[ss+v]()(2)(1), p_21);
+      vstream(chi[ss+v]()(2)(2), p_22);
+      vstream(chi[ss+v]()(3)(0), p_30);
+      vstream(chi[ss+v]()(3)(1), p_31);
+      vstream(chi[ss+v]()(3)(2), p_32);
+    }
+  });
+
+  this->M5Dtime += usecond();
+
+#endif
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
+				     Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+{
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid = psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  int Ls  = this->Ls;
+  int LLs = grid->_rdimensions[0];
+  int nsimd = Simd::Nsimd();
+
+  Vector<iSinglet<Simd>> u(LLs);
+  Vector<iSinglet<Simd>> l(LLs);
+  Vector<iSinglet<Simd>> d(LLs);
+
+  assert(Ls/LLs == nsimd);
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type* u_p = (scalar_type*) &u[0];
+  scalar_type* l_p = (scalar_type*) &l[0];
+  scalar_type* d_p = (scalar_type*) &d[0];
+
+  for(int o=0; o<LLs; o++){ // outer
+    for(int i=0; i<nsimd; i++){ //inner
+      int s  = o + i*LLs;
+      int ss = o*nsimd + i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+    }}
+
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
+
+#if 0
+
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0; v<LLs; v++){
+
+      int vp = (v+1)%LLs;
+      int vm = (v+LLs-1)%LLs;
+
+      spProj5p(hp, psi[ss+vp]);
+      spProj5m(hm, psi[ss+vm]);
+
+      if(vp <= v){ rotate(hp, hp, 1); }
+      if(vm >= v){ rotate(hm, hm, nsimd-1); }
+
+      hp = hp*0.5;
+      hm = hm*0.5;
+      spRecon5p(fp, hp);
+      spRecon5m(fm, hm);
+
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+
+#else
+
+    for(int v=0; v<LLs; v++){
+
+      vprefetch(psi[ss+v+LLs]);
+
+      int vp = (v == LLs-1) ? 0     : v+1;
+      int vm = (v == 0    ) ? LLs-1 : v-1;
+
+      Simd hp_00 = psi[ss+vp]()(0)(0);
+      Simd hp_01 = psi[ss+vp]()(0)(1);
+      Simd hp_02 = psi[ss+vp]()(0)(2);
+      Simd hp_10 = psi[ss+vp]()(1)(0);
+      Simd hp_11 = psi[ss+vp]()(1)(1);
+      Simd hp_12 = psi[ss+vp]()(1)(2);
+
+      Simd hm_00 = psi[ss+vm]()(2)(0);
+      Simd hm_01 = psi[ss+vm]()(2)(1);
+      Simd hm_02 = psi[ss+vm]()(2)(2);
+      Simd hm_10 = psi[ss+vm]()(3)(0);
+      Simd hm_11 = psi[ss+vm]()(3)(1);
+      Simd hm_12 = psi[ss+vm]()(3)(2);
+
+      if (vp <= v){
+	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+      }
+
+      if(vm >= v){
+	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+      }
+
+      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
+      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
+      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
+      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
+      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
+      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
+
+      vstream(chi[ss+v]()(0)(0), p_00);
+      vstream(chi[ss+v]()(0)(1), p_01);
+      vstream(chi[ss+v]()(0)(2), p_02);
+      vstream(chi[ss+v]()(1)(0), p_10);
+      vstream(chi[ss+v]()(1)(1), p_11);
+      vstream(chi[ss+v]()(1)(2), p_12);
+      vstream(chi[ss+v]()(2)(0), p_20);
+      vstream(chi[ss+v]()(2)(1), p_21);
+      vstream(chi[ss+v]()(2)(2), p_22);
+      vstream(chi[ss+v]()(3)(0), p_30);
+      vstream(chi[ss+v]()(3)(1), p_31);
+      vstream(chi[ss+v]()(3)(2), p_32);
+
+    }
+
+#endif
+
+  });
+
+  this->M5Dtime += usecond();
+}
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
+					   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
+					   Vector<Coeff_t>& shift_coeffs)
+{
+#if 0
+  auto & psi = psi_i;
+  auto & phi = phi_i;
+  auto & chi = chi_i;
+  this->M5Ddag(psi, phi, chi, lower, diag, upper);
+
+  // FIXME: possible gain from vectorizing shift operation as well?
+  Coeff_t one(1.0);
+  int Ls = this->Ls;
+  for(int s=0; s<Ls; s++){
+    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
+    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
+  }
+
+#else
+  chi_i.Checkerboard() = psi_i.Checkerboard();
+  GridBase* grid = psi_i.Grid();
+  auto psi = psi_i.View();
+  auto phi = phi_i.View();
+  auto chi = chi_i.View();
+  int Ls  = this->Ls;
+  int LLs = grid->_rdimensions[0];
+  int nsimd = Simd::Nsimd();
+
+  Vector<iSinglet<Simd>> u(LLs);
+  Vector<iSinglet<Simd>> l(LLs);
+  Vector<iSinglet<Simd>> d(LLs);
+  Vector<iSinglet<Simd>> s(LLs);
+
+  assert(Ls/LLs == nsimd);
+  assert(phi.Checkerboard() == psi.Checkerboard());
+
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type* u_p = (scalar_type*) &u[0];
+  scalar_type* l_p = (scalar_type*) &l[0];
+  scalar_type* d_p = (scalar_type*) &d[0];
+  scalar_type* s_p = (scalar_type*) &s[0];
+
+  for(int o=0; o<LLs; o++){ // outer
+    for(int i=0; i<nsimd; i++){ //inner
+      int s  = o + i*LLs;
+      int ss = o*nsimd + i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+      s_p[ss] = shift_coeffs[s];
+    }}
+
+  this->M5Dcalls++;
+  this->M5Dtime -= usecond();
+
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
+
+    int vs     = (this->pm == 1) ? LLs-1 : 0;
+    Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
+    Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
+    Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
+    Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
+    Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
+    Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
+
+    for(int v=0; v<LLs; v++){
+
+      vprefetch(psi[ss+v+LLs]);
+
+      int vp = (v == LLs-1) ? 0     : v+1;
+      int vm = (v == 0    ) ? LLs-1 : v-1;
+
+      Simd hp_00 = psi[ss+vp]()(0)(0);
+      Simd hp_01 = psi[ss+vp]()(0)(1);
+      Simd hp_02 = psi[ss+vp]()(0)(2);
+      Simd hp_10 = psi[ss+vp]()(1)(0);
+      Simd hp_11 = psi[ss+vp]()(1)(1);
+      Simd hp_12 = psi[ss+vp]()(1)(2);
+
+      Simd hm_00 = psi[ss+vm]()(2)(0);
+      Simd hm_01 = psi[ss+vm]()(2)(1);
+      Simd hm_02 = psi[ss+vm]()(2)(2);
+      Simd hm_10 = psi[ss+vm]()(3)(0);
+      Simd hm_11 = psi[ss+vm]()(3)(1);
+      Simd hm_12 = psi[ss+vm]()(3)(2);
+
+      if (vp <= v){
+	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+      }
+
+      if(this->pm == 1 && vs <= v){
+	hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
+	hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
+	hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
+	hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
+	hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
+	hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
+      }
+
+      if(vm >= v){
+	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+      }
+
+      if(this->pm == -1 && vs >= v){
+	hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
+	hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
+	hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
+	hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
+	hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
+	hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
+      }
+
+      Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+      Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+      Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+      Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+      Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+      Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+      Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
+      Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
+      Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
+      Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
+      Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
+      Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
+	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
+	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
+
+      vstream(chi[ss+v]()(0)(0), p_00);
+      vstream(chi[ss+v]()(0)(1), p_01);
+      vstream(chi[ss+v]()(0)(2), p_02);
+      vstream(chi[ss+v]()(1)(0), p_10);
+      vstream(chi[ss+v]()(1)(1), p_11);
+      vstream(chi[ss+v]()(1)(2), p_12);
+      vstream(chi[ss+v]()(2)(0), p_20);
+      vstream(chi[ss+v]()(2)(1), p_21);
+      vstream(chi[ss+v]()(2)(2), p_22);
+      vstream(chi[ss+v]()(3)(0), p_30);
+      vstream(chi[ss+v]()(3)(1), p_31);
+      vstream(chi[ss+v]()(3)(2), p_32);
+
+    }
+
+  });
+
+  this->M5Dtime += usecond();
+
+#endif
+}
+
+#ifdef AVX512
+#include<simd/Intel512common.h>
+#include<simd/Intel512avx.h>
+#include<simd/Intel512single.h>
+#endif
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
+					       int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+{
+  auto psi = psi_i.View();
+  auto chi = chi_i.View();
+#ifndef AVX512
+  {
+    SiteHalfSpinor BcastP;
+    SiteHalfSpinor BcastM;
+    SiteHalfSpinor SiteChiP;
+    SiteHalfSpinor SiteChiM;
+
+    // Ls*Ls * 2 * 12 * vol flops
+    for(int s1=0; s1<LLs; s1++){
+
+      for(int s2=0; s2<LLs; s2++){
+	for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
+
+	  int s = s2 + l*LLs;
+	  int lex = s2 + LLs*site;
+
+	  if( s2==0 && l==0 ){
+	    SiteChiP=Zero();
+	    SiteChiM=Zero();
+	  }
+
+	  for(int sp=0; sp<2;  sp++){
+            for(int co=0; co<Nc; co++){
+              vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
+            }}
+
+	  for(int sp=0; sp<2;  sp++){
+            for(int co=0; co<Nc; co++){
+              vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
+            }}
+
+	  for(int sp=0; sp<2;  sp++){
+            for(int co=0; co<Nc; co++){
+              SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
+              SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
+            }}
+	}}
+
+      {
+	int lex = s1 + LLs*site;
+	for(int sp=0; sp<2;  sp++){
+	  for(int co=0; co<Nc; co++){
+	    vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
+	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+	  }}
+      }
+    }
+  }
+#else
+  {
+    // pointers
+    //  MASK_REGS;
+#define Chi_00 %%zmm1
+#define Chi_01 %%zmm2
+#define Chi_02 %%zmm3
+#define Chi_10 %%zmm4
+#define Chi_11 %%zmm5
+#define Chi_12 %%zmm6
+#define Chi_20 %%zmm7
+#define Chi_21 %%zmm8
+#define Chi_22 %%zmm9
+#define Chi_30 %%zmm10
+#define Chi_31 %%zmm11
+#define Chi_32 %%zmm12
+
+#define BCAST0  %%zmm13
+#define BCAST1  %%zmm14
+#define BCAST2  %%zmm15
+#define BCAST3  %%zmm16
+#define BCAST4  %%zmm17
+#define BCAST5  %%zmm18
+#define BCAST6  %%zmm19
+#define BCAST7  %%zmm20
+#define BCAST8  %%zmm21
+#define BCAST9  %%zmm22
+#define BCAST10 %%zmm23
+#define BCAST11 %%zmm24
+
+    int incr = LLs*LLs*sizeof(iSinglet<Simd>);
+
+    for(int s1=0; s1<LLs; s1++){
+
+      for(int s2=0; s2<LLs; s2++){
+
+	int lex = s2 + LLs*site;
+	uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
+	uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
+	uint64_t a2 = (uint64_t) &psi[lex];
+
+	for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
+
+	  if((s2+l)==0) {
+	    asm(
+		VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
+		VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
+		VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
+		VBCASTCDUP(0,%2,BCAST0)
+		VBCASTCDUP(1,%2,BCAST1)
+		VBCASTCDUP(2,%2,BCAST2)
+		VBCASTCDUP(3,%2,BCAST3)
+		VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
+		VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
+		VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
+		VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
+		VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
+		VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
+		VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
+		VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
+		VMULMEM(0,%1,BCAST8,Chi_22)
+		VMULMEM(0,%1,BCAST9,Chi_30)
+		VMULMEM(0,%1,BCAST10,Chi_31)
+		VMULMEM(0,%1,BCAST11,Chi_32)
+		: : "r" (a0), "r" (a1), "r" (a2)                            );
+	  } else {
+	    asm(
+		VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
+		VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
+		VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
+		VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
+		VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
+		VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
+		VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
+		VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
+		VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
+		VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
+		VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
+		VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
+		: : "r" (a0), "r" (a1), "r" (a2)                            );
+	  }
+
+	  a0 = a0 + incr;
+	  a1 = a1 + incr;
+              a2 = a2 + sizeof(typename Simd::scalar_type);
+	}
+      }
+
+      {
+	int lexa = s1+LLs*site;
+	asm (
+	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
+	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
+	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
+	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
+	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+      }
+    }
+  }
+
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef BCAST0
+#undef BCAST1
+#undef BCAST2
+#undef BCAST3
+#undef BCAST4
+#undef BCAST5
+#undef BCAST6
+#undef BCAST7
+#undef BCAST8
+#undef BCAST9
+#undef BCAST10
+#undef BCAST11
+
+#endif
+};
+
+// Z-mobius version
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
+						int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+{
+  std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
+  exit(-1);
+};
+
+template<class Impl>
+void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
+{
+  chi.Checkerboard() = psi.Checkerboard();
+
+  int Ls  = this->Ls;
+  int LLs = psi.Grid()->_rdimensions[0];
+  int vol = psi.Grid()->oSites()/LLs;
+
+  Vector<iSinglet<Simd>>   Matp;
+  Vector<iSinglet<Simd>>   Matm;
+  Vector<iSinglet<Simd>>* _Matp;
+  Vector<iSinglet<Simd>>* _Matm;
+
+  //  MooeeInternalCompute(dag,inv,Matp,Matm);
+  if(inv && dag){
+    _Matp = &this->MatpInvDag;
+    _Matm = &this->MatmInvDag;
+  }
+
+  if(inv && (!dag)){
+    _Matp = &this->MatpInv;
+    _Matm = &this->MatmInv;
+  }
+
+  if(!inv){
+    MooeeInternalCompute(dag, inv, Matp, Matm);
+    _Matp = &Matp;
+    _Matm = &Matm;
+  }
+
+  assert(_Matp->size() == Ls*LLs);
+
+  this->MooeeInvCalls++;
+  this->MooeeInvTime -= usecond();
+
+  if(switcheroo<Coeff_t>::iscomplex()){
+    thread_loop( (auto site=0; site<vol; site++),{
+      MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
+    });
+  } else {
+    thread_loop( (auto site=0; site<vol; site++),{
+      MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
+    });
+  }
+
+  this->MooeeInvTime += usecond();
+}
+
+#ifdef MOBIUS_EOFA_DPERP_VEC
+
+INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
+INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
+
+INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
+INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
+
+template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@@ -0,0 +1,452 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl>
+void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+  // this does both dag and undag but is trivial; make a common helper routing
+  int Ls = this->Ls;
+
+  this->DhopDir(psi,chi,dir,disp);
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
+    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
+  }
+  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
+
+}
+template<class Impl>
+void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
+{
+  int Ls = this->Ls;
+  if ( psi.Checkerboard() == Odd ) {
+    this->DhopEO(psi,chi,DaggerNo);
+  } else {
+    this->DhopOE(psi,chi,DaggerNo);
+  }
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
+    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
+  }
+  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
+}
+
+template<class Impl>
+void   PartialFractionFermion5D<Impl>::Mooee_internal(const FermionField &psi, FermionField &chi,int dag)
+{
+  // again dag and undag are trivially related
+  int sign = dag ? (-1) : 1;
+  int Ls = this->Ls;
+      
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+	
+    int s = 2*b;
+    RealD pp = p[nblock-1-b];
+    RealD qq = q[nblock-1-b];
+	
+    // Do each 2x2 block aligned at s and multiplies Dw site diagonal by G5 so Hw
+    ag5xpby_ssp(chi,-dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s  ,s+1); 
+    ag5xpby_ssp(chi, dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s+1,s);
+    axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
+  }
+      
+  {
+    RealD R=(1+mass)/(1-mass);
+    //R g5 psi[Ls-1] + p[0] H
+    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale*dw_diag/amax,psi,Ls-1,Ls-1);
+	
+    for(int b=0;b<nblock;b++){
+      int s = 2*b+1;
+      RealD pp = p[nblock-1-b];
+      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
+    }
+  }
+}
+
+template<class Impl>
+void   PartialFractionFermion5D<Impl>::MooeeInv_internal(const FermionField &psi, FermionField &chi,int dag)
+{
+  int sign = dag ? (-1) : 1;
+  int Ls = this->Ls;
+
+  FermionField tmp(psi.Grid());
+      
+  ///////////////////////////////////////////////////////////////////////////////////////
+  //Linv
+  ///////////////////////////////////////////////////////////////////////////////////////
+  int nblock=(Ls-1)/2;
+
+  axpy(chi,0.0,psi,psi); // Identity piece
+      
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    RealD pp = p[nblock-1-b];
+    RealD qq = q[nblock-1-b];
+    RealD coeff1=sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
+    RealD coeff2=sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
+    axpby_ssp  (chi,1.0,chi,coeff1,psi,Ls-1,s);
+    axpbg5y_ssp(chi,1.0,chi,coeff2,psi,Ls-1,s+1);
+  }
+      
+  ///////////////////////////////////////////////////////////////////////////////////////
+  //Dinv (note D isn't really diagonal -- just diagonal enough that we can still invert)
+  // Compute Seeinv (coeff of gamma5)
+  ///////////////////////////////////////////////////////////////////////////////////////
+  RealD R=(1+mass)/(1-mass);
+  RealD Seeinv = R + p[nblock]*dw_diag/amax;
+  for(int b=0;b<nblock;b++){
+    Seeinv += p[nblock-1-b]*dw_diag/amax / ( dw_diag*dw_diag/amax/amax + q[nblock-1-b]);
+  }    
+  Seeinv = 1.0/Seeinv;
+      
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    RealD pp = p[nblock-1-b];
+    RealD qq = q[nblock-1-b];
+    RealD coeff1=dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
+    RealD coeff2=amax*sqrt(qq) / ( dw_diag*dw_diag + amax*amax* qq);
+    ag5xpby_ssp  (tmp,-coeff1,chi,coeff2,chi,s,s+1);
+    ag5xpby_ssp  (tmp, coeff1,chi,coeff2,chi,s+1,s);
+  }
+  ag5xpby_ssp  (tmp, Seeinv,chi,0.0,chi,Ls-1,Ls-1);
+      
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // Uinv
+  ///////////////////////////////////////////////////////////////////////////////////////
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    RealD pp = p[nblock-1-b];
+    RealD qq = q[nblock-1-b];
+    RealD coeff1=-sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
+    RealD coeff2=-sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
+    axpby_ssp  (chi,1.0/scale,tmp,coeff1/scale,tmp,s,Ls-1);
+    axpbg5y_ssp(chi,1.0/scale,tmp,coeff2/scale,tmp,s+1,Ls-1);
+  }
+  axpby_ssp  (chi, 1.0/scale,tmp,0.0,tmp,Ls-1,Ls-1);
+}
+
+template<class Impl>
+void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, FermionField &chi,int dag)
+{
+  FermionField D(psi.Grid());
+  
+  int Ls = this->Ls;
+  int sign = dag ? (-1) : 1;
+
+  // For partial frac Hw case (b5=c5=1) chroma quirkily computes
+  //
+  // Conventions for partfrac appear to be a mess.
+  // Tony's Nara lectures have
+  //
+  // BlockDiag(  H/p_i  1             | 1       )    
+  //          (  1      p_i H / q_i^2 | 0       )  
+  //           ---------------------------------
+  //           ( -1      0                | R  +p0 H  )
+  //
+  //Chroma     ( -2H    2sqrt(q_i)    |   0         )
+  //           (2 sqrt(q_i)   2H      |  2 sqrt(p_i) )
+  //           ---------------------------------
+  //           ( 0     -2 sqrt(p_i)   |  2 R gamma_5 + p0 2H
+  //
+  // Edwards/Joo/Kennedy/Wenger
+  //
+  // Here, the "beta's" selected by chroma to scale the unphysical bulk constraint fields
+  // incorporate the approx scale factor. This is obtained by propagating the
+  // scale on "H" out to the off diagonal elements as follows:
+  //
+  // BlockDiag(  H/p_i  1             | 1       ) 
+  //          (  1      p_i H / q_i^2 | 0       )  
+  //           ---------------------------------
+  //          ( -1      0                | R  + p_0 H  )
+  //
+  // becomes:
+  // BlockDiag(  H/ sp_i  1               | 1             ) 
+  //          (  1      sp_i H / s^2q_i^2 | 0             )  
+  //           ---------------------------------
+  //           ( -1      0                | R + p_0/s H   )
+  //
+  //
+  // This is implemented in Chroma by
+  //           p0' = p0/approxMax
+  //           p_i' = p_i*approxMax
+  //           q_i' = q_i*approxMax*approxMax
+  //
+  // After the equivalence transform is applied the matrix becomes
+  // 
+  //Chroma     ( -2H    sqrt(q'_i)    |   0         )
+  //           (sqrt(q'_i)   2H       |   sqrt(p'_i) )
+  //           ---------------------------------
+  //           ( 0     -sqrt(p'_i)    |  2 R gamma_5 + p'0 2H
+  //
+  //     =     ( -2H    sqrt(q_i)amax    |   0              )
+  //           (sqrt(q_i)amax   2H       |   sqrt(p_i*amax) )
+  //           ---------------------------------
+  //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H
+  //
+
+  this->DW(psi,D,DaggerNo); 
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+	
+    int s = 2*b;
+    double pp = p[nblock-1-b];
+    double qq = q[nblock-1-b];
+	
+    // Do each 2x2 block aligned at s and
+    ag5xpby_ssp(chi,-1.0*scale,D,amax*sqrt(qq)*scale,psi, s  ,s+1); // Multiplies Dw by G5 so Hw
+    ag5xpby_ssp(chi, 1.0*scale,D,amax*sqrt(qq)*scale,psi, s+1,s);
+	
+    // Pick up last column
+    axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
+  }
+	
+  {
+    double R=(1+this->mass)/(1-this->mass);
+    //R g5 psi[Ls] + p[0] H
+    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
+	
+    for(int b=0;b<nblock;b++){
+      int s = 2*b+1;
+      double pp = p[nblock-1-b];
+      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
+    }
+  }
+
+}
+
+template<class Impl>
+RealD  PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
+{
+  M_internal(in,out,DaggerNo);
+  return norm2(out);
+}
+template<class Impl>
+RealD  PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
+{
+  M_internal(in,out,DaggerYes);
+  return norm2(out);
+}
+
+template<class Impl>
+void PartialFractionFermion5D<Impl>::Meooe       (const FermionField &in, FermionField &out)
+{
+  Meooe_internal(in,out,DaggerNo);
+}
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MeooeDag    (const FermionField &in, FermionField &out)
+{
+  Meooe_internal(in,out,DaggerYes);
+}
+template<class Impl>
+void PartialFractionFermion5D<Impl>::Mooee       (const FermionField &in, FermionField &out)
+{
+  Mooee_internal(in,out,DaggerNo);
+}
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MooeeDag    (const FermionField &in, FermionField &out)
+{
+  Mooee_internal(in,out,DaggerYes);
+}
+
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MooeeInv    (const FermionField &in, FermionField &out)
+{
+  MooeeInv_internal(in,out,DaggerNo);
+}
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MooeeInvDag (const FermionField &in, FermionField &out)
+{
+  MooeeInv_internal(in,out,DaggerYes);
+}
+
+
+// force terms; five routines; default to Dhop on diagonal
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+  }
+  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+  this->DhopDeriv(mat,D,V,DaggerNo); 
+};
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+  }
+  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+  this->DhopDerivOE(mat,D,V,DaggerNo); 
+};
+template<class Impl>
+void PartialFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  int Ls = this->Ls;
+
+  FermionField D(V.Grid());
+
+  int nblock=(Ls-1)/2;
+  for(int b=0;b<nblock;b++){
+    int s = 2*b;
+    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+  }
+  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+  this->DhopDerivEO(mat,D,V,DaggerNo); 
+};
+
+template<class Impl>
+void  PartialFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
+  SetCoefficientsZolotarev(1.0/scale,zdata);
+}
+template<class Impl>
+void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){
+
+  // check on degree matching
+  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+  int Ls = this->Ls;
+
+  assert(Ls == (2*zdata->da -1) );
+
+  // Part frac
+  //      RealD R;
+  R=(1+mass)/(1-mass);
+  dw_diag = (4.0-this->M5);
+
+  //      std::vector<RealD> p; 
+  //      std::vector<RealD> q;
+  p.resize(zdata->da);
+  q.resize(zdata->dd);
+	
+  for(int n=0;n<zdata->da;n++){
+    p[n] = zdata -> alpha[n];
+  }
+  for(int n=0;n<zdata->dd;n++){
+    q[n] = -zdata -> ap[n];
+  }
+      
+  scale= part_frac_chroma_convention ? 2.0 : 1.0; // Chroma conventions annoy me
+
+  amax=zolo_hi;
+}
+
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+    {
+      int Ls = this->Ls;
+      conformable(solution5d.Grid(),this->FermionGrid());
+      conformable(exported4d.Grid(),this->GaugeGrid());
+      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+    }
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+    {
+      int Ls = this->Ls;
+      conformable(imported5d.Grid(),this->FermionGrid());
+      conformable(input4d.Grid()   ,this->GaugeGrid());
+      FermionField tmp(this->FermionGrid());
+      tmp=Zero();
+      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
+      this->Dminus(tmp,imported5d);
+    }
+
+// Constructors
+template<class Impl>
+PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
+							 GridCartesian         &FiveDimGrid,
+							 GridRedBlackCartesian &FiveDimRedBlackGrid,
+							 GridCartesian         &FourDimGrid,
+							 GridRedBlackCartesian &FourDimRedBlackGrid,
+							 RealD _mass,RealD M5,
+							 const ImplParams &p) :
+  WilsonFermion5D<Impl>(_Umu,
+			FiveDimGrid, FiveDimRedBlackGrid,
+			FourDimGrid, FourDimRedBlackGrid,M5,p),
+  mass(_mass)
+
+{
+  int Ls = this->Ls;
+
+  assert((Ls&0x1)==1); // Odd Ls required
+  int nrational=Ls-1;
+
+
+  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);
+
+  // NB: chroma uses a cast to "float" for the zolotarev range(!?).
+  // this creates a real difference in the operator which I do not like but we can replicate here
+  // to demonstrate compatibility
+  //      RealD eps = (zolo_lo / zolo_hi);
+  //      zdata = bfm_zolotarev(eps,nrational,0);
+      
+  SetCoefficientsTanh(zdata,1.0);
+
+  Approx::zolotarev_free(zdata);
+
+}
+ 
+FermOpTemplateInstantiate(PartialFractionFermion5D);
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsInstantiation.cc
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsInstantiation.cc
@@ -1,43 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Azusa Yamaguchi, Peter Boyle
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h>
-
-
-NAMESPACE_BEGIN(Grid);
-
-int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
-int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
-
-FermOpStaggeredTemplateInstantiate(StaggeredKernels);
-FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermion.h
@@ -0,0 +1,242 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
+
+    Copyright (C) 2017
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// *NOT* EO
+template <class Impl>
+RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
+{
+  FermionField temp(out.Grid());
+
+  // Wilson term
+  out.Checkerboard() = in.Checkerboard();
+  this->Dhop(in, out, DaggerNo);
+
+  // Clover term
+  Mooee(in, temp);
+
+  out += temp;
+  return norm2(out);
+}
+
+template <class Impl>
+RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
+{
+  FermionField temp(out.Grid());
+
+  // Wilson term
+  out.Checkerboard() = in.Checkerboard();
+  this->Dhop(in, out, DaggerYes);
+
+  // Clover term
+  MooeeDag(in, temp);
+
+  out += temp;
+  return norm2(out);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
+{
+  WilsonFermion<Impl>::ImportGauge(_Umu);
+  GridBase *grid = _Umu.Grid();
+  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
+
+  // Compute the field strength terms mu>nu
+  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
+
+  // Compute the Clover Operator acting on Colour and Spin
+  // multiply here by the clover coefficients for the anisotropy
+  CloverTerm  = fillCloverYZ(Bx) * csw_r;
+  CloverTerm += fillCloverXZ(By) * csw_r;
+  CloverTerm += fillCloverXY(Bz) * csw_r;
+  CloverTerm += fillCloverXT(Ex) * csw_t;
+  CloverTerm += fillCloverYT(Ey) * csw_t;
+  CloverTerm += fillCloverZT(Ez) * csw_t;
+  CloverTerm += diag_mass;
+
+  int lvol = _Umu.Grid()->lSites();
+  int DimRep = Impl::Dimension;
+
+  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+
+  Coordinate lcoor;
+  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
+
+  for (int site = 0; site < lvol; site++)
+  {
+    grid->LocalIndexToLocalCoor(site, lcoor);
+    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+    peekLocalSite(Qx, CloverTerm, lcoor);
+    Qxinv = Zero();
+    //if (csw!=0){
+    for (int j = 0; j < Ns; j++)
+      for (int k = 0; k < Ns; k++)
+        for (int a = 0; a < DimRep; a++)
+          for (int b = 0; b < DimRep; b++){
+	    auto zz =  Qx()(j, k)(a, b);
+            EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
+	  }
+    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
+
+    EigenInvCloverOp = EigenCloverOp.inverse();
+    //std::cout << EigenInvCloverOp << std::endl;
+    for (int j = 0; j < Ns; j++)
+      for (int k = 0; k < Ns; k++)
+        for (int a = 0; a < DimRep; a++)
+          for (int b = 0; b < DimRep; b++)
+            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
+    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
+    //  }
+    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
+  }
+
+  // Separate the even and odd parts
+  pickCheckerboard(Even, CloverTermEven, CloverTerm);
+  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
+
+  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
+  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
+
+  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
+  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
+
+  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
+  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerNo, InverseNo);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerYes, InverseNo);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerNo, InverseYes);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerYes, InverseYes);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
+{
+  out.Checkerboard() = in.Checkerboard();
+  CloverFieldType *Clover;
+  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+
+  if (dag)
+  {
+    if (in.Grid()->_isCheckerBoarded)
+    {
+      if (in.Checkerboard() == Odd)
+      {
+        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
+      }
+      else
+      {
+        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
+      }
+      out = *Clover * in;
+    }
+    else
+    {
+      Clover = (inv) ? &CloverTermInv : &CloverTerm;
+      out = adj(*Clover) * in;
+    }
+  }
+  else
+  {
+    if (in.Grid()->_isCheckerBoarded)
+    {
+
+      if (in.Checkerboard() == Odd)
+      {
+        //  std::cout << "Calling clover term Odd" << std::endl;
+        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
+      }
+      else
+      {
+        //  std::cout << "Calling clover term Even" << std::endl;
+        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
+      }
+      out = *Clover * in;
+      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
+    }
+    else
+    {
+      Clover = (inv) ? &CloverTermInv : &CloverTerm;
+      out = *Clover * in;
+    }
+  }
+
+} // MooeeInternal
+
+
+// Derivative parts
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
+{
+  assert(0);
+}
+
+// Derivative parts
+template <class Impl>
+void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+{
+  assert(0); // not implemented yet
+}
+
+FermOpTemplateInstantiate(WilsonCloverFermion);
+AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
+TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
+//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DInstantiation.cc
@@ -1,40 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonFermion5D.h>
-
-NAMESPACE_BEGIN(Grid);
-
-FermOpTemplateInstantiate(WilsonFermion5D);
-GparityFermOpTemplateInstantiate(WilsonFermion5D);
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -34,10 +34,6 @@ directory

 NAMESPACE_BEGIN(Grid);

-const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
-const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
-int WilsonFermionStatic::HandOptDslash;
-
 /////////////////////////////////
 // Constructor and gauge import
 /////////////////////////////////
@@ -464,18 +460,10 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
  st.HaloExchange(in, compressor);

  int Opt = WilsonKernelsStatic::Opt;
-  auto U_v  = U.View();
-  auto in_v = in.View();
-  auto out_v= out.View();
-  auto st_v = st.View();
  if (dag == DaggerYes) {
-    accelerator_loop( sss,in_v, {
-      Kernels::DhopSiteDag(Opt,st_v, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-    });
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  } else {
-    accelerator_loop( sss,in_v, {
-      Kernels::DhopSite(Opt,st_v, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-    });
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  }
 };
 /*Change ends */
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionInstantiation.cc
@@ -1,43 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-
-FermOpTemplateInstantiate(WilsonFermion);
-AdjointFermOpTemplateInstantiate(WilsonFermion);
-TwoIndexFermOpTemplateInstantiate(WilsonFermion);
-GparityFermOpTemplateInstantiate(WilsonFermion);
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h
@@ -103,8 +103,8 @@ template void WilsonKernels<A>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeF

 //INSTANTIATE_ASM(WilsonImplF);
 //INSTANTIATE_ASM(WilsonImplD);
-INSTANTIATE_ASM(GparityWilsonImplF);
-INSTANTIATE_ASM(GparityWilsonImplD);
+//INSTANTIATE_ASM(GparityWilsonImplF);
+//INSTANTIATE_ASM(GparityWilsonImplD);
 //INSTANTIATE_ASM(ZWilsonImplF);
 //INSTANTIATE_ASM(ZWilsonImplD);
 //INSTANTIATE_ASM(DomainWallVec5dImplF);
@@ -116,8 +116,8 @@ INSTANTIATE_ASM(GparityWilsonImplD);
 //INSTANTIATE_ASM(WilsonImplDF);
 //INSTANTIATE_ASM(ZWilsonImplFH);
 //INSTANTIATE_ASM(ZWilsonImplDF);
-INSTANTIATE_ASM(GparityWilsonImplFH);
-INSTANTIATE_ASM(GparityWilsonImplDF);
+//INSTANTIATE_ASM(GparityWilsonImplFH);
+//INSTANTIATE_ASM(GparityWilsonImplDF);
 //INSTANTIATE_ASM(DomainWallVec5dImplFH);
 //INSTANTIATE_ASM(DomainWallVec5dImplDF);
 //INSTANTIATE_ASM(ZDomainWallVec5dImplFH);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
@@ -575,20 +575,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class Impl> void  accelerator
-WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-
 #define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\
@@ -600,23 +586,6 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT(ss,F)

-  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-#endif
-}
-
-template<class Impl> accelerator
-void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-
 #define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
@@ -628,24 +597,6 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView
  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT(ss,F)

-  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-#endif
-}
-
-template<class Impl> void accelerator
-WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-
 #define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  ZERO_RESULT; \
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
@@ -658,23 +609,6 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,S
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT(ss,F)

-  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-#endif
-}
-
-template<class Impl> accelerator
-void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-
 #define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
  ZERO_RESULT;							\
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
@@ -686,25 +620,6 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldV
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
  HAND_RESULT(ss,F)
-  
-  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-#endif
-}
-
-template<class Impl> void accelerator
-WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset, perm, ptype;
-  StencilEntry *SE;
-  int nmu=0;

 #define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  ZERO_RESULT; \
@@ -718,25 +633,6 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,S
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT_EXT(ss,F)

-  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-  perm++;
-#endif
-}
-
-template<class Impl>
-accelerator void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset, perm, ptype;
-  int nmu=0;
-
 #define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  ZERO_RESULT; \
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
@@ -749,70 +645,8 @@ accelerator void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st, Double
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT_EXT(ss,F)

-  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-  perm++;
-#endif
-}
-
-  ////////////////////////////////////////////////
-  // Specialise Gparity to simple implementation
-  ////////////////////////////////////////////////
-#define HAND_SPECIALISE_EMPTY(IMPL)					\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSite(StencilView &st,			\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st,			\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st,			\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st,			\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st,	       	\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st,	       	\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-
-
-#ifdef GRID_NVCC
-#define HAND_SPECIALISE_GPARITY(IMPL) HAND_SPECIALISE_EMPTY(IMPL)
-#else
 #define HAND_SPECIALISE_GPARITY(IMPL)					\
-  template<> void							\
+  template<> void						\
  WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -828,9 +662,9 @@ accelerator void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st, Double
    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-					    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
+  template<> void						\
+  WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -844,9 +678,9 @@ accelerator void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st, Double
    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void							\
+  template<> void						\
  WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
-						     int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
+				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -860,9 +694,9 @@ accelerator void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st, Double
    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-							     int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
+  template<> void						\
+  WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -878,7 +712,7 @@ accelerator void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st, Double
 									\
  template<> void							\
  WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
-						     int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
+				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -893,9 +727,9 @@ accelerator void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st, Double
    nmu = 0;								\
    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-							     int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
+  template<> void						\
+  WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -910,39 +744,12 @@ accelerator void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st, Double
    nmu = 0;								\
    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }
-#endif

 HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
 HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
 HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
 HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);

-  
-////////////// Wilson ; uses this implementation /////////////////////
-
-#define INSTANTIATE_THEM(A) \
-template void WilsonKernels<A>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-					     int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out);\
-template void WilsonKernels<A>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionFieldView &in, FermionFieldView &out); 
-
-//INSTANTIATE_THEM(GparityWilsonImplF);
-//INSTANTIATE_THEM(GparityWilsonImplD);
-//INSTANTIATE_THEM(GparityWilsonImplFH);
-//INSTANTIATE_THEM(GparityWilsonImplDF);
-//INSTANTIATE_THEM(DomainWallVec5dImplFH);
-//INSTANTIATE_THEM(DomainWallVec5dImplDF);
-//INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
-//INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
-
 NAMESPACE_END(Grid);
 #undef REGISTER
 #undef LOAD_CHIMU_BODY
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
@@ -456,7 +456,6 @@ template<class Impl> void
 WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-#ifndef GRID_NVCC
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -475,16 +474,12 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
  HAND_RESULT(ss);
-#else
-  assert(0);
-#endif
 }

 template<class Impl>
 void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-#ifndef GRID_NVCC
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

@@ -502,16 +497,12 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
  HAND_RESULT(ss);
-#else
-  assert(0);
-#endif
 }

 template<class Impl> void 
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-#ifndef GRID_NVCC
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -530,16 +521,12 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
  HAND_RESULT(ss);
-#else
-  assert(0);
-#endif
 }

 template<class Impl>
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-#ifndef GRID_NVCC
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

@@ -557,23 +544,19 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
  HAND_RESULT(ss);
-#else
-  assert(0);
-#endif
 }

 template<class Impl> void 
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-#ifndef GRID_NVCC
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

  HAND_DECLARATIONS(ignore);

-  int offset,local,perm, ptype;
+  int offset, ptype;
  StencilEntry *SE;
  int nmu=0;
  ZERO_RESULT;
@@ -586,23 +569,19 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
  HAND_RESULT_EXT(ss);
-#else
-  assert(0);
-#endif
 }

 template<class Impl>
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-#ifndef GRID_NVCC
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

  HAND_DECLARATIONS(ignore);

  StencilEntry *SE;
-  int offset,local,perm, ptype;
+  int offset, ptype;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
@@ -614,9 +593,6 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
  HAND_RESULT_EXT(ss);
-#else
-  assert(0);
-#endif
 }

 ////////////// Wilson ; uses this implementation /////////////////////
@@ -672,3 +648,9 @@ NAMESPACE_END(Grid);
 #undef Chimu_30
 #undef Chimu_31
 #undef Chimu_32
+#undef HAND_STENCIL_LEG
+#undef HAND_STENCIL_LEG_INT
+#undef HAND_STENCIL_LEG_EXT
+#undef HAND_RESULT
+#undef HAND_RESULT_INT
+#undef HAND_RESULT_EXT
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsInstantiation.cc
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsInstantiation.cc
@@ -1,51 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h>
-			   //#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// Move these
-int WilsonKernelsStatic::Opt   = WilsonKernelsStatic::OptGeneric;
-int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
-
-// FIXME: Break these out to parallel make
-FermOpTemplateInstantiate(WilsonKernels);
-AdjointFermOpTemplateInstantiate(WilsonKernels);
-TwoIndexFermOpTemplateInstantiate(WilsonKernels);
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/implementation/WilsonTMFermion.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonTMFermion.h
@@ -0,0 +1,97 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+
+/*
+ * BF sequence
+ *
+ void bfmbase<Float>::MooeeInv(Fermion_t psi, 
+ Fermion_t chi, 
+ int dag, int cb)
+
+ double m    = this->mass;
+ double tm   = this->twistedmass;
+ double mtil = 4.0+this->mass;
+
+ double sq = mtil*mtil + tm*tm;
+
+ double a = mtil/sq;
+ double b = -tm /sq;
+ if(dag) b=-b;
+ axpibg5x(chi,psi,a,b);
+
+ void bfmbase<Float>::Mooee(Fermion_t psi, 
+ Fermion_t chi, 
+ int dag,int cb)
+ double a = 4.0+this->mass;
+ double b = this->twistedmass;
+ if(dag) b=-b;
+ axpibg5x(chi,psi,a,b);
+*/
+
+template<class Impl>
+void WilsonTMFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+  RealD a = 4.0+this->mass;
+  RealD b = this->mu;
+  out.Checkerboard() = in.Checkerboard();
+  axpibg5x(out,in,a,b);
+}
+template<class Impl>
+void WilsonTMFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+  RealD a = 4.0+this->mass;
+  RealD b = -this->mu;
+  out.Checkerboard() = in.Checkerboard();
+  axpibg5x(out,in,a,b);
+}
+template<class Impl>
+void WilsonTMFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+  RealD m    = this->mass;
+  RealD tm   = this->mu;
+  RealD mtil = 4.0+m;
+  RealD sq   = mtil*mtil+tm*tm;
+  RealD a    = mtil/sq;
+  RealD b    = -tm /sq;
+  axpibg5x(out,in,a,b);
+}
+template<class Impl>
+void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
+  RealD m    = this->mass;
+  RealD tm   = this->mu;
+  RealD mtil = 4.0+m;
+  RealD sq   = mtil*mtil+tm*tm;
+  RealD a    = mtil/sq;
+  RealD b    = tm /sq;
+  axpibg5x(out,in,a,b);
+}
+
+FermOpTemplateInstantiate(WilsonTMFermion);
+
+NAMESPACE_END(Grid);