Compiles GPU and CPU, still gives good performance on CPU

2025-06-25 19:22:03 +01:00 · 2019-06-05 13:28:16 +01:00
parent 18d3cde29a
commit 0ee6e77cbc
71 changed files with 1512 additions and 33769 deletions
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermion.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermion.h
@ -1,433 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
-						   GaugeField            &_Umu,
-						   GridCartesian         &FiveDimGrid,
-						   GridRedBlackCartesian &FiveDimRedBlackGrid,
-						   GridCartesian         &FourDimGrid,
-						   GridRedBlackCartesian &FourDimRedBlackGrid,
-						   RealD _mq1, RealD _mq2, RealD _mq3,
-						   RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
-  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-			    _shift, _pm, _M5, 1.0, 0.0, p)
-{
-  RealD eps = 1.0;
-  Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
-  assert(zdata->n == this->Ls);
-
-  std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
-  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-
-  Approx::zolotarev_free(zdata);
-}
-
-/***************************************************************
- * Additional EOFA operators only called outside the inverter.
- * Since speed is not essential, simple axpby-style
- * implementations should be fine.
- ***************************************************************/
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-{
-  int Ls = this->Ls;
-
-  Din = Zero();
-  if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
-  else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-  else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
-  else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-}
-
-// This is just the identity for DWF
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-// This is just the identity for DWF
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-/*****************************************************************************************************/
-
-template<class Impl>
-RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->Meooe5D(psi, Din);
-  this->DW(Din, chi, DaggerNo);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  this->M5D(psi, chi);
-  return(norm2(chi));
-}
-
-template<class Impl>
-RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->DW(psi, Din, DaggerYes);
-  this->MeooeDag5D(Din, chi);
-  this->M5Ddag(psi, chi);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  return(norm2(chi));
-}
-
-/********************************************************************
- * Performance critical fermion operators called inside the inverter
- ********************************************************************/
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD shift = this->shift;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-
-  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-  Coeff_t shiftp(0.0), shiftm(0.0);
-  if(shift != 0.0){
-    if(pm == 1){ shiftp = shift*(mq3-mq2); }
-    else{ shiftm = -shift*(mq3-mq2); }
-  }
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
-
-#if(0)
-  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
-  for(int i=0; i<diag.size(); ++i){
-    std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-  }
-  for(int i=0; i<upper.size(); ++i){
-    std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-  }
-  for(int i=0; i<lower.size(); ++i){
-    std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-  }
-#endif
-
-  this->M5D(psi, chi, chi, lower, diag, upper);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD shift = this->shift;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-
-  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-  Coeff_t shiftp(0.0), shiftm(0.0);
-  if(shift != 0.0){
-    if(pm == 1){ shiftp = shift*(mq3-mq2); }
-    else{ shiftm = -shift*(mq3-mq2); }
-  }
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
-
-#if(0)
-  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
-  for(int i=0; i<diag.size(); ++i){
-    std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-  }
-  for(int i=0; i<upper.size(); ++i){
-    std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-  }
-  for(int i=0; i<lower.size(); ++i){
-    std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-  }
-#endif
-
-  this->M5Ddag(psi, chi, chi, lower, diag, upper);
-}
-
-// half checkerboard operations
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] = this->dm;
-  lower[0]    = this->dp;
-
-  this->M5D(psi, psi, chi, lower, diag, upper);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] = this->dp;
-  lower[0]    = this->dm;
-
-  this->M5Ddag(psi, psi, chi, lower, diag, upper);
-}
-
-/****************************************************************************************/
-
-//Zolo
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-  RealD shift = this->shift;
-
-  ////////////////////////////////////////////////////////
-  // Constants for the preconditioned matrix Cayley form
-  ////////////////////////////////////////////////////////
-  this->bs.resize(Ls);
-  this->cs.resize(Ls);
-  this->aee.resize(Ls);
-  this->aeo.resize(Ls);
-  this->bee.resize(Ls);
-  this->beo.resize(Ls);
-  this->cee.resize(Ls);
-  this->ceo.resize(Ls);
-
-  for(int i=0; i<Ls; ++i){
-    this->bee[i] = 4.0 - this->M5 + 1.0;
-    this->cee[i] = 1.0;
-  }
-
-  for(int i=0; i<Ls; ++i){
-    this->aee[i] = this->cee[i];
-    this->bs[i] = this->beo[i] = 1.0;
-    this->cs[i] = this->ceo[i] = 0.0;
-  }
-
-  //////////////////////////////////////////
-  // EOFA shift terms
-  //////////////////////////////////////////
-  if(pm == 1){
-    this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
-    this->dm = mq1*this->cee[Ls-1];
-  } else if(this->pm == -1) {
-    this->dp = mq1*this->cee[0];
-    this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
-  } else {
-    this->dp = mq1*this->cee[0];
-    this->dm = mq1*this->cee[Ls-1];
-  }
-
-  //////////////////////////////////////////
-  // LDU decomposition of eeoo
-  //////////////////////////////////////////
-  this->dee.resize(Ls+1);
-  this->lee.resize(Ls);
-  this->leem.resize(Ls);
-  this->uee.resize(Ls);
-  this->ueem.resize(Ls);
-
-  for(int i=0; i<Ls; ++i){
-
-    if(i < Ls-1){
-
-      this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
-
-      this->leem[i] = this->dm/this->bee[i];
-      for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
-
-      this->dee[i] = this->bee[i];
-
-      this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
-
-      this->ueem[i] = this->dp / this->bee[0];
-      for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
-
-    } else {
-
-      this->lee[i]  = 0.0;
-      this->leem[i] = 0.0;
-      this->uee[i]  = 0.0;
-      this->ueem[i] = 0.0;
-
-    }
-  }
-
-  {
-    Coeff_t delta_d = 1.0 / this->bee[0];
-    for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
-    this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
-    this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
-  }
-
-  int inv = 1;
-  this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
-  this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
-}
-
-// Recompute Cayley-form coefficients for different shift
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-{
-  this->shift = new_shift;
-  Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
-  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-						       Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  int Ls = this->Ls;
-
-  GridBase* grid = this->FermionRedBlackGrid();
-  int LLs = grid->_rdimensions[0];
-
-  if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-  for(int s=0; s<Ls; s++){
-    Pplus(s,s)  = this->bee[s];
-    Pminus(s,s) = this->bee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pminus(s,s+1) = -this->cee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pplus(s+1,s) = -this->cee[s+1];
-  }
-
-  Pplus (0,Ls-1) = this->dp;
-  Pminus(Ls-1,0) = this->dm;
-
-  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXcd PminusMat;
-
-#if(0)
-  std::cout << GridLogMessage << "Pplus:" << std::endl;
-  for(int s=0; s<Ls; ++s){
-    for(int ss=0; ss<Ls; ++ss){
-      std::cout << Pplus(s,ss) << "\t";
-    }
-    std::cout << std::endl;
-  }
-  std::cout << GridLogMessage << "Pminus:" << std::endl;
-  for(int s=0; s<Ls; ++s){
-    for(int ss=0; ss<Ls; ++ss){
-      std::cout << Pminus(s,ss) << "\t";
-    }
-    std::cout << std::endl;
-  }
-#endif
-
-  if(inv) {
-    PplusMat  = Pplus.inverse();
-    PminusMat = Pminus.inverse();
-  } else {
-    PplusMat  = Pplus;
-    PminusMat = Pminus;
-  }
-
-  if(dag){
-    PplusMat.adjointInPlace();
-    PminusMat.adjointInPlace();
-  }
-
-  typedef typename SiteHalfSpinor::scalar_type scalar_type;
-  const int Nsimd = Simd::Nsimd();
-  Matp.resize(Ls*LLs);
-  Matm.resize(Ls*LLs);
-
-  for(int s2=0; s2<Ls; s2++){
-    for(int s1=0; s1<LLs; s1++){
-      int istride = LLs;
-      int ostride = 1;
-      Simd Vp;
-      Simd Vm;
-      scalar_type *sp = (scalar_type*) &Vp;
-      scalar_type *sm = (scalar_type*) &Vm;
-      for(int l=0; l<Nsimd; l++){
-	if(switcheroo<Coeff_t>::iscomplex()) {
-	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
-	} else {
-	  // if real
-	  scalar_type tmp;
-	  tmp = PplusMat (l*istride+s1*ostride,s2);
-	  sp[l] = scalar_type(tmp.real(),tmp.real());
-	  tmp = PminusMat(l*istride+s1*ostride,s2);
-	  sm[l] = scalar_type(tmp.real(),tmp.real());
-	}
-      }
-      Matp[LLs*s2+s1] = Vp;
-      Matm[LLs*s2+s1] = Vm;
-    }}
-}
-
-FermOpTemplateInstantiate(DomainWallEOFAFermion);
-GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermioncache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermioncache.h
@ -1,255 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// FIXME -- make a version of these routines with site loop outermost for cache reuse.
-
-// Pminus fowards
-// Pplus  backwards..
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  int Ls = this->Ls;
-  GridBase* grid = psi_i.Grid();
-  auto phi = phi_i.View();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  assert(phi.Checkerboard() == psi.Checkerboard());
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-  
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-    for(int s=0; s<Ls; s++){
-      auto tmp = psi[0];
-      if(s==0) {
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5m(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  int Ls = this->Ls;
-
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-    auto tmp = psi[0];
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5p(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionField& chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi=psi_i.View();
-  auto chi=chi_i.View();
-  int Ls = this->Ls;
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-
-    auto tmp1 = psi[0];
-    auto tmp2 = psi[0];
-
-    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-    // Apply (L^{\prime})^{-1}
-    chi[ss] = psi[ss]; // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp1, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
-    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp1, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp1, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
-    }
-    spProj5m(tmp2, chi[ss+Ls-1]);
-    chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
-
-    // Apply U^{-1}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5m(tmp1, chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, FermionField& chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  int Ls = this->Ls;
-
-  assert(psi.Checkerboard() == psi.Checkerboard());
-
-  Vector<Coeff_t> ueec(Ls);
-  Vector<Coeff_t> deec(Ls+1);
-  Vector<Coeff_t> leec(Ls);
-  Vector<Coeff_t> ueemc(Ls);
-  Vector<Coeff_t> leemc(Ls);
-
-  for(int s=0; s<ueec.size(); s++){
-    ueec[s]  = conjugate(this->uee[s]);
-    deec[s]  = conjugate(this->dee[s]);
-    leec[s]  = conjugate(this->lee[s]);
-    ueemc[s] = conjugate(this->ueem[s]);
-    leemc[s] = conjugate(this->leem[s]);
-  }
-  deec[Ls] = conjugate(this->dee[Ls]);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-
-    auto tmp1 = psi[0];
-    auto tmp2 = psi[0];
-
-    // Apply (U^{\prime})^{-dagger}
-    chi[ss] = psi[ss];
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp1, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
-    }
-
-    // U_m^{-\dagger}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp1, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp1, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
-    }
-    spProj5p(tmp2, chi[ss+Ls-1]);
-    chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
-
-    // Apply L^{-dagger}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5p(tmp1, chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
-
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
-
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionvec.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionvec.h
@ -1,613 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-
-/*
- * Dense matrix versions of routines
- */
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0;o<LLs;o++){ // outer
-    for(int i=0;i<nsimd;i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5m(hp, psi[ss+vp]);
-      spProj5p(hm, psi[ss+vm]);
-
-      if (vp <= v){ rotate(hp, hp, 1); }
-      if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = 0.5*hp;
-      hm = 0.5*hm;
-
-      spRecon5m(fp, hp);
-      spRecon5p(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v];
-      chi[ss+v] = chi[ss+v] + u[v]*fp;
-      chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v==LLs-1) ? 0     : v+1;
-      int vm = (v==0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-
-#endif
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5p(hp, psi[ss+vp]);
-      spProj5m(hm, psi[ss+vm]);
-
-      if(vp <= v){ rotate(hp, hp, 1); }
-      if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = hp*0.5;
-      hm = hm*0.5;
-      spRecon5p(fp, hp);
-      spRecon5m(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-#endif
-
-  });
-
-  this->M5Dtime += usecond();
-}
-
-#ifdef AVX512
-#include<simd/Intel512common.h>
-#include<simd/Intel512avx.h>
-#include<simd/Intel512single.h>
-#endif
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
-						   int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-#ifndef AVX512
-  {
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-    SiteHalfSpinor SiteChiP;
-    SiteHalfSpinor SiteChiM;
-
-    // Ls*Ls * 2 * 12 * vol flops
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-	for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-	  int s = s2 + l*LLs;
-	  int lex = s2 + LLs*site;
-
-	  if( s2==0 && l==0 ){
-	    SiteChiP=Zero();
-	    SiteChiM=Zero();
-	  }
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-	    }}
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-	    }}
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-	      SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-	    }}
-	}}
-
-      {
-	int lex = s1 + LLs*site;
-	for(int sp=0; sp<2;  sp++){
-	  for(int co=0; co<Nc; co++){
-	    vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-	  }}
-      }
-    }
-
-  }
-#else
-  {
-    // pointers
-    //  MASK_REGS;
-#define Chi_00 %%zmm1
-#define Chi_01 %%zmm2
-#define Chi_02 %%zmm3
-#define Chi_10 %%zmm4
-#define Chi_11 %%zmm5
-#define Chi_12 %%zmm6
-#define Chi_20 %%zmm7
-#define Chi_21 %%zmm8
-#define Chi_22 %%zmm9
-#define Chi_30 %%zmm10
-#define Chi_31 %%zmm11
-#define Chi_32 %%zmm12
-
-#define BCAST0  %%zmm13
-#define BCAST1  %%zmm14
-#define BCAST2  %%zmm15
-#define BCAST3  %%zmm16
-#define BCAST4  %%zmm17
-#define BCAST5  %%zmm18
-#define BCAST6  %%zmm19
-#define BCAST7  %%zmm20
-#define BCAST8  %%zmm21
-#define BCAST9  %%zmm22
-#define BCAST10 %%zmm23
-#define BCAST11 %%zmm24
-
-    int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-
-	int lex = s2 + LLs*site;
-	uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-	uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-	uint64_t a2 = (uint64_t) &psi[lex];
-
-	for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-	  if((s2+l)==0) {
-	    asm(
-		VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-		VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-		VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-		VBCASTCDUP(0,%2,BCAST0)
-		VBCASTCDUP(1,%2,BCAST1)
-		VBCASTCDUP(2,%2,BCAST2)
-		VBCASTCDUP(3,%2,BCAST3)
-		VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-		VMULMEM(0,%1,BCAST8,Chi_22)
-		VMULMEM(0,%1,BCAST9,Chi_30)
-		VMULMEM(0,%1,BCAST10,Chi_31)
-		VMULMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  } else {
-	    asm(
-		VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-		VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-		VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-		VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-		VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  }
-	  a0 = a0 + incr;
-	  a1 = a1 + incr;
-                        a2 = a2 + sizeof(typename Simd::scalar_type);
-	}
-      }
-
-      {
-	int lexa = s1+LLs*site;
-	asm (
-	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-      }
-    }
-  }
-
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-#endif
-};
-
-// Z-mobius version
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-						    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-  exit(-1);
-};
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-{
-  chi.Checkerboard() = psi.Checkerboard();
-  int Ls  = this->Ls;
-  int LLs = psi.Grid()->_rdimensions[0];
-  int vol = psi.Grid()->oSites()/LLs;
-
-  Vector<iSinglet<Simd> > Matp;
-  Vector<iSinglet<Simd> > Matm;
-  Vector<iSinglet<Simd> > *_Matp;
-  Vector<iSinglet<Simd> > *_Matm;
-
-  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-  if(inv && dag){
-    _Matp = &this->MatpInvDag;
-    _Matm = &this->MatmInvDag;
-  }
-
-  if(inv && (!dag)){
-    _Matp = &this->MatpInv;
-    _Matm = &this->MatmInv;
-  }
-
-  if(!inv){
-    MooeeInternalCompute(dag, inv, Matp, Matm);
-    _Matp = &Matp;
-    _Matm = &Matm;
-  }
-
-  assert(_Matp->size() == Ls*LLs);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  if(switcheroo<Coeff_t>::iscomplex()){
-    thread_loop((auto site=0; site<vol; site++),{
-      MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  } else {
-    thread_loop((auto site=0; site<vol; site++){
-      MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  }
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef DOMAIN_WALL_EOFA_DPERP_VEC
-
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
-
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
-
-template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermion.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermion.h
@ -1,497 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
-					   GaugeField            &_Umu,
-					   GridCartesian         &FiveDimGrid,
-					   GridRedBlackCartesian &FiveDimRedBlackGrid,
-					   GridCartesian         &FourDimGrid,
-					   GridRedBlackCartesian &FourDimRedBlackGrid,
-					   RealD _mq1, RealD _mq2, RealD _mq3,
-					   RealD _shift, int _pm, RealD _M5,
-					   RealD _b, RealD _c, const ImplParams &p) :
-  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-			    _shift, _pm, _M5, _b, _c, p)
-{
-  int Ls = this->Ls;
-
-  RealD eps = 1.0;
-  Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
-  assert(zdata->n == this->Ls);
-
-  std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
-    ",c=" << _c << ") with Ls=" << Ls << std::endl;
-  this->SetCoefficientsTanh(zdata, _b, _c);
-  std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
-    ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
-    ",pm=" << _pm << ")" << std::endl;
-
-  Approx::zolotarev_free(zdata);
-
-  if(_shift != 0.0){
-    SetCoefficientsPrecondShiftOps();
-  } else {
-    Mooee_shift.resize(Ls, 0.0);
-    MooeeInv_shift_lc.resize(Ls, 0.0);
-    MooeeInv_shift_norm.resize(Ls, 0.0);
-    MooeeInvDag_shift_lc.resize(Ls, 0.0);
-    MooeeInvDag_shift_norm.resize(Ls, 0.0);
-  }
-}
-
-/****************************************************************
- * Additional EOFA operators only called outside the inverter.  
- * Since speed is not essential, simple axpby-style
- * implementations should be fine.
- ***************************************************************/
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-{
-  int Ls = this->Ls;
-  RealD alpha = this->alpha;
-
-  Din = Zero();
-  if((sign == 1) && (dag == 0)) { // \Omega_{+}
-    for(int s=0; s<Ls; ++s){
-      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
-    }
-  } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
-    for(int s=0; s<Ls; ++s){
-      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
-    }
-  } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
-    for(int sp=0; sp<Ls; ++sp){
-      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
-    }
-  } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
-    for(int sp=0; sp<Ls; ++sp){
-      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
-    }
-  }
-}
-
-// This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
-// It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
-{
-  int Ls    = this->Ls;
-  RealD b   = 0.5 * ( 1.0 + this->alpha );
-  RealD c   = 0.5 * ( 1.0 - this->alpha );
-  RealD mq1 = this->mq1;
-
-  for(int s=0; s<Ls; ++s){
-    if(s == 0) {
-      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-      axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
-    } else if(s == (Ls-1)) {
-      axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
-      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-    } else {
-      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-    }
-  }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-  RealD m = this->mq1;
-  RealD c = 0.5 * this->alpha;
-  RealD d = 0.5;
-
-  RealD DtInv_p(0.0), DtInv_m(0.0);
-  RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
-  FermionField tmp(this->FermionGrid());
-
-  for(int s=0; s<Ls; ++s){
-    for(int sp=0; sp<Ls; ++sp){
-
-      DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
-      DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
-      DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
-      DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
-
-      if(sp == 0){
-	axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
-	axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
-      } else {
-	axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
-	axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
-      }
-
-    }}
-}
-
-/*****************************************************************************************************/
-
-template<class Impl>
-RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->Meooe5D(psi, Din);
-  this->DW(Din, chi, DaggerNo);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  this->M5D(psi, chi);
-  return(norm2(chi));
-}
-
-template<class Impl>
-RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->DW(psi, Din, DaggerYes);
-  this->MeooeDag5D(Din, chi);
-  this->M5Ddag(psi, chi);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  return(norm2(chi));
-}
-
-/********************************************************************
- * Performance critical fermion operators called inside the inverter
- ********************************************************************/
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-// half checkerboard operations
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  // coefficients of Mooee
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] *= -this->mq1;
-  lower[0]    *= -this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  // coefficients of MooeeDag
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-  for(int s=0; s<Ls; s++){
-    if(s==0) {
-      upper[s] = -this->cee[s+1];
-      lower[s] = this->mq1*this->cee[Ls-1];
-    } else if(s==(Ls-1)) {
-      upper[s] = this->mq1*this->cee[0];
-      lower[s] = -this->cee[s-1];
-    } else {
-      upper[s] = -this->cee[s+1];
-      lower[s] = -this->cee[s-1];
-    }
-  }
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-/****************************************************************************************/
-
-// Computes coefficients for applying Cayley preconditioned shift operators
-//  (Mooee + \Delta) --> Mooee_shift
-//  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
-//  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
-// For the latter two cases, the operation takes the form
-//  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
-//      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
-template<class Impl>
-void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD alpha = this->alpha;
-  RealD k     = this->k;
-  RealD mq1   = this->mq1;
-  RealD shift = this->shift;
-
-  // Initialize
-  Mooee_shift.resize(Ls);
-  MooeeInv_shift_lc.resize(Ls);
-  MooeeInv_shift_norm.resize(Ls);
-  MooeeInvDag_shift_lc.resize(Ls);
-  MooeeInvDag_shift_norm.resize(Ls);
-
-  // Construct Mooee_shift
-  int idx(0);
-  Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
-    ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
-  for(int s=0; s<Ls; ++s){
-    idx = (pm == 1) ? (s) : (Ls-1-s);
-    Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
-  }
-
-  // Tridiagonal solve for MooeeInvDag_shift_lc
-  {
-    Coeff_t m(0.0);
-    Vector<Coeff_t> d = Mooee_shift;
-    Vector<Coeff_t> u(Ls,0.0);
-    Vector<Coeff_t> y(Ls,0.0);
-    Vector<Coeff_t> q(Ls,0.0);
-    if(pm == 1){ u[0] = 1.0; }
-    else{ u[Ls-1] = 1.0; }
-
-    // Tridiagonal matrix algorithm + Sherman-Morrison formula
-    //
-    // We solve
-    //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
-    // where Mooee' is the tridiagonal part of Mooee_{+}, and
-    // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
-    // so that the outer-product u \otimes v gives the (0,Ls-1)
-    // entry of Mooee_{+}.
-    //
-    // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
-    // and then construct the solution to the original system
-    //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
-    if(pm == 1){
-      for(int s=1; s<Ls; ++s){
-	m = -this->cee[s] / this->bee[s-1];
-	d[s] -= m*d[s-1];
-	u[s] -= m*u[s-1];
-      }
-    }
-    y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
-    q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
-    for(int s=Ls-2; s>=0; --s){
-      if(pm == 1){
-	y[s] = d[s] / this->bee[s];
-	q[s] = u[s] / this->bee[s];
-      } else {
-	y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
-	q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
-      }
-    }
-
-    // Construct MooeeInvDag_shift_lc
-    for(int s=0; s<Ls; ++s){
-      if(pm == 1){
-	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
-	  (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
-      } else {
-	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
-	  (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
-      }
-    }
-
-    // Compute remaining coefficients
-    N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
-    for(int s=0; s<Ls; ++s){
-
-      // MooeeInv_shift_lc
-      if(pm == 1){ MooeeInv_shift_lc[s] = pow(this->bee[s],s)      * pow(this->cee[s],Ls-1-s); }
-      else       { MooeeInv_shift_lc[s] = pow(this->bee[s],Ls-1-s) * pow(this->cee[s],s); }
-
-      // MooeeInv_shift_norm
-      MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
-	( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N;
-
-      // MooeeInvDag_shift_norm
-      if(pm == 1){ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],s) * pow(this->cee[s],(Ls-1-s)) /
-     	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
-      else{ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],(Ls-1-s)) * pow(this->cee[s],s) /
-	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
-    }
-  }
-}
-
-// Recompute coefficients for a different value of shift constant
-template<class Impl>
-void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-{
-  this->shift = new_shift;
-  if(new_shift != 0.0){
-    SetCoefficientsPrecondShiftOps();
-  } else {
-    int Ls = this->Ls;
-    Mooee_shift.resize(Ls,0.0);
-    MooeeInv_shift_lc.resize(Ls,0.0);
-    MooeeInv_shift_norm.resize(Ls,0.0);
-    MooeeInvDag_shift_lc.resize(Ls,0.0);
-    MooeeInvDag_shift_norm.resize(Ls,0.0);
-  }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-						   Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  int Ls = this->Ls;
-
-  GridBase* grid = this->FermionRedBlackGrid();
-  int LLs = grid->_rdimensions[0];
-
-  if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-  for(int s=0; s<Ls; s++){
-    Pplus(s,s)  = this->bee[s];
-    Pminus(s,s) = this->bee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pminus(s,s+1) = -this->cee[s];
-    Pplus(s+1,s) = -this->cee[s+1];
-  }
-
-  Pplus (0,Ls-1) = this->mq1*this->cee[0];
-  Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
-
-  if(this->shift != 0.0){
-    RealD c = 0.5 * this->alpha;
-    RealD d = 0.5;
-    RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
-    if(this->pm == 1) {
-      for(int s=0; s<Ls; ++s){
-	Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
-      }
-    } else {
-      for(int s=0; s<Ls; ++s){
-	Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
-      }
-    }
-  }
-
-  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXcd PminusMat;
-
-  if(inv) {
-    PplusMat  = Pplus.inverse();
-    PminusMat = Pminus.inverse();
-  } else {
-    PplusMat  = Pplus;
-    PminusMat = Pminus;
-  }
-
-  if(dag){
-    PplusMat.adjointInPlace();
-    PminusMat.adjointInPlace();
-  }
-
-  typedef typename SiteHalfSpinor::scalar_type scalar_type;
-  const int Nsimd = Simd::Nsimd();
-  Matp.resize(Ls*LLs);
-  Matm.resize(Ls*LLs);
-
-  for(int s2=0; s2<Ls; s2++){
-    for(int s1=0; s1<LLs; s1++){
-      int istride = LLs;
-      int ostride = 1;
-      Simd Vp;
-      Simd Vm;
-      scalar_type *sp = (scalar_type*) &Vp;
-      scalar_type *sm = (scalar_type*) &Vm;
-      for(int l=0; l<Nsimd; l++){
-	if(switcheroo<Coeff_t>::iscomplex()) {
-	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
-	} else {
-	  // if real
-	  scalar_type tmp;
-	  tmp = PplusMat (l*istride+s1*ostride,s2);
-	  sp[l] = scalar_type(tmp.real(),tmp.real());
-	  tmp = PminusMat(l*istride+s1*ostride,s2);
-	  sm[l] = scalar_type(tmp.real(),tmp.real());
-	}
-      }
-      Matp[LLs*s2+s1] = Vp;
-      Matm[LLs*s2+s1] = Vm;
-    }}
-}
-
-FermOpTemplateInstantiate(MobiusEOFAFermion);
-GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionvec.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionvec.h
@ -1,998 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-/*
- * Dense matrix versions of routines
- */
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				  Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid  = psi_i.Grid();
-  auto psi        = psi_i.View();
-  auto phi        = phi_i.View();
-  auto chi        = chi_i.View();
-  int Ls          = this->Ls;
-  int LLs         = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s   = o + i*LLs;
-      int ss  = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5m(hp, psi[ss+vp]);
-      spProj5p(hm, psi[ss+vm]);
-
-      if (vp <= v){ rotate(hp, hp, 1); }
-      if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = 0.5*hp;
-      hm = 0.5*hm;
-
-      spRecon5m(fp, hp);
-      spRecon5p(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v];
-      chi[ss+v] = chi[ss+v] + u[v]*fp;
-      chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-
-#endif
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi_i, const FermionField& phi_i,
-					FermionField& chi_i, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-					Vector<Coeff_t>& shift_coeffs)
-{
-#if 0
-  auto & psi = psi_i;
-  auto & phi = phi_i;
-  auto & chi = chi_i;
-
-  this->M5D(psi, phi, chi, lower, diag, upper);
-
-  // FIXME: possible gain from vectorizing shift operation as well?
-  Coeff_t one(1.0);
-  int Ls = this->Ls;
-  for(int s=0; s<Ls; s++){
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
-    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
-  }
-
-#else
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid  = psi_i.Grid();
-  auto psi        = psi_i.View();
-  auto phi        = phi_i.View();
-  auto chi        = chi_i.View();
-  int Ls          = this->Ls;
-  int LLs         = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-  Vector<iSinglet<Simd>> s(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-  scalar_type* s_p = (scalar_type*) &s[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s   = o + i*LLs;
-      int ss  = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-      s_p[ss] = shift_coeffs[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-    int vs     = (this->pm == 1) ? LLs-1 : 0;
-    Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
-    Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
-    Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
-    Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
-    Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
-    Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(this->pm == 1 && vs <= v){
-	hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      if(this->pm == -1 && vs >= v){
-	hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-      Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-      Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-      Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-      Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-      Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-      Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-  });
-
-  this->M5Dtime += usecond();
-
-#endif
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				     Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5p(hp, psi[ss+vp]);
-      spProj5m(hm, psi[ss+vm]);
-
-      if(vp <= v){ rotate(hp, hp, 1); }
-      if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = hp*0.5;
-      hm = hm*0.5;
-      spRecon5p(fp, hp);
-      spRecon5m(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-
-    }
-
-#endif
-
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-					   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-					   Vector<Coeff_t>& shift_coeffs)
-{
-#if 0
-  auto & psi = psi_i;
-  auto & phi = phi_i;
-  auto & chi = chi_i;
-  this->M5Ddag(psi, phi, chi, lower, diag, upper);
-
-  // FIXME: possible gain from vectorizing shift operation as well?
-  Coeff_t one(1.0);
-  int Ls = this->Ls;
-  for(int s=0; s<Ls; s++){
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
-    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
-  }
-
-#else
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-  Vector<iSinglet<Simd>> s(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-  scalar_type* s_p = (scalar_type*) &s[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-      s_p[ss] = shift_coeffs[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-    int vs     = (this->pm == 1) ? LLs-1 : 0;
-    Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
-    Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
-    Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
-    Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
-    Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
-    Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(this->pm == 1 && vs <= v){
-	hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      if(this->pm == -1 && vs >= v){
-	hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-      }
-
-      Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-      Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-      Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-      Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-      Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-      Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-
-    }
-
-  });
-
-  this->M5Dtime += usecond();
-
-#endif
-}
-
-#ifdef AVX512
-#include<simd/Intel512common.h>
-#include<simd/Intel512avx.h>
-#include<simd/Intel512single.h>
-#endif
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
-					       int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-#ifndef AVX512
-  {
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-    SiteHalfSpinor SiteChiP;
-    SiteHalfSpinor SiteChiM;
-
-    // Ls*Ls * 2 * 12 * vol flops
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-	for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-	  int s = s2 + l*LLs;
-	  int lex = s2 + LLs*site;
-
-	  if( s2==0 && l==0 ){
-	    SiteChiP=Zero();
-	    SiteChiM=Zero();
-	  }
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-            }}
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-            }}
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-              SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-            }}
-	}}
-
-      {
-	int lex = s1 + LLs*site;
-	for(int sp=0; sp<2;  sp++){
-	  for(int co=0; co<Nc; co++){
-	    vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-	  }}
-      }
-    }
-  }
-#else
-  {
-    // pointers
-    //  MASK_REGS;
-#define Chi_00 %%zmm1
-#define Chi_01 %%zmm2
-#define Chi_02 %%zmm3
-#define Chi_10 %%zmm4
-#define Chi_11 %%zmm5
-#define Chi_12 %%zmm6
-#define Chi_20 %%zmm7
-#define Chi_21 %%zmm8
-#define Chi_22 %%zmm9
-#define Chi_30 %%zmm10
-#define Chi_31 %%zmm11
-#define Chi_32 %%zmm12
-
-#define BCAST0  %%zmm13
-#define BCAST1  %%zmm14
-#define BCAST2  %%zmm15
-#define BCAST3  %%zmm16
-#define BCAST4  %%zmm17
-#define BCAST5  %%zmm18
-#define BCAST6  %%zmm19
-#define BCAST7  %%zmm20
-#define BCAST8  %%zmm21
-#define BCAST9  %%zmm22
-#define BCAST10 %%zmm23
-#define BCAST11 %%zmm24
-
-    int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-
-	int lex = s2 + LLs*site;
-	uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-	uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-	uint64_t a2 = (uint64_t) &psi[lex];
-
-	for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-
-	  if((s2+l)==0) {
-	    asm(
-		VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-		VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-		VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-		VBCASTCDUP(0,%2,BCAST0)
-		VBCASTCDUP(1,%2,BCAST1)
-		VBCASTCDUP(2,%2,BCAST2)
-		VBCASTCDUP(3,%2,BCAST3)
-		VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-		VMULMEM(0,%1,BCAST8,Chi_22)
-		VMULMEM(0,%1,BCAST9,Chi_30)
-		VMULMEM(0,%1,BCAST10,Chi_31)
-		VMULMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  } else {
-	    asm(
-		VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-		VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-		VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-		VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-		VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  }
-
-	  a0 = a0 + incr;
-	  a1 = a1 + incr;
-              a2 = a2 + sizeof(typename Simd::scalar_type);
-	}
-      }
-
-      {
-	int lexa = s1+LLs*site;
-	asm (
-	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-      }
-    }
-  }
-
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-
-#endif
-};
-
-// Z-mobius version
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-						int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-  exit(-1);
-};
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-{
-  chi.Checkerboard() = psi.Checkerboard();
-
-  int Ls  = this->Ls;
-  int LLs = psi.Grid()->_rdimensions[0];
-  int vol = psi.Grid()->oSites()/LLs;
-
-  Vector<iSinglet<Simd>>   Matp;
-  Vector<iSinglet<Simd>>   Matm;
-  Vector<iSinglet<Simd>>* _Matp;
-  Vector<iSinglet<Simd>>* _Matm;
-
-  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-  if(inv && dag){
-    _Matp = &this->MatpInvDag;
-    _Matm = &this->MatmInvDag;
-  }
-
-  if(inv && (!dag)){
-    _Matp = &this->MatpInv;
-    _Matm = &this->MatmInv;
-  }
-
-  if(!inv){
-    MooeeInternalCompute(dag, inv, Matp, Matm);
-    _Matp = &Matp;
-    _Matm = &Matm;
-  }
-
-  assert(_Matp->size() == Ls*LLs);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  if(switcheroo<Coeff_t>::iscomplex()){
-    thread_loop( (auto site=0; site<vol; site++),{
-      MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  } else {
-    thread_loop( (auto site=0; site<vol; site++),{
-      MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  }
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef MOBIUS_EOFA_DPERP_VEC
-
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
-
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
-
-template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermion.h
@ -1,242 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
-
-    Copyright (C) 2017
-
-    Author: paboyle <paboyle@ph.ed.ac.uk>
-    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid.h>
-#include <Grid/qcd/spin/Dirac.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// *NOT* EO
-template <class Impl>
-RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
-{
-  FermionField temp(out.Grid());
-
-  // Wilson term
-  out.Checkerboard() = in.Checkerboard();
-  this->Dhop(in, out, DaggerNo);
-
-  // Clover term
-  Mooee(in, temp);
-
-  out += temp;
-  return norm2(out);
-}
-
-template <class Impl>
-RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
-{
-  FermionField temp(out.Grid());
-
-  // Wilson term
-  out.Checkerboard() = in.Checkerboard();
-  this->Dhop(in, out, DaggerYes);
-
-  // Clover term
-  MooeeDag(in, temp);
-
-  out += temp;
-  return norm2(out);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
-{
-  WilsonFermion<Impl>::ImportGauge(_Umu);
-  GridBase *grid = _Umu.Grid();
-  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
-
-  // Compute the field strength terms mu>nu
-  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
-  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
-  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
-
-  // Compute the Clover Operator acting on Colour and Spin
-  // multiply here by the clover coefficients for the anisotropy
-  CloverTerm  = fillCloverYZ(Bx) * csw_r;
-  CloverTerm += fillCloverXZ(By) * csw_r;
-  CloverTerm += fillCloverXY(Bz) * csw_r;
-  CloverTerm += fillCloverXT(Ex) * csw_t;
-  CloverTerm += fillCloverYT(Ey) * csw_t;
-  CloverTerm += fillCloverZT(Ez) * csw_t;
-  CloverTerm += diag_mass;
-
-  int lvol = _Umu.Grid()->lSites();
-  int DimRep = Impl::Dimension;
-
-  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-
-  Coordinate lcoor;
-  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
-
-  for (int site = 0; site < lvol; site++)
-  {
-    grid->LocalIndexToLocalCoor(site, lcoor);
-    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-    peekLocalSite(Qx, CloverTerm, lcoor);
-    Qxinv = Zero();
-    //if (csw!=0){
-    for (int j = 0; j < Ns; j++)
-      for (int k = 0; k < Ns; k++)
-        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++){
-	    auto zz =  Qx()(j, k)(a, b);
-            EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
-	  }
-    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
-
-    EigenInvCloverOp = EigenCloverOp.inverse();
-    //std::cout << EigenInvCloverOp << std::endl;
-    for (int j = 0; j < Ns; j++)
-      for (int k = 0; k < Ns; k++)
-        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++)
-            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
-    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
-    //  }
-    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
-  }
-
-  // Separate the even and odd parts
-  pickCheckerboard(Even, CloverTermEven, CloverTerm);
-  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
-
-  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
-  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
-
-  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
-  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
-
-  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
-  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerNo, InverseNo);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerYes, InverseNo);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerNo, InverseYes);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerYes, InverseYes);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
-{
-  out.Checkerboard() = in.Checkerboard();
-  CloverFieldType *Clover;
-  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
-
-  if (dag)
-  {
-    if (in.Grid()->_isCheckerBoarded)
-    {
-      if (in.Checkerboard() == Odd)
-      {
-        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
-      }
-      else
-      {
-        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
-      }
-      out = *Clover * in;
-    }
-    else
-    {
-      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = adj(*Clover) * in;
-    }
-  }
-  else
-  {
-    if (in.Grid()->_isCheckerBoarded)
-    {
-
-      if (in.Checkerboard() == Odd)
-      {
-        //  std::cout << "Calling clover term Odd" << std::endl;
-        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
-      }
-      else
-      {
-        //  std::cout << "Calling clover term Even" << std::endl;
-        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
-      }
-      out = *Clover * in;
-      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
-    }
-    else
-    {
-      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = *Clover * in;
-    }
-  }
-
-} // MooeeInternal
-
-
-// Derivative parts
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
-{
-  assert(0);
-}
-
-// Derivative parts
-template <class Impl>
-void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
-{
-  assert(0); // not implemented yet
-}
-
-FermOpTemplateInstantiate(WilsonCloverFermion);
-AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
-TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
-//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -386,11 +386,9 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         const FermionField &in, FermionField &out,int dag)
 {
  DhopTotalTime-=usecond();
-#ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else 
-#endif
    DhopInternalSerialComms(st,lo,U,in,out,dag);
  DhopTotalTime+=usecond();
 }
@ -401,111 +399,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 							DoubledGaugeField & U,
 							const FermionField &in, FermionField &out,int dag)
 {
-#ifdef GRID_OMP
  Compressor compressor(dag);

  int LLs = in.Grid()->_rdimensions[0];
  int len =  U.Grid()->oSites();

+  /////////////////////////////
+  // Start comms  // Gather intranode and extra node differentiated??
+  /////////////////////////////
  DhopFaceTime-=usecond();
  st.HaloExchangeOptGather(in,compressor);
-  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();

-  double ctime=0;
-  double ptime=0;
+  DhopCommTime -=usecond();
+  std::vector<std::vector<CommsRequest_t> > requests;
+  st.CommunicateBegin(requests);

-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  auto U_v   = U.View();
-  auto in_v  = in.View();
-  auto out_v = out.View();
-  int Opt = WilsonKernelsStatic::Opt;
-#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
-  { 
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      double start = usecond();
-      nthreads -= ncomms;
-      int ttid = tid - ncomms;
-      int n = U.Grid()->oSites();
-      int chunk = n / nthreads;
-      int rem = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-	myblock = ttid * chunk + ttid;
-	myn = chunk+1;
-      } else {
-	myblock = ttid*chunk + rem;
-	myn = chunk;
-      }
+  /////////////////////////////
+  // Overlap with comms
+  /////////////////////////////
+  DhopFaceTime-=usecond();
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+  DhopFaceTime+=usecond();
      
-      // do the compute
-      if (dag == DaggerYes) {
-	for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  int sU = ss;
-	  int sF = LLs * sU;
- 	  Kernels::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,1,0);
-	}
-      } else {
-	for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  int sU = ss;
-	  int sF = LLs * sU;
-	  Kernels::DhopSite(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,1,0);
-	}
-      }
-      ptime = usecond() - start;
-    }
-    {
-      double start = usecond();
-      st.CommunicateThreaded();
-      ctime = usecond() - start;
-    }
+  /////////////////////////////
+  // do the compute interior
+  /////////////////////////////
+  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
+  DhopComputeTime-=usecond();
+  if (dag == DaggerYes) {
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
+  } else {
+    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
-  DhopCommTime += ctime;
-  DhopComputeTime+=ptime;
+  DhopComputeTime+=usecond();

-  // First to enter, last to leave timing
-  st.CollateThreads();
+  /////////////////////////////
+  // Complete comms
+  /////////////////////////////
+  st.CommunicateComplete(requests);
+  DhopCommTime   +=usecond();

+  /////////////////////////////
+  // do the compute exterior
+  /////////////////////////////
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();

  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
-    int sz=st.surface_list.size();
-    thread_loop( (int ss = 0; ss < sz; ss++) ,{
-      int sU = st.surface_list[ss];
-      int sF = LLs * sU;
-      Kernels::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,0,1);
-    });
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  } else {
-    int sz=st.surface_list.size();
-    thread_loop( (int ss = 0; ss < sz; ss++) ,{
-      int sU = st.surface_list[ss];
-      int sF = LLs * sU;
-      Kernels::DhopSite(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,0,1);
-    });
+    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
-#else 
-  assert(0);
-#endif
 }


 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
-					 DoubledGaugeField & U,
-					 const FermionField &in, FermionField &out,int dag)
+						    DoubledGaugeField & U,
+						    const FermionField &in, 
+						    FermionField &out,int dag)
 {
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  Compressor compressor(dag);

  int LLs = in.Grid()->_rdimensions[0];
@ -515,24 +472,11 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
  DhopCommTime+=usecond();
  
  DhopComputeTime-=usecond();
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-
-  auto U_v = U.View();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
-    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U_v.size(),in,out);
-    //    parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
-    //      int sU = ss;
-    //      int sF = LLs * sU;
-    //      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
-    //    }
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  } else {
-    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U_v.size(),in,out);
-    //    parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
-    //      int sU = ss;
-    //      int sF = LLs * sU;
-    //      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
-    //    }
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
 }
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@ -375,78 +375,47 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
 						      const FermionField &in,
 						      FermionField &out, int dag) {
  assert((dag == DaggerNo) || (dag == DaggerYes));
-#ifdef GRID_OMP
+
  Compressor compressor(dag);
  int len =  U.Grid()->oSites();
-  const int LLs =  1;

+  /////////////////////////////
+  // Start comms  // Gather intranode and extra node differentiated??
+  /////////////////////////////
+  std::vector<std::vector<CommsRequest_t> > requests;
  st.Prepare();
  st.HaloGather(in,compressor);
+  st.CommunicateBegin(requests);
+
+  /////////////////////////////
+  // Overlap with comms
+  /////////////////////////////
  st.CommsMergeSHM(compressor);
-#pragma omp parallel
-  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = len;
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
-      }
-      // do the compute
-      auto U_v   = U.View();
-      auto in_v  = in.View();
-      auto out_v = out.View();
-      auto st_v  = st.View();
-      int Opt = WilsonKernelsStatic::Opt;

-      if (dag == DaggerYes) {
-        for (int sss = myblock; sss < myblock+myn; ++sss) {
-	  Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
-	  //	  Kernels::DhopSiteDag(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-	}
-      } else {
-        for (int sss = myblock; sss < myblock+myn; ++sss) {
-	  Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
-	  //	  Kernels::DhopSite(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-	}
-      } 
+  /////////////////////////////
+  // do the compute interior
+  /////////////////////////////
+  int Opt = WilsonKernelsStatic::Opt;
+  if (dag == DaggerYes) {
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
+  } else {
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
+  } 

-    } else {
-      st.CommunicateThreaded();
-    }
-  }  //pragma
+  /////////////////////////////
+  // Complete comms
+  /////////////////////////////
+  st.CommunicateComplete(requests);
+  st.CommsMerge(compressor);

-  {
-    auto U_v   = U.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    auto st_v  =  st.View();
-    int Opt = WilsonKernelsStatic::Opt;
-    if (dag == DaggerYes) {
-      thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
-	Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
-      });
-    } else {
-      thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
-	Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
-      });
-    }
+  /////////////////////////////
+  // do the compute exterior
+  /////////////////////////////
+  if (dag == DaggerYes) {
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
+  } else {
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  }
-#else
-  assert(0);
-#endif
 };


--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h
@ -73,7 +73,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  return;
 }

-#ifdef GPU_VEC
+#if 1
 #define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
  if (SE._is_local) {							\
    int mask = Nsimd >> (ptype + 1);					\
@ -96,7 +96,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
      spProj(chi, in_t);						\
    }									\
  } else {								\
-    chi  = buf[SE._offset+s];						\
+    chi  = (buf[SE._offset+s];						\
  }									\
  synchronise();
 #endif
@ -106,15 +106,9 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, Sit
 							    SiteHalfSpinor *buf, int Ls, int s,
 							    int sU, const FermionFieldView &in, FermionFieldView &out)
 {
-#ifdef __CUDA_ARCH__
  typename SiteHalfSpinor::scalar_object chi;
  typename SiteHalfSpinor::scalar_object Uchi;
  typename SiteSpinor::scalar_object   result;
-#else 
-  SiteHalfSpinor chi;
-  SiteHalfSpinor Uchi;
-  SiteSpinor     result;
-#endif

  typedef typename SiteSpinor::scalar_type scalar_type;
  typedef typename SiteSpinor::vector_type vector_type;
@ -173,11 +167,7 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, Sit
    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); 
    Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
    accumReconTm(result, Uchi);
-#ifdef GPU_VEC
-  insertLane (lane,out[sF],result);
-#else
-  vstream(out[sF], result);
-#endif
+    insertLane (lane,out[sF],result);
  }
 }

@ -186,15 +176,10 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDo
 							 SiteHalfSpinor *buf,  int Ls, int s,
 							 int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
-#ifdef __CUDA_ARCH__
  typename SiteHalfSpinor::scalar_object chi;
  typename SiteHalfSpinor::scalar_object Uchi;
  typename SiteSpinor::scalar_object   result;
-#else 
-  SiteHalfSpinor chi;
-  SiteHalfSpinor Uchi;
-  SiteSpinor     result;
-#endif
+
  typedef typename SiteSpinor::scalar_type scalar_type;
  typedef typename SiteSpinor::vector_type vector_type;
  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
@ -255,11 +240,7 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDo
    Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
    accumReconTp(result, Uchi);

-#ifdef GPU_VEC
    insertLane (lane,out[sF],result);
-#else
-  vstream(out[sF], result);
-#endif
  }
 };

@ -287,6 +268,25 @@ GPU_EMPTY(GparityWilsonImplFH);
 GPU_EMPTY(GparityWilsonImplD);
 GPU_EMPTY(GparityWilsonImplDF);

+#define KERNEL_CALL(A) \
+      const uint64_t nsimd = Simd::Nsimd(); \
+      const uint64_t    NN = Nsite*Ls*nsimd;\
+      accelerator_loopN( sss, NN, {         \
+	  uint64_t cur  = sss;              \
+	  cur = cur / nsimd;                \
+	  uint64_t   s  = cur%Ls;           \
+	  cur = cur / Ls;                   \
+	  uint64_t   sU = cur;              \
+	  WilsonKernels<Impl>::A(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);\
+      });
+ 
+#define HOST_CALL(A) \
+  accelerator_loopN( ss, Ls*Nsite, {					\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      WilsonKernels<Impl>::A(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v);	\
+  });
+
 template <class Impl>
 void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 				     int Ls, int Nsite, const FermionField &in, FermionField &out,
@ -297,25 +297,18 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    auto out_v = out.View();
    auto st_v  =  st.View();

-    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
-#define KERNEL_CALL(A) \
-      const uint64_t nsimd = Simd::Nsimd(); \
-      const uint64_t    NN = Nsite*Ls*nsimd;\
-      accelerator_loopN( sss, NN, {         \
-	  uint64_t cur  = sss;              \
-	  cur = cur / nsimd;                \
-	  uint64_t   s  = cur%Ls;           \
-	  cur = cur / Ls;                   \
-	  uint64_t   sU = cur; 
-	  WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
-      });
-    } else { 
-      accelerator_loop( ss, U_v, {
-	int sU = ss;
-        int sF = Ls * sU;
-        WilsonKernels<Impl>::GenericDhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
-      });
-    }
+   if( interior && exterior ) { 
+     if (Opt == WilsonKernelsStatic::OptGpu) {
+       KERNEL_CALL(GpuDhopSite);
+     } else {
+       HOST_CALL(GenericDhopSite);
+     }
+   } else if( interior ) {
+     HOST_CALL(GenericDhopSiteInt);
+   } else if( exterior ) { 
+     HOST_CALL(GenericDhopSiteExt);
+   }
+
  }
  template <class Impl>
  void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
@ -327,25 +320,16 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    auto out_v = out.View();
    auto st_v  = st.View();

-    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
-      const uint64_t nsimd = Simd::Nsimd();
-      const uint64_t    NN = Nsite*Ls*nsimd;
-      accelerator_loopN( sss, NN, {
-	  uint64_t cur  = sss;
-	  //	  uint64_t lane = cur % nsimd;
-	  cur = cur / nsimd;
-	  uint64_t   s  = cur%Ls;
-	  //	  uint64_t   sF = cur;         
-	  cur = cur / Ls;
-	  uint64_t   sU = cur;
-	  WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
-      });
-    } else { 
-      accelerator_loop( ss, U_v, {
-	int sU = ss;
-	int sF = Ls * sU;
-	WilsonKernels<Impl>::GenericDhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
-      });
+    if( interior && exterior ) { 
+      if (Opt == WilsonKernelsStatic::OptGpu) {
+	KERNEL_CALL(GpuDhopSiteDag);
+      } else {
+	HOST_CALL(GenericDhopSiteDag);
+      }
+    } else if( interior ) {
+      HOST_CALL(GenericDhopSiteDagInt);
+    } else if( exterior ) { 
+      HOST_CALL(GenericDhopSiteDagExt);
    }
  }

--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -267,7 +267,6 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
  int ptype;

  SE = st.GetEntry(ptype, dir, sF);
-  //  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
  if (gamma == Xp) {						
    if (SE->_is_local ) {					
      int perm= SE->_permute;					
--- a/Grid/qcd/action/fermion/implementation/WilsonTMFermion.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonTMFermion.h
@ -1,97 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
-
-    Copyright (C) 2015
-
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-/*
- * BF sequence
- *
- void bfmbase<Float>::MooeeInv(Fermion_t psi, 
- Fermion_t chi, 
- int dag, int cb)
-
- double m    = this->mass;
- double tm   = this->twistedmass;
- double mtil = 4.0+this->mass;
-
- double sq = mtil*mtil + tm*tm;
-
- double a = mtil/sq;
- double b = -tm /sq;
- if(dag) b=-b;
- axpibg5x(chi,psi,a,b);
-
- void bfmbase<Float>::Mooee(Fermion_t psi, 
- Fermion_t chi, 
- int dag,int cb)
- double a = 4.0+this->mass;
- double b = this->twistedmass;
- if(dag) b=-b;
- axpibg5x(chi,psi,a,b);
-*/
-
-template<class Impl>
-void WilsonTMFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  RealD a = 4.0+this->mass;
-  RealD b = this->mu;
-  out.Checkerboard() = in.Checkerboard();
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  RealD a = 4.0+this->mass;
-  RealD b = -this->mu;
-  out.Checkerboard() = in.Checkerboard();
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  RealD m    = this->mass;
-  RealD tm   = this->mu;
-  RealD mtil = 4.0+m;
-  RealD sq   = mtil*mtil+tm*tm;
-  RealD a    = mtil/sq;
-  RealD b    = -tm /sq;
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
-  RealD m    = this->mass;
-  RealD tm   = this->mu;
-  RealD mtil = 4.0+m;
-  RealD sq   = mtil*mtil+tm*tm;
-  RealD a    = mtil/sq;
-  RealD b    = tm /sq;
-  axpibg5x(out,in,a,b);
-}
-
-FermOpTemplateInstantiate(WilsonTMFermion);
-
-NAMESPACE_END(Grid);