Compiles GPU and CPU, still gives good performance on CPU

2025-09-20 02:01:05 +01:00 · 2019-06-05 13:28:16 +01:00
parent 18d3cde29a
commit 0ee6e77cbc
71 changed files with 1512 additions and 33769 deletions
--- a/Grid/qcd/action/fermion/CayleyFermion5D.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.cc
@@ -1,668 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
-				       GridCartesian         &FiveDimGrid,
-				       GridRedBlackCartesian &FiveDimRedBlackGrid,
-				       GridCartesian         &FourDimGrid,
-				       GridRedBlackCartesian &FourDimRedBlackGrid,
-				       RealD _mass,RealD _M5,const ImplParams &p) :
-  WilsonFermion5D<Impl>(_Umu,
-			FiveDimGrid,
-			FiveDimRedBlackGrid,
-			FourDimGrid,
-			FourDimRedBlackGrid,_M5,p),
-  mass(_mass)
-{ 
-}
-
-///////////////////////////////////////////////////////////////
-// Physical surface field utilities
-///////////////////////////////////////////////////////////////
-template<class Impl>  
-void CayleyFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
-{
-  int Ls = this->Ls;
-  FermionField tmp(this->FermionGrid());
-  tmp = solution5d;
-  conformable(solution5d.Grid(),this->FermionGrid());
-  conformable(exported4d.Grid(),this->GaugeGrid());
-  axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0);
-  axpby_ssp_pplus (tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
-  ExtractSlice(exported4d, tmp, 0, 0);
-}
-template<class Impl>  
-void CayleyFermion5D<Impl>::P(const FermionField &psi, FermionField &chi)
-{
-  int Ls= this->Ls;
-  chi=Zero();
-  for(int s=0;s<Ls;s++){
-    axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
-    axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s+1)%Ls);
-  }
-}
-template<class Impl>  
-void CayleyFermion5D<Impl>::Pdag(const FermionField &psi, FermionField &chi)
-{
-  int Ls= this->Ls;
-  chi=Zero();
-  for(int s=0;s<Ls;s++){
-    axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
-    axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s-1+Ls)%Ls);
-  }
-}
-template<class Impl>  
-void CayleyFermion5D<Impl>::ExportPhysicalFermionSource(const FermionField &solution5d,FermionField &exported4d)
-{
-  int Ls = this->Ls;
-  FermionField tmp(this->FermionGrid());
-  tmp = solution5d;
-  conformable(solution5d.Grid(),this->FermionGrid());
-  conformable(exported4d.Grid(),this->GaugeGrid());
-  axpby_ssp_pplus (tmp, 0., solution5d, 1., solution5d, 0, 0);
-  axpby_ssp_pminus(tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
-  ExtractSlice(exported4d, tmp, 0, 0);
-}
-template<class Impl>
-void CayleyFermion5D<Impl>::ImportUnphysicalFermion(const FermionField &input4d,FermionField &imported5d)
-{
-  int Ls = this->Ls;
-  FermionField tmp(this->FermionGrid());
-  conformable(imported5d.Grid(),this->FermionGrid());
-  conformable(input4d.Grid()   ,this->GaugeGrid());
-  tmp = Zero();
-  InsertSlice(input4d, tmp, 0   , 0);
-  InsertSlice(input4d, tmp, Ls-1, 0);
-  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
-  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
-  imported5d=tmp;
-}
-
-template<class Impl>  
-void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
-{
-  int Ls = this->Ls;
-  FermionField tmp(this->FermionGrid());
-  conformable(imported5d.Grid(),this->FermionGrid());
-  conformable(input4d.Grid()   ,this->GaugeGrid());
-  tmp = Zero();
-  InsertSlice(input4d, tmp, 0   , 0);
-  InsertSlice(input4d, tmp, Ls-1, 0);
-  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
-  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
-  Dminus(tmp,imported5d);
-}
-template<class Impl>  
-void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
-{
-  int Ls=this->Ls;
-
-  FermionField tmp_f(this->FermionGrid());
-  this->DW(psi,tmp_f,DaggerNo);
-
-  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
-  }
-}
-template<class Impl>  
-void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
-{
-  int Ls=this->Ls;
-
-  FermionField tmp_f(this->FermionGrid());
-  this->DW(psi,tmp_f,DaggerYes);
-
-  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
-  }
-}
-
-template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
-{
-  this->Report();
-  Coordinate latt = GridDefaultLatt();          
-  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP     = this->_FourDimGrid->_Nprocessors;
-  if ( M5Dcalls > 0 ) {
-    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
-
-    // Flops = 10.0*(Nc*Ns) *Ls*vol
-    RealD mflops = 10.0*(Nc*Ns)*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-
-    // Bytes = sizeof(Real) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting)
-    // read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 )
-    // write = 1
-    RealD Gbytes = sizeof(Real) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9;
-    std::cout << GridLogMessage << "Average bandwidth (GB/s)                 : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl;
-  }
-
-  if ( MooeeInvCalls > 0 ) {
-
-    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
-#ifdef GRID_NVCC
-    RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-#else
-    // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
-    RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-#endif
-  }
-
-}
-template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
-{
-  this->ZeroCounters();
-  M5Dflops=0;
-  M5Dcalls=0;
-  M5Dtime=0;
-  MooeeInvFlops=0;
-  MooeeInvCalls=0;
-  MooeeInvTime=0;
-}
-
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
-{
-  int Ls=this->Ls;
-  Vector<Coeff_t> diag (Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
-  M5D(psi,chi,chi,lower,diag,upper);
-}
-template<class Impl>
-void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
-{
-  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bs;
-  Vector<Coeff_t> upper= cs;
-  Vector<Coeff_t> lower= cs; 
-  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
-  M5D(psi,psi,Din,lower,diag,upper);
-}
-// FIXME Redunant with the above routine; check this and eliminate
-template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
-{
-  int Ls=this->Ls;
-  Vector<Coeff_t> diag = beo;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-  for(int i=0;i<Ls;i++) {
-    upper[i]=-ceo[i];
-    lower[i]=-ceo[i];
-  }
-  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
-  M5D(psi,psi,chi,lower,diag,upper);
-}
-template<class Impl>
-void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
-{
-  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-  for(int i=0;i<Ls;i++) {
-    upper[i]=-cee[i];
-    lower[i]=-cee[i];
-  }
-  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
-  M5D(psi,psi,chi,lower,diag,upper);
-}
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
-{
-  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-
-  for (int s=0;s<Ls;s++){
-    // Assemble the 5d matrix
-    if ( s==0 ) {
-      upper[s] = -cee[s+1] ;
-      lower[s] = mass*cee[Ls-1];
-    } else if ( s==(Ls-1)) { 
-      upper[s] = mass*cee[0];
-      lower[s] = -cee[s-1];
-    } else {
-      upper[s]=-cee[s+1];
-      lower[s]=-cee[s-1];
-    }
-  }
-  // Conjugate the terms 
-  for (int s=0;s<Ls;s++){
-    diag[s] =conjugate(diag[s]);
-    upper[s]=conjugate(upper[s]);
-    lower[s]=conjugate(lower[s]);
-  }
-  M5Ddag(psi,psi,chi,lower,diag,upper);
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
-{
-  int Ls=this->Ls;
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);
-  Vector<Coeff_t> lower(Ls,-1.0);
-  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
-  M5Ddag(psi,chi,chi,lower,diag,upper);
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
-{
-  int Ls=this->Ls;
-  Vector<Coeff_t> diag =bs;
-  Vector<Coeff_t> upper=cs;
-  Vector<Coeff_t> lower=cs; 
-
-  for (int s=0;s<Ls;s++){
-    if ( s== 0 ) {
-      upper[s] = cs[s+1];
-      lower[s] =-mass*cs[Ls-1];
-    } else if ( s==(Ls-1) ) { 
-      upper[s] =-mass*cs[0];
-      lower[s] = cs[s-1];
-    } else { 
-      upper[s] = cs[s+1];
-      lower[s] = cs[s-1];
-    }
-    upper[s] = conjugate(upper[s]);
-    lower[s] = conjugate(lower[s]);
-    diag[s]  = conjugate(diag[s]);
-  }
-  M5Ddag(psi,psi,Din,lower,diag,upper);
-}
-
-template<class Impl>
-RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
-{
-  FermionField Din(psi.Grid());
-  
-  // Assemble Din
-  Meooe5D(psi,Din);
-  
-  this->DW(Din,chi,DaggerNo);
-  // ((b D_W + D_w hop terms +1) on s-diag
-  axpby(chi,1.0,1.0,chi,psi); 
-  
-  M5D(psi,chi);
-  return(norm2(chi));
-}
-
-template<class Impl>
-RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
-{
-  // Under adjoint
-  //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
-  //D2- P+     D2+            P-D1-^dag D2+dag
-  
-  FermionField Din(psi.Grid());
-  // Apply Dw
-  this->DW(psi,Din,DaggerYes); 
-  
-  MeooeDag5D(Din,chi);
-  
-  M5Ddag(psi,chi);
-  // ((b D_W + D_w hop terms +1) on s-diag
-  axpby (chi,1.0,1.0,chi,psi); 
-  return norm2(chi);
-}
-
-// half checkerboard operations
-template<class Impl>
-void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
-{
-  Meooe5D(psi,this->tmp()); 
-
-  if ( psi.Checkerboard() == Odd ) {
-    this->DhopEO(this->tmp(),chi,DaggerNo);
-  } else {
-    this->DhopOE(this->tmp(),chi,DaggerNo);
-  }
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
-{
-  // Apply 4d dslash
-  if ( psi.Checkerboard() == Odd ) {
-    this->DhopEO(psi,this->tmp(),DaggerYes);
-  } else {
-    this->DhopOE(psi,this->tmp(),DaggerYes);
-  }
-  MeooeDag5D(this->tmp(),chi); 
-}
-
-template<class Impl>
-void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-  Meo5D(psi,this->tmp());
-  // Apply 4d dslash fragment
-  this->DhopDir(this->tmp(),chi,dir,disp);
-}
-// force terms; five routines; default to Dhop on diagonal
-template<class Impl>
-void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  FermionField Din(V.Grid());
-  
-  if ( dag == DaggerNo ) {
-    //      U d/du [D_w D5] V = U d/du DW D5 V
-    Meooe5D(V,Din);
-    this->DhopDeriv(mat,U,Din,dag);
-  } else {
-    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-    Meooe5D(U,Din);
-    this->DhopDeriv(mat,Din,V,dag);
-  }
-};
-template<class Impl>
-void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  FermionField Din(V.Grid());
-  
-  if ( dag == DaggerNo ) {
-    //      U d/du [D_w D5] V = U d/du DW D5 V
-    Meooe5D(V,Din);
-    this->DhopDerivOE(mat,U,Din,dag);
-  } else {
-    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-    Meooe5D(U,Din);
-    this->DhopDerivOE(mat,Din,V,dag);
-  }
-};
-template<class Impl>
-void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  FermionField Din(V.Grid());
-  
-  if ( dag == DaggerNo ) {
-    //      U d/du [D_w D5] V = U d/du DW D5 V
-    Meooe5D(V,Din);
-    this->DhopDerivEO(mat,U,Din,dag);
-  } else {
-    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-    Meooe5D(U,Din);
-    this->DhopDerivEO(mat,Din,V,dag);
-  }
-};
-  
-// Tanh
-template<class Impl>
-void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
-{
-  Vector<Coeff_t> gamma(this->Ls);
-  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
-  SetCoefficientsInternal(1.0,gamma,b,c);
-}
-//Zolo
-template<class Impl>
-void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
-{
-  Vector<Coeff_t> gamma(this->Ls);
-  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
-  SetCoefficientsInternal(zolo_hi,gamma,b,c);
-}
-//Zolo
-template<class Impl>
-void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
-{
-  int Ls=this->Ls;
-
-  ///////////////////////////////////////////////////////////
-  // The Cayley coeffs (unprec)
-  ///////////////////////////////////////////////////////////
-  assert(gamma.size()==Ls);
-
-  omega.resize(Ls);
-  bs.resize(Ls);
-  cs.resize(Ls);
-  as.resize(Ls);
-  
-  // 
-  // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
-  //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
-  //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
-  //
-  //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
-  //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
-  //
-  // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
-  // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
-  //
-  // So 
-  //
-  // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
-  //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
-  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
-  //
-  // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
-  //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
-  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
-  // 
-    
-  double bpc = b+c;
-  double bmc = b-c;
-  _b = b;
-  _c = c;
-  _gamma  = gamma; // Save the parameters so we can change mass later.
-  _zolo_hi= zolo_hi;
-  for(int i=0; i < Ls; i++){
-    as[i] = 1.0;
-    omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code
-    assert(omega[i]!=Coeff_t(0.0));
-    bs[i] = 0.5*(bpc/omega[i] + bmc);
-    cs[i] = 0.5*(bpc/omega[i] - bmc);
-  }
-
-  ////////////////////////////////////////////////////////
-  // Constants for the preconditioned matrix Cayley form
-  ////////////////////////////////////////////////////////
-  bee.resize(Ls);
-  cee.resize(Ls);
-  beo.resize(Ls);
-  ceo.resize(Ls);
-  
-  for(int i=0;i<Ls;i++){
-    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
-    assert(bee[i]!=Coeff_t(0.0));
-    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
-    beo[i]=as[i]*bs[i];
-    ceo[i]=-as[i]*cs[i];
-  }
-  aee.resize(Ls);
-  aeo.resize(Ls);
-  for(int i=0;i<Ls;i++){
-    aee[i]=cee[i];
-    aeo[i]=ceo[i];
-  }
-  
-  //////////////////////////////////////////
-  // LDU decomposition of eeoo
-  //////////////////////////////////////////
-  dee.resize(Ls);
-  lee.resize(Ls);
-  leem.resize(Ls);
-  uee.resize(Ls);
-  ueem.resize(Ls);
-  
-  for(int i=0;i<Ls;i++){
-    
-    dee[i] = bee[i];
-    
-    if ( i < Ls-1 ) {
-
-      assert(bee[i]!=Coeff_t(0.0));
-      assert(bee[0]!=Coeff_t(0.0));
-      
-      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
-      
-      leem[i]=mass*cee[Ls-1]/bee[0];
-      for(int j=0;j<i;j++) {
-	assert(bee[j+1]!=Coeff_t(0.0));
-	leem[i]*= aee[j]/bee[j+1];
-      }
-      
-      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
-      
-      ueem[i]=mass;
-      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
-      ueem[i]*= aee[0]/bee[0];
-      
-    } else { 
-      lee[i] =0.0;
-      leem[i]=0.0;
-      uee[i] =0.0;
-      ueem[i]=0.0;
-    }
-  }
-	
-  { 
-    Coeff_t delta_d=mass*cee[Ls-1];
-    for(int j=0;j<Ls-1;j++) {
-      assert(bee[j] != Coeff_t(0.0));
-      delta_d *= cee[j]/bee[j];
-    }
-    dee[Ls-1] += delta_d;
-  }  
-
-  int inv=1;
-  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
-  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
-}
-
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
-						 Vector<iSinglet<Simd> > & Matp,
-						 Vector<iSinglet<Simd> > & Matm)
-{
-  int Ls=this->Ls;
-
-  GridBase *grid = this->FermionRedBlackGrid();
-  int LLs = grid->_rdimensions[0];
-
-  if ( LLs == Ls ) {
-    return; // Not vectorised in 5th direction
-  }
-
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-  
-  for(int s=0;s<Ls;s++){
-    Pplus(s,s) = bee[s];
-    Pminus(s,s)= bee[s];
-  }
-  
-  for(int s=0;s<Ls-1;s++){
-    Pminus(s,s+1) = -cee[s];
-  }
-  
-  for(int s=0;s<Ls-1;s++){
-    Pplus(s+1,s) = -cee[s+1];
-  }
-  Pplus (0,Ls-1) = mass*cee[0];
-  Pminus(Ls-1,0) = mass*cee[Ls-1];
-  
-  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXcd PminusMat;
-  
-  if ( inv ) {
-    PplusMat =Pplus.inverse();
-    PminusMat=Pminus.inverse();
-  } else { 
-    PplusMat =Pplus;
-    PminusMat=Pminus;
-  }
-  
-  if(dag){
-    PplusMat.adjointInPlace();
-    PminusMat.adjointInPlace();
-  }
-  
-  typedef typename SiteHalfSpinor::scalar_type scalar_type;
-  const int Nsimd=Simd::Nsimd();
-  Matp.resize(Ls*LLs);
-  Matm.resize(Ls*LLs);
-
-  for(int s2=0;s2<Ls;s2++){
-    for(int s1=0;s1<LLs;s1++){
-      int istride = LLs;
-      int ostride = 1;
-      Simd Vp;
-      Simd Vm;
-      scalar_type *sp = (scalar_type *)&Vp;
-      scalar_type *sm = (scalar_type *)&Vm;
-      for(int l=0;l<Nsimd;l++){
-	if ( switcheroo<Coeff_t>::iscomplex() ) {
-	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
-	} else { 
-	  // if real
-	  scalar_type tmp;
-	  tmp = PplusMat (l*istride+s1*ostride,s2);
-	  sp[l] = scalar_type(tmp.real(),tmp.real());
-	  tmp = PminusMat(l*istride+s1*ostride,s2);
-	  sm[l] = scalar_type(tmp.real(),tmp.real());
-	}
-      }
-      Matp[LLs*s2+s1] = Vp;
-      Matm[LLs*s2+s1] = Vm;
-    }}
-}
-
-
-FermOpTemplateInstantiate(CayleyFermion5D);
-GparityFermOpTemplateInstantiate(CayleyFermion5D);
-
-NAMESPACE_END(Grid);
-
-
--- a/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -1,247 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
-
-
-NAMESPACE_BEGIN(Grid);
-
-// Pminus fowards
-// Pplus  backwards..
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
-				const FermionField &phi_i, 
-				FermionField &chi_i,
-				Vector<Coeff_t> &lower,
-				Vector<Coeff_t> &diag,
-				Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls =this->Ls;
-  assert(phi.Checkerboard() == psi.Checkerboard());
-  // 10 = 3 complex mult + 2 complex add
-  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
-    for(int s=0;s<Ls;s++){
-      auto tmp = psi[0];
-      if ( s==0 ) {
-	spProj5m(tmp,psi[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5p(tmp,psi[ss+Ls-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else if ( s==(Ls-1)) {
-	spProj5m(tmp,psi[ss+0]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5p(tmp,psi[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else { 
-	spProj5m(tmp,psi[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5p(tmp,psi[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      }
-    }
-  });
-  M5Dtime+=usecond();
-}
-
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
-				   const FermionField &phi_i, 
-				   FermionField &chi_i,
-				   Vector<Coeff_t> &lower,
-				   Vector<Coeff_t> &diag,
-				   Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls =this->Ls;
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
-    auto tmp = psi[0];
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	spProj5p(tmp,psi[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5m(tmp,psi[ss+Ls-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else if ( s==(Ls-1)) {
-	spProj5p(tmp,psi[ss+0]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5m(tmp,psi[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else { 
-	spProj5p(tmp,psi[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5m(tmp,psi[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      }
-    }
-  });
-  M5Dtime+=usecond();
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi_i)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-
-  int Ls=this->Ls;
-
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
-  thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
-    auto tmp = psi[0];
-
-    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-    // Apply (L^{\prime})^{-1}
-    chi[ss]=psi[ss]; // chi[0]=psi[0]
-    for(int s=1;s<Ls;s++){
-      spProj5p(tmp,chi[ss+s-1]);  
-      chi[ss+s] = psi[ss+s]-lee[s-1]*tmp;
-    }
-    // L_m^{-1} 
-    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp,chi[ss+s]);    
-      chi[ss+Ls-1] = chi[ss+Ls-1] - leem[s]*tmp;
-    }
-    // U_m^{-1} D^{-1}
-    for (int s=0;s<Ls-1;s++){
-      // Chi[s] + 1/d chi[s] 
-      spProj5p(tmp,chi[ss+Ls-1]); 
-      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(ueem[s]/dee[Ls-1])*tmp;
-    }	
-    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
-      
-    // Apply U^{-1}
-    for (int s=Ls-2;s>=0;s--){
-      spProj5m(tmp,chi[ss+s+1]);  
-      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
-    }
-  });
-
-  MooeeInvTime+=usecond();
-
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi_i)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  int Ls=this->Ls;
-
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-
-  assert(psi.Checkerboard() == psi.Checkerboard());
-
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
-  thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
-
-    auto tmp = psi[0];
-
-    // Apply (U^{\prime})^{-dagger}
-    chi[ss]=psi[ss];
-    for (int s=1;s<Ls;s++){
-      spProj5m(tmp,chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s]-conjugate(uee[s-1])*tmp;
-    }
-    // U_m^{-\dagger} 
-    for (int s=0;s<Ls-1;s++){
-      spProj5p(tmp,chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - conjugate(ueem[s])*tmp;
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for (int s=0;s<Ls-1;s++){
-      spProj5m(tmp,chi[ss+Ls-1]);
-      chi[ss+s] = conjugate(1.0/dee[s])*chi[ss+s]-conjugate(leem[s]/dee[Ls-1])*tmp;
-    }	
-    chi[ss+Ls-1]= conjugate(1.0/dee[Ls-1])*chi[ss+Ls-1];
-  
-    // Apply L^{-dagger}
-    for (int s=Ls-2;s>=0;s--){
-      spProj5p(tmp,chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - conjugate(lee[s])*tmp;
-    }
-  });
-
-  MooeeInvTime+=usecond();
-
-}
-
-#ifdef CAYLEY_DPERP_CACHE
-INSTANTIATE_DPERP(WilsonImplF);
-INSTANTIATE_DPERP(WilsonImplD);
-INSTANTIATE_DPERP(GparityWilsonImplF);
-INSTANTIATE_DPERP(GparityWilsonImplD);
-INSTANTIATE_DPERP(ZWilsonImplF);
-INSTANTIATE_DPERP(ZWilsonImplD);
-
-INSTANTIATE_DPERP(WilsonImplFH);
-INSTANTIATE_DPERP(WilsonImplDF);
-INSTANTIATE_DPERP(GparityWilsonImplFH);
-INSTANTIATE_DPERP(GparityWilsonImplDF);
-INSTANTIATE_DPERP(ZWilsonImplFH);
-INSTANTIATE_DPERP(ZWilsonImplDF);
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc
@@ -1,284 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
-
-
-NAMESPACE_BEGIN(Grid);
-
-// Pminus fowards
-// Pplus  backwards..
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
-				const FermionField &phi_i, 
-				FermionField &chi_i,
-				Vector<Coeff_t> &lower,
-				Vector<Coeff_t> &diag,
-				Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  Coeff_t *lower_v = &lower[0];
-  Coeff_t *diag_v  = &diag[0];
-  Coeff_t *upper_v = &upper[0];
-  int Ls =this->Ls;
-  assert(phi.Checkerboard() == psi.Checkerboard());
-  
-  const uint64_t nsimd = grid->Nsimd();
-  const uint64_t sites4d = nsimd * grid->oSites() / Ls;
-  
-  // 10 = 3 complex mult + 2 complex add
-  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
-  M5Dcalls++;
-  M5Dtime-=usecond();
-  
-  accelerator_loopN( sss, sites4d ,{
-    uint64_t lane = sss % nsimd;
-    uint64_t ss   = Ls * (sss / nsimd);
-    
-    for(int s=0;s<Ls;s++){
-      auto res = extractLane(lane,phi[ss+s]);
-      res = diag_v[s]*res;
-      
-      auto tmp = extractLane(lane,psi[ss+(s+1)%Ls]);
-      spProj5m(tmp,tmp);
-      res += upper_v[s]*tmp;
-      
-      tmp = extractLane(lane,psi[ss+(s+Ls-1)%Ls]);
-      spProj5p(tmp,tmp);
-      res += lower_v[s]*tmp;
-      
-      insertLane(lane,chi[ss+s],res);
-    }
-  });
-  M5Dtime+=usecond();
-}
-
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
-				   const FermionField &phi_i, 
-				   FermionField &chi_i,
-				   Vector<Coeff_t> &lower,
-				   Vector<Coeff_t> &diag,
-				   Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  Coeff_t *lower_v = &lower[0];
-  Coeff_t *diag_v  = &diag[0];
-  Coeff_t *upper_v = &upper[0];
-  int Ls =this->Ls;
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  const uint64_t nsimd = grid->Nsimd();
-  const uint64_t sites4d = nsimd * grid->oSites() / Ls;
-  
-  // 10 = 3 complex mult + 2 complex add
-  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  accelerator_loopN( sss, sites4d ,{
-    uint64_t lane = sss % nsimd;
-    uint64_t ss   = Ls * (sss / nsimd);
-    
-    for(int s=0;s<Ls;s++){
-      auto res = extractLane(lane,phi[ss+s]);
-      res = diag_v[s]*res;
-      
-      auto tmp = extractLane(lane,psi[ss+(s+1)%Ls]);
-      spProj5p(tmp,tmp);
-      res += upper_v[s]*tmp;
-      
-      tmp = extractLane(lane,psi[ss+(s+Ls-1)%Ls]);
-      spProj5m(tmp,tmp);
-      res += lower_v[s]*tmp;
-      
-      insertLane(lane,chi[ss+s],res);
-    }
-  });
-  M5Dtime+=usecond();
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi_i)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  Coeff_t *lee_v  = &lee[0];
-  Coeff_t *leem_v = &leem[0];
-  Coeff_t *uee_v  = &uee[0];
-  Coeff_t *ueem_v = &ueem[0];
-  Coeff_t *dee_v  = &dee[0];
-  
-  int Ls=this->Ls;
-  const uint64_t nsimd = grid->Nsimd();
-  const uint64_t sites4d = nsimd * grid->oSites() / Ls;
-  
-  typedef typename SiteSpinor::scalar_object ScalarSiteSpinor;
-  
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-  
-  accelerator_loopN( sss, sites4d ,{
-    uint64_t lane = sss % nsimd;
-    uint64_t ss   = Ls * (sss / nsimd);
-    ScalarSiteSpinor res, tmp, acc;
-    
-    // X = Nc*Ns
-    // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
-    // Apply (L^{\prime})^{-1} L_m^{-1}
-    res = extractLane(lane,psi[ss]);
-    spProj5m(tmp,res);
-    acc = leem_v[0]*tmp;
-    spProj5p(tmp,res);
-    insertLane(lane,chi[ss],res);
-    
-    for(int s=1;s<Ls-1;s++){
-      res = extractLane(lane,psi[ss+s]);
-      res -= lee_v[s-1]*tmp;
-      spProj5m(tmp,res);
-      acc += leem_v[s]*tmp;
-      spProj5p(tmp,res);
-      insertLane(lane,chi[ss+s],res);
-    }
-    res = extractLane(lane,psi[ss+Ls-1]);
-    res = res - lee_v[Ls-2]*tmp - acc;
-    
-    // Apply U_m^{-1} D^{-1} U^{-1}
-    res = (1.0/dee_v[Ls-1])*res;
-    insertLane(lane,chi[ss+Ls-1],res);
-    spProj5p(acc,res);
-    spProj5m(tmp,res);
-    for (int s=Ls-2;s>=0;s--){
-      res = extractLane(lane,chi[ss+s]);
-      res = (1.0/dee_v[s])*res - uee_v[s]*tmp - ueem_v[s]*acc;
-      spProj5m(tmp,res);
-      insertLane(lane,chi[ss+s],res);
-    }
-  });
-  
-  MooeeInvTime+=usecond();
-
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi_i)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  Coeff_t *lee_v  = &lee[0];
-  Coeff_t *leem_v = &leem[0];
-  Coeff_t *uee_v  = &uee[0];
-  Coeff_t *ueem_v = &ueem[0];
-  Coeff_t *dee_v  = &dee[0];
-  
-  int Ls=this->Ls;
-  const uint64_t nsimd = grid->Nsimd();
-  const uint64_t sites4d = nsimd * grid->oSites() / Ls;
-  
-  typedef typename SiteSpinor::scalar_object ScalarSiteSpinor;
-  
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-  
-  accelerator_loopN( sss, sites4d ,{
-    uint64_t lane = sss % nsimd;
-    uint64_t ss   = Ls * (sss / nsimd);
-    ScalarSiteSpinor res, tmp, acc;
-    
-    // X = Nc*Ns
-    // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
-    // Apply (U^{\prime})^{-dagger} U_m^{-\dagger}
-    res = extractLane(lane,psi[ss]);
-    spProj5p(tmp,res);
-    acc = conjugate(ueem_v[0])*tmp;
-    spProj5m(tmp,res);
-    insertLane(lane,chi[ss],res);
-    
-    for(int s=1;s<Ls-1;s++){
-      res = extractLane(lane,psi[ss+s]);
-      res -= conjugate(uee_v[s-1])*tmp;
-      spProj5p(tmp,res);
-      acc += conjugate(ueem_v[s])*tmp;
-      spProj5m(tmp,res);
-      insertLane(lane,chi[ss+s],res);
-    }
-    res = extractLane(lane,psi[ss+Ls-1]);
-    res = res - conjugate(uee_v[Ls-2])*tmp - acc;
-    
-    // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
-    res = conjugate(1.0/dee_v[Ls-1])*res;
-    insertLane(lane,chi[ss+Ls-1],res);
-    spProj5m(acc,res);
-    spProj5p(tmp,res);
-    for (int s=Ls-2;s>=0;s--){
-      res = extractLane(lane,chi[ss+s]);
-      res = conjugate(1.0/dee_v[s])*res - conjugate(lee_v[s])*tmp - conjugate(leem_v[s])*acc;
-      spProj5p(tmp,res);
-      insertLane(lane,chi[ss+s],res);
-    }
-  });
-  
-  MooeeInvTime+=usecond();
-  
-}
-
-#ifdef CAYLEY_DPERP_GPU
-INSTANTIATE_DPERP(WilsonImplF);
-INSTANTIATE_DPERP(WilsonImplD);
-INSTANTIATE_DPERP(GparityWilsonImplF);
-INSTANTIATE_DPERP(GparityWilsonImplD);
-INSTANTIATE_DPERP(ZWilsonImplF);
-INSTANTIATE_DPERP(ZWilsonImplD);
-
-INSTANTIATE_DPERP(WilsonImplFH);
-INSTANTIATE_DPERP(WilsonImplDF);
-INSTANTIATE_DPERP(GparityWilsonImplFH);
-INSTANTIATE_DPERP(GparityWilsonImplDF);
-INSTANTIATE_DPERP(ZWilsonImplFH);
-INSTANTIATE_DPERP(ZWilsonImplDF);
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -1,838 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
-
-
-NAMESPACE_BEGIN(Grid);
-
-/*
- * Dense matrix versions of routines
- */
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
-{
-  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
-}
-  
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
-{
-  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
-}
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
-				const FermionField &phi_i, 
-				FermionField &chi_i,
-				Vector<Coeff_t> &lower,
-				Vector<Coeff_t> &diag,
-				Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls   = this->Ls;
-  int LLs  = grid->_rdimensions[0];
-  const int nsimd= Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs==nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type * u_p = (scalar_type *)&u[0];
-  scalar_type * l_p = (scalar_type *)&l[0];
-  scalar_type * d_p = (scalar_type *)&d[0];
-
-  for(int o=0;o<LLs;o++){ // outer
-    for(int i=0;i<nsimd;i++){ //inner
-      int s  = o+i*LLs;
-      int ss = o*nsimd+i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  assert(Nc==3);
-
-  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
-#if 0
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0;v<LLs;v++){
-
-      int vp=(v+1)%LLs;
-      int vm=(v+LLs-1)%LLs;
-
-      spProj5m(hp,psi[ss+vp]);
-      spProj5p(hm,psi[ss+vm]);
-
-      if ( vp<=v ) rotate(hp,hp,1);
-      if ( vm>=v ) rotate(hm,hm,nsimd-1);
-	
-      hp=0.5*hp;
-      hm=0.5*hm;
-
-      spRecon5m(fp,hp);
-      spRecon5p(fm,hm);
-
-      chi[ss+v] = d[v]*phi[ss+v];
-      chi[ss+v] = chi[ss+v]     +u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-    }
-#else
-    for(int v=0;v<LLs;v++){
-      
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp= (v==LLs-1) ? 0     : v+1;
-      int vm= (v==0    ) ? LLs-1 : v-1;
-	
-      Simd hp_00 = psi[ss+vp]()(2)(0); 
-      Simd hp_01 = psi[ss+vp]()(2)(1); 
-      Simd hp_02 = psi[ss+vp]()(2)(2); 
-      Simd hp_10 = psi[ss+vp]()(3)(0); 
-      Simd hp_11 = psi[ss+vp]()(3)(1); 
-      Simd hp_12 = psi[ss+vp]()(3)(2); 
-	
-      Simd hm_00 = psi[ss+vm]()(0)(0); 
-      Simd hm_01 = psi[ss+vm]()(0)(1); 
-      Simd hm_02 = psi[ss+vm]()(0)(2); 
-      Simd hm_10 = psi[ss+vm]()(1)(0); 
-      Simd hm_11 = psi[ss+vm]()(1)(1); 
-      Simd hm_12 = psi[ss+vm]()(1)(2); 
-
-      if ( vp<=v ) {
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-      if ( vm>=v ) {
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
-      Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
-      Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02); 
-      Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
-      Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
-      Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
-      Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
-      Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
-      Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);  
-      Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
-      Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
-      Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
-
-      vstream(chi[ss+v]()(0)(0),p_00);
-      vstream(chi[ss+v]()(0)(1),p_01);
-      vstream(chi[ss+v]()(0)(2),p_02);
-      vstream(chi[ss+v]()(1)(0),p_10);
-      vstream(chi[ss+v]()(1)(1),p_11);
-      vstream(chi[ss+v]()(1)(2),p_12);
-      vstream(chi[ss+v]()(2)(0),p_20);
-      vstream(chi[ss+v]()(2)(1),p_21);
-      vstream(chi[ss+v]()(2)(2),p_22);
-      vstream(chi[ss+v]()(3)(0),p_30);
-      vstream(chi[ss+v]()(3)(1),p_31);
-      vstream(chi[ss+v]()(3)(2),p_32);
-
-    }
-#endif
-  });
-  M5Dtime+=usecond();
-}
-
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
-				   const FermionField &phi_i, 
-				   FermionField &chi_i,
-				   Vector<Coeff_t> &lower,
-				   Vector<Coeff_t> &diag,
-				   Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  auto psi=psi_i.View();
-  auto phi=phi_i.View();
-  auto chi=chi_i.View();
-  int Ls   = this->Ls;
-  int LLs  = grid->_rdimensions[0];
-  int nsimd= Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs==nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type * u_p = (scalar_type *)&u[0];
-  scalar_type * l_p = (scalar_type *)&l[0];
-  scalar_type * d_p = (scalar_type *)&d[0];
-
-  for(int o=0;o<LLs;o++){ // outer
-    for(int i=0;i<nsimd;i++){ //inner
-      int s  = o+i*LLs;
-      int ss = o*nsimd+i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  M5Dcalls++;
-  M5Dtime-=usecond();
-  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
-#if 0
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0;v<LLs;v++){
-
-      int vp=(v+1)%LLs;
-      int vm=(v+LLs-1)%LLs;
-
-      spProj5p(hp,psi[ss+vp]);
-      spProj5m(hm,psi[ss+vm]);
-
-      if ( vp<=v ) rotate(hp,hp,1);
-      if ( vm>=v ) rotate(hm,hm,nsimd-1);
-      
-      hp=hp*0.5;
-      hm=hm*0.5;
-      spRecon5p(fp,hp);
-      spRecon5m(fm,hm);
-
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-    }
-#else
-    for(int v=0;v<LLs;v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp= (v==LLs-1) ? 0     : v+1;
-      int vm= (v==0    ) ? LLs-1 : v-1;
-	
-      Simd hp_00 = psi[ss+vp]()(0)(0); 
-      Simd hp_01 = psi[ss+vp]()(0)(1); 
-      Simd hp_02 = psi[ss+vp]()(0)(2); 
-      Simd hp_10 = psi[ss+vp]()(1)(0); 
-      Simd hp_11 = psi[ss+vp]()(1)(1); 
-      Simd hp_12 = psi[ss+vp]()(1)(2); 
-	
-      Simd hm_00 = psi[ss+vm]()(2)(0); 
-      Simd hm_01 = psi[ss+vm]()(2)(1); 
-      Simd hm_02 = psi[ss+vm]()(2)(2); 
-      Simd hm_10 = psi[ss+vm]()(3)(0); 
-      Simd hm_11 = psi[ss+vm]()(3)(1); 
-      Simd hm_12 = psi[ss+vm]()(3)(2); 
-
-      if ( vp<=v ) {
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-      if ( vm>=v ) {
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
-      Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
-      Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02); 
-      Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
-      Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
-      Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
-
-      Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
-      Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
-      Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);  
-      Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
-      Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
-      Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
-
-      vstream(chi[ss+v]()(0)(0),p_00);
-      vstream(chi[ss+v]()(0)(1),p_01);
-      vstream(chi[ss+v]()(0)(2),p_02);
-      vstream(chi[ss+v]()(1)(0),p_10);
-      vstream(chi[ss+v]()(1)(1),p_11);
-      vstream(chi[ss+v]()(1)(2),p_12);
-      vstream(chi[ss+v]()(2)(0),p_20);
-      vstream(chi[ss+v]()(2)(1),p_21);
-      vstream(chi[ss+v]()(2)(2),p_22);
-      vstream(chi[ss+v]()(3)(0),p_30);
-      vstream(chi[ss+v]()(3)(1),p_31);
-      vstream(chi[ss+v]()(3)(2),p_32);
-    }
-#endif
-  });
-  M5Dtime+=usecond();
-}
-
-
-#ifdef AVX512 
-#include <simd/Intel512common.h>
-#include <simd/Intel512avx.h>
-#include <simd/Intel512single.h>
-#endif 
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField &chi_i,
-					     int LLs, int site,
-					     Vector<iSinglet<Simd> > &Matp,
-					     Vector<iSinglet<Simd> > &Matm)
-{
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-#ifndef AVX512
-  {
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-    SiteHalfSpinor SiteChiP;
-    SiteHalfSpinor SiteChiM;
-
-    // Ls*Ls * 2 * 12 * vol flops
-    for(int s1=0;s1<LLs;s1++){ 
-      for(int s2=0;s2<LLs;s2++){ 
-	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-
-	  int s=s2+l*LLs;
-	  int lex=s2+LLs*site;
-	
-	  if ( s2==0 && l==0) {
-	    SiteChiP=Zero();
-	    SiteChiM=Zero();
-	  }
-	
-	  for(int sp=0;sp<2;sp++){
-	    for(int co=0;co<Nc;co++){
-	      vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
-	    }}
-	  for(int sp=0;sp<2;sp++){
-	    for(int co=0;co<Nc;co++){
-	      vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
-	    }}
-
-	  for(int sp=0;sp<2;sp++){
-	    for(int co=0;co<Nc;co++){
-	      SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
-	      SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
-	    }}
-
-	}}
-      {
-	int lex = s1+LLs*site;
-	for(int sp=0;sp<2;sp++){
-	  for(int co=0;co<Nc;co++){
-	    vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
-	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-	  }}
-      }
-    }
-
-  }
-#else
-  {
-    // pointers
-    //  MASK_REGS;
-#define Chi_00 %%zmm1
-#define Chi_01 %%zmm2
-#define Chi_02 %%zmm3
-#define Chi_10 %%zmm4
-#define Chi_11 %%zmm5
-#define Chi_12 %%zmm6
-#define Chi_20 %%zmm7
-#define Chi_21 %%zmm8
-#define Chi_22 %%zmm9
-#define Chi_30 %%zmm10
-#define Chi_31 %%zmm11
-#define Chi_32 %%zmm12
-
-#define BCAST0   %%zmm13
-#define BCAST1   %%zmm14
-#define BCAST2   %%zmm15
-#define BCAST3   %%zmm16
-#define BCAST4   %%zmm17
-#define BCAST5   %%zmm18
-#define BCAST6   %%zmm19
-#define BCAST7   %%zmm20
-#define BCAST8   %%zmm21
-#define BCAST9   %%zmm22
-#define BCAST10  %%zmm23
-#define BCAST11  %%zmm24
-
-    int incr=LLs*LLs*sizeof(iSinglet<Simd>);
-    for(int s1=0;s1<LLs;s1++){ 
-      for(int s2=0;s2<LLs;s2++){ 
-	int lex=s2+LLs*site;
-	uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
-	uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
-	uint64_t a2 = (uint64_t)&psi[lex];
-	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-	  if ( (s2+l)==0 ) {
-	    asm (
-		 VPREFETCH1(0,%2)  	     VPREFETCH1(0,%1)
-		 VPREFETCH1(12,%2)  	     VPREFETCH1(13,%2)
-		 VPREFETCH1(14,%2)  	     VPREFETCH1(15,%2)         
-		 VBCASTCDUP(0,%2,BCAST0)   
-		 VBCASTCDUP(1,%2,BCAST1)   
-		 VBCASTCDUP(2,%2,BCAST2)   
-		 VBCASTCDUP(3,%2,BCAST3)   
-		 VBCASTCDUP(4,%2,BCAST4)     VMULMEM (0,%0,BCAST0,Chi_00)
-		 VBCASTCDUP(5,%2,BCAST5)     VMULMEM (0,%0,BCAST1,Chi_01)
-		 VBCASTCDUP(6,%2,BCAST6)     VMULMEM (0,%0,BCAST2,Chi_02)
-		 VBCASTCDUP(7,%2,BCAST7)     VMULMEM (0,%0,BCAST3,Chi_10)
-		 VBCASTCDUP(8,%2,BCAST8)     VMULMEM (0,%0,BCAST4,Chi_11)
-		 VBCASTCDUP(9,%2,BCAST9)     VMULMEM (0,%0,BCAST5,Chi_12)
-		 VBCASTCDUP(10,%2,BCAST10)   VMULMEM (0,%1,BCAST6,Chi_20)
-		 VBCASTCDUP(11,%2,BCAST11)   VMULMEM (0,%1,BCAST7,Chi_21)
-		 VMULMEM (0,%1,BCAST8,Chi_22)         
-		 VMULMEM (0,%1,BCAST9,Chi_30)
-		 VMULMEM (0,%1,BCAST10,Chi_31)       
-		 VMULMEM (0,%1,BCAST11,Chi_32)
-		 : : "r" (a0), "r" (a1), "r" (a2)  );
-	  } else { 
-	    asm (
-		 VBCASTCDUP(0,%2,BCAST0)   VMADDMEM (0,%0,BCAST0,Chi_00)
-		 VBCASTCDUP(1,%2,BCAST1)   VMADDMEM (0,%0,BCAST1,Chi_01)
-		 VBCASTCDUP(2,%2,BCAST2)   VMADDMEM (0,%0,BCAST2,Chi_02)
-		 VBCASTCDUP(3,%2,BCAST3)   VMADDMEM (0,%0,BCAST3,Chi_10)
-		 VBCASTCDUP(4,%2,BCAST4)   VMADDMEM (0,%0,BCAST4,Chi_11)
-		 VBCASTCDUP(5,%2,BCAST5)   VMADDMEM (0,%0,BCAST5,Chi_12)
-		 VBCASTCDUP(6,%2,BCAST6)   VMADDMEM (0,%1,BCAST6,Chi_20)
-		 VBCASTCDUP(7,%2,BCAST7)   VMADDMEM (0,%1,BCAST7,Chi_21)
-		 VBCASTCDUP(8,%2,BCAST8)   VMADDMEM (0,%1,BCAST8,Chi_22)
-		 VBCASTCDUP(9,%2,BCAST9)   VMADDMEM (0,%1,BCAST9,Chi_30)
-		 VBCASTCDUP(10,%2,BCAST10)  VMADDMEM (0,%1,BCAST10,Chi_31)
-		 VBCASTCDUP(11,%2,BCAST11)  VMADDMEM (0,%1,BCAST11,Chi_32) 
-		 : : "r" (a0), "r" (a1), "r" (a2)  );
-	  }
-	  a0 = a0+incr;
-	  a1 = a1+incr;
-	a2 = a2+sizeof(typename Simd::scalar_type);
-	}}
-      {
-	int lexa = s1+LLs*site;
-	asm (
-	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)		
-	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)		
-	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)		
-	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)		
-	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-      }
-    }
-  }
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-#endif
-};
-
-// Z-mobius version
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField &chi_i,
-					      int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
-{
-#ifndef AVX512
-  {
-    auto psi = psi_i.View();
-    auto chi = chi_i.View();
-
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-    SiteHalfSpinor SiteChiP;
-    SiteHalfSpinor SiteChiM;
-
-    // Ls*Ls * 2 * 12 * vol flops
-    for(int s1=0;s1<LLs;s1++){ 
-      for(int s2=0;s2<LLs;s2++){ 
-	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-
-	  int s=s2+l*LLs;
-	  int lex=s2+LLs*site;
-	
-	  if ( s2==0 && l==0) {
-	    SiteChiP=Zero();
-	    SiteChiM=Zero();
-	  }
-	
-	  for(int sp=0;sp<2;sp++){
-	    for(int co=0;co<Nc;co++){
-	      vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
-	    }}
-	  for(int sp=0;sp<2;sp++){
-	    for(int co=0;co<Nc;co++){
-	      vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
-	    }}
-
-	  for(int sp=0;sp<2;sp++){
-	    for(int co=0;co<Nc;co++){
-	      SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co); 
-	      SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co); 
-	    }}
-
-
-	}}
-      {
-	int lex = s1+LLs*site;
-	for(int sp=0;sp<2;sp++){
-	  for(int co=0;co<Nc;co++){
-	    vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
-	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-	  }}
-      }
-    }
-
-  }
-#else
-  {
-    auto psi = psi_i.View();
-    auto chi = chi_i.View();
-    // pointers
-    //  MASK_REGS;
-#define Chi_00 %zmm0
-#define Chi_01 %zmm1
-#define Chi_02 %zmm2
-#define Chi_10 %zmm3
-#define Chi_11 %zmm4
-#define Chi_12 %zmm5
-#define Chi_20 %zmm6
-#define Chi_21 %zmm7
-#define Chi_22 %zmm8
-#define Chi_30 %zmm9
-#define Chi_31 %zmm10
-#define Chi_32 %zmm11
-#define pChi_00 %%zmm0
-#define pChi_01 %%zmm1
-#define pChi_02 %%zmm2
-#define pChi_10 %%zmm3
-#define pChi_11 %%zmm4
-#define pChi_12 %%zmm5
-#define pChi_20 %%zmm6
-#define pChi_21 %%zmm7
-#define pChi_22 %%zmm8
-#define pChi_30 %%zmm9
-#define pChi_31 %%zmm10
-#define pChi_32 %%zmm11
-
-#define BCAST_00   %zmm12
-#define  SHUF_00   %zmm13
-#define BCAST_01   %zmm14
-#define  SHUF_01   %zmm15
-#define BCAST_02   %zmm16
-#define  SHUF_02   %zmm17
-#define BCAST_10   %zmm18
-#define  SHUF_10   %zmm19
-#define BCAST_11   %zmm20
-#define  SHUF_11   %zmm21
-#define BCAST_12   %zmm22
-#define  SHUF_12   %zmm23
-
-#define Mp  %zmm24
-#define Mps %zmm25
-#define Mm  %zmm26
-#define Mms %zmm27
-#define N 8
-    int incr=LLs*LLs*sizeof(iSinglet<Simd>);
-    for(int s1=0;s1<LLs;s1++){ 
-      for(int s2=0;s2<LLs;s2++){ 
-	int lex=s2+LLs*site;
-	uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
-	uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
-	uint64_t a2 = (uint64_t)&psi[lex];
-	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-	  if ( (s2+l)==0 ) {
-	    LOAD64(%r8,a0);
-	    LOAD64(%r9,a1);
-	    LOAD64(%r10,a2);
-	    asm (
-		 VLOAD(0,%r8,Mp)// i r
-		 VLOAD(0,%r9,Mm)
-		 VSHUF(Mp,Mps)  // r i 
-		 VSHUF(Mm,Mms)
-		 VPREFETCH1(12,%r10)  	     VPREFETCH1(13,%r10)
-		 VPREFETCH1(14,%r10)  	     VPREFETCH1(15,%r10)         
-
-		 VMULIDUP(0*N,%r10,Mps,Chi_00)
-		 VMULIDUP(1*N,%r10,Mps,Chi_01)
-		 VMULIDUP(2*N,%r10,Mps,Chi_02)
-		 VMULIDUP(3*N,%r10,Mps,Chi_10)
-		 VMULIDUP(4*N,%r10,Mps,Chi_11)
-		 VMULIDUP(5*N,%r10,Mps,Chi_12)
-
-		 VMULIDUP(6*N ,%r10,Mms,Chi_20)
-		 VMULIDUP(7*N ,%r10,Mms,Chi_21)
-		 VMULIDUP(8*N ,%r10,Mms,Chi_22)
-		 VMULIDUP(9*N ,%r10,Mms,Chi_30)
-		 VMULIDUP(10*N,%r10,Mms,Chi_31)
-		 VMULIDUP(11*N,%r10,Mms,Chi_32)
-
-		 VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
-		 VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
-		 VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
-		 VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
-		 VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
-		 VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
-
-		 VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
-		 VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
-		 VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
-		 VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
-		 VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
-		 VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
-		 );
-	  } else { 
-	    LOAD64(%r8,a0);
-	    LOAD64(%r9,a1);
-	    LOAD64(%r10,a2);
-	    asm (
-		 VLOAD(0,%r8,Mp)
-		 VSHUF(Mp,Mps)
-
-		 VLOAD(0,%r9,Mm)
-		 VSHUF(Mm,Mms)
-
-		 VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) //  Mri * Pii +- Cir
-		 VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
-		 VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
-		 VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
-		 VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
-		 VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
-
-		 VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
-		 VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
-		 VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
-		 VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
-		 VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
-		 VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
-
-		 VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) //  Cir = Mir * Prr +- ( Mri * Pii +- Cir) 
-		 VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) //  Ci = MiPr + Ci + MrPi ;    Cr = MrPr - ( MiPi - Cr)
-		 VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
-		 VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
-		 VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
-		 VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
-
-		 VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
-		 VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
-		 VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
-		 VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
-		 VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
-		 VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
-		 );
-	  }
-	  a0 = a0+incr;
-	  a1 = a1+incr;
-	a2 = a2+sizeof(typename Simd::scalar_type);
-	}}
-      {
-	int lexa = s1+LLs*site;
-	/*
-	  SiteSpinor tmp;
-	  asm (
-	  VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
-	  VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
-	  VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
-	  VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
-	  : : "r" ((uint64_t)&tmp) : "memory" );
-	*/
-
-	asm (
-	     VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
-	     VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
-	     VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
-	     VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
-	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-	//      if ( 1 || (site==0) ) { 
-	//	std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
-	//      }
-      }
-    }
-  }
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-
-#endif
-};
-
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
-{
-  chi.Checkerboard()=psi.Checkerboard();
-
-  int Ls=this->Ls;
-  int LLs = psi.Grid()->_rdimensions[0];
-  int vol = psi.Grid()->oSites()/LLs;
-
-  
-  Vector<iSinglet<Simd> >  Matp;
-  Vector<iSinglet<Simd> >  Matm;
-  Vector<iSinglet<Simd> >  *_Matp;
-  Vector<iSinglet<Simd> >  *_Matm;
-  
-  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-  if ( inv && dag ) { 
-    _Matp = &MatpInvDag;
-    _Matm = &MatmInvDag;
-  }
-  if ( inv && (!dag) ) { 
-    _Matp = &MatpInv;
-    _Matm = &MatmInv;
-  } 
-  if ( !inv ) {
-    MooeeInternalCompute(dag,inv,Matp,Matm);
-    _Matp = &Matp;
-    _Matm = &Matm;
-  }
-  assert(_Matp->size()==Ls*LLs);
-
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
-  if ( switcheroo<Coeff_t>::iscomplex() ) {
-    thread_loop( (auto site=0;site<vol;site++),{
-      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
-    });
-  } else { 
-    thread_loop( (auto site=0;site<vol;site++),{
-      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
-    });
-  }
-  MooeeInvTime+=usecond();
-}
-
-INSTANTIATE_DPERP(DomainWallVec5dImplD);
-INSTANTIATE_DPERP(DomainWallVec5dImplF);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplF);
-
-INSTANTIATE_DPERP(DomainWallVec5dImplDF);
-INSTANTIATE_DPERP(DomainWallVec5dImplFH);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplDF);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplFH);
-
-template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-
-template void CayleyFermion5D<DomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<DomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.cc
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.cc
@@ -1,320 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-void ContinuedFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
-{
-  SetCoefficientsZolotarev(1.0/scale,zdata);
-}
-template<class Impl>
-void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
-{
-  // How to check Ls matches??
-  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
-  int Ls = this->Ls;
-  assert(zdata->db==Ls);// Beta has Ls coeffs
-
-  R=(1+this->mass)/(1-this->mass);
-
-  Beta.resize(Ls);
-  cc.resize(Ls);
-  cc_d.resize(Ls);
-  sqrt_cc.resize(Ls);
-  for(int i=0; i < Ls ; i++){
-    Beta[i] = zdata -> beta[i];
-    cc[i] = 1.0/Beta[i];
-    cc_d[i]=std::sqrt(cc[i]);
-  }
-    
-  cc_d[Ls-1]=1.0;
-  for(int i=0; i < Ls-1 ; i++){
-    sqrt_cc[i]= std::sqrt(cc[i]*cc[i+1]);
-  }    
-  sqrt_cc[Ls-2]=std::sqrt(cc[Ls-2]);
-
-
-  ZoloHiInv =1.0/zolo_hi;
-  dw_diag = (4.0-this->M5)*ZoloHiInv;
-    
-  See.resize(Ls);
-  Aee.resize(Ls);
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    Aee[s] = sign * Beta[s] * dw_diag;
-    sign   = - sign;
-  }
-  Aee[Ls-1] += R;
-    
-  See[0] = Aee[0];
-  for(int s=1;s<Ls;s++){
-    See[s] = Aee[s] - 1.0/See[s-1];
-  }
-  for(int s=0;s<Ls;s++){
-    std::cout<<GridLogMessage <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
-  }
-}
-
-
-
-template<class Impl>
-RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
-{
-  int Ls = this->Ls;
-
-  FermionField D(psi.Grid());
-
-  this->DW(psi,D,DaggerNo); 
-
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    if ( s==0 ) {
-      ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*ZoloHiInv,D,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
-    } else if ( s==(Ls-1) ){
-      RealD R=(1.0+mass)/(1.0-mass);
-      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,D,sqrt_cc[s-1],psi,s,s-1);
-      ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
-    } else {
-      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,D,sqrt_cc[s],psi,s,s+1);
-      axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
-    }
-    sign=-sign; 
-  }
-  return norm2(chi);
-}
-template<class Impl>
-RealD  ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
-{
-  // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
-  // The rest of matrix is symmetric.
-  // Can ignore "dag"
-  return M(psi,chi);
-}
-template<class Impl>
-void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-  int Ls = this->Ls;
-
-  this->DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
-
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    if ( s==(Ls-1) ){
-      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
-    } else {
-      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
-    }
-    sign=-sign; 
-  }
-}
-template<class Impl>
-void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
-{
-  int Ls = this->Ls;
-
-  // Apply 4d dslash
-  if ( psi.Checkerboard() == Odd ) {
-    this->DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
-  } else {
-    this->DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
-  }
-      
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    if ( s==(Ls-1) ){
-      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
-    } else {
-      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
-    }
-    sign=-sign; 
-  }
-}
-template<class Impl>
-void   ContinuedFractionFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
-{
-  this->Meooe(psi,chi);
-}
-template<class Impl>
-void   ContinuedFractionFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
-{
-  int Ls = this->Ls;
-
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    if ( s==0 ) {
-      ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*dw_diag,psi,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
-    } else if ( s==(Ls-1) ){
-      // Drop the CC here.
-      double R=(1+mass)/(1-mass);
-      ag5xpby_ssp(chi,Beta[s]*dw_diag,psi,sqrt_cc[s-1],psi,s,s-1);
-      ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
-    } else {
-      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*dw_diag,psi,sqrt_cc[s],psi,s,s+1);
-      axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
-    }
-    sign=-sign; 
-  }
-}
-
-template<class Impl>
-void   ContinuedFractionFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
-{
-  this->Mooee(psi,chi);
-}
-template<class Impl>
-void   ContinuedFractionFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
-{
-  int Ls = this->Ls;
-
-  // Apply Linv
-  axpby_ssp(chi,1.0/cc_d[0],psi,0.0,psi,0,0); 
-  for(int s=1;s<Ls;s++){
-    axpbg5y_ssp(chi,1.0/cc_d[s],psi,-1.0/See[s-1],chi,s,s-1);
-  }
-  // Apply Dinv
-  for(int s=0;s<Ls;s++){
-    ag5xpby_ssp(chi,1.0/See[s],chi,0.0,chi,s,s); //only appearance of See[0]
-  }
-  // Apply Uinv = (Linv)^T
-  axpby_ssp(chi,1.0/cc_d[Ls-1],chi,0.0,chi,Ls-1,Ls-1);
-  for(int s=Ls-2;s>=0;s--){
-    axpbg5y_ssp(chi,1.0/cc_d[s],chi,-1.0*cc_d[s+1]/See[s]/cc_d[s],chi,s,s+1);
-  }
-}
-template<class Impl>
-void   ContinuedFractionFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
-{
-  this->MooeeInv(psi,chi);
-}
-
-// force terms; five routines; default to Dhop on diagonal
-template<class Impl>
-void ContinuedFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  int Ls = this->Ls;
-
-  FermionField D(V.Grid());
-
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    if ( s==(Ls-1) ){
-      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
-    } else {
-      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
-    }
-    sign=-sign; 
-  }
-  this->DhopDeriv(mat,D,V,DaggerNo); 
-};
-template<class Impl>
-void ContinuedFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  int Ls = this->Ls;
-
-  FermionField D(V.Grid());
-
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    if ( s==(Ls-1) ){
-      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
-    } else {
-      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
-    }
-    sign=-sign; 
-  }
-  this->DhopDerivOE(mat,D,V,DaggerNo); 
-};
-template<class Impl>
-void ContinuedFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  int Ls = this->Ls;
-
-  FermionField D(V.Grid());
-
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    if ( s==(Ls-1) ){
-      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
-    } else {
-      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
-    }
-    sign=-sign; 
-  }
-  this->DhopDerivEO(mat,D,V,DaggerNo); 
-};
-    
-// Constructors
-template<class Impl>
-ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
-							     GaugeField &_Umu,
-							     GridCartesian         &FiveDimGrid,
-							     GridRedBlackCartesian &FiveDimRedBlackGrid,
-							     GridCartesian         &FourDimGrid,
-							     GridRedBlackCartesian &FourDimRedBlackGrid,
-							     RealD _mass,RealD M5,const ImplParams &p) :
-  WilsonFermion5D<Impl>(_Umu,
-			FiveDimGrid, FiveDimRedBlackGrid,
-			FourDimGrid, FourDimRedBlackGrid,M5,p),
-  mass(_mass)
-{
-  int Ls = this->Ls;
-  assert((Ls&0x1)==1); // Odd Ls required
-}
-
-    template<class Impl>
-    void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
-    {
-      int Ls = this->Ls;
-      conformable(solution5d.Grid(),this->FermionGrid());
-      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
-    }
-    template<class Impl>
-    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
-    {
-      int Ls = this->Ls;
-      conformable(imported5d.Grid(),this->FermionGrid());
-      conformable(input4d.Grid()   ,this->GaugeGrid());
-      FermionField tmp(this->FermionGrid());
-      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
-      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
-      this->Dminus(tmp,imported5d);
-    }
-
-FermOpTemplateInstantiate(ContinuedFractionFermion5D);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc
@@ -1,433 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
-						   GaugeField            &_Umu,
-						   GridCartesian         &FiveDimGrid,
-						   GridRedBlackCartesian &FiveDimRedBlackGrid,
-						   GridCartesian         &FourDimGrid,
-						   GridRedBlackCartesian &FourDimRedBlackGrid,
-						   RealD _mq1, RealD _mq2, RealD _mq3,
-						   RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
-  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-			    _shift, _pm, _M5, 1.0, 0.0, p)
-{
-  RealD eps = 1.0;
-  Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
-  assert(zdata->n == this->Ls);
-
-  std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
-  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-
-  Approx::zolotarev_free(zdata);
-}
-
-/***************************************************************
- * Additional EOFA operators only called outside the inverter.
- * Since speed is not essential, simple axpby-style
- * implementations should be fine.
- ***************************************************************/
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-{
-  int Ls = this->Ls;
-
-  Din = Zero();
-  if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
-  else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-  else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
-  else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-}
-
-// This is just the identity for DWF
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-// This is just the identity for DWF
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-/*****************************************************************************************************/
-
-template<class Impl>
-RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->Meooe5D(psi, Din);
-  this->DW(Din, chi, DaggerNo);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  this->M5D(psi, chi);
-  return(norm2(chi));
-}
-
-template<class Impl>
-RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->DW(psi, Din, DaggerYes);
-  this->MeooeDag5D(Din, chi);
-  this->M5Ddag(psi, chi);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  return(norm2(chi));
-}
-
-/********************************************************************
- * Performance critical fermion operators called inside the inverter
- ********************************************************************/
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD shift = this->shift;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-
-  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-  Coeff_t shiftp(0.0), shiftm(0.0);
-  if(shift != 0.0){
-    if(pm == 1){ shiftp = shift*(mq3-mq2); }
-    else{ shiftm = -shift*(mq3-mq2); }
-  }
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
-
-#if(0)
-  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
-  for(int i=0; i<diag.size(); ++i){
-    std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-  }
-  for(int i=0; i<upper.size(); ++i){
-    std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-  }
-  for(int i=0; i<lower.size(); ++i){
-    std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-  }
-#endif
-
-  this->M5D(psi, chi, chi, lower, diag, upper);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD shift = this->shift;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-
-  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-  Coeff_t shiftp(0.0), shiftm(0.0);
-  if(shift != 0.0){
-    if(pm == 1){ shiftp = shift*(mq3-mq2); }
-    else{ shiftm = -shift*(mq3-mq2); }
-  }
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
-
-#if(0)
-  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
-  for(int i=0; i<diag.size(); ++i){
-    std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-  }
-  for(int i=0; i<upper.size(); ++i){
-    std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-  }
-  for(int i=0; i<lower.size(); ++i){
-    std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-  }
-#endif
-
-  this->M5Ddag(psi, chi, chi, lower, diag, upper);
-}
-
-// half checkerboard operations
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] = this->dm;
-  lower[0]    = this->dp;
-
-  this->M5D(psi, psi, chi, lower, diag, upper);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] = this->dp;
-  lower[0]    = this->dm;
-
-  this->M5Ddag(psi, psi, chi, lower, diag, upper);
-}
-
-/****************************************************************************************/
-
-//Zolo
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-  RealD shift = this->shift;
-
-  ////////////////////////////////////////////////////////
-  // Constants for the preconditioned matrix Cayley form
-  ////////////////////////////////////////////////////////
-  this->bs.resize(Ls);
-  this->cs.resize(Ls);
-  this->aee.resize(Ls);
-  this->aeo.resize(Ls);
-  this->bee.resize(Ls);
-  this->beo.resize(Ls);
-  this->cee.resize(Ls);
-  this->ceo.resize(Ls);
-
-  for(int i=0; i<Ls; ++i){
-    this->bee[i] = 4.0 - this->M5 + 1.0;
-    this->cee[i] = 1.0;
-  }
-
-  for(int i=0; i<Ls; ++i){
-    this->aee[i] = this->cee[i];
-    this->bs[i] = this->beo[i] = 1.0;
-    this->cs[i] = this->ceo[i] = 0.0;
-  }
-
-  //////////////////////////////////////////
-  // EOFA shift terms
-  //////////////////////////////////////////
-  if(pm == 1){
-    this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
-    this->dm = mq1*this->cee[Ls-1];
-  } else if(this->pm == -1) {
-    this->dp = mq1*this->cee[0];
-    this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
-  } else {
-    this->dp = mq1*this->cee[0];
-    this->dm = mq1*this->cee[Ls-1];
-  }
-
-  //////////////////////////////////////////
-  // LDU decomposition of eeoo
-  //////////////////////////////////////////
-  this->dee.resize(Ls+1);
-  this->lee.resize(Ls);
-  this->leem.resize(Ls);
-  this->uee.resize(Ls);
-  this->ueem.resize(Ls);
-
-  for(int i=0; i<Ls; ++i){
-
-    if(i < Ls-1){
-
-      this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
-
-      this->leem[i] = this->dm/this->bee[i];
-      for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
-
-      this->dee[i] = this->bee[i];
-
-      this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
-
-      this->ueem[i] = this->dp / this->bee[0];
-      for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
-
-    } else {
-
-      this->lee[i]  = 0.0;
-      this->leem[i] = 0.0;
-      this->uee[i]  = 0.0;
-      this->ueem[i] = 0.0;
-
-    }
-  }
-
-  {
-    Coeff_t delta_d = 1.0 / this->bee[0];
-    for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
-    this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
-    this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
-  }
-
-  int inv = 1;
-  this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
-  this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
-}
-
-// Recompute Cayley-form coefficients for different shift
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-{
-  this->shift = new_shift;
-  Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
-  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-						       Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  int Ls = this->Ls;
-
-  GridBase* grid = this->FermionRedBlackGrid();
-  int LLs = grid->_rdimensions[0];
-
-  if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-  for(int s=0; s<Ls; s++){
-    Pplus(s,s)  = this->bee[s];
-    Pminus(s,s) = this->bee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pminus(s,s+1) = -this->cee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pplus(s+1,s) = -this->cee[s+1];
-  }
-
-  Pplus (0,Ls-1) = this->dp;
-  Pminus(Ls-1,0) = this->dm;
-
-  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXcd PminusMat;
-
-#if(0)
-  std::cout << GridLogMessage << "Pplus:" << std::endl;
-  for(int s=0; s<Ls; ++s){
-    for(int ss=0; ss<Ls; ++ss){
-      std::cout << Pplus(s,ss) << "\t";
-    }
-    std::cout << std::endl;
-  }
-  std::cout << GridLogMessage << "Pminus:" << std::endl;
-  for(int s=0; s<Ls; ++s){
-    for(int ss=0; ss<Ls; ++ss){
-      std::cout << Pminus(s,ss) << "\t";
-    }
-    std::cout << std::endl;
-  }
-#endif
-
-  if(inv) {
-    PplusMat  = Pplus.inverse();
-    PminusMat = Pminus.inverse();
-  } else {
-    PplusMat  = Pplus;
-    PminusMat = Pminus;
-  }
-
-  if(dag){
-    PplusMat.adjointInPlace();
-    PminusMat.adjointInPlace();
-  }
-
-  typedef typename SiteHalfSpinor::scalar_type scalar_type;
-  const int Nsimd = Simd::Nsimd();
-  Matp.resize(Ls*LLs);
-  Matm.resize(Ls*LLs);
-
-  for(int s2=0; s2<Ls; s2++){
-    for(int s1=0; s1<LLs; s1++){
-      int istride = LLs;
-      int ostride = 1;
-      Simd Vp;
-      Simd Vm;
-      scalar_type *sp = (scalar_type*) &Vp;
-      scalar_type *sm = (scalar_type*) &Vm;
-      for(int l=0; l<Nsimd; l++){
-	if(switcheroo<Coeff_t>::iscomplex()) {
-	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
-	} else {
-	  // if real
-	  scalar_type tmp;
-	  tmp = PplusMat (l*istride+s1*ostride,s2);
-	  sp[l] = scalar_type(tmp.real(),tmp.real());
-	  tmp = PminusMat(l*istride+s1*ostride,s2);
-	  sm[l] = scalar_type(tmp.real(),tmp.real());
-	}
-      }
-      Matp[LLs*s2+s1] = Vp;
-      Matm[LLs*s2+s1] = Vm;
-    }}
-}
-
-FermOpTemplateInstantiate(DomainWallEOFAFermion);
-GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc
@@ -1,255 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// FIXME -- make a version of these routines with site loop outermost for cache reuse.
-
-// Pminus fowards
-// Pplus  backwards..
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  int Ls = this->Ls;
-  GridBase* grid = psi_i.Grid();
-  auto phi = phi_i.View();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  assert(phi.Checkerboard() == psi.Checkerboard());
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-  
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-    for(int s=0; s<Ls; s++){
-      auto tmp = psi[0];
-      if(s==0) {
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5m(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  int Ls = this->Ls;
-
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-    auto tmp = psi[0];
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5p(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionField& chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi=psi_i.View();
-  auto chi=chi_i.View();
-  int Ls = this->Ls;
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-
-    auto tmp1 = psi[0];
-    auto tmp2 = psi[0];
-
-    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-    // Apply (L^{\prime})^{-1}
-    chi[ss] = psi[ss]; // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp1, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
-    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp1, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp1, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
-    }
-    spProj5m(tmp2, chi[ss+Ls-1]);
-    chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
-
-    // Apply U^{-1}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5m(tmp1, chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, FermionField& chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  int Ls = this->Ls;
-
-  assert(psi.Checkerboard() == psi.Checkerboard());
-
-  Vector<Coeff_t> ueec(Ls);
-  Vector<Coeff_t> deec(Ls+1);
-  Vector<Coeff_t> leec(Ls);
-  Vector<Coeff_t> ueemc(Ls);
-  Vector<Coeff_t> leemc(Ls);
-
-  for(int s=0; s<ueec.size(); s++){
-    ueec[s]  = conjugate(this->uee[s]);
-    deec[s]  = conjugate(this->dee[s]);
-    leec[s]  = conjugate(this->lee[s]);
-    ueemc[s] = conjugate(this->ueem[s]);
-    leemc[s] = conjugate(this->leem[s]);
-  }
-  deec[Ls] = conjugate(this->dee[Ls]);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-
-    auto tmp1 = psi[0];
-    auto tmp2 = psi[0];
-
-    // Apply (U^{\prime})^{-dagger}
-    chi[ss] = psi[ss];
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp1, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
-    }
-
-    // U_m^{-\dagger}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp1, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp1, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
-    }
-    spProj5p(tmp2, chi[ss+Ls-1]);
-    chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
-
-    // Apply L^{-dagger}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5p(tmp1, chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
-
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
-
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc
@@ -1,613 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-
-/*
- * Dense matrix versions of routines
- */
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0;o<LLs;o++){ // outer
-    for(int i=0;i<nsimd;i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5m(hp, psi[ss+vp]);
-      spProj5p(hm, psi[ss+vm]);
-
-      if (vp <= v){ rotate(hp, hp, 1); }
-      if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = 0.5*hp;
-      hm = 0.5*hm;
-
-      spRecon5m(fp, hp);
-      spRecon5p(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v];
-      chi[ss+v] = chi[ss+v] + u[v]*fp;
-      chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v==LLs-1) ? 0     : v+1;
-      int vm = (v==0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-
-#endif
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5p(hp, psi[ss+vp]);
-      spProj5m(hm, psi[ss+vm]);
-
-      if(vp <= v){ rotate(hp, hp, 1); }
-      if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = hp*0.5;
-      hm = hm*0.5;
-      spRecon5p(fp, hp);
-      spRecon5m(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-#endif
-
-  });
-
-  this->M5Dtime += usecond();
-}
-
-#ifdef AVX512
-#include<simd/Intel512common.h>
-#include<simd/Intel512avx.h>
-#include<simd/Intel512single.h>
-#endif
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
-						   int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-#ifndef AVX512
-  {
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-    SiteHalfSpinor SiteChiP;
-    SiteHalfSpinor SiteChiM;
-
-    // Ls*Ls * 2 * 12 * vol flops
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-	for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-	  int s = s2 + l*LLs;
-	  int lex = s2 + LLs*site;
-
-	  if( s2==0 && l==0 ){
-	    SiteChiP=Zero();
-	    SiteChiM=Zero();
-	  }
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-	    }}
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-	    }}
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-	      SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-	    }}
-	}}
-
-      {
-	int lex = s1 + LLs*site;
-	for(int sp=0; sp<2;  sp++){
-	  for(int co=0; co<Nc; co++){
-	    vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-	  }}
-      }
-    }
-
-  }
-#else
-  {
-    // pointers
-    //  MASK_REGS;
-#define Chi_00 %%zmm1
-#define Chi_01 %%zmm2
-#define Chi_02 %%zmm3
-#define Chi_10 %%zmm4
-#define Chi_11 %%zmm5
-#define Chi_12 %%zmm6
-#define Chi_20 %%zmm7
-#define Chi_21 %%zmm8
-#define Chi_22 %%zmm9
-#define Chi_30 %%zmm10
-#define Chi_31 %%zmm11
-#define Chi_32 %%zmm12
-
-#define BCAST0  %%zmm13
-#define BCAST1  %%zmm14
-#define BCAST2  %%zmm15
-#define BCAST3  %%zmm16
-#define BCAST4  %%zmm17
-#define BCAST5  %%zmm18
-#define BCAST6  %%zmm19
-#define BCAST7  %%zmm20
-#define BCAST8  %%zmm21
-#define BCAST9  %%zmm22
-#define BCAST10 %%zmm23
-#define BCAST11 %%zmm24
-
-    int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-
-	int lex = s2 + LLs*site;
-	uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-	uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-	uint64_t a2 = (uint64_t) &psi[lex];
-
-	for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-	  if((s2+l)==0) {
-	    asm(
-		VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-		VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-		VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-		VBCASTCDUP(0,%2,BCAST0)
-		VBCASTCDUP(1,%2,BCAST1)
-		VBCASTCDUP(2,%2,BCAST2)
-		VBCASTCDUP(3,%2,BCAST3)
-		VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-		VMULMEM(0,%1,BCAST8,Chi_22)
-		VMULMEM(0,%1,BCAST9,Chi_30)
-		VMULMEM(0,%1,BCAST10,Chi_31)
-		VMULMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  } else {
-	    asm(
-		VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-		VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-		VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-		VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-		VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  }
-	  a0 = a0 + incr;
-	  a1 = a1 + incr;
-                        a2 = a2 + sizeof(typename Simd::scalar_type);
-	}
-      }
-
-      {
-	int lexa = s1+LLs*site;
-	asm (
-	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-      }
-    }
-  }
-
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-#endif
-};
-
-// Z-mobius version
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-						    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-  exit(-1);
-};
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-{
-  chi.Checkerboard() = psi.Checkerboard();
-  int Ls  = this->Ls;
-  int LLs = psi.Grid()->_rdimensions[0];
-  int vol = psi.Grid()->oSites()/LLs;
-
-  Vector<iSinglet<Simd> > Matp;
-  Vector<iSinglet<Simd> > Matm;
-  Vector<iSinglet<Simd> > *_Matp;
-  Vector<iSinglet<Simd> > *_Matm;
-
-  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-  if(inv && dag){
-    _Matp = &this->MatpInvDag;
-    _Matm = &this->MatmInvDag;
-  }
-
-  if(inv && (!dag)){
-    _Matp = &this->MatpInv;
-    _Matm = &this->MatmInv;
-  }
-
-  if(!inv){
-    MooeeInternalCompute(dag, inv, Matp, Matm);
-    _Matp = &Matp;
-    _Matm = &Matm;
-  }
-
-  assert(_Matp->size() == Ls*LLs);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  if(switcheroo<Coeff_t>::iscomplex()){
-    thread_loop((auto site=0; site<vol; site++),{
-      MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  } else {
-    thread_loop((auto site=0; site<vol; site++){
-      MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  }
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef DOMAIN_WALL_EOFA_DPERP_VEC
-
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
-
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
-
-template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
+++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
@@ -0,0 +1,267 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class S,class Representation = FundamentalRepresentation, class Options=CoeffReal>
+class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Representation::Dimension> > { 
+public:
+
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > Gimpl;
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  static const int Dimension = Representation::Dimension;
+  static const bool isFundamental = Representation::isFundamental;
+  static const bool LsVectorised=true;
+  static const int Nhcs = Options::Nhcs;
+      
+  typedef typename Options::_Coeff_t Coeff_t;      
+  typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
+  
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
+  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+  
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplPropagator<Simd>        SitePropagator;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+  typedef iImplHalfCommSpinor<SimdL>   SiteHalfCommSpinor;
+  typedef Lattice<SiteSpinor>          FermionField;
+  typedef Lattice<SitePropagator>      PropagatorField;
+
+  /////////////////////////////////////////////////
+  // Make the doubled gauge field a *scalar*
+  /////////////////////////////////////////////////
+  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
+  typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar
+  typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar
+  typedef Lattice<SiteDoubledGaugeField>                      DoubledGaugeField;
+      
+  typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
+  typedef WilsonImplParams ImplParams;
+  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
+  typedef typename StencilImpl::View_type StencilView;
+  
+  ImplParams Params;
+
+  DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
+      
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    vsplat(reg, memory);
+  }
+
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi, const SiteDoubledGaugeField &U,
+					  const _Spinor &chi, int mu, StencilEntry *SE,
+					  StencilView &St) 
+  {
+#ifdef GPU_VEC
+    // Gauge link is scalarised
+    mult(&phi(), &U(mu), &chi());
+#else
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+        vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mult(&phi(), &UU(), &chi());
+#endif
+  }
+#ifdef GPU_VEC
+  static accelerator_inline void copyLinkGpu(int lane,
+					     SiteDoubledGaugeField & UU,
+					     const SiteDoubledGaugeField &U)
+  {
+    UU = U;
+  }
+  static accelerator_inline void multLinkGpu(int lane,
+					     typename SiteHalfSpinor::scalar_object &phi,
+					     const SiteDoubledGaugeField &U,
+					     const typename SiteHalfSpinor::scalar_object &chi,
+					     int mu) 
+  {
+#if 1
+    typedef typename ExtractTypeMap<typename Simd::scalar_type>::extract_type extract_type;
+
+    SiteScalarGaugeLink U_l;
+
+    extract_type * U_mem  = (extract_type *) &U(mu);
+    extract_type * U_stack= (extract_type *) &U_l;
+
+    for(int w=0;w<(sizeof(U_l)/sizeof(extract_type)) ;w++) U_stack[w] = U_mem[w];
+
+    phi() =  U_l() * chi();
+#else
+    auto U_l = U(mu);
+
+    phi() =  U_l * chi();
+#endif
+  }
+#else
+  static accelerator_inline void multLinkGpu(int lane,
+					     SiteHalfSpinor &phi,
+					     const SiteDoubledGaugeField &U,
+					     const SiteHalfSpinor &chi,
+					     int mu) 
+  {
+    auto U_l   = U(mu);
+    phi() =  U_l * chi();
+  }
+#endif
+
+  static accelerator_inline void multLinkProp(SitePropagator &phi,
+					      const SiteDoubledGaugeField &U,
+					      const SitePropagator &chi,int mu) 
+  {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+        vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mult(&phi(), &UU(), &chi());
+  }
+
+  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) 
+  {
+    SiteScalarGaugeField  ScalarUmu;
+    SiteDoubledGaugeField ScalarUds;
+    
+    GaugeLinkField U(Umu.Grid());
+    GaugeField  Uadj(Umu.Grid());
+    for (int mu = 0; mu < Nd; mu++) {
+      U = PeekIndex<LorentzIndex>(Umu, mu);
+      U = adj(Cshift(U, mu, -1));
+      PokeIndex<LorentzIndex>(Uadj, U, mu);
+    }
+    
+    for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
+      Coordinate lcoor;
+      GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
+      
+      peekLocalSite(ScalarUmu, Umu, lcoor);
+      for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
+      
+      peekLocalSite(ScalarUmu, Uadj, lcoor);
+      for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
+      
+      pokeLocalSite(ScalarUds, Uds, lcoor);
+    }
+  }
+      
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) 
+  {
+    assert(0);
+  }
+
+  inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
+    assert(0);
+  } 
+
+  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+    assert(0);
+  }
+
+  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    assert(0);
+  }
+
+
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
+
+    assert(0);
+    // Following lines to be revised after Peter's addition of half prec
+    // missing put lane...
+    /*
+      typedef decltype(traceIndex<SpinIndex>(outerProduct(Btilde[0], Atilde[0]))) result_type;
+      unsigned int LLs = Btilde.Grid()->_rdimensions[0];
+      conformable(Atilde.Grid(),Btilde.Grid());
+      GridBase* grid = mat.Grid();
+      GridBase* Bgrid = Btilde.Grid();
+      unsigned int dimU = grid->Nd();
+      unsigned int dimF = Bgrid->Nd();
+      GaugeLinkField tmp(grid); 
+      tmp = Zero();
+    
+      // FIXME 
+      // Current implementation works, thread safe, probably suboptimal
+      // Passing through the local coordinate for grid transformation
+      // the force grid is in general very different from the Ls vectorized grid
+
+      for (int so = 0; so < grid->oSites(); so++) {
+      std::vector<typename result_type::scalar_object> vres(Bgrid->Nsimd());
+      std::vector<int> ocoor;  grid->oCoorFromOindex(ocoor,so); 
+      for (int si = 0; si < tmp.Grid()->iSites(); si++){
+      typename result_type::scalar_object scalar_object; scalar_object = Zero();
+      std::vector<int> local_coor;      
+      std::vector<int> icoor; grid->iCoorFromIindex(icoor,si);
+      grid->InOutCoorToLocalCoor(ocoor, icoor, local_coor);
+      for (int s = 0; s < LLs; s++) {
+      std::vector<int> slocal_coor(dimF);
+      slocal_coor[0] = s;
+      for (int s4d = 1; s4d< dimF; s4d++) slocal_coor[s4d] = local_coor[s4d-1];
+      int sF = Bgrid->oIndexReduced(slocal_coor);  
+      assert(sF < Bgrid->oSites());
+
+      extract(traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF])), vres); 
+      // sum across the 5d dimension
+      for (auto v : vres) scalar_object += v;  
+      }
+      tmp[so].putlane(scalar_object, si);
+      }
+      }
+      PokeIndex<LorentzIndex>(mat, tmp, mu);
+    */
+  }
+};
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplD; // Double
+ 
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
+ 
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplD; // Double
+ 
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
@@ -1,625 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
-
-Copyright (C) 2015
-
-Author: Azusa Yamaguchi, Peter Boyle
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-			   /*  END LEGAL */
-#include <Grid.h>
-
-NAMESPACE_BEGIN(Grid);
-
-const std::vector<int> 
-ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
-const std::vector<int> 
-ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
-
-/////////////////////////////////
-// Constructor and gauge import
-/////////////////////////////////
-
-
-template <class Impl>
-ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, 
-							 RealD _mass,
-							 RealD _c1, RealD _c2,RealD _u0,
-							 const ImplParams &p)
-  : Kernels(p),
-    _grid(&Fgrid),
-    _cbgrid(&Hgrid),
-    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
-    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
-    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
-    mass(_mass),
-    Lebesgue(_grid),
-    LebesgueEvenOdd(_cbgrid),
-    Umu(&Fgrid),
-    UmuEven(&Hgrid),
-    UmuOdd(&Hgrid),
-    UUUmu(&Fgrid),
-    UUUmuEven(&Hgrid),
-    UUUmuOdd(&Hgrid) ,
-    _tmp(&Hgrid)
-{
-  int vol4;
-  int LLs=1;
-  c1=_c1;
-  c2=_c2;
-  u0=_u0;
-  vol4= _grid->oSites();
-  Stencil.BuildSurfaceList(LLs,vol4);
-  vol4= _cbgrid->oSites();
-  StencilEven.BuildSurfaceList(LLs,vol4);
-  StencilOdd.BuildSurfaceList(LLs,vol4);
-}
-
-template <class Impl>
-ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
-							 GridRedBlackCartesian &Hgrid, RealD _mass,
-							 RealD _c1, RealD _c2,RealD _u0,
-							 const ImplParams &p)
-  : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p)
-{
-  ImportGauge(_Uthin,_Ufat);
-}
-
-////////////////////////////////////////////////////////////
-// Momentum space propagator should be 
-// https://arxiv.org/pdf/hep-lat/9712010.pdf
-//
-// mom space action.
-//   gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
-//
-// must track through staggered flavour/spin reduction in literature to 
-// turn to free propagator for the one component chi field, a la page 4/5
-// of above link to implmement fourier based solver.
-////////////////////////////////////////////////////////////
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) 
-{
-  /////////////////////////////////////////////////////////////////
-  // Trivial import; phases and fattening and such like preapplied
-  /////////////////////////////////////////////////////////////////
-  GaugeLinkField U(GaugeGrid());
-
-  for (int mu = 0; mu < Nd; mu++) {
-
-    U = PeekIndex<LorentzIndex>(_Utriple, mu);
-    PokeIndex<LorentzIndex>(UUUmu, U, mu );
-
-    U = adj( Cshift(U, mu, -3));
-    PokeIndex<LorentzIndex>(UUUmu, -U, mu+4 );
-
-    U = PeekIndex<LorentzIndex>(_Ufat, mu);
-    PokeIndex<LorentzIndex>(Umu, U, mu);
-
-    U = adj( Cshift(U, mu, -1));
-    PokeIndex<LorentzIndex>(Umu, -U, mu+4);
-
-  }
-  CopyGaugeCheckerboards();
-}
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) 
-{
-
-  Umu   = _U;
-  UUUmu = _UUU;
-  CopyGaugeCheckerboards();
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
-{
-  pickCheckerboard(Even, UmuEven,  Umu);
-  pickCheckerboard(Odd,  UmuOdd ,  Umu);
-  pickCheckerboard(Even, UUUmuEven,UUUmu);
-  pickCheckerboard(Odd,  UUUmuOdd, UUUmu);
-}
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) 
-{
-  GaugeLinkField U(GaugeGrid());
-
-  ////////////////////////////////////////////////////////
-  // Double Store should take two fields for Naik and one hop separately.
-  ////////////////////////////////////////////////////////
-  Impl::DoubleStore(GaugeGrid(), UUUmu, Umu, _Uthin, _Ufat );
-
-  ////////////////////////////////////////////////////////
-  // Apply scale factors to get the right fermion Kinetic term
-  // Could pass coeffs into the double store to save work.
-  // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) 
-  ////////////////////////////////////////////////////////
-  for (int mu = 0; mu < Nd; mu++) {
-
-    U = PeekIndex<LorentzIndex>(Umu, mu);
-    PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
-    
-    U = PeekIndex<LorentzIndex>(Umu, mu+4);
-    PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
-
-    U = PeekIndex<LorentzIndex>(UUUmu, mu);
-    PokeIndex<LorentzIndex>(UUUmu, U*( 0.5*c2/u0/u0/u0), mu );
-    
-    U = PeekIndex<LorentzIndex>(UUUmu, mu+4);
-    PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
-  }
-
-  CopyGaugeCheckerboards();
-}
-
-/////////////////////////////
-// Implement the interface
-/////////////////////////////
-
-template <class Impl>
-RealD ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Dhop(in, out, DaggerNo);
-  return axpy_norm(out, mass, in, out);
-}
-
-template <class Impl>
-RealD ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Dhop(in, out, DaggerYes);
-  return axpy_norm(out, mass, in, out);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
-  if (in.Checkerboard() == Odd) {
-    DhopEO(in, out, DaggerNo);
-  } else {
-    DhopOE(in, out, DaggerNo);
-  }
-}
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
-  if (in.Checkerboard() == Odd) {
-    DhopEO(in, out, DaggerYes);
-  } else {
-    DhopOE(in, out, DaggerYes);
-  }
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  typename FermionField::scalar_type scal(mass);
-  out = scal * in;
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Mooee(in, out);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  out = (1.0 / (mass)) * in;
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
-						 FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  MooeeInv(in, out);
-}
-
-///////////////////////////////////
-// Internal
-///////////////////////////////////
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, 
-						   GaugeField & mat,
-						   const FermionField &A, const FermionField &B, int dag) {
-  assert((dag == DaggerNo) || (dag == DaggerYes));
-
-  Compressor compressor;
-
-  FermionField Btilde(B.Grid());
-  FermionField Atilde(B.Grid());
-  Atilde = A;
-
-  st.HaloExchange(B, compressor);
-
-  for (int mu = 0; mu < Nd; mu++) {
-
-    ////////////////////////
-    // Call the single hop
-    ////////////////////////
-    auto U_v   = U.View();
-    auto UUU_v = UUU.View();
-    auto B_v   = B.View();
-    auto Btilde_v   = Btilde.View();
-    thread_loop( (int sss = 0; sss < B.Grid()->oSites(); sss++), {
-      Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
-    });
-
-    // Force in three link terms
-    //
-    //    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
-    //
-    // dU_ac(x)/dt = i p_ab U_bc(x)
-    //
-    // => dS_f/dt = dS_f/dU_ac(x) . dU_ac(x)/dt =  i p_ab U_bc(x) dS_f/dU_ac(x) 
-    //
-    // One link: form fragments S_f = A U B 
-    //
-    //         write Btilde = U(x) B(x+mu)
-    //
-    // mat+= TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
-    // 
-    // Three link: form fragments S_f = A UUU B 
-    //
-    // mat+= outer ( A, UUUB) <-- Best take DhopDeriv with one linke or identity matrix
-    // mat+= outer ( AU, UUB) <-- and then use covariant cshift?
-    // mat+= outer ( AUU, UB) <-- Returned from call to DhopDir
-
-    assert(0);// need to figure out the force interface with a blasted three link term.
-    
-  }
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-
-  conformable(U.Grid(), _grid);
-  conformable(U.Grid(), V.Grid());
-  conformable(U.Grid(), mat.Grid());
-
-  mat.Checkerboard() = U.Checkerboard();
-
-  DerivInternal(Stencil, Umu, UUUmu, mat, U, V, dag);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-
-  conformable(U.Grid(), _cbgrid);
-  conformable(U.Grid(), V.Grid());
-  conformable(U.Grid(), mat.Grid());
-
-  assert(V.Checkerboard() == Even);
-  assert(U.Checkerboard() == Odd);
-  mat.Checkerboard() = Odd;
-
-  DerivInternal(StencilEven, UmuOdd, UUUmuOdd, mat, U, V, dag);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-
-  conformable(U.Grid(), _cbgrid);
-  conformable(U.Grid(), V.Grid());
-  conformable(U.Grid(), mat.Grid());
-
-  assert(V.Checkerboard() == Odd);
-  assert(U.Checkerboard() == Even);
-  mat.Checkerboard() = Even;
-
-  DerivInternal(StencilOdd, UmuEven, UUUmuEven, mat, U, V, dag);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
-{
-  DhopCalls+=2;
-  conformable(in.Grid(), _grid);  // verifies full grid
-  conformable(in.Grid(), out.Grid());
-
-  out.Checkerboard() = in.Checkerboard();
-
-  DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
-{
-  DhopCalls+=1;
-  conformable(in.Grid(), _cbgrid);    // verifies half grid
-  conformable(in.Grid(), out.Grid());  // drops the cb check
-
-  assert(in.Checkerboard() == Even);
-  out.Checkerboard() = Odd;
-
-  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
-{
-  DhopCalls+=1;
-  conformable(in.Grid(), _cbgrid);    // verifies half grid
-  conformable(in.Grid(), out.Grid());  // drops the cb check
-
-  assert(in.Checkerboard() == Odd);
-  out.Checkerboard() = Even;
-
-  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
-  DhopDir(in, out, dir, disp);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
-
-  Compressor compressor;
-  Stencil.HaloExchange(in, compressor);
-  auto Umu_v   =   Umu.View();
-  auto UUUmu_v = UUUmu.View();
-  auto in_v    =  in.View();
-  auto out_v   = out.View();
-  thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) , {
-      Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
-  });
-};
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
-						  DoubledGaugeField &U,
-						  DoubledGaugeField &UUU,
-						  const FermionField &in,
-						  FermionField &out, int dag) 
-{
-#ifdef GRID_OMP
-  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
-  else
-#endif
-    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
-}
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
-								 DoubledGaugeField &U,
-								 DoubledGaugeField &UUU,
-								 const FermionField &in,
-								 FermionField &out, int dag) 
-{
-#ifdef GRID_OMP
-  Compressor compressor; 
-  int len =  U.Grid()->oSites();
-  const int LLs =  1;
-
-  DhopTotalTime   -= usecond();
-
-  DhopFaceTime    -= usecond();
-  st.Prepare();
-  st.HaloGather(in,compressor);
-  st.CommsMergeSHM(compressor);
-  DhopFaceTime    += usecond();
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  DhopComputeTime    -= usecond();
-#pragma omp parallel 
-  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-
-    if (tid >= ncomms) {
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = len;
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
-      }
-
-      // do the compute
-      auto U_v   = U.View();
-      auto UUU_v = UUU.View();
-      auto in_v  = in.View();
-      auto out_v = out.View();
-      if (dag == DaggerYes) {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-          int sU = ss;
-	  // Interior = 1; Exterior = 0; must implement for staggered
-          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); 
-        }
-      } else {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  // Interior = 1; Exterior = 0;
-          int sU = ss;
-          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
-        }
-      }
-    } else {
-      st.CommunicateThreaded();
-    }
-  }
-  DhopComputeTime    += usecond();
-
-  // First to enter, last to leave timing
-  DhopFaceTime    -= usecond();
-  st.CommsMerge(compressor);
-  DhopFaceTime    -= usecond();
-
-  DhopComputeTime2    -= usecond();
-  {
-    auto U_v   = U.View();
-    auto UUU_v = UUU.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    if (dag == DaggerYes) {
-      int sz=st.surface_list.size();
-      thread_loop( (int ss = 0; ss < sz; ss++) ,{
-	int sU = st.surface_list[ss];
-	Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
-      });
-    } else {
-      int sz=st.surface_list.size();
-      thread_loop( (int ss = 0; ss < sz; ss++) ,{
-	int sU = st.surface_list[ss];
-	Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
-      });
-    }
-  }
-  DhopComputeTime2    += usecond();
-#else
-  assert(0);
-#endif
-}
-
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
-							     DoubledGaugeField &U,
-							     DoubledGaugeField &UUU,
-							     const FermionField &in,
-							     FermionField &out, int dag) 
-{
-  assert((dag == DaggerNo) || (dag == DaggerYes));
-
-  DhopTotalTime   -= usecond();
-
-  DhopCommTime    -= usecond();
-  Compressor compressor;
-  st.HaloExchange(in, compressor);
-  DhopCommTime    += usecond();
-
-  auto U_v   =   U.View();
-  auto UUU_v = UUU.View();
-  auto in_v  =  in.View();
-  auto out_v = out.View();
-  DhopComputeTime -= usecond();
-  if (dag == DaggerYes) {
-    thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++), {
-      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
-    });
-  } else {
-    thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++), {
-      Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
-    });
-  }
-  DhopComputeTime += usecond();
-  DhopTotalTime   += usecond();
-};
-
-  ////////////////////////////////////////////////////////////////
-  // Reporting
-  ////////////////////////////////////////////////////////////////
-template<class Impl>
-void ImprovedStaggeredFermion<Impl>::Report(void) 
-{
-  Coordinate latt = _grid->GlobalDimensions();
-  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP = _grid->_Nprocessors;
-  RealD NN = _grid->NodeCount();
-
-  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls   : " 
-	    << DhopCalls   << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime   /Calls       : " 
-	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime    /Calls       : " 
-	    << DhopCommTime    / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls        : " 
-	    << DhopComputeTime / DhopCalls << " us" << std::endl;
-
-  // Average the compute time
-  _grid->GlobalSum(DhopComputeTime);
-  DhopComputeTime/=NP;
-
-  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
-  
-  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
-}
-template<class Impl>
-void ImprovedStaggeredFermion<Impl>::ZeroCounters(void) 
-{
-  DhopCalls       = 0;
-  DhopTotalTime   = 0;
-  DhopCommTime    = 0;
-  DhopComputeTime = 0;
-  DhopFaceTime    = 0;
-
-  Stencil.ZeroCounters();
-  StencilEven.ZeroCounters();
-  StencilOdd.ZeroCounters();
-}
-
-
-//////////////////////////////////////////////////////// 
-// Conserved current - not yet implemented.
-////////////////////////////////////////////////////////
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
-							      PropagatorField &q_in_2,
-							      PropagatorField &q_out,
-							      Current curr_type,
-							      unsigned int mu)
-{
-  assert(0);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
-                                                         PropagatorField &q_out,
-                                                         Current curr_type,
-                                                         unsigned int mu, 
-                                                         unsigned int tmin,
-                                              unsigned int tmax,
-					      ComplexField &lattice_cmplx)
-{
-  assert(0);
-
-}
-
-
-FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
-
-//AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
-//TwoIndexFermOpTemplateInstantiate(ImprovedStaggeredFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
@@ -1,672 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
-#include <Grid/perfmon/PerfCount.h>
-
-NAMESPACE_BEGIN(Grid);
-  
-// S-direction is INNERMOST and takes no part in the parity.
-const std::vector<int> 
-ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
-const std::vector<int> 
-ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
-
-// 5d lattice for DWF.
-template<class Impl>
-ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian         &FiveDimGrid,
-							     GridRedBlackCartesian &FiveDimRedBlackGrid,
-							     GridCartesian         &FourDimGrid,
-							     GridRedBlackCartesian &FourDimRedBlackGrid,
-							     RealD _mass,
-							     RealD _c1,RealD _c2, RealD _u0,
-							     const ImplParams &p) :
-  Kernels(p),
-  _FiveDimGrid        (&FiveDimGrid),
-  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
-  _FourDimGrid        (&FourDimGrid),
-  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
-  Stencil    (&FiveDimGrid,npoint,Even,directions,displacements,p),
-  StencilEven(&FiveDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
-  StencilOdd (&FiveDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
-  mass(_mass),
-  c1(_c1),
-  c2(_c2),
-  u0(_u0),
-  Umu(&FourDimGrid),
-  UmuEven(&FourDimRedBlackGrid),
-  UmuOdd (&FourDimRedBlackGrid),
-  UUUmu(&FourDimGrid),
-  UUUmuEven(&FourDimRedBlackGrid),
-  UUUmuOdd(&FourDimRedBlackGrid),
-  Lebesgue(&FourDimGrid),
-  LebesgueEvenOdd(&FourDimRedBlackGrid),
-  _tmp(&FiveDimRedBlackGrid)
-{
-
-  // some assertions
-  assert(FiveDimGrid._ndimension==5);
-  assert(FourDimGrid._ndimension==4);
-  assert(FourDimRedBlackGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._ndimension==5);
-  assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
-
-  // extent of fifth dim and not spread out
-  Ls=FiveDimGrid._fdimensions[0];
-  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
-  assert(FiveDimGrid._processors[0]         ==1);
-  assert(FiveDimRedBlackGrid._processors[0] ==1);
-
-  // Other dimensions must match the decomposition of the four-D fields 
-  for(int d=0;d<4;d++){
-    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
-    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
-    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
-
-    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
-    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
-    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
-
-    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
-    assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
-    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
-  }
-
-  if (Impl::LsVectorised) { 
-
-    int nsimd = Simd::Nsimd();
-    
-    // Dimension zero of the five-d is the Ls direction
-    assert(FiveDimGrid._simd_layout[0]        ==nsimd);
-    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
-
-    for(int d=0;d<4;d++){
-      assert(FourDimGrid._simd_layout[d]==1);
-      assert(FourDimRedBlackGrid._simd_layout[d]==1);
-      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
-    }
-
-  } else {
-    
-    // Dimension zero of the five-d is the Ls direction
-    assert(FiveDimRedBlackGrid._simd_layout[0]==1);
-    assert(FiveDimGrid._simd_layout[0]        ==1);
-
-  }
-  int LLs = FiveDimGrid._rdimensions[0];
-  int vol4= FourDimGrid.oSites();
-  Stencil.BuildSurfaceList(LLs,vol4);
-
-  vol4=FourDimRedBlackGrid.oSites();
-  StencilEven.BuildSurfaceList(LLs,vol4);
-  StencilOdd.BuildSurfaceList(LLs,vol4);
-}
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::CopyGaugeCheckerboards(void)
-{
-  pickCheckerboard(Even, UmuEven,  Umu);
-  pickCheckerboard(Odd,  UmuOdd ,  Umu);
-  pickCheckerboard(Even, UUUmuEven,UUUmu);
-  pickCheckerboard(Odd,  UUUmuOdd, UUUmu);
-}
-template<class Impl>
-ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
-							     GridCartesian         &FiveDimGrid,
-							     GridRedBlackCartesian &FiveDimRedBlackGrid,
-							     GridCartesian         &FourDimGrid,
-							     GridRedBlackCartesian &FourDimRedBlackGrid,
-							     RealD _mass,
-							     RealD _c1,RealD _c2, RealD _u0,
-							     const ImplParams &p) :
-  ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid,
-			     FourDimGrid,FourDimRedBlackGrid,
-			     _mass,_c1,_c2,_u0,p)
-{
-  ImportGauge(_Uthin,_Ufat);
-}
-
-///////////////////////////////////////////////////
-// For MILC use; pass three link U's and 1 link U
-///////////////////////////////////////////////////
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) 
-{
-  /////////////////////////////////////////////////////////////////
-  // Trivial import; phases and fattening and such like preapplied
-  /////////////////////////////////////////////////////////////////
-  for (int mu = 0; mu < Nd; mu++) {
-
-    auto U = PeekIndex<LorentzIndex>(_Utriple, mu);
-    Impl::InsertGaugeField(UUUmu,U,mu);
-
-    U = adj( Cshift(U, mu, -3));
-    Impl::InsertGaugeField(UUUmu,-U,mu+4);
-
-    U = PeekIndex<LorentzIndex>(_Ufat, mu);
-    Impl::InsertGaugeField(Umu,U,mu);
-
-    U = adj( Cshift(U, mu, -1));
-    Impl::InsertGaugeField(Umu,-U,mu+4);
-
-  }
-  CopyGaugeCheckerboards();
-}
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) 
-{
-  /////////////////////////////////////////////////////////////////
-  // Trivial import; phases and fattening and such like preapplied
-  /////////////////////////////////////////////////////////////////
-  Umu   = _U;
-  UUUmu = _UUU;
-  CopyGaugeCheckerboards();
-}
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
-{
-  ////////////////////////////////////////////////////////
-  // Double Store should take two fields for Naik and one hop separately.
-  ////////////////////////////////////////////////////////
-  Impl::DoubleStore(GaugeGrid(), UUUmu, Umu, _Uthin, _Ufat );
-
-  ////////////////////////////////////////////////////////
-  // Apply scale factors to get the right fermion Kinetic term
-  // Could pass coeffs into the double store to save work.
-  // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) 
-  ////////////////////////////////////////////////////////
-  for (int mu = 0; mu < Nd; mu++) {
-
-    auto U = PeekIndex<LorentzIndex>(Umu, mu);
-    PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
-    
-    U = PeekIndex<LorentzIndex>(Umu, mu+4);
-    PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
-
-    U = PeekIndex<LorentzIndex>(UUUmu, mu);
-    PokeIndex<LorentzIndex>(UUUmu, U*( 0.5*c2/u0/u0/u0), mu );
-    
-    U = PeekIndex<LorentzIndex>(UUUmu, mu+4);
-    PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
-  }
-
-  CopyGaugeCheckerboards();
-}
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
-{
-  int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
-                    // we drop off the innermost fifth dimension
-
-  Compressor compressor;
-  Stencil.HaloExchange(in,compressor);
-  auto Umu_v   = Umu.View();
-  auto UUUmu_v = UUUmu.View();
-  auto in_v    = in.View();
-  auto out_v   = out.View();
-  thread_loop( (int ss=0;ss<Umu.Grid()->oSites();ss++),{
-    for(int s=0;s<Ls;s++){
-      int sU=ss;
-      int sF = s+Ls*sU; 
-      Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sF, sU, in_v, out_v, dir, disp);
-    }
-  });
-};
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DerivInternal(StencilImpl & st,
-						     DoubledGaugeField & U,
-						     DoubledGaugeField & UUU,
-						     GaugeField &mat,
-						     const FermionField &A,
-						     const FermionField &B,
-						     int dag)
-{
-  // No force terms in multi-rhs solver staggered
-  assert(0);
-}
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopDeriv(GaugeField &mat,
-						 const FermionField &A,
-						 const FermionField &B,
-						 int dag)
-{
-  assert(0);
-}
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
-						   const FermionField &A,
-						   const FermionField &B,
-						   int dag)
-{
-  assert(0);
-}
-
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
-						   const FermionField &A,
-						   const FermionField &B,
-						   int dag)
-{
-  assert(0);
-}
-
-/*CHANGE */
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
-						    DoubledGaugeField & U,DoubledGaugeField & UUU,
-						    const FermionField &in, FermionField &out,int dag)
-{
-#ifdef GRID_OMP
-  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
-  else
-#endif
-    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
-}
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
-								   DoubledGaugeField & U,DoubledGaugeField & UUU,
-								   const FermionField &in, FermionField &out,int dag)
-{
-#ifdef GRID_OMP
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-
-  Compressor compressor; 
-
-  int LLs = in.Grid()->_rdimensions[0];
-  int len =  U.Grid()->oSites();
-
-  DhopFaceTime-=usecond();
-  st.Prepare();
-  st.HaloGather(in,compressor);
-  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
-  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
-  DhopFaceTime+=usecond();
-
-  double ctime=0;
-  double ptime=0;
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
-  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      double start = usecond();
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = U.Grid()->oSites(); // 4d vol
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
-      }
-
-      // do the compute
-      auto   U_v  =   U.View();
-      auto UUU_v  = UUU.View();
-      auto  in_v  =  in.View();
-      auto out_v  = out.View();
-
-      if (dag == DaggerYes) {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-          int sU = ss;
-	  // Interior = 1; Exterior = 0; must implement for staggered
-          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
-        }
-      } else {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  // Interior = 1; Exterior = 0;
-          int sU = ss;
-          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
-        }
-      }
-        ptime = usecond() - start;
-    } else {
-      double start = usecond();
-      st.CommunicateThreaded();
-      ctime = usecond() - start;
-    }
-  }
-  DhopCommTime += ctime;
-  DhopComputeTime+=ptime;
-
-  // First to enter, last to leave timing
-  st.CollateThreads();
-
-  DhopFaceTime-=usecond();
-  st.CommsMerge(compressor);
-  DhopFaceTime+=usecond();
-
-  DhopComputeTime2-=usecond();
-
-  auto   U_v  =   U.View();
-  auto UUU_v  = UUU.View();
-  auto  in_v  =  in.View();
-  auto out_v  = out.View();
-  if (dag == DaggerYes) {
-    int sz=st.surface_list.size();
-    thread_loop( (int ss = 0; ss < sz; ss++) ,{
-      int sU = st.surface_list[ss];
-      Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
-    });
-  } else {
-    int sz=st.surface_list.size();
-    thread_loop( (int ss = 0; ss < sz; ss++) ,{
-      int sU = st.surface_list[ss];
-      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
-    });
-  }
-  DhopComputeTime2+=usecond();
-#else
-  assert(0);
-#endif
-
-}
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
-						    DoubledGaugeField & U,DoubledGaugeField & UUU,
-						    const FermionField &in, FermionField &out,int dag)
-{
-  Compressor compressor;
-  int LLs = in.Grid()->_rdimensions[0];
-
-
-
- //double t1=usecond();
-  DhopTotalTime -= usecond();
-  DhopCommTime -= usecond();
-  st.HaloExchange(in,compressor);
-  DhopCommTime += usecond();
-  
-  DhopComputeTime -= usecond();
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  auto   U_v  =   U.View();
-  auto UUU_v  = UUU.View();
-  auto  in_v  =  in.View();
-  auto out_v  = out.View();
-  if (dag == DaggerYes) {
-  thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++), {
-      int sU=ss;
-      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
-    });
-  } else {
-    thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{
-      int sU=ss;
-      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
-    });
-  }
-  DhopComputeTime += usecond();
-  DhopTotalTime   += usecond();
- //double t2=usecond();
- //std::cout << __FILE__ << " " << __func__  << " Total Time " << DhopTotalTime << std::endl;
- //std::cout << __FILE__ << " " << __func__  << " Total Time Org " << t2-t1 << std::endl;
- //std::cout << __FILE__ << " " << __func__  << " Comml Time " << DhopCommTime << std::endl;
- //std::cout << __FILE__ << " " << __func__  << " Compute Time " << DhopComputeTime << std::endl;
-
-}
-/*CHANGE END*/
-
-/* ORG
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
-						    DoubledGaugeField & U,DoubledGaugeField & UUU,
-						    const FermionField &in, FermionField &out,int dag)
-{
-  Compressor compressor;
-  int LLs = in.Grid()->_rdimensions[0];
-
-  DhopTotalTime -= usecond();
-  DhopCommTime -= usecond();
-  st.HaloExchange(in,compressor);
-  DhopCommTime += usecond();
-  
-  DhopComputeTime -= usecond();
-  auto U_v   =   U.View();
-  auto UUU_v = UUU.View();
-  auto out_v = out.View();
-  auto in_v  =  in.View();
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  if (dag == DaggerYes) {
-    thread_loop(  (int ss = 0; ss < U.Grid()->oSites(); ss++), {
-      int sU=ss;
-      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
-    });
-  } else {
-    thread_loop(  (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{
-      int sU=ss;
-      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
-    });
-  }
-  DhopComputeTime += usecond();
-  DhopTotalTime   += usecond();
-}
-*/
-
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
-{
-  DhopCalls+=1;
-  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
-  conformable(in.Grid(),out.Grid()); // drops the cb check
-
-  assert(in.Checkerboard()==Even);
-  out.Checkerboard() = Odd;
-
-  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
-}
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
-{
-  DhopCalls+=1;
-  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
-  conformable(in.Grid(),out.Grid()); // drops the cb check
-
-  assert(in.Checkerboard()==Odd);
-  out.Checkerboard() = Even;
-
-  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
-}
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
-{
-  DhopCalls+=2;
-  conformable(in.Grid(),FermionGrid()); // verifies full grid
-  conformable(in.Grid(),out.Grid());
-
-  out.Checkerboard() = in.Checkerboard();
-
-  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
-}
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Report(void) 
-{
-  Coordinate latt = GridDefaultLatt();          
-  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP = _FourDimGrid->_Nprocessors;
-  RealD NN = _FourDimGrid->NodeCount();
-
-  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
-	    << DhopCalls   << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
-	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
-	    << DhopCommTime    / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
-	    << DhopComputeTime / DhopCalls << " us" << std::endl;
-
-  // Average the compute time
-  _FourDimGrid->GlobalSum(DhopComputeTime);
-  DhopComputeTime/=NP;
-
-  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
-  
-  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
-}
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
-{
-  DhopCalls       = 0;
-  DhopTotalTime    = 0;
-  DhopCommTime    = 0;
-  DhopComputeTime = 0;
-  DhopFaceTime    = 0;
-
-
-  Stencil.ZeroCounters();
-  StencilEven.ZeroCounters();
-  StencilOdd.ZeroCounters();
-}
-
-/////////////////////////////////////////////////////////////////////////
-// Implement the general interface. Here we use SAME mass on all slices
-/////////////////////////////////////////////////////////////////////////
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
-  DhopDir(in, out, dir, disp);
-}
-template <class Impl>
-RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Dhop(in, out, DaggerNo);
-  return axpy_norm(out, mass, in, out);
-}
-
-template <class Impl>
-RealD ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Dhop(in, out, DaggerYes);
-  return axpy_norm(out, mass, in, out);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) {
-  if (in.Checkerboard() == Odd) {
-    DhopEO(in, out, DaggerNo);
-  } else {
-    DhopOE(in, out, DaggerNo);
-  }
-}
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
-  if (in.Checkerboard() == Odd) {
-    DhopEO(in, out, DaggerYes);
-  } else {
-    DhopOE(in, out, DaggerYes);
-  }
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  typename FermionField::scalar_type scal(mass);
-  out = scal * in;
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Mooee(in, out);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  out = (1.0 / (mass)) * in;
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
-						   FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  MooeeInv(in, out);
-}
-
-//////////////////////////////////////////////////////// 
-// Conserved current - not yet implemented.
-////////////////////////////////////////////////////////
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
-								PropagatorField &q_in_2,
-								PropagatorField &q_out,
-								Current curr_type,
-								unsigned int mu)
-{
-  assert(0);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
-							   PropagatorField &q_out,
-							   Current curr_type,
-							   unsigned int mu, 
-							   unsigned int tmin,
-                                              unsigned int tmax,
-					      ComplexField &lattice_cmplx)
-{
-  assert(0);
-
-}
-
-FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
-FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
-  
-NAMESPACE_END(Grid);
-
-
-
--- a/Grid/qcd/action/fermion/MobiusEOFAFermion.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.cc
@@ -1,497 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
-					   GaugeField            &_Umu,
-					   GridCartesian         &FiveDimGrid,
-					   GridRedBlackCartesian &FiveDimRedBlackGrid,
-					   GridCartesian         &FourDimGrid,
-					   GridRedBlackCartesian &FourDimRedBlackGrid,
-					   RealD _mq1, RealD _mq2, RealD _mq3,
-					   RealD _shift, int _pm, RealD _M5,
-					   RealD _b, RealD _c, const ImplParams &p) :
-  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-			    _shift, _pm, _M5, _b, _c, p)
-{
-  int Ls = this->Ls;
-
-  RealD eps = 1.0;
-  Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
-  assert(zdata->n == this->Ls);
-
-  std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
-    ",c=" << _c << ") with Ls=" << Ls << std::endl;
-  this->SetCoefficientsTanh(zdata, _b, _c);
-  std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
-    ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
-    ",pm=" << _pm << ")" << std::endl;
-
-  Approx::zolotarev_free(zdata);
-
-  if(_shift != 0.0){
-    SetCoefficientsPrecondShiftOps();
-  } else {
-    Mooee_shift.resize(Ls, 0.0);
-    MooeeInv_shift_lc.resize(Ls, 0.0);
-    MooeeInv_shift_norm.resize(Ls, 0.0);
-    MooeeInvDag_shift_lc.resize(Ls, 0.0);
-    MooeeInvDag_shift_norm.resize(Ls, 0.0);
-  }
-}
-
-/****************************************************************
- * Additional EOFA operators only called outside the inverter.  
- * Since speed is not essential, simple axpby-style
- * implementations should be fine.
- ***************************************************************/
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-{
-  int Ls = this->Ls;
-  RealD alpha = this->alpha;
-
-  Din = Zero();
-  if((sign == 1) && (dag == 0)) { // \Omega_{+}
-    for(int s=0; s<Ls; ++s){
-      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
-    }
-  } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
-    for(int s=0; s<Ls; ++s){
-      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
-    }
-  } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
-    for(int sp=0; sp<Ls; ++sp){
-      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
-    }
-  } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
-    for(int sp=0; sp<Ls; ++sp){
-      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
-    }
-  }
-}
-
-// This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
-// It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
-{
-  int Ls    = this->Ls;
-  RealD b   = 0.5 * ( 1.0 + this->alpha );
-  RealD c   = 0.5 * ( 1.0 - this->alpha );
-  RealD mq1 = this->mq1;
-
-  for(int s=0; s<Ls; ++s){
-    if(s == 0) {
-      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-      axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
-    } else if(s == (Ls-1)) {
-      axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
-      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-    } else {
-      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-    }
-  }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-  RealD m = this->mq1;
-  RealD c = 0.5 * this->alpha;
-  RealD d = 0.5;
-
-  RealD DtInv_p(0.0), DtInv_m(0.0);
-  RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
-  FermionField tmp(this->FermionGrid());
-
-  for(int s=0; s<Ls; ++s){
-    for(int sp=0; sp<Ls; ++sp){
-
-      DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
-      DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
-      DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
-      DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
-
-      if(sp == 0){
-	axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
-	axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
-      } else {
-	axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
-	axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
-      }
-
-    }}
-}
-
-/*****************************************************************************************************/
-
-template<class Impl>
-RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->Meooe5D(psi, Din);
-  this->DW(Din, chi, DaggerNo);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  this->M5D(psi, chi);
-  return(norm2(chi));
-}
-
-template<class Impl>
-RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->DW(psi, Din, DaggerYes);
-  this->MeooeDag5D(Din, chi);
-  this->M5Ddag(psi, chi);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  return(norm2(chi));
-}
-
-/********************************************************************
- * Performance critical fermion operators called inside the inverter
- ********************************************************************/
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-// half checkerboard operations
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  // coefficients of Mooee
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] *= -this->mq1;
-  lower[0]    *= -this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  // coefficients of MooeeDag
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-  for(int s=0; s<Ls; s++){
-    if(s==0) {
-      upper[s] = -this->cee[s+1];
-      lower[s] = this->mq1*this->cee[Ls-1];
-    } else if(s==(Ls-1)) {
-      upper[s] = this->mq1*this->cee[0];
-      lower[s] = -this->cee[s-1];
-    } else {
-      upper[s] = -this->cee[s+1];
-      lower[s] = -this->cee[s-1];
-    }
-  }
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-/****************************************************************************************/
-
-// Computes coefficients for applying Cayley preconditioned shift operators
-//  (Mooee + \Delta) --> Mooee_shift
-//  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
-//  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
-// For the latter two cases, the operation takes the form
-//  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
-//      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
-template<class Impl>
-void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD alpha = this->alpha;
-  RealD k     = this->k;
-  RealD mq1   = this->mq1;
-  RealD shift = this->shift;
-
-  // Initialize
-  Mooee_shift.resize(Ls);
-  MooeeInv_shift_lc.resize(Ls);
-  MooeeInv_shift_norm.resize(Ls);
-  MooeeInvDag_shift_lc.resize(Ls);
-  MooeeInvDag_shift_norm.resize(Ls);
-
-  // Construct Mooee_shift
-  int idx(0);
-  Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
-    ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
-  for(int s=0; s<Ls; ++s){
-    idx = (pm == 1) ? (s) : (Ls-1-s);
-    Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
-  }
-
-  // Tridiagonal solve for MooeeInvDag_shift_lc
-  {
-    Coeff_t m(0.0);
-    Vector<Coeff_t> d = Mooee_shift;
-    Vector<Coeff_t> u(Ls,0.0);
-    Vector<Coeff_t> y(Ls,0.0);
-    Vector<Coeff_t> q(Ls,0.0);
-    if(pm == 1){ u[0] = 1.0; }
-    else{ u[Ls-1] = 1.0; }
-
-    // Tridiagonal matrix algorithm + Sherman-Morrison formula
-    //
-    // We solve
-    //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
-    // where Mooee' is the tridiagonal part of Mooee_{+}, and
-    // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
-    // so that the outer-product u \otimes v gives the (0,Ls-1)
-    // entry of Mooee_{+}.
-    //
-    // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
-    // and then construct the solution to the original system
-    //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
-    if(pm == 1){
-      for(int s=1; s<Ls; ++s){
-	m = -this->cee[s] / this->bee[s-1];
-	d[s] -= m*d[s-1];
-	u[s] -= m*u[s-1];
-      }
-    }
-    y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
-    q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
-    for(int s=Ls-2; s>=0; --s){
-      if(pm == 1){
-	y[s] = d[s] / this->bee[s];
-	q[s] = u[s] / this->bee[s];
-      } else {
-	y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
-	q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
-      }
-    }
-
-    // Construct MooeeInvDag_shift_lc
-    for(int s=0; s<Ls; ++s){
-      if(pm == 1){
-	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
-	  (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
-      } else {
-	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
-	  (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
-      }
-    }
-
-    // Compute remaining coefficients
-    N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
-    for(int s=0; s<Ls; ++s){
-
-      // MooeeInv_shift_lc
-      if(pm == 1){ MooeeInv_shift_lc[s] = pow(this->bee[s],s)      * pow(this->cee[s],Ls-1-s); }
-      else       { MooeeInv_shift_lc[s] = pow(this->bee[s],Ls-1-s) * pow(this->cee[s],s); }
-
-      // MooeeInv_shift_norm
-      MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
-	( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N;
-
-      // MooeeInvDag_shift_norm
-      if(pm == 1){ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],s) * pow(this->cee[s],(Ls-1-s)) /
-     	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
-      else{ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],(Ls-1-s)) * pow(this->cee[s],s) /
-	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
-    }
-  }
-}
-
-// Recompute coefficients for a different value of shift constant
-template<class Impl>
-void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-{
-  this->shift = new_shift;
-  if(new_shift != 0.0){
-    SetCoefficientsPrecondShiftOps();
-  } else {
-    int Ls = this->Ls;
-    Mooee_shift.resize(Ls,0.0);
-    MooeeInv_shift_lc.resize(Ls,0.0);
-    MooeeInv_shift_norm.resize(Ls,0.0);
-    MooeeInvDag_shift_lc.resize(Ls,0.0);
-    MooeeInvDag_shift_norm.resize(Ls,0.0);
-  }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-						   Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  int Ls = this->Ls;
-
-  GridBase* grid = this->FermionRedBlackGrid();
-  int LLs = grid->_rdimensions[0];
-
-  if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-  for(int s=0; s<Ls; s++){
-    Pplus(s,s)  = this->bee[s];
-    Pminus(s,s) = this->bee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pminus(s,s+1) = -this->cee[s];
-    Pplus(s+1,s) = -this->cee[s+1];
-  }
-
-  Pplus (0,Ls-1) = this->mq1*this->cee[0];
-  Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
-
-  if(this->shift != 0.0){
-    RealD c = 0.5 * this->alpha;
-    RealD d = 0.5;
-    RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
-    if(this->pm == 1) {
-      for(int s=0; s<Ls; ++s){
-	Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
-      }
-    } else {
-      for(int s=0; s<Ls; ++s){
-	Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
-      }
-    }
-  }
-
-  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXcd PminusMat;
-
-  if(inv) {
-    PplusMat  = Pplus.inverse();
-    PminusMat = Pminus.inverse();
-  } else {
-    PplusMat  = Pplus;
-    PminusMat = Pminus;
-  }
-
-  if(dag){
-    PplusMat.adjointInPlace();
-    PminusMat.adjointInPlace();
-  }
-
-  typedef typename SiteHalfSpinor::scalar_type scalar_type;
-  const int Nsimd = Simd::Nsimd();
-  Matp.resize(Ls*LLs);
-  Matm.resize(Ls*LLs);
-
-  for(int s2=0; s2<Ls; s2++){
-    for(int s1=0; s1<LLs; s1++){
-      int istride = LLs;
-      int ostride = 1;
-      Simd Vp;
-      Simd Vm;
-      scalar_type *sp = (scalar_type*) &Vp;
-      scalar_type *sm = (scalar_type*) &Vm;
-      for(int l=0; l<Nsimd; l++){
-	if(switcheroo<Coeff_t>::iscomplex()) {
-	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
-	} else {
-	  // if real
-	  scalar_type tmp;
-	  tmp = PplusMat (l*istride+s1*ostride,s2);
-	  sp[l] = scalar_type(tmp.real(),tmp.real());
-	  tmp = PminusMat(l*istride+s1*ostride,s2);
-	  sm[l] = scalar_type(tmp.real(),tmp.real());
-	}
-      }
-      Matp[LLs*s2+s1] = Vp;
-      Matm[LLs*s2+s1] = Vm;
-    }}
-}
-
-FermOpTemplateInstantiate(MobiusEOFAFermion);
-GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc
@@ -1,998 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-/*
- * Dense matrix versions of routines
- */
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				  Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid  = psi_i.Grid();
-  auto psi        = psi_i.View();
-  auto phi        = phi_i.View();
-  auto chi        = chi_i.View();
-  int Ls          = this->Ls;
-  int LLs         = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s   = o + i*LLs;
-      int ss  = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5m(hp, psi[ss+vp]);
-      spProj5p(hm, psi[ss+vm]);
-
-      if (vp <= v){ rotate(hp, hp, 1); }
-      if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = 0.5*hp;
-      hm = 0.5*hm;
-
-      spRecon5m(fp, hp);
-      spRecon5p(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v];
-      chi[ss+v] = chi[ss+v] + u[v]*fp;
-      chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-
-#endif
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi_i, const FermionField& phi_i,
-					FermionField& chi_i, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-					Vector<Coeff_t>& shift_coeffs)
-{
-#if 0
-  auto & psi = psi_i;
-  auto & phi = phi_i;
-  auto & chi = chi_i;
-
-  this->M5D(psi, phi, chi, lower, diag, upper);
-
-  // FIXME: possible gain from vectorizing shift operation as well?
-  Coeff_t one(1.0);
-  int Ls = this->Ls;
-  for(int s=0; s<Ls; s++){
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
-    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
-  }
-
-#else
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid  = psi_i.Grid();
-  auto psi        = psi_i.View();
-  auto phi        = phi_i.View();
-  auto chi        = chi_i.View();
-  int Ls          = this->Ls;
-  int LLs         = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-  Vector<iSinglet<Simd>> s(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-  scalar_type* s_p = (scalar_type*) &s[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s   = o + i*LLs;
-      int ss  = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-      s_p[ss] = shift_coeffs[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-    int vs     = (this->pm == 1) ? LLs-1 : 0;
-    Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
-    Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
-    Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
-    Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
-    Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
-    Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(this->pm == 1 && vs <= v){
-	hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      if(this->pm == -1 && vs >= v){
-	hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-      Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-      Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-      Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-      Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-      Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-      Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-  });
-
-  this->M5Dtime += usecond();
-
-#endif
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				     Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5p(hp, psi[ss+vp]);
-      spProj5m(hm, psi[ss+vm]);
-
-      if(vp <= v){ rotate(hp, hp, 1); }
-      if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = hp*0.5;
-      hm = hm*0.5;
-      spRecon5p(fp, hp);
-      spRecon5m(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-
-    }
-
-#endif
-
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-					   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-					   Vector<Coeff_t>& shift_coeffs)
-{
-#if 0
-  auto & psi = psi_i;
-  auto & phi = phi_i;
-  auto & chi = chi_i;
-  this->M5Ddag(psi, phi, chi, lower, diag, upper);
-
-  // FIXME: possible gain from vectorizing shift operation as well?
-  Coeff_t one(1.0);
-  int Ls = this->Ls;
-  for(int s=0; s<Ls; s++){
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
-    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
-  }
-
-#else
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-  Vector<iSinglet<Simd>> s(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-  scalar_type* s_p = (scalar_type*) &s[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-      s_p[ss] = shift_coeffs[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-    int vs     = (this->pm == 1) ? LLs-1 : 0;
-    Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
-    Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
-    Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
-    Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
-    Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
-    Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(this->pm == 1 && vs <= v){
-	hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      if(this->pm == -1 && vs >= v){
-	hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-      }
-
-      Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-      Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-      Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-      Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-      Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-      Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-
-    }
-
-  });
-
-  this->M5Dtime += usecond();
-
-#endif
-}
-
-#ifdef AVX512
-#include<simd/Intel512common.h>
-#include<simd/Intel512avx.h>
-#include<simd/Intel512single.h>
-#endif
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
-					       int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-#ifndef AVX512
-  {
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-    SiteHalfSpinor SiteChiP;
-    SiteHalfSpinor SiteChiM;
-
-    // Ls*Ls * 2 * 12 * vol flops
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-	for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-	  int s = s2 + l*LLs;
-	  int lex = s2 + LLs*site;
-
-	  if( s2==0 && l==0 ){
-	    SiteChiP=Zero();
-	    SiteChiM=Zero();
-	  }
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-            }}
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-            }}
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-              SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-            }}
-	}}
-
-      {
-	int lex = s1 + LLs*site;
-	for(int sp=0; sp<2;  sp++){
-	  for(int co=0; co<Nc; co++){
-	    vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-	  }}
-      }
-    }
-  }
-#else
-  {
-    // pointers
-    //  MASK_REGS;
-#define Chi_00 %%zmm1
-#define Chi_01 %%zmm2
-#define Chi_02 %%zmm3
-#define Chi_10 %%zmm4
-#define Chi_11 %%zmm5
-#define Chi_12 %%zmm6
-#define Chi_20 %%zmm7
-#define Chi_21 %%zmm8
-#define Chi_22 %%zmm9
-#define Chi_30 %%zmm10
-#define Chi_31 %%zmm11
-#define Chi_32 %%zmm12
-
-#define BCAST0  %%zmm13
-#define BCAST1  %%zmm14
-#define BCAST2  %%zmm15
-#define BCAST3  %%zmm16
-#define BCAST4  %%zmm17
-#define BCAST5  %%zmm18
-#define BCAST6  %%zmm19
-#define BCAST7  %%zmm20
-#define BCAST8  %%zmm21
-#define BCAST9  %%zmm22
-#define BCAST10 %%zmm23
-#define BCAST11 %%zmm24
-
-    int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-
-	int lex = s2 + LLs*site;
-	uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-	uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-	uint64_t a2 = (uint64_t) &psi[lex];
-
-	for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-
-	  if((s2+l)==0) {
-	    asm(
-		VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-		VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-		VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-		VBCASTCDUP(0,%2,BCAST0)
-		VBCASTCDUP(1,%2,BCAST1)
-		VBCASTCDUP(2,%2,BCAST2)
-		VBCASTCDUP(3,%2,BCAST3)
-		VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-		VMULMEM(0,%1,BCAST8,Chi_22)
-		VMULMEM(0,%1,BCAST9,Chi_30)
-		VMULMEM(0,%1,BCAST10,Chi_31)
-		VMULMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  } else {
-	    asm(
-		VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-		VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-		VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-		VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-		VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  }
-
-	  a0 = a0 + incr;
-	  a1 = a1 + incr;
-              a2 = a2 + sizeof(typename Simd::scalar_type);
-	}
-      }
-
-      {
-	int lexa = s1+LLs*site;
-	asm (
-	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-      }
-    }
-  }
-
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-
-#endif
-};
-
-// Z-mobius version
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-						int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-  exit(-1);
-};
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-{
-  chi.Checkerboard() = psi.Checkerboard();
-
-  int Ls  = this->Ls;
-  int LLs = psi.Grid()->_rdimensions[0];
-  int vol = psi.Grid()->oSites()/LLs;
-
-  Vector<iSinglet<Simd>>   Matp;
-  Vector<iSinglet<Simd>>   Matm;
-  Vector<iSinglet<Simd>>* _Matp;
-  Vector<iSinglet<Simd>>* _Matm;
-
-  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-  if(inv && dag){
-    _Matp = &this->MatpInvDag;
-    _Matm = &this->MatmInvDag;
-  }
-
-  if(inv && (!dag)){
-    _Matp = &this->MatpInv;
-    _Matm = &this->MatmInv;
-  }
-
-  if(!inv){
-    MooeeInternalCompute(dag, inv, Matp, Matm);
-    _Matp = &Matp;
-    _Matm = &Matm;
-  }
-
-  assert(_Matp->size() == Ls*LLs);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  if(switcheroo<Coeff_t>::iscomplex()){
-    thread_loop( (auto site=0; site<vol; site++),{
-      MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  } else {
-    thread_loop( (auto site=0; site<vol; site++),{
-      MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  }
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef MOBIUS_EOFA_DPERP_VEC
-
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
-
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
-
-template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.cc
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.cc
@@ -1,452 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-  // this does both dag and undag but is trivial; make a common helper routing
-  int Ls = this->Ls;
-
-  this->DhopDir(psi,chi,dir,disp);
-
-  int nblock=(Ls-1)/2;
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
-    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
-  }
-  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
-
-}
-template<class Impl>
-void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
-{
-  int Ls = this->Ls;
-  if ( psi.Checkerboard() == Odd ) {
-    this->DhopEO(psi,chi,DaggerNo);
-  } else {
-    this->DhopOE(psi,chi,DaggerNo);
-  }
-
-  int nblock=(Ls-1)/2;
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
-    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
-  }
-  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
-}
-
-template<class Impl>
-void   PartialFractionFermion5D<Impl>::Mooee_internal(const FermionField &psi, FermionField &chi,int dag)
-{
-  // again dag and undag are trivially related
-  int sign = dag ? (-1) : 1;
-  int Ls = this->Ls;
-      
-  int nblock=(Ls-1)/2;
-  for(int b=0;b<nblock;b++){
-	
-    int s = 2*b;
-    RealD pp = p[nblock-1-b];
-    RealD qq = q[nblock-1-b];
-	
-    // Do each 2x2 block aligned at s and multiplies Dw site diagonal by G5 so Hw
-    ag5xpby_ssp(chi,-dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s  ,s+1); 
-    ag5xpby_ssp(chi, dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s+1,s);
-    axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
-  }
-      
-  {
-    RealD R=(1+mass)/(1-mass);
-    //R g5 psi[Ls-1] + p[0] H
-    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale*dw_diag/amax,psi,Ls-1,Ls-1);
-	
-    for(int b=0;b<nblock;b++){
-      int s = 2*b+1;
-      RealD pp = p[nblock-1-b];
-      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
-    }
-  }
-}
-
-template<class Impl>
-void   PartialFractionFermion5D<Impl>::MooeeInv_internal(const FermionField &psi, FermionField &chi,int dag)
-{
-  int sign = dag ? (-1) : 1;
-  int Ls = this->Ls;
-
-  FermionField tmp(psi.Grid());
-      
-  ///////////////////////////////////////////////////////////////////////////////////////
-  //Linv
-  ///////////////////////////////////////////////////////////////////////////////////////
-  int nblock=(Ls-1)/2;
-
-  axpy(chi,0.0,psi,psi); // Identity piece
-      
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    RealD pp = p[nblock-1-b];
-    RealD qq = q[nblock-1-b];
-    RealD coeff1=sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
-    RealD coeff2=sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
-    axpby_ssp  (chi,1.0,chi,coeff1,psi,Ls-1,s);
-    axpbg5y_ssp(chi,1.0,chi,coeff2,psi,Ls-1,s+1);
-  }
-      
-  ///////////////////////////////////////////////////////////////////////////////////////
-  //Dinv (note D isn't really diagonal -- just diagonal enough that we can still invert)
-  // Compute Seeinv (coeff of gamma5)
-  ///////////////////////////////////////////////////////////////////////////////////////
-  RealD R=(1+mass)/(1-mass);
-  RealD Seeinv = R + p[nblock]*dw_diag/amax;
-  for(int b=0;b<nblock;b++){
-    Seeinv += p[nblock-1-b]*dw_diag/amax / ( dw_diag*dw_diag/amax/amax + q[nblock-1-b]);
-  }    
-  Seeinv = 1.0/Seeinv;
-      
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    RealD pp = p[nblock-1-b];
-    RealD qq = q[nblock-1-b];
-    RealD coeff1=dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
-    RealD coeff2=amax*sqrt(qq) / ( dw_diag*dw_diag + amax*amax* qq);
-    ag5xpby_ssp  (tmp,-coeff1,chi,coeff2,chi,s,s+1);
-    ag5xpby_ssp  (tmp, coeff1,chi,coeff2,chi,s+1,s);
-  }
-  ag5xpby_ssp  (tmp, Seeinv,chi,0.0,chi,Ls-1,Ls-1);
-      
-  ///////////////////////////////////////////////////////////////////////////////////////
-  // Uinv
-  ///////////////////////////////////////////////////////////////////////////////////////
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    RealD pp = p[nblock-1-b];
-    RealD qq = q[nblock-1-b];
-    RealD coeff1=-sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
-    RealD coeff2=-sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
-    axpby_ssp  (chi,1.0/scale,tmp,coeff1/scale,tmp,s,Ls-1);
-    axpbg5y_ssp(chi,1.0/scale,tmp,coeff2/scale,tmp,s+1,Ls-1);
-  }
-  axpby_ssp  (chi, 1.0/scale,tmp,0.0,tmp,Ls-1,Ls-1);
-}
-
-template<class Impl>
-void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, FermionField &chi,int dag)
-{
-  FermionField D(psi.Grid());
-  
-  int Ls = this->Ls;
-  int sign = dag ? (-1) : 1;
-
-  // For partial frac Hw case (b5=c5=1) chroma quirkily computes
-  //
-  // Conventions for partfrac appear to be a mess.
-  // Tony's Nara lectures have
-  //
-  // BlockDiag(  H/p_i  1             | 1       )    
-  //          (  1      p_i H / q_i^2 | 0       )  
-  //           ---------------------------------
-  //           ( -1      0                | R  +p0 H  )
-  //
-  //Chroma     ( -2H    2sqrt(q_i)    |   0         )
-  //           (2 sqrt(q_i)   2H      |  2 sqrt(p_i) )
-  //           ---------------------------------
-  //           ( 0     -2 sqrt(p_i)   |  2 R gamma_5 + p0 2H
-  //
-  // Edwards/Joo/Kennedy/Wenger
-  //
-  // Here, the "beta's" selected by chroma to scale the unphysical bulk constraint fields
-  // incorporate the approx scale factor. This is obtained by propagating the
-  // scale on "H" out to the off diagonal elements as follows:
-  //
-  // BlockDiag(  H/p_i  1             | 1       ) 
-  //          (  1      p_i H / q_i^2 | 0       )  
-  //           ---------------------------------
-  //          ( -1      0                | R  + p_0 H  )
-  //
-  // becomes:
-  // BlockDiag(  H/ sp_i  1               | 1             ) 
-  //          (  1      sp_i H / s^2q_i^2 | 0             )  
-  //           ---------------------------------
-  //           ( -1      0                | R + p_0/s H   )
-  //
-  //
-  // This is implemented in Chroma by
-  //           p0' = p0/approxMax
-  //           p_i' = p_i*approxMax
-  //           q_i' = q_i*approxMax*approxMax
-  //
-  // After the equivalence transform is applied the matrix becomes
-  // 
-  //Chroma     ( -2H    sqrt(q'_i)    |   0         )
-  //           (sqrt(q'_i)   2H       |   sqrt(p'_i) )
-  //           ---------------------------------
-  //           ( 0     -sqrt(p'_i)    |  2 R gamma_5 + p'0 2H
-  //
-  //     =     ( -2H    sqrt(q_i)amax    |   0              )
-  //           (sqrt(q_i)amax   2H       |   sqrt(p_i*amax) )
-  //           ---------------------------------
-  //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H
-  //
-
-  this->DW(psi,D,DaggerNo); 
-
-  int nblock=(Ls-1)/2;
-  for(int b=0;b<nblock;b++){
-	
-    int s = 2*b;
-    double pp = p[nblock-1-b];
-    double qq = q[nblock-1-b];
-	
-    // Do each 2x2 block aligned at s and
-    ag5xpby_ssp(chi,-1.0*scale,D,amax*sqrt(qq)*scale,psi, s  ,s+1); // Multiplies Dw by G5 so Hw
-    ag5xpby_ssp(chi, 1.0*scale,D,amax*sqrt(qq)*scale,psi, s+1,s);
-	
-    // Pick up last column
-    axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
-  }
-	
-  {
-    double R=(1+this->mass)/(1-this->mass);
-    //R g5 psi[Ls] + p[0] H
-    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
-	
-    for(int b=0;b<nblock;b++){
-      int s = 2*b+1;
-      double pp = p[nblock-1-b];
-      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
-    }
-  }
-
-}
-
-template<class Impl>
-RealD  PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
-{
-  M_internal(in,out,DaggerNo);
-  return norm2(out);
-}
-template<class Impl>
-RealD  PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
-{
-  M_internal(in,out,DaggerYes);
-  return norm2(out);
-}
-
-template<class Impl>
-void PartialFractionFermion5D<Impl>::Meooe       (const FermionField &in, FermionField &out)
-{
-  Meooe_internal(in,out,DaggerNo);
-}
-template<class Impl>
-void PartialFractionFermion5D<Impl>::MeooeDag    (const FermionField &in, FermionField &out)
-{
-  Meooe_internal(in,out,DaggerYes);
-}
-template<class Impl>
-void PartialFractionFermion5D<Impl>::Mooee       (const FermionField &in, FermionField &out)
-{
-  Mooee_internal(in,out,DaggerNo);
-}
-template<class Impl>
-void PartialFractionFermion5D<Impl>::MooeeDag    (const FermionField &in, FermionField &out)
-{
-  Mooee_internal(in,out,DaggerYes);
-}
-
-template<class Impl>
-void PartialFractionFermion5D<Impl>::MooeeInv    (const FermionField &in, FermionField &out)
-{
-  MooeeInv_internal(in,out,DaggerNo);
-}
-template<class Impl>
-void PartialFractionFermion5D<Impl>::MooeeInvDag (const FermionField &in, FermionField &out)
-{
-  MooeeInv_internal(in,out,DaggerYes);
-}
-
-
-// force terms; five routines; default to Dhop on diagonal
-template<class Impl>
-void PartialFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  int Ls = this->Ls;
-
-  FermionField D(V.Grid());
-
-  int nblock=(Ls-1)/2;
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
-    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
-  }
-  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
-
-  this->DhopDeriv(mat,D,V,DaggerNo); 
-};
-template<class Impl>
-void PartialFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  int Ls = this->Ls;
-
-  FermionField D(V.Grid());
-
-  int nblock=(Ls-1)/2;
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
-    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
-  }
-  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
-
-  this->DhopDerivOE(mat,D,V,DaggerNo); 
-};
-template<class Impl>
-void PartialFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  int Ls = this->Ls;
-
-  FermionField D(V.Grid());
-
-  int nblock=(Ls-1)/2;
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
-    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
-  }
-  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
-
-  this->DhopDerivEO(mat,D,V,DaggerNo); 
-};
-
-template<class Impl>
-void  PartialFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
-  SetCoefficientsZolotarev(1.0/scale,zdata);
-}
-template<class Impl>
-void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){
-
-  // check on degree matching
-  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
-  int Ls = this->Ls;
-
-  assert(Ls == (2*zdata->da -1) );
-
-  // Part frac
-  //      RealD R;
-  R=(1+mass)/(1-mass);
-  dw_diag = (4.0-this->M5);
-
-  //      std::vector<RealD> p; 
-  //      std::vector<RealD> q;
-  p.resize(zdata->da);
-  q.resize(zdata->dd);
-	
-  for(int n=0;n<zdata->da;n++){
-    p[n] = zdata -> alpha[n];
-  }
-  for(int n=0;n<zdata->dd;n++){
-    q[n] = -zdata -> ap[n];
-  }
-      
-  scale= part_frac_chroma_convention ? 2.0 : 1.0; // Chroma conventions annoy me
-
-  amax=zolo_hi;
-}
-
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
-    {
-      int Ls = this->Ls;
-      conformable(solution5d.Grid(),this->FermionGrid());
-      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
-    }
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
-    {
-      int Ls = this->Ls;
-      conformable(imported5d.Grid(),this->FermionGrid());
-      conformable(input4d.Grid()   ,this->GaugeGrid());
-      FermionField tmp(this->FermionGrid());
-      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
-      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
-      this->Dminus(tmp,imported5d);
-    }
-
-// Constructors
-template<class Impl>
-PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
-							 GridCartesian         &FiveDimGrid,
-							 GridRedBlackCartesian &FiveDimRedBlackGrid,
-							 GridCartesian         &FourDimGrid,
-							 GridRedBlackCartesian &FourDimRedBlackGrid,
-							 RealD _mass,RealD M5,
-							 const ImplParams &p) :
-  WilsonFermion5D<Impl>(_Umu,
-			FiveDimGrid, FiveDimRedBlackGrid,
-			FourDimGrid, FourDimRedBlackGrid,M5,p),
-  mass(_mass)
-
-{
-  int Ls = this->Ls;
-
-  assert((Ls&0x1)==1); // Odd Ls required
-  int nrational=Ls-1;
-
-
-  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);
-
-  // NB: chroma uses a cast to "float" for the zolotarev range(!?).
-  // this creates a real difference in the operator which I do not like but we can replicate here
-  // to demonstrate compatibility
-  //      RealD eps = (zolo_lo / zolo_hi);
-  //      zdata = bfm_zolotarev(eps,nrational,0);
-      
-  SetCoefficientsTanh(zdata,1.0);
-
-  Approx::zolotarev_free(zdata);
-
-}
- 
-FermOpTemplateInstantiate(PartialFractionFermion5D);
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/StaggeredKernels.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernels.cc
@@ -1,294 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Azusa Yamaguchi, Peter Boyle
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-NAMESPACE_BEGIN(Grid);
-
-int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
-int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
-
-#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)		\
-  SE = st.GetEntry(ptype, Dir+skew, sF);			\
-  if (SE->_is_local ) {						\
-    if (SE->_permute) {						\
-      chi_p = &chi;						\
-      permute(chi,  in[SE->_offset], ptype);		\
-    } else {							\
-      chi_p = &in[SE->_offset];				\
-    }								\
-  } else {							\
-    chi_p = &buf[SE->_offset];					\
-  }								\
-  multLink(Uchi, U[sU], *chi_p, Dir);			
-
-#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)		\
-  SE = st.GetEntry(ptype, Dir+skew, sF);			\
-  if (SE->_is_local ) {						\
-    if (SE->_permute) {						\
-      chi_p = &chi;						\
-      permute(chi,  in[SE->_offset], ptype);		\
-    } else {							\
-      chi_p = &in[SE->_offset];				\
-    }								\
-  } else if ( st.same_node[Dir] ) {				\
-    chi_p = &buf[SE->_offset];					\
-  }								\
-  if (SE->_is_local || st.same_node[Dir] ) {			\
-    multLink(Uchi, U[sU], *chi_p, Dir);			\
-  }
-
-#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
-  SE = st.GetEntry(ptype, Dir+skew, sF);			\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
-    nmu++;							\
-    chi_p = &buf[SE->_offset];					\
-    multLink(Uchi, U[sU], *chi_p, Dir);			\
-  }
-
-template <class Impl>
-StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
-
-////////////////////////////////////////////////////////////////////////////////////
-// Generic implementation; move to different file?
-// Int, Ext, Int+Ext cases for comms overlap
-////////////////////////////////////////////////////////////////////////////////////
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionFieldView &in, FermionFieldView &out, int dag) {
-  const SiteSpinor *chi_p;
-  SiteSpinor chi;
-  SiteSpinor Uchi;
-  StencilEntry *SE;
-  int ptype;
-  int skew;
-
-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
-    skew = 0;
-    GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
-    GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
-    skew=8;
-    GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
-    if ( dag ) { 
-      Uchi = - Uchi;
-    } 
-    vstream(out[sF], Uchi);
-  }
-};
-
-  ///////////////////////////////////////////////////
-  // Only contributions from interior of our node
-  ///////////////////////////////////////////////////
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
-						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-						SiteSpinor *buf, int LLs, int sU, 
-						const FermionFieldView &in, FermionFieldView &out,int dag) {
-  const SiteSpinor *chi_p;
-  SiteSpinor chi;
-  SiteSpinor Uchi;
-  StencilEntry *SE;
-  int ptype;
-  int skew ;
-
-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
-    skew = 0;
-    Uchi=Zero();
-    GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
-    skew=8;
-    GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
-    if ( dag ) {
-      Uchi = - Uchi;
-    }
-    vstream(out[sF], Uchi);
-  }
-};
-
-
-  ///////////////////////////////////////////////////
-  // Only contributions from exterior of our node
-  ///////////////////////////////////////////////////
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
-						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-						SiteSpinor *buf, int LLs, int sU,
-						const FermionFieldView &in, FermionFieldView &out,int dag) {
-  const SiteSpinor *chi_p;
-  //  SiteSpinor chi;
-  SiteSpinor Uchi;
-  StencilEntry *SE;
-  int ptype;
-  int nmu=0;
-  int skew ;
-
-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
-    skew = 0;
-    Uchi=Zero();
-    GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
-    skew=8;
-    GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
-
-    if ( nmu ) { 
-      if ( dag ) { 
-	out[sF] = out[sF] - Uchi;
-      } else { 
-	out[sF] = out[sF] + Uchi;
-      }
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////
-// Driving / wrapping routine to select right kernel
-////////////////////////////////////////////////////////////////////////////////////
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					 SiteSpinor *buf, int LLs, int sU,
-					 const FermionFieldView &in, FermionFieldView &out,
-					 int interior,int exterior)
-{
-  int dag=1;
-  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-				      SiteSpinor *buf, int LLs, int sU,
-				      const FermionFieldView &in, FermionFieldView &out,
-				      int interior,int exterior)
-{
-  int dag=0;
-  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-				      SiteSpinor *buf, int LLs,
-				      int sU, const FermionFieldView &in, FermionFieldView &out,
-				      int dag,int interior,int exterior) 
-{
-  switch(Opt) {
-#ifdef AVX512
-  case OptInlineAsm:
-    if ( interior && exterior ) {
-      DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else { 
-      std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
-      assert(0);
-    }
-    break;
-#endif
-  case OptHandUnroll:
-    if ( interior && exterior ) {
-      DhopSiteHand   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( interior ) {
-      DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( exterior ) {
-      DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    }
-    break;
-  case OptGeneric:
-    if ( interior && exterior ) {
-      DhopSiteGeneric   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( interior ) {
-      DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( exterior ) {
-      DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    }
-    break;
-  default:
-    std::cout<<"Oops Opt = "<<Opt<<std::endl;
-    assert(0);
-    break;
-  }
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U,  DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
-					    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp) 
-{
-  // Disp should be either +1,-1,+3,-3
-  // What about "dag" ?
-  // Because we work out pU . dS/dU 
-  // U
-  assert(0);
-}
-
-FermOpStaggeredTemplateInstantiate(StaggeredKernels);
-FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
-
-NAMESPACE_END(Grid);
-
-
--- a/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
@@ -1,972 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid.h>
-
-#ifdef AVX512
-#include <simd/Intel512common.h>
-#include <simd/Intel512avx.h>
-#endif
-
-// Interleave operations from two directions
-// This looks just like a 2 spin multiply and reuse same sequence from the Wilson
-// Kernel. But the spin index becomes a mu index instead.
-#define Chi_00 %zmm0
-#define Chi_01 %zmm1
-#define Chi_02 %zmm2
-#define Chi_10 %zmm3
-#define Chi_11 %zmm4
-#define Chi_12 %zmm5
-#define Chi_20 %zmm6
-#define Chi_21 %zmm7
-#define Chi_22 %zmm8
-#define Chi_30 %zmm9
-#define Chi_31 %zmm10
-#define Chi_32 %zmm11
-
-#define UChi_00 %zmm12
-#define UChi_01 %zmm13
-#define UChi_02 %zmm14
-#define UChi_10 %zmm15
-#define UChi_11 %zmm16
-#define UChi_12 %zmm17
-#define UChi_20 %zmm18
-#define UChi_21 %zmm19
-#define UChi_22 %zmm20
-#define UChi_30 %zmm21
-#define UChi_31 %zmm22
-#define UChi_32 %zmm23
-
-#define pChi_00 %%zmm0
-#define pChi_01 %%zmm1
-#define pChi_02 %%zmm2
-#define pChi_10 %%zmm3
-#define pChi_11 %%zmm4
-#define pChi_12 %%zmm5
-#define pChi_20 %%zmm6
-#define pChi_21 %%zmm7
-#define pChi_22 %%zmm8
-#define pChi_30 %%zmm9
-#define pChi_31 %%zmm10
-#define pChi_32 %%zmm11
-
-#define pUChi_00 %%zmm12
-#define pUChi_01 %%zmm13
-#define pUChi_02 %%zmm14
-#define pUChi_10 %%zmm15
-#define pUChi_11 %%zmm16
-#define pUChi_12 %%zmm17
-#define pUChi_20 %%zmm18
-#define pUChi_21 %%zmm19
-#define pUChi_22 %%zmm20
-#define pUChi_30 %%zmm21
-#define pUChi_31 %%zmm22
-#define pUChi_32 %%zmm23
-
-#define T0 %zmm24
-#define T1 %zmm25
-#define T2 %zmm26
-#define T3 %zmm27
-
-#define Z00 %zmm26
-#define Z10 %zmm27
-#define Z0 Z00
-#define Z1 %zmm28
-#define Z2 %zmm29
-
-#define Z3 %zmm30
-#define Z4 %zmm31
-#define Z5 Chi_31
-#define Z6 Chi_32
-
-#define MULT_ADD_LS(g0,g1,g2,g3)					\
-  asm ( "movq %0, %%r8 \n\t"					\
-	"movq %1, %%r9 \n\t"						\
-        "movq %2, %%r10 \n\t"						\
-        "movq %3, %%r11 \n\t" :  : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
-  asm (									\
-  VSHUF(Chi_00,T0)      VSHUF(Chi_10,T1)				\
-  VSHUF(Chi_20,T2)      VSHUF(Chi_30,T3)				\
-  VMADDSUBIDUP(0,%r8,T0,UChi_00) VMADDSUBIDUP(0,%r9,T1,UChi_10)		\
-  VMADDSUBIDUP(3,%r8,T0,UChi_01) VMADDSUBIDUP(3,%r9,T1,UChi_11)		\
-  VMADDSUBIDUP(6,%r8,T0,UChi_02) VMADDSUBIDUP(6,%r9,T1,UChi_12)		\
-  VMADDSUBIDUP(0,%r10,T2,UChi_20) VMADDSUBIDUP(0,%r11,T3,UChi_30)		\
-  VMADDSUBIDUP(3,%r10,T2,UChi_21) VMADDSUBIDUP(3,%r11,T3,UChi_31)		\
-  VMADDSUBIDUP(6,%r10,T2,UChi_22) VMADDSUBIDUP(6,%r11,T3,UChi_32)		\
-  VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
-  VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
-  VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
-  VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
-  VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
-  VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
-  VSHUF(Chi_01,T0)	  VSHUF(Chi_11,T1)				\
-  VSHUF(Chi_21,T2)	  VSHUF(Chi_31,T3)				\
-  VMADDSUBIDUP(1,%r8,T0,UChi_00)     VMADDSUBIDUP(1,%r9,T1,UChi_10)	\
-  VMADDSUBIDUP(4,%r8,T0,UChi_01)     VMADDSUBIDUP(4,%r9,T1,UChi_11)	\
-  VMADDSUBIDUP(7,%r8,T0,UChi_02)     VMADDSUBIDUP(7,%r9,T1,UChi_12)	\
-  VMADDSUBIDUP(1,%r10,T2,UChi_20)     VMADDSUBIDUP(1,%r11,T3,UChi_30)	\
-  VMADDSUBIDUP(4,%r10,T2,UChi_21)     VMADDSUBIDUP(4,%r11,T3,UChi_31)	\
-  VMADDSUBIDUP(7,%r10,T2,UChi_22)     VMADDSUBIDUP(7,%r11,T3,UChi_32)	\
-  VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
-  VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
-  VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
-  VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
-  VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
-  VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
-  VSHUF(Chi_02,T0)    VSHUF(Chi_12,T1)					\
-  VSHUF(Chi_22,T2)    VSHUF(Chi_32,T3)					\
-  VMADDSUBIDUP(2,%r8,T0,UChi_00)     VMADDSUBIDUP(2,%r9,T1,UChi_10)     \
-  VMADDSUBIDUP(5,%r8,T0,UChi_01)     VMADDSUBIDUP(5,%r9,T1,UChi_11)     \
-  VMADDSUBIDUP(8,%r8,T0,UChi_02)     VMADDSUBIDUP(8,%r9,T1,UChi_12)     \
-  VMADDSUBIDUP(2,%r10,T2,UChi_20)     VMADDSUBIDUP(2,%r11,T3,UChi_30)     \
-  VMADDSUBIDUP(5,%r10,T2,UChi_21)     VMADDSUBIDUP(5,%r11,T3,UChi_31)     \
-  VMADDSUBIDUP(8,%r10,T2,UChi_22)     VMADDSUBIDUP(8,%r11,T3,UChi_32)     \
-  VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
-  VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
-  VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
-  VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
-  VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
-  VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
-
-#define MULT_LS(g0,g1,g2,g3)					\
-  asm ( "movq %0, %%r8 \n\t"					\
-	"movq %1, %%r9 \n\t"						\
-        "movq %2, %%r10 \n\t"						\
-        "movq %3, %%r11 \n\t" :  : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
-  asm (									\
-  VSHUF(Chi_00,T0)      VSHUF(Chi_10,T1)				\
-  VSHUF(Chi_20,T2)      VSHUF(Chi_30,T3)				\
-  VMULIDUP(0,%r8,T0,UChi_00) VMULIDUP(0,%r9,T1,UChi_10)		\
-  VMULIDUP(3,%r8,T0,UChi_01) VMULIDUP(3,%r9,T1,UChi_11)		\
-  VMULIDUP(6,%r8,T0,UChi_02) VMULIDUP(6,%r9,T1,UChi_12)		\
-  VMULIDUP(0,%r10,T2,UChi_20) VMULIDUP(0,%r11,T3,UChi_30)		\
-  VMULIDUP(3,%r10,T2,UChi_21) VMULIDUP(3,%r11,T3,UChi_31)		\
-  VMULIDUP(6,%r10,T2,UChi_22) VMULIDUP(6,%r11,T3,UChi_32)		\
-  VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
-  VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
-  VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
-  VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
-  VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
-  VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
-  VSHUF(Chi_01,T0)	  VSHUF(Chi_11,T1)				\
-  VSHUF(Chi_21,T2)	  VSHUF(Chi_31,T3)				\
-  VMADDSUBIDUP(1,%r8,T0,UChi_00)     VMADDSUBIDUP(1,%r9,T1,UChi_10)	\
-  VMADDSUBIDUP(4,%r8,T0,UChi_01)     VMADDSUBIDUP(4,%r9,T1,UChi_11)	\
-  VMADDSUBIDUP(7,%r8,T0,UChi_02)     VMADDSUBIDUP(7,%r9,T1,UChi_12)	\
-  VMADDSUBIDUP(1,%r10,T2,UChi_20)     VMADDSUBIDUP(1,%r11,T3,UChi_30)	\
-  VMADDSUBIDUP(4,%r10,T2,UChi_21)     VMADDSUBIDUP(4,%r11,T3,UChi_31)	\
-  VMADDSUBIDUP(7,%r10,T2,UChi_22)     VMADDSUBIDUP(7,%r11,T3,UChi_32)	\
-  VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
-  VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
-  VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
-  VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
-  VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
-  VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
-  VSHUF(Chi_02,T0)    VSHUF(Chi_12,T1)					\
-  VSHUF(Chi_22,T2)    VSHUF(Chi_32,T3)					\
-  VMADDSUBIDUP(2,%r8,T0,UChi_00)     VMADDSUBIDUP(2,%r9,T1,UChi_10)     \
-  VMADDSUBIDUP(5,%r8,T0,UChi_01)     VMADDSUBIDUP(5,%r9,T1,UChi_11)     \
-  VMADDSUBIDUP(8,%r8,T0,UChi_02)     VMADDSUBIDUP(8,%r9,T1,UChi_12)     \
-  VMADDSUBIDUP(2,%r10,T2,UChi_20)     VMADDSUBIDUP(2,%r11,T3,UChi_30)     \
-  VMADDSUBIDUP(5,%r10,T2,UChi_21)     VMADDSUBIDUP(5,%r11,T3,UChi_31)     \
-  VMADDSUBIDUP(8,%r10,T2,UChi_22)     VMADDSUBIDUP(8,%r11,T3,UChi_32)     \
-  VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
-  VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
-  VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
-  VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
-  VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
-  VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
-
-#define MULT_ADD_XYZTa(g0,g1)					\
-  asm ( "movq %0, %%r8 \n\t"					\
-	"movq %1, %%r9 \n\t"	 :  : "r"(g0), "r"(g1) : "%r8","%r9");\
-	   __asm__ (						\
-	   VSHUF(Chi_00,T0)				\
-	   VSHUF(Chi_10,T1)						\
-	   VMOVIDUP(0,%r8,Z0 )						\
-           VMOVIDUP(3,%r8,Z1 )						\
-           VMOVIDUP(6,%r8,Z2 )						\
-           VMADDSUB(Z0,T0,UChi_00)					\
-	   VMADDSUB(Z1,T0,UChi_01)					\
-	   VMADDSUB(Z2,T0,UChi_02)					\
-									\
-	   VMOVIDUP(0,%r9,Z0 )						\
-           VMOVIDUP(3,%r9,Z1 )						\
-           VMOVIDUP(6,%r9,Z2 )						\
-           VMADDSUB(Z0,T1,UChi_10)					\
-           VMADDSUB(Z1,T1,UChi_11)            \
-           VMADDSUB(Z2,T1,UChi_12)            \
-	   							\
-								\
-	   VMOVRDUP(0,%r8,Z3 )					\
-	   VMOVRDUP(3,%r8,Z4 )					\
-	   VMOVRDUP(6,%r8,Z5 )					\
-           VMADDSUB(Z3,Chi_00,UChi_00)/*rr * ir = ri rr*/	\
-           VMADDSUB(Z4,Chi_00,UChi_01)				\
-           VMADDSUB(Z5,Chi_00,UChi_02)				\
-								\
-	   VMOVRDUP(0,%r9,Z3 )					\
-	   VMOVRDUP(3,%r9,Z4 )					\
-	   VMOVRDUP(6,%r9,Z5 )					\
-           VMADDSUB(Z3,Chi_10,UChi_10)				\
-           VMADDSUB(Z4,Chi_10,UChi_11)\
-           VMADDSUB(Z5,Chi_10,UChi_12)				\
-	   							\
-								\
-	   VMOVIDUP(1,%r8,Z0 )					\
-	   VMOVIDUP(4,%r8,Z1 )					\
-	   VMOVIDUP(7,%r8,Z2 )					\
-	   VSHUF(Chi_01,T0)					\
-           VMADDSUB(Z0,T0,UChi_00)				\
-           VMADDSUB(Z1,T0,UChi_01)				\
-           VMADDSUB(Z2,T0,UChi_02)				\
-								\
-	   VMOVIDUP(1,%r9,Z0 )					\
-	   VMOVIDUP(4,%r9,Z1 )					\
-	   VMOVIDUP(7,%r9,Z2 )					\
-	   VSHUF(Chi_11,T1)					\
-           VMADDSUB(Z0,T1,UChi_10)				\
-           VMADDSUB(Z1,T1,UChi_11)				\
-           VMADDSUB(Z2,T1,UChi_12)				\
-								\
-	   VMOVRDUP(1,%r8,Z3 )					\
-	   VMOVRDUP(4,%r8,Z4 )					\
-	   VMOVRDUP(7,%r8,Z5 )					\
-           VMADDSUB(Z3,Chi_01,UChi_00)				\
-           VMADDSUB(Z4,Chi_01,UChi_01)				\
-           VMADDSUB(Z5,Chi_01,UChi_02)				\
-								\
-	   VMOVRDUP(1,%r9,Z3 )					\
-	   VMOVRDUP(4,%r9,Z4 )					\
-	   VMOVRDUP(7,%r9,Z5 )					\
-           VMADDSUB(Z3,Chi_11,UChi_10)				\
-           VMADDSUB(Z4,Chi_11,UChi_11)				\
-           VMADDSUB(Z5,Chi_11,UChi_12)				\
-	   							\
-	   VSHUF(Chi_02,T0)					\
-	   VSHUF(Chi_12,T1)					\
-	   VMOVIDUP(2,%r8,Z0 )					\
-	   VMOVIDUP(5,%r8,Z1 )					\
-	   VMOVIDUP(8,%r8,Z2 )					\
-           VMADDSUB(Z0,T0,UChi_00)				\
-           VMADDSUB(Z1,T0,UChi_01)			      \
-           VMADDSUB(Z2,T0,UChi_02)			      \
-	   VMOVIDUP(2,%r9,Z0 )					\
-	   VMOVIDUP(5,%r9,Z1 )					\
-	   VMOVIDUP(8,%r9,Z2 )					\
-           VMADDSUB(Z0,T1,UChi_10)			      \
-           VMADDSUB(Z1,T1,UChi_11)			      \
-           VMADDSUB(Z2,T1,UChi_12)			      \
-	   /*55*/					      \
-	   VMOVRDUP(2,%r8,Z3 )		  \
-	   VMOVRDUP(5,%r8,Z4 )					\
-	   VMOVRDUP(8,%r8,Z5 )				      \
-           VMADDSUB(Z3,Chi_02,UChi_00)			      \
-           VMADDSUB(Z4,Chi_02,UChi_01)			      \
-           VMADDSUB(Z5,Chi_02,UChi_02)			      \
-	   VMOVRDUP(2,%r9,Z3 )		  \
-	   VMOVRDUP(5,%r9,Z4 )					\
-	   VMOVRDUP(8,%r9,Z5 )				      \
-           VMADDSUB(Z3,Chi_12,UChi_10)			      \
-           VMADDSUB(Z4,Chi_12,UChi_11)			      \
-           VMADDSUB(Z5,Chi_12,UChi_12)			      \
-	   /*61 insns*/							);
-
-#define MULT_ADD_XYZT(g0,g1)					\
-  asm ( "movq %0, %%r8 \n\t"					\
-	"movq %1, %%r9 \n\t"	 :  : "r"(g0), "r"(g1) : "%r8","%r9");\
-  __asm__ (							  \
-  VSHUFMEM(0,%r8,Z00)		   VSHUFMEM(0,%r9,Z10)			\
-  VRDUP(Chi_00,T0)           VIDUP(Chi_00,Chi_00)	          \
-   VRDUP(Chi_10,T1)           VIDUP(Chi_10,Chi_10)		  \
-   VMUL(Z00,Chi_00,Z1)        VMUL(Z10,Chi_10,Z2)		  \
-   VSHUFMEM(3,%r8,Z00)	      VSHUFMEM(3,%r9,Z10)		  \
-   VMUL(Z00,Chi_00,Z3)        VMUL(Z10,Chi_10,Z4)		  \
-   VSHUFMEM(6,%r8,Z00)	      VSHUFMEM(6,%r9,Z10)		  \
-   VMUL(Z00,Chi_00,Z5)        VMUL(Z10,Chi_10,Z6)		  \
-   VMADDMEM(0,%r8,T0,UChi_00)  VMADDMEM(0,%r9,T1,UChi_10)		  \
-   VMADDMEM(3,%r8,T0,UChi_01)  VMADDMEM(3,%r9,T1,UChi_11)		  \
-   VMADDMEM(6,%r8,T0,UChi_02)  VMADDMEM(6,%r9,T1,UChi_12)		  \
-   VSHUFMEM(1,%r8,Z00)	      VSHUFMEM(1,%r9,Z10)		  \
-   VRDUP(Chi_01,T0)           VIDUP(Chi_01,Chi_01)		  \
-   VRDUP(Chi_11,T1)           VIDUP(Chi_11,Chi_11)		  \
-   VMADD(Z00,Chi_01,Z1)       VMADD(Z10,Chi_11,Z2)		  \
-   VSHUFMEM(4,%r8,Z00)	      VSHUFMEM(4,%r9,Z10)		  \
-   VMADD(Z00,Chi_01,Z3)       VMADD(Z10,Chi_11,Z4)		  \
-   VSHUFMEM(7,%r8,Z00)	      VSHUFMEM(7,%r9,Z10)		  \
-   VMADD(Z00,Chi_01,Z5)       VMADD(Z10,Chi_11,Z6)		  \
-   VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10)	  \
-   VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11)	  \
-   VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12)	  \
-   VSHUFMEM(2,%r8,Z00)	      VSHUFMEM(2,%r9,Z10)			\
-   VRDUP(Chi_02,T0)           VIDUP(Chi_02,Chi_02)			\
-   VRDUP(Chi_12,T1)           VIDUP(Chi_12,Chi_12)			\
-   VMADD(Z00,Chi_02,Z1)       VMADD(Z10,Chi_12,Z2)		  \
-   VSHUFMEM(5,%r8,Z00)	      VSHUFMEM(5,%r9,Z10)		  \
-   VMADD(Z00,Chi_02,Z3)       VMADD(Z10,Chi_12,Z4)		  \
-   VSHUFMEM(8,%r8,Z00)	      VSHUFMEM(8,%r9,Z10)		  \
-   VMADD(Z00,Chi_02,Z5)       VMADD(Z10,Chi_12,Z6)		  \
-   VMADDSUBMEM(2,%r8,T0,Z1)   VMADDSUBMEM(2,%r9,T1,Z2)		  \
-   VMADDSUBMEM(5,%r8,T0,Z3)   VMADDSUBMEM(5,%r9,T1,Z4)	          \
-   VMADDSUBMEM(8,%r8,T0,Z5)   VMADDSUBMEM(8,%r9,T1,Z6)	       \
-   VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
-   VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
-   VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) );
-
-#define MULT_XYZT(g0,g1)					\
-    asm ( "movq %0, %%r8 \n\t"						\
-	"movq %1, %%r9 \n\t" :  : "r"(g0), "r"(g1) : "%r8","%r9" ); \
-	   __asm__ (						\
-	   VSHUF(Chi_00,T0)				\
-	   VSHUF(Chi_10,T1)						\
-	   VMOVIDUP(0,%r8,Z0 )						\
-           VMOVIDUP(3,%r8,Z1 )						\
-           VMOVIDUP(6,%r8,Z2 )						\
-	   /*6*/							\
-           VMUL(Z0,T0,UChi_00)            \
-           VMUL(Z1,T0,UChi_01)            \
-           VMUL(Z2,T0,UChi_02)            \
-	   VMOVIDUP(0,%r9,Z0 )						\
-           VMOVIDUP(3,%r9,Z1 )						\
-           VMOVIDUP(6,%r9,Z2 )						\
-           VMUL(Z0,T1,UChi_10)            \
-           VMUL(Z1,T1,UChi_11)            \
-           VMUL(Z2,T1,UChi_12)            \
-	   VMOVRDUP(0,%r8,Z3 )					\
-	   VMOVRDUP(3,%r8,Z4 )					\
-	   VMOVRDUP(6,%r8,Z5 )					\
-	   /*18*/						\
-           VMADDSUB(Z3,Chi_00,UChi_00)				\
-           VMADDSUB(Z4,Chi_00,UChi_01)\
-           VMADDSUB(Z5,Chi_00,UChi_02) \
-	   VMOVRDUP(0,%r9,Z3 )					\
-	   VMOVRDUP(3,%r9,Z4 )					\
-	   VMOVRDUP(6,%r9,Z5 )					\
-           VMADDSUB(Z3,Chi_10,UChi_10)				\
-           VMADDSUB(Z4,Chi_10,UChi_11)\
-           VMADDSUB(Z5,Chi_10,UChi_12)				\
-	   VMOVIDUP(1,%r8,Z0 )					\
-	   VMOVIDUP(4,%r8,Z1 )					\
-	   VMOVIDUP(7,%r8,Z2 )					\
-	   /*28*/						\
-	   VSHUF(Chi_01,T0)					\
-           VMADDSUB(Z0,T0,UChi_00)      \
-           VMADDSUB(Z1,T0,UChi_01)       \
-           VMADDSUB(Z2,T0,UChi_02)        \
-	   VMOVIDUP(1,%r9,Z0 )					\
-	   VMOVIDUP(4,%r9,Z1 )					\
-	   VMOVIDUP(7,%r9,Z2 )					\
-	   VSHUF(Chi_11,T1)					\
-           VMADDSUB(Z0,T1,UChi_10)				\
-           VMADDSUB(Z1,T1,UChi_11)				\
-           VMADDSUB(Z2,T1,UChi_12)        \
-	   VMOVRDUP(1,%r8,Z3 )					\
-	   VMOVRDUP(4,%r8,Z4 )					\
-	   VMOVRDUP(7,%r8,Z5 )					\
-           /*38*/						\
-           VMADDSUB(Z3,Chi_01,UChi_00)    \
-           VMADDSUB(Z4,Chi_01,UChi_01)    \
-           VMADDSUB(Z5,Chi_01,UChi_02)    \
-	   VMOVRDUP(1,%r9,Z3 )					\
-	   VMOVRDUP(4,%r9,Z4 )					\
-	   VMOVRDUP(7,%r9,Z5 )					\
-           VMADDSUB(Z3,Chi_11,UChi_10)				\
-           VMADDSUB(Z4,Chi_11,UChi_11)    \
-           VMADDSUB(Z5,Chi_11,UChi_12)				\
-	   /*48*/						\
-	   VSHUF(Chi_02,T0)					\
-	   VSHUF(Chi_12,T1)					\
-	   VMOVIDUP(2,%r8,Z0 )					\
-	   VMOVIDUP(5,%r8,Z1 )					\
-	   VMOVIDUP(8,%r8,Z2 )					\
-           VMADDSUB(Z0,T0,UChi_00)				\
-           VMADDSUB(Z1,T0,UChi_01)			      \
-           VMADDSUB(Z2,T0,UChi_02)			      \
-	   VMOVIDUP(2,%r9,Z0 )					\
-	   VMOVIDUP(5,%r9,Z1 )					\
-	   VMOVIDUP(8,%r9,Z2 )					\
-           VMADDSUB(Z0,T1,UChi_10)			      \
-           VMADDSUB(Z1,T1,UChi_11)			      \
-           VMADDSUB(Z2,T1,UChi_12)			      \
-	   /*55*/					      \
-	   VMOVRDUP(2,%r8,Z3 )		  \
-	   VMOVRDUP(5,%r8,Z4 )					\
-	   VMOVRDUP(8,%r8,Z5 )				      \
-           VMADDSUB(Z3,Chi_02,UChi_00)			      \
-           VMADDSUB(Z4,Chi_02,UChi_01)			      \
-           VMADDSUB(Z5,Chi_02,UChi_02)			      \
-	   VMOVRDUP(2,%r9,Z3 )		  \
-	   VMOVRDUP(5,%r9,Z4 )					\
-	   VMOVRDUP(8,%r9,Z5 )				      \
-           VMADDSUB(Z3,Chi_12,UChi_10)			      \
-           VMADDSUB(Z4,Chi_12,UChi_11)			      \
-           VMADDSUB(Z5,Chi_12,UChi_12)			      \
-	   /*61 insns*/							);
-
-#define MULT_XYZTa(g0,g1)					\
-  asm ( "movq %0, %%r8 \n\t"					\
-	"movq %1, %%r9 \n\t" :  : "r"(g0), "r"(g1) : "%r8","%r9" ); \
-  __asm__ (							  \
-   VSHUFMEM(0,%r8,Z00)		   VSHUFMEM(0,%r9,Z10)	  \
-   VRDUP(Chi_00,T0)           VIDUP(Chi_00,Chi_00)	          \
-   VRDUP(Chi_10,T1)           VIDUP(Chi_10,Chi_10)		  \
-   VMUL(Z00,Chi_00,Z1)        VMUL(Z10,Chi_10,Z2)		  \
-   VSHUFMEM(3,%r8,Z00)	      VSHUFMEM(3,%r9,Z10)		  \
-   VMUL(Z00,Chi_00,Z3)        VMUL(Z10,Chi_10,Z4)		  \
-   VSHUFMEM(6,%r8,Z00)	      VSHUFMEM(6,%r9,Z10)		  \
-   VMUL(Z00,Chi_00,Z5)        VMUL(Z10,Chi_10,Z6)		  \
-   VMULMEM(0,%r8,T0,UChi_00)  VMULMEM(0,%r9,T1,UChi_10)		  \
-   VMULMEM(3,%r8,T0,UChi_01)  VMULMEM(3,%r9,T1,UChi_11)		  \
-   VMULMEM(6,%r8,T0,UChi_02)  VMULMEM(6,%r9,T1,UChi_12)		  \
-   VSHUFMEM(1,%r8,Z00)	      VSHUFMEM(1,%r9,Z10)		  \
-   VRDUP(Chi_01,T0)           VIDUP(Chi_01,Chi_01)		  \
-   VRDUP(Chi_11,T1)           VIDUP(Chi_11,Chi_11)		  \
-   VMADD(Z00,Chi_01,Z1)       VMADD(Z10,Chi_11,Z2)		  \
-   VSHUFMEM(4,%r8,Z00)	      VSHUFMEM(4,%r9,Z10)		  \
-   VMADD(Z00,Chi_01,Z3)       VMADD(Z10,Chi_11,Z4)		  \
-   VSHUFMEM(7,%r8,Z00)	      VSHUFMEM(7,%r9,Z10)		  \
-   VMADD(Z00,Chi_01,Z5)       VMADD(Z10,Chi_11,Z6)		  \
-   VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10)	  \
-   VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11)	  \
-   VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12)	  \
-   VSHUFMEM(2,%r8,Z00)	      VSHUFMEM(2,%r9,Z10)			\
-   VRDUP(Chi_02,T0)           VIDUP(Chi_02,Chi_02)			\
-   VRDUP(Chi_12,T1)           VIDUP(Chi_12,Chi_12)			\
-   VMADD(Z00,Chi_02,Z1)       VMADD(Z10,Chi_12,Z2)		  \
-   VSHUFMEM(5,%r8,Z00)	      VSHUFMEM(5,%r9,Z10)		  \
-   VMADD(Z00,Chi_02,Z3)       VMADD(Z10,Chi_12,Z4)		  \
-   VSHUFMEM(8,%r8,Z00)	      VSHUFMEM(8,%r9,Z10)		  \
-   VMADD(Z00,Chi_02,Z5)       VMADD(Z10,Chi_12,Z6)		  \
-   VMADDSUBMEM(2,%r8,T0,Z1)   VMADDSUBMEM(2,%r9,T1,Z2)		  \
-   VMADDSUBMEM(5,%r8,T0,Z3)   VMADDSUBMEM(5,%r9,T1,Z4)	          \
-   VMADDSUBMEM(8,%r8,T0,Z5)   VMADDSUBMEM(8,%r9,T1,Z6)	       \
-   VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
-   VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
-   VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) );
-
-
-#define LOAD_CHI(a0,a1,a2,a3)						\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VLOAD(0,%%r8,pChi_00)						\
-       VLOAD(1,%%r8,pChi_01)						\
-       VLOAD(2,%%r8,pChi_02)						\
-       : : "r" (a0) : "%r8" );						\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VLOAD(0,%%r8,pChi_10)						\
-       VLOAD(1,%%r8,pChi_11)						\
-       VLOAD(2,%%r8,pChi_12)						\
-       : : "r" (a1) : "%r8" );						\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VLOAD(0,%%r8,pChi_20)						\
-       VLOAD(1,%%r8,pChi_21)						\
-       VLOAD(2,%%r8,pChi_22)						\
-       : : "r" (a2) : "%r8" );						\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VLOAD(0,%%r8,pChi_30)						\
-       VLOAD(1,%%r8,pChi_31)						\
-       VLOAD(2,%%r8,pChi_32)						\
-       : : "r" (a3) : "%r8" );						
-
-#define LOAD_CHIa(a0,a1)						\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VLOAD(0,%%r8,pChi_00)						\
-       VLOAD(1,%%r8,pChi_01)						\
-       VLOAD(2,%%r8,pChi_02)						\
-       : : "r" (a0) : "%r8" );						\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VLOAD(0,%%r8,pChi_10)						\
-       VLOAD(1,%%r8,pChi_11)						\
-       VLOAD(2,%%r8,pChi_12)						\
-       : : "r" (a1) : "%r8" );						
-
-#define PF_CHI(a0)							
-#define PF_CHIa(a0)							\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VPREFETCH1(0,%%r8)						\
-       VPREFETCH1(1,%%r8)						\
-       VPREFETCH1(2,%%r8)						\
-       : : "r" (a0) : "%r8" );						\
-
-#define PF_GAUGE_XYZT(a0)							
-#define PF_GAUGE_XYZTa(a0)						\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VPREFETCH1(0,%%r8)						\
-       VPREFETCH1(1,%%r8)						\
-       VPREFETCH1(2,%%r8)						\
-       VPREFETCH1(3,%%r8)						\
-       VPREFETCH1(4,%%r8)						\
-       VPREFETCH1(5,%%r8)						\
-       VPREFETCH1(6,%%r8)						\
-       VPREFETCH1(7,%%r8)						\
-       VPREFETCH1(8,%%r8)						\
-       : : "r" (a0) : "%r8" );						\
-
-#define PF_GAUGE_LS(a0)							
-#define PF_GAUGE_LSa(a0)							\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VPREFETCH1(0,%%r8)						\
-       VPREFETCH1(1,%%r8)						\
-       : : "r" (a0) : "%r8" );						\
-  
-
-#define REDUCE(out)					\
-  asm (							\
-  VADD(UChi_00,UChi_10,UChi_00)				\
-  VADD(UChi_01,UChi_11,UChi_01)				\
-  VADD(UChi_02,UChi_12,UChi_02)				\
-  VADD(UChi_30,UChi_20,UChi_30)				\
-  VADD(UChi_31,UChi_21,UChi_31)				\
-  VADD(UChi_32,UChi_22,UChi_32)				\
-  VADD(UChi_00,UChi_30,UChi_00)				\
-  VADD(UChi_01,UChi_31,UChi_01)				\
-  VADD(UChi_02,UChi_32,UChi_02)				);	\
-  asm (								\
-       VSTORE(0,%0,pUChi_00)					\
-       VSTORE(1,%0,pUChi_01)					\
-       VSTORE(2,%0,pUChi_02)					\
-       : : "r" (out) : "memory" );
-
-#define nREDUCE(out)							\
-  asm (									\
-       VADD(UChi_00,UChi_10,UChi_00)					\
-       VADD(UChi_01,UChi_11,UChi_01)					\
-       VADD(UChi_02,UChi_12,UChi_02)					\
-       VADD(UChi_30,UChi_20,UChi_30)					\
-       VADD(UChi_31,UChi_21,UChi_31)					\
-       VADD(UChi_32,UChi_22,UChi_32)					\
-       VADD(UChi_00,UChi_30,UChi_00)					\
-       VADD(UChi_01,UChi_31,UChi_01)					\
-       VADD(UChi_02,UChi_32,UChi_02)				);	\
-  asm (VZERO(Chi_00)							\
-       VSUB(UChi_00,Chi_00,UChi_00)					\
-       VSUB(UChi_01,Chi_00,UChi_01)					\
-       VSUB(UChi_02,Chi_00,UChi_02)				);	\
-  asm (								\
-       VSTORE(0,%0,pUChi_00)					\
-       VSTORE(1,%0,pUChi_01)					\
-       VSTORE(2,%0,pUChi_02)					\
-       : : "r" (out) : "memory" );
-
-#define REDUCEa(out)					\
-  asm (							\
-  VADD(UChi_00,UChi_10,UChi_00)				\
-  VADD(UChi_01,UChi_11,UChi_01)				\
-  VADD(UChi_02,UChi_12,UChi_02)	);			\
-  asm (							\
-  VSTORE(0,%0,pUChi_00)					\
-  VSTORE(1,%0,pUChi_01)					\
-  VSTORE(2,%0,pUChi_02)					\
-  : : "r" (out) : "memory" );
-
-// FIXME is sign right in the VSUB ?
-#define nREDUCEa(out)					\
-  asm (							\
-  VADD(UChi_00,UChi_10,UChi_00)				\
-  VADD(UChi_01,UChi_11,UChi_01)				\
-  VADD(UChi_02,UChi_12,UChi_02)	);			\
-  asm (VZERO(Chi_00)							\
-       VSUB(UChi_00,Chi_00,UChi_00)					\
-       VSUB(UChi_01,Chi_00,UChi_01)					\
-       VSUB(UChi_02,Chi_00,UChi_02)				);	\
-  asm (									\
-       VSTORE(0,%0,pUChi_00)				\
-       VSTORE(1,%0,pUChi_01)				\
-       VSTORE(2,%0,pUChi_02)				\
-       : : "r" (out) : "memory" );
-
-#define PERMUTE_DIR(dir)			\
-      permute##dir(Chi_0,Chi_0);\
-      permute##dir(Chi_1,Chi_1);\
-      permute##dir(Chi_2,Chi_2);
-
-NAMESPACE_BEGIN(Grid);
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-					 DoubledGaugeFieldView &U,
-					 DoubledGaugeFieldView &UUU,
-					 SiteSpinor *buf, int LLs,
-					 int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
-{
-  assert(0);
-};
-
-
-//#define CONDITIONAL_MOVE(l,o,out) if ( l ) { out = (uint64_t) &in[o] ; } else { out =(uint64_t) &buf[o]; }
-
-#define CONDITIONAL_MOVE(l,o,out) { const SiteSpinor *ptr = l? in_p : buf; out = (uint64_t) &ptr[o]; }
-
-#define PREPARE_XYZT(X,Y,Z,T,skew,UU)			\
-  PREPARE(X,Y,Z,T,skew,UU);				\
-  PF_GAUGE_XYZT(gauge0);					\
-  PF_GAUGE_XYZT(gauge1);					\
-  PF_GAUGE_XYZT(gauge2);					\
-  PF_GAUGE_XYZT(gauge3);					
-
-#define PREPARE_LS(X,Y,Z,T,skew,UU)			\
-  PREPARE(X,Y,Z,T,skew,UU);				\
-  PF_GAUGE_LS(gauge0);					\
-  PF_GAUGE_LS(gauge1);					\
-  PF_GAUGE_LS(gauge2);					\
-  PF_GAUGE_LS(gauge3);					
-
-#define PREPARE(X,Y,Z,T,skew,UU)					\
-  SE0=st.GetEntry(ptype,X+skew,sF);					\
-  o0 = SE0->_offset;							\
-  l0 = SE0->_is_local;							\
-  p0 = SE0->_permute;							\
-  CONDITIONAL_MOVE(l0,o0,addr0);					\
-  PF_CHI(addr0);							\
-  									\
-  SE1=st.GetEntry(ptype,Y+skew,sF);					\
-  o1 = SE1->_offset;							\
-  l1 = SE1->_is_local;							\
-  p1 = SE1->_permute;							\
-  CONDITIONAL_MOVE(l1,o1,addr1);					\
-  PF_CHI(addr1);							\
-  									\
-  SE2=st.GetEntry(ptype,Z+skew,sF);					\
-  o2 = SE2->_offset;							\
-  l2 = SE2->_is_local;							\
-  p2 = SE2->_permute;							\
-  CONDITIONAL_MOVE(l2,o2,addr2);					\
-  PF_CHI(addr2);							\
-  									\
-  SE3=st.GetEntry(ptype,T+skew,sF);					\
-  o3 = SE3->_offset;							\
-  l3 = SE3->_is_local;							\
-  p3 = SE3->_permute;							\
-  CONDITIONAL_MOVE(l3,o3,addr3);					\
-  PF_CHI(addr3);							\
-  									\
-  gauge0 =(uint64_t)&UU[sU]( X );				\
-  gauge1 =(uint64_t)&UU[sU]( Y );				\
-  gauge2 =(uint64_t)&UU[sU]( Z );				\
-  gauge3 =(uint64_t)&UU[sU]( T ); 
-  
-  // This is the single precision 5th direction vectorised kernel
-#include <simd/Intel512single.h>
-template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-								    DoubledGaugeFieldView &U,
-								    DoubledGaugeFieldView &UUU,
-								    SiteSpinor *buf, int LLs,
-								    int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
-{
-#ifdef AVX512
-  uint64_t gauge0,gauge1,gauge2,gauge3;
-  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in[0];
-
-  int o0,o1,o2,o3; // offsets
-  int l0,l1,l2,l3; // local 
-  int p0,p1,p2,p3; // perm
-  int ptype;
-  StencilEntry *SE0;
-  StencilEntry *SE1;
-  StencilEntry *SE2;
-  StencilEntry *SE3;
-
-   for(int s=0;s<LLs;s++){
-
-    int sF=s+LLs*sU;
-    // Xp, Yp, Zp, Tp
-    PREPARE(Xp,Yp,Zp,Tp,0,U);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_LS(gauge0,gauge1,gauge2,gauge3);  
-
-    PREPARE(Xm,Ym,Zm,Tm,0,U);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);  
-
-    PREPARE(Xp,Yp,Zp,Tp,8,UUU);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
-
-    PREPARE(Xm,Ym,Zm,Tm,8,UUU);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
-
-    addr0 = (uint64_t) &out[sF];
-    if ( dag ) {
-      nREDUCE(addr0);
-    } else { 
-      REDUCE(addr0);
-    }
-   }
-#else 
-    assert(0);
-#endif
-   
-}
-
-#include <simd/Intel512double.h>
-template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-								    DoubledGaugeFieldView &U,
-								    DoubledGaugeFieldView &UUU,
-								    SiteSpinor *buf, int LLs,
-								    int sU, const FermionFieldView &in, FermionFieldView &out, int dag) 
-{
-#ifdef AVX512
-  uint64_t gauge0,gauge1,gauge2,gauge3;
-  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in[0];
-
-  int o0,o1,o2,o3; // offsets
-  int l0,l1,l2,l3; // local 
-  int p0,p1,p2,p3; // perm
-  int ptype;
-  StencilEntry *SE0;
-  StencilEntry *SE1;
-  StencilEntry *SE2;
-  StencilEntry *SE3;
-
-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
-    // Xp, Yp, Zp, Tp
-    PREPARE(Xp,Yp,Zp,Tp,0,U);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_LS(gauge0,gauge1,gauge2,gauge3);  
-
-    PREPARE(Xm,Ym,Zm,Tm,0,U);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);  
-
-    PREPARE(Xp,Yp,Zp,Tp,8,UUU);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
-
-    PREPARE(Xm,Ym,Zm,Tm,8,UUU);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
-
-    addr0 = (uint64_t) &out[sF];
-    if ( dag ) {
-      nREDUCE(addr0);
-    } else { 
-      REDUCE(addr0);
-    }
-  }
-#else 
-  assert(0);
-#endif
-}
-   
-   
-
-
-#define PERMUTE_DIR3 __asm__ (	\
-  VPERM3(Chi_00,Chi_00)	\
-  VPERM3(Chi_01,Chi_01)	\
-  VPERM3(Chi_02,Chi_02)	);
-
-#define PERMUTE_DIR2 __asm__ (	\
-  VPERM2(Chi_10,Chi_10)	\
-  VPERM2(Chi_11,Chi_11)	\
-  VPERM2(Chi_12,Chi_12) );
-
-#define PERMUTE_DIR1 __asm__ (	\
-  VPERM1(Chi_00,Chi_00)	\
-  VPERM1(Chi_01,Chi_01)	\
-  VPERM1(Chi_02,Chi_02)	);
-
-#define PERMUTE_DIR0 __asm__ (			\
-  VPERM0(Chi_10,Chi_10)	\
-  VPERM0(Chi_11,Chi_11)	\
-  VPERM0(Chi_12,Chi_12) );
-
-#define PERMUTE01 \
-  if ( p0 ) { PERMUTE_DIR3; }\
-  if ( p1 ) { PERMUTE_DIR2; }
-
-#define PERMUTE23 \
-  if ( p2 ) { PERMUTE_DIR1; }\
-  if ( p3 ) { PERMUTE_DIR0; }
-
-  // This is the single precision 5th direction vectorised kernel
-
-#include <simd/Intel512single.h>
-template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-							       DoubledGaugeFieldView &U,
-							       DoubledGaugeFieldView &UUU,
-							       SiteSpinor *buf, int LLs,
-							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
-{
-#ifdef AVX512
-  uint64_t gauge0,gauge1,gauge2,gauge3;
-  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in[0];
-
-  int o0,o1,o2,o3; // offsets
-  int l0,l1,l2,l3; // local 
-  int p0,p1,p2,p3; // perm
-  int ptype;
-  StencilEntry *SE0;
-  StencilEntry *SE1;
-  StencilEntry *SE2;
-  StencilEntry *SE3;
-
-  for(int s=0;s<LLs;s++){
-    
-    int sF=s+LLs*sU;
-    // Xp, Yp, Zp, Tp
-    PREPARE(Xp,Yp,Zp,Tp,0,U);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-
-    PREPARE(Xm,Ym,Zm,Tm,0,U);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_ADD_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-
-    PREPARE(Xp,Yp,Zp,Tp,8,UUU);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_ADD_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-    
-    PREPARE(Xm,Ym,Zm,Tm,8,UUU);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_ADD_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-
-    addr0 = (uint64_t) &out[sF];
-    if ( dag ) { 
-      nREDUCEa(addr0);
-    } else { 
-      REDUCEa(addr0);
-    }
-  }
-#else 
-  assert(0);
-#endif
-}
-
-#include <simd/Intel512double.h>
-template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-							       DoubledGaugeFieldView &U,
-							       DoubledGaugeFieldView &UUU,
-							       SiteSpinor *buf, int LLs,
-							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
-{
-#ifdef AVX512
-  uint64_t gauge0,gauge1,gauge2,gauge3;
-  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in[0];
-
-  int o0,o1,o2,o3; // offsets
-  int l0,l1,l2,l3; // local 
-  int p0,p1,p2,p3; // perm
-  int ptype;
-  StencilEntry *SE0;
-  StencilEntry *SE1;
-  StencilEntry *SE2;
-  StencilEntry *SE3;
-
-  for(int s=0;s<LLs;s++){
-    
-    int sF=s+LLs*sU;
-    // Xp, Yp, Zp, Tp
-    PREPARE(Xp,Yp,Zp,Tp,0,U);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-    
-    PREPARE(Xm,Ym,Zm,Tm,0,U);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_ADD_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-    
-    PREPARE(Xp,Yp,Zp,Tp,8,UUU);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_ADD_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-    
-    PREPARE(Xm,Ym,Zm,Tm,8,UUU);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_ADD_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-    
-    addr0 = (uint64_t) &out[sF];
-    if ( dag ) {
-      nREDUCEa(addr0);
-    } else { 
-      REDUCEa(addr0);
-    }
-  }
-#else 
-  assert(0);
-#endif
-}
-
-#define KERNEL_INSTANTIATE(CLASS,FUNC,IMPL)			    \
-  template void CLASS<IMPL>::FUNC(StencilImpl &st, LebesgueOrder &lo,	\
-				  DoubledGaugeFieldView &U,			\
-				  DoubledGaugeFieldView &UUU,		\
-				  SiteSpinor *buf, int LLs,		\
-				  int sU, const FermionFieldView &in, FermionFieldView &out,int dag);
-
-//KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
-//KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
-//KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplD);
-//KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplF);
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
@@ -1,396 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#define LOAD_CHI(b)		\
-  const SiteSpinor & ref (b[offset]);	\
-    Chi_0=ref()()(0);\
-    Chi_1=ref()()(1);\
-    Chi_2=ref()()(2);
-
-
-// To splat or not to splat depends on the implementation
-#define MULT(A,UChi)				\
-  auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
-    UChi ## _0  = U_00*Chi_0;	       \
-    UChi ## _1  = U_10*Chi_0;\
-    UChi ## _2  = U_20*Chi_0;\
-    UChi ## _0 += U_01*Chi_1;\
-    UChi ## _1 += U_11*Chi_1;\
-    UChi ## _2 += U_21*Chi_1;\
-    UChi ## _0 += U_02*Chi_2;\
-    UChi ## _1 += U_12*Chi_2;\
-    UChi ## _2 += U_22*Chi_2;
-
-#define MULT_ADD(U,A,UChi)			\
-  auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
-    UChi ## _0 += U_00*Chi_0;	       \
-    UChi ## _1 += U_10*Chi_0;\
-    UChi ## _2 += U_20*Chi_0;\
-    UChi ## _0 += U_01*Chi_1;\
-    UChi ## _1 += U_11*Chi_1;\
-    UChi ## _2 += U_21*Chi_1;\
-    UChi ## _0 += U_02*Chi_2;\
-    UChi ## _1 += U_12*Chi_2;\
-    UChi ## _2 += U_22*Chi_2;
-
-
-#define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_0,Chi_0);			\
-  permute##dir(Chi_1,Chi_1);			\
-  permute##dir(Chi_2,Chi_2);
-
-
-#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew)	\
-  SE=st.GetEntry(ptype,Dir+skew,sF);	\
-  offset = SE->_offset;			\
-  local  = SE->_is_local;		\
-  perm   = SE->_permute;		\
-  if ( local ) {						\
-    LOAD_CHI(in);					\
-    if ( perm) {						\
-      PERMUTE_DIR(Perm);					\
-    }								\
-  } else {							\
-    LOAD_CHI(buf);						\
-  }								
-
-#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even)		\
-  HAND_STENCIL_LEG_BASE(Dir,Perm,skew)				\
-  {								\
-    MULT(Dir,even);						\
-  }
-
-#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even)			\
-  HAND_STENCIL_LEG_BASE(Dir,Perm,skew)				\
-  {								\
-    MULT_ADD(U,Dir,even);					\
-  }
-
-
-
-#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even)	\
-  SE=st.GetEntry(ptype,Dir+skew,sF);			\
-  offset = SE->_offset;					\
-  local  = SE->_is_local;				\
-  perm   = SE->_permute;				\
-  if ( local ) {					\
-    LOAD_CHI(in);				\
-    if ( perm) {					\
-      PERMUTE_DIR(Perm);				\
-    }							\
-  } else if ( st.same_node[Dir] ) {			\
-    LOAD_CHI(buf);					\
-  }							\
-  if (SE->_is_local || st.same_node[Dir] ) {		\
-    MULT_ADD(U,Dir,even);				\
-  }
-
-#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even)	\
-  SE=st.GetEntry(ptype,Dir+skew,sF);			\
-  offset = SE->_offset;					\
-  local  = SE->_is_local;				\
-  perm   = SE->_permute;				\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
-    nmu++;							\
-    { LOAD_CHI(buf);	  }					\
-    { MULT_ADD(U,Dir,even); }					\
-  }								
-
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
-					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
-					  SiteSpinor *buf, int LLs, int sU, 
-					  const FermionFieldView &in, FermionFieldView &out,int dag) 
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
-
-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
-  int offset,local,perm, ptype;
-
-  StencilEntry *SE;
-  int skew;
-
-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
-
-    skew = 0;
-    HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);  
-    HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);   
-    HAND_STENCIL_LEG      (U,Zp,1,skew,even);  
-    HAND_STENCIL_LEG      (U,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG      (U,Xm,3,skew,even);  
-    HAND_STENCIL_LEG      (U,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG      (U,Zm,1,skew,even);  
-    HAND_STENCIL_LEG      (U,Tm,0,skew,odd);  
-    skew = 8;
-    HAND_STENCIL_LEG(UUU,Xp,3,skew,even);  
-    HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG(UUU,Zp,1,skew,even);  
-    HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG(UUU,Xm,3,skew,even);  
-    HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG(UUU,Zm,1,skew,even);  
-    HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);  
-    
-    if ( dag ) {
-      result()()(0) = - even_0 - odd_0;
-      result()()(1) = - even_1 - odd_1;
-      result()()(2) = - even_2 - odd_2;
-    } else { 
-      result()()(0) = even_0 + odd_0;
-      result()()(1) = even_1 + odd_1;
-      result()()(2) = even_2 + odd_2;
-    }
-    vstream(out[sF],result);
-  }
-}
-
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionFieldView &in, FermionFieldView &out,int dag) 
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
-
-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
-  int offset, ptype, local, perm;
-
-  StencilEntry *SE;
-  int skew;
-
-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
-
-    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
-     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
-
-    skew = 0;
-    HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);  
-    HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);  
-    HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);  
-    HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);  
-    HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);  
-    skew = 8;
-    HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);  
-    HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);  
-    HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);  
-    HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);  
-    HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);  
-
-    // Assume every site must be connected to at least one interior point. No 1^4 subvols.
-    if ( dag ) {
-      result()()(0) = - even_0 - odd_0;
-      result()()(1) = - even_1 - odd_1;
-      result()()(2) = - even_2 - odd_2;
-    } else { 
-      result()()(0) = even_0 + odd_0;
-      result()()(1) = even_1 + odd_1;
-      result()()(2) = even_2 + odd_2;
-    }
-    vstream(out[sF],result);
-  }
-}
-
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionFieldView &in, FermionFieldView &out,int dag) 
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
-
-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
-  int offset, ptype, local, perm;
-
-  StencilEntry *SE;
-  int skew;
-
-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
-
-    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
-     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
-    int nmu=0;
-    skew = 0;
-    HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);  
-    HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);  
-    HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);  
-    HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);  
-    HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);  
-    skew = 8;
-    HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);  
-    HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);  
-    HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);  
-    HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);  
-    HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);  
-
-    // Add sum of all exterior connected stencil legs
-    if ( nmu ) { 
-      if ( dag ) {
-	result()()(0) = - even_0 - odd_0;
-	result()()(1) = - even_1 - odd_1;
-	result()()(2) = - even_2 - odd_2;
-      } else { 
-	result()()(0) = even_0 + odd_0;
-	result()()(1) = even_1 + odd_1;
-	result()()(2) = even_2 + odd_2;
-      }
-      out[sF] = out[sF] + result;
-    }
-  }
-}
-
-
-#define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
-  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
-						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionFieldView &in, FermionFieldView &out, int dag); \
-									\
-  template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
-						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionFieldView &in, FermionFieldView &out, int dag); \
-									\
-  template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
-						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionFieldView &in, FermionFieldView &out, int dag); \
-
-DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
-
-NAMESPACE_END(Grid);
-
-
--- a/Grid/qcd/action/fermion/StaggeredVec5dImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredVec5dImpl.h
@@ -0,0 +1,203 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template <class S, class Representation = FundamentalRepresentation >
+class StaggeredVec5dImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
+
+public:
+
+  static const int Dimension = Representation::Dimension;
+    static const bool isFundamental = Representation::isFundamental;
+  static const bool LsVectorised=true;
+  typedef RealD   Coeff_t ;
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
+      
+  //Necessary?
+  constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
+
+
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  template <typename vtype> using iImplSpinor            = iScalar<iScalar<iVector<vtype, Dimension> > >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iScalar<iVector<vtype, Dimension> > >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
+  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+  template <typename vtype> using iImplPropagator        = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+
+  // Make the doubled gauge field a *scalar*
+  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
+  typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar
+  typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar
+  typedef iImplPropagator<Simd>        SitePropagator;
+
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+  typedef Lattice<SitePropagator> PropagatorField;
+    
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+
+    
+  typedef Lattice<SiteSpinor>            FermionField;
+    
+  typedef StaggeredImplParams ImplParams;
+  typedef SimpleCompressor<SiteSpinor> Compressor;
+  typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
+  typedef typename StencilImpl::View_type StencilView;
+    
+  ImplParams Params;
+    
+  StaggeredVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
+
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    vsplat(reg, memory);
+  }
+
+  static accelerator_inline void multLink(SiteHalfSpinor &phi, 
+					  const SiteDoubledGaugeField &U,
+					  const SiteHalfSpinor &chi, 
+					  int mu) 
+  {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+	vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mult(&phi(), &UU(), &chi());
+  }
+  static accelerator_inline void multLinkAdd(SiteHalfSpinor &phi, 
+					     const SiteDoubledGaugeField &U,
+					     const SiteHalfSpinor &chi, 
+					     int mu) 
+  {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+	vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mac(&phi(), &UU(), &chi());
+  }
+      
+  inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
+  {
+    GridBase *GaugeGrid = U_ds.Grid();
+    thread_loop( (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++), {
+
+	SiteScalarGaugeLink   ScalarU;
+	SiteDoubledGaugeField ScalarUds;
+	
+	Coordinate lcoor;
+	GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
+	peekLocalSite(ScalarUds, U_ds, lcoor);
+	
+	peekLocalSite(ScalarU, U, lcoor);
+	ScalarUds(mu) = ScalarU();
+	
+    });
+  }
+  inline void DoubleStore(GridBase *GaugeGrid,
+			  DoubledGaugeField &UUUds, // for Naik term
+			  DoubledGaugeField &Uds,
+			  const GaugeField &Uthin,
+			  const GaugeField &Ufat) 
+  {
+
+    GridBase * InputGrid = Uthin.Grid();
+    conformable(InputGrid,Ufat.Grid());
+
+    GaugeLinkField U(InputGrid);
+    GaugeLinkField UU(InputGrid);
+    GaugeLinkField UUU(InputGrid);
+    GaugeLinkField Udag(InputGrid);
+    GaugeLinkField UUUdag(InputGrid);
+
+    for (int mu = 0; mu < Nd; mu++) {
+
+      // Staggered Phase.
+      Lattice<iScalar<vInteger> > coor(InputGrid);
+      Lattice<iScalar<vInteger> > x(InputGrid); LatticeCoordinate(x,0);
+      Lattice<iScalar<vInteger> > y(InputGrid); LatticeCoordinate(y,1);
+      Lattice<iScalar<vInteger> > z(InputGrid); LatticeCoordinate(z,2);
+      Lattice<iScalar<vInteger> > t(InputGrid); LatticeCoordinate(t,3);
+
+      Lattice<iScalar<vInteger> > lin_z(InputGrid); lin_z=x+y;
+      Lattice<iScalar<vInteger> > lin_t(InputGrid); lin_t=x+y+z;
+
+      ComplexField phases(InputGrid);	phases=1.0;
+
+      if ( mu == 1 ) phases = where( mod(x    ,2)==(Integer)0, phases,-phases);
+      if ( mu == 2 ) phases = where( mod(lin_z,2)==(Integer)0, phases,-phases);
+      if ( mu == 3 ) phases = where( mod(lin_t,2)==(Integer)0, phases,-phases);
+
+      // 1 hop based on fat links
+      U      = PeekIndex<LorentzIndex>(Ufat, mu);
+      Udag   = adj( Cshift(U, mu, -1));
+
+      U    = U    *phases;
+      Udag = Udag *phases;
+
+      InsertGaugeField(Uds,U,mu);
+      InsertGaugeField(Uds,Udag,mu+4);
+
+      // 3 hop based on thin links. Crazy huh ?
+      U  = PeekIndex<LorentzIndex>(Uthin, mu);
+      UU = Gimpl::CovShiftForward(U,mu,U);
+      UUU= Gimpl::CovShiftForward(U,mu,UU);
+	
+      UUUdag = adj( Cshift(UUU, mu, -3));
+
+      UUU    = UUU    *phases;
+      UUUdag = UUUdag *phases;
+
+      InsertGaugeField(UUUds,UUU,mu);
+      InsertGaugeField(UUUds,UUUdag,mu+4);
+
+    }
+  }
+
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+    assert(0);
+  }   
+      
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+    assert (0); 
+  }
+};
+typedef StaggeredVec5dImpl<vComplex,  FundamentalRepresentation > StaggeredVec5dImplR;   // Real.. whichever prec
+typedef StaggeredVec5dImpl<vComplexF, FundamentalRepresentation > StaggeredVec5dImplF;  // Float
+typedef StaggeredVec5dImpl<vComplexD, FundamentalRepresentation > StaggeredVec5dImplD;  // Double
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.cc
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.cc
@@ -1,242 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
-
-    Copyright (C) 2017
-
-    Author: paboyle <paboyle@ph.ed.ac.uk>
-    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid.h>
-#include <Grid/qcd/spin/Dirac.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// *NOT* EO
-template <class Impl>
-RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
-{
-  FermionField temp(out.Grid());
-
-  // Wilson term
-  out.Checkerboard() = in.Checkerboard();
-  this->Dhop(in, out, DaggerNo);
-
-  // Clover term
-  Mooee(in, temp);
-
-  out += temp;
-  return norm2(out);
-}
-
-template <class Impl>
-RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
-{
-  FermionField temp(out.Grid());
-
-  // Wilson term
-  out.Checkerboard() = in.Checkerboard();
-  this->Dhop(in, out, DaggerYes);
-
-  // Clover term
-  MooeeDag(in, temp);
-
-  out += temp;
-  return norm2(out);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
-{
-  WilsonFermion<Impl>::ImportGauge(_Umu);
-  GridBase *grid = _Umu.Grid();
-  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
-
-  // Compute the field strength terms mu>nu
-  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
-  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
-  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
-
-  // Compute the Clover Operator acting on Colour and Spin
-  // multiply here by the clover coefficients for the anisotropy
-  CloverTerm  = fillCloverYZ(Bx) * csw_r;
-  CloverTerm += fillCloverXZ(By) * csw_r;
-  CloverTerm += fillCloverXY(Bz) * csw_r;
-  CloverTerm += fillCloverXT(Ex) * csw_t;
-  CloverTerm += fillCloverYT(Ey) * csw_t;
-  CloverTerm += fillCloverZT(Ez) * csw_t;
-  CloverTerm += diag_mass;
-
-  int lvol = _Umu.Grid()->lSites();
-  int DimRep = Impl::Dimension;
-
-  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-
-  Coordinate lcoor;
-  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
-
-  for (int site = 0; site < lvol; site++)
-  {
-    grid->LocalIndexToLocalCoor(site, lcoor);
-    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-    peekLocalSite(Qx, CloverTerm, lcoor);
-    Qxinv = Zero();
-    //if (csw!=0){
-    for (int j = 0; j < Ns; j++)
-      for (int k = 0; k < Ns; k++)
-        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++){
-	    auto zz =  Qx()(j, k)(a, b);
-            EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
-	  }
-    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
-
-    EigenInvCloverOp = EigenCloverOp.inverse();
-    //std::cout << EigenInvCloverOp << std::endl;
-    for (int j = 0; j < Ns; j++)
-      for (int k = 0; k < Ns; k++)
-        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++)
-            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
-    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
-    //  }
-    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
-  }
-
-  // Separate the even and odd parts
-  pickCheckerboard(Even, CloverTermEven, CloverTerm);
-  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
-
-  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
-  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
-
-  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
-  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
-
-  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
-  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerNo, InverseNo);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerYes, InverseNo);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerNo, InverseYes);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerYes, InverseYes);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
-{
-  out.Checkerboard() = in.Checkerboard();
-  CloverFieldType *Clover;
-  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
-
-  if (dag)
-  {
-    if (in.Grid()->_isCheckerBoarded)
-    {
-      if (in.Checkerboard() == Odd)
-      {
-        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
-      }
-      else
-      {
-        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
-      }
-      out = *Clover * in;
-    }
-    else
-    {
-      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = adj(*Clover) * in;
-    }
-  }
-  else
-  {
-    if (in.Grid()->_isCheckerBoarded)
-    {
-
-      if (in.Checkerboard() == Odd)
-      {
-        //  std::cout << "Calling clover term Odd" << std::endl;
-        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
-      }
-      else
-      {
-        //  std::cout << "Calling clover term Even" << std::endl;
-        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
-      }
-      out = *Clover * in;
-      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
-    }
-    else
-    {
-      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = *Clover * in;
-    }
-  }
-
-} // MooeeInternal
-
-
-// Derivative parts
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
-{
-  assert(0);
-}
-
-// Derivative parts
-template <class Impl>
-void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
-{
-  assert(0); // not implemented yet
-}
-
-FermOpTemplateInstantiate(WilsonCloverFermion);
-AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
-TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
-//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonFermion.cc
+++ b/Grid/qcd/action/fermion/WilsonFermion.cc
@@ -1,596 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-			   /*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/WilsonFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
-const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
-int WilsonFermionStatic::HandOptDslash;
-
-/////////////////////////////////
-// Constructor and gauge import
-/////////////////////////////////
-
-template <class Impl>
-WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
-                                   GridRedBlackCartesian &Hgrid, RealD _mass,
-                                   const ImplParams &p,
-                                   const WilsonAnisotropyCoefficients &anis)
-  : 
-    Kernels(p),
-    _grid(&Fgrid),
-    _cbgrid(&Hgrid),
-    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
-    StencilEven(&Hgrid, npoint, Even, directions,displacements,p),  // source is Even
-    StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p),  // source is Odd
-    mass(_mass),
-    Lebesgue(_grid),
-    LebesgueEvenOdd(_cbgrid),
-    Umu(&Fgrid),
-    UmuEven(&Hgrid),
-    UmuOdd(&Hgrid),
-      _tmp(&Hgrid),
-      anisotropyCoeff(anis)
-{
-  // Allocate the required comms buffer
-  ImportGauge(_Umu);
-  if  (anisotropyCoeff.isAnisotropic){
-    diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
-  } else {
-    diag_mass = 4.0 + mass;
-  }
-
-
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) 
-{
-  GaugeField HUmu(_Umu.Grid());
-
-  //Here multiply the anisotropy coefficients
-  if (anisotropyCoeff.isAnisotropic)
-  {
-
-    for (int mu = 0; mu < Nd; mu++)
-    {
-      GaugeLinkField U_dir = (-0.5)*PeekIndex<LorentzIndex>(_Umu, mu);
-      if (mu != anisotropyCoeff.t_direction)
-        U_dir *= (anisotropyCoeff.nu / anisotropyCoeff.xi_0);
-
-      PokeIndex<LorentzIndex>(HUmu, U_dir, mu);
-    }
-  }
-  else
-  {
-    HUmu = _Umu * (-0.5);
-  }
-  Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
-  pickCheckerboard(Even, UmuEven, Umu);
-  pickCheckerboard(Odd, UmuOdd, Umu);
-}
-
-/////////////////////////////
-// Implement the interface
-/////////////////////////////
-
-template <class Impl>
-RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Dhop(in, out, DaggerNo);
-  return axpy_norm(out, diag_mass, in, out);
-}
-
-template <class Impl>
-RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Dhop(in, out, DaggerYes);
-  return axpy_norm(out, diag_mass, in, out);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
-  if (in.Checkerboard() == Odd) {
-    DhopEO(in, out, DaggerNo);
-  } else {
-    DhopOE(in, out, DaggerNo);
-  }
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
-  if (in.Checkerboard() == Odd) {
-    DhopEO(in, out, DaggerYes);
-  } else {
-    DhopOE(in, out, DaggerYes);
-  }
-}
-  
-template <class Impl>
-void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  typename FermionField::scalar_type scal(diag_mass);
-  out = scal * in;
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Mooee(in, out);
-}
-
-template<class Impl>
-void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  out = (1.0/(diag_mass))*in;
-}
-  
-template<class Impl>
-void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  MooeeInv(in,out);
-}
-template<class Impl>
-void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m,std::vector<double> twist)
-{  
-  typedef typename FermionField::vector_type vector_type;
-  typedef typename FermionField::scalar_type ScalComplex;
-  typedef Lattice<iSinglet<vector_type> > LatComplex;
-  
-  // what type LatticeComplex 
-  conformable(_grid,out.Grid());
-  
-  Gamma::Algebra Gmu [] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT
-  };
-  
-  Coordinate latt_size   = _grid->_fdimensions;
-  
-  FermionField   num  (_grid); num  = Zero();
-  LatComplex    wilson(_grid); wilson= Zero();
-  LatComplex     one  (_grid); one = ScalComplex(1.0,0.0);
-  
-  LatComplex denom(_grid); denom= Zero();
-  LatComplex kmu(_grid); 
-  ScalComplex ci(0.0,1.0);
-  // momphase = n * 2pi / L
-  for(int mu=0;mu<Nd;mu++) {
-    
-    LatticeCoordinate(kmu,mu);
-    
-    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-    
-    kmu = TwoPiL * kmu;
-    kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
-    
-    wilson = wilson + 2.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
-    
-    num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in);    // derivative term
-    
-    denom=denom + sin(kmu)*sin(kmu);
-  }
-  
-  wilson = wilson + _m;     // 2 sin^2 k/2 + m
-  
-  num   = num + wilson*in;     // -i gmu sin k + 2 sin^2 k/2 + m
-  
-  denom= denom+wilson*wilson; // sin^2 k + (2 sin^2 k/2 + m)^2
-  
-  denom= one/denom;
-  
-  out = num*denom; // [ -i gmu sin k + 2 sin^2 k/2 + m] / [ sin^2 k + (2 sin^2 k/2 + m)^2 ]
-  
-}
-  
-
-///////////////////////////////////
-// Internal
-///////////////////////////////////
-
-template <class Impl>
-void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
-                                        GaugeField &mat, const FermionField &A,
-                                        const FermionField &B, int dag) {
-  assert((dag == DaggerNo) || (dag == DaggerYes));
-
-  Compressor compressor(dag);
-
-  FermionField Btilde(B.Grid());
-  FermionField Atilde(B.Grid());
-  Atilde = A;
-
-  st.HaloExchange(B, compressor);
-
-  for (int mu = 0; mu < Nd; mu++) {
-    ////////////////////////////////////////////////////////////////////////
-    // Flip gamma (1+g)<->(1-g) if dag
-    ////////////////////////////////////////////////////////////////////////
-    int gamma = mu;
-    if (!dag) gamma += Nd;
-
-    ////////////////////////
-    // Call the single hop
-    ////////////////////////
-    auto U_v = U.View();
-    auto B_v = B.View();
-    auto Btilde_v = Btilde.View();
-    auto st_v = st.View();
-    thread_loop( (int sss = 0; sss < B.Grid()->oSites(); sss++) ,{
-      Kernels::DhopDirK(st_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu, gamma);
-    });
-
-    //////////////////////////////////////////////////
-    // spin trace outer product
-    //////////////////////////////////////////////////
-    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
-  }
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-  conformable(U.Grid(), _grid);
-  conformable(U.Grid(), V.Grid());
-  conformable(U.Grid(), mat.Grid());
-
-  mat.Checkerboard() = U.Checkerboard();
-
-  DerivInternal(Stencil, Umu, mat, U, V, dag);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-  conformable(U.Grid(), _cbgrid);
-  conformable(U.Grid(), V.Grid());
-  //conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido)
-  // Motivation: look at the SchurDiff operator
-  
-  assert(V.Checkerboard() == Even);
-  assert(U.Checkerboard() == Odd);
-  mat.Checkerboard() = Odd;
-
-  DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-  conformable(U.Grid(), _cbgrid);
-  conformable(U.Grid(), V.Grid());
-  //conformable(U.Grid(), mat.Grid());
-
-  assert(V.Checkerboard() == Odd);
-  assert(U.Checkerboard() == Even);
-  mat.Checkerboard() = Even;
-
-  DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
-  conformable(in.Grid(), _grid);  // verifies full grid
-  conformable(in.Grid(), out.Grid());
-
-  out.Checkerboard() = in.Checkerboard();
-
-  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
-  conformable(in.Grid(), _cbgrid);    // verifies half grid
-  conformable(in.Grid(), out.Grid());  // drops the cb check
-
-  assert(in.Checkerboard() == Even);
-  out.Checkerboard() = Odd;
-
-  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) {
-  conformable(in.Grid(), _cbgrid);    // verifies half grid
-  conformable(in.Grid(), out.Grid());  // drops the cb check
-
-  assert(in.Checkerboard() == Odd);
-  out.Checkerboard() = Even;
-
-  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
-  DhopDir(in, out, dir, disp);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
-  int skip = (disp == 1) ? 0 : 1;
-  int dirdisp = dir + skip * 4;
-  int gamma = dir + (1 - skip) * 4;
-
-  DhopDirDisp(in, out, dirdisp, gamma, DaggerNo);
-};
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
-{
-  Compressor compressor(dag);
-
-  Stencil.HaloExchange(in, compressor);
-  auto in_v = in.View();
-  auto out_v = in.View();
-  auto Umu_v = Umu.View();
-  auto Stencil_v = Stencil.View();
-  thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
-    Kernels::DhopDirK(Stencil_v, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dirdisp, gamma);
-  });
-};
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
-                                       DoubledGaugeField &U,
-                                       const FermionField &in,
-                                       FermionField &out, int dag) {
-#ifdef GRID_OMP
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
-  else
-#endif 
-    DhopInternalSerial(st,lo,U,in,out,dag);
-
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
-						      DoubledGaugeField &U,
-						      const FermionField &in,
-						      FermionField &out, int dag) {
-  assert((dag == DaggerNo) || (dag == DaggerYes));
-#ifdef GRID_OMP
-  Compressor compressor(dag);
-  int len =  U.Grid()->oSites();
-  const int LLs =  1;
-
-  st.Prepare();
-  st.HaloGather(in,compressor);
-  st.CommsMergeSHM(compressor);
-#pragma omp parallel
-  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = len;
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
-      }
-      // do the compute
-      auto U_v   = U.View();
-      auto in_v  = in.View();
-      auto out_v = out.View();
-      auto st_v  = st.View();
-      int Opt = WilsonKernelsStatic::Opt;
-
-      if (dag == DaggerYes) {
-        for (int sss = myblock; sss < myblock+myn; ++sss) {
-	  Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
-	  //	  Kernels::DhopSiteDag(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-	}
-      } else {
-        for (int sss = myblock; sss < myblock+myn; ++sss) {
-	  Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
-	  //	  Kernels::DhopSite(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-	}
-      } 
-
-    } else {
-      st.CommunicateThreaded();
-    }
-  }  //pragma
-
-  {
-    auto U_v   = U.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    auto st_v  =  st.View();
-    int Opt = WilsonKernelsStatic::Opt;
-    if (dag == DaggerYes) {
-      thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
-	Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
-      });
-    } else {
-      thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
-	Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
-      });
-    }
-  }
-#else
-  assert(0);
-#endif
-};
-
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
-                                       DoubledGaugeField &U,
-                                       const FermionField &in,
-                                       FermionField &out, int dag) {
-  assert((dag == DaggerNo) || (dag == DaggerYes));
-  Compressor compressor(dag);
-  st.HaloExchange(in, compressor);
-
-  int Opt = WilsonKernelsStatic::Opt;
-  auto U_v  = U.View();
-  auto in_v = in.View();
-  auto out_v= out.View();
-  auto st_v = st.View();
-  if (dag == DaggerYes) {
-    accelerator_loop( sss,in_v, {
-      Kernels::DhopSiteDag(Opt,st_v, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-    });
-  } else {
-    accelerator_loop( sss,in_v, {
-      Kernels::DhopSite(Opt,st_v, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-    });
-  }
-};
-/*Change ends */
-
-/*******************************************************************************
- * Conserved current utilities for Wilson fermions, for contracting propagators
- * to make a conserved current sink or inserting the conserved current 
- * sequentially.
- ******************************************************************************/
-template <class Impl>
-void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
-                                                   PropagatorField &q_in_2,
-                                                   PropagatorField &q_out,
-                                                   Current curr_type,
-                                                   unsigned int mu)
-{
-  Gamma g5(Gamma::Algebra::Gamma5);
-  conformable(_grid, q_in_1.Grid());
-  conformable(_grid, q_in_2.Grid());
-  conformable(_grid, q_out.Grid());
-  PropagatorField tmp1(_grid), tmp2(_grid);
-  q_out = Zero();
-
-  // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
-  // Inefficient comms method but not performance critical.
-  tmp1 = Cshift(q_in_1, mu, 1);
-  tmp2 = Cshift(q_in_2, mu, 1);
-  auto tmp1_v  =  tmp1.View();
-  auto tmp2_v  =  tmp2.View();
-  auto q_in_1_v=q_in_1.View();
-  auto q_in_2_v=q_in_2.View();
-  auto q_out_v = q_out.View();
-  auto Umu_v   =   Umu.View();
-  thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), {
-      Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
-					       q_in_2_v[sU],
-					       q_out_v[sU],
-					       Umu_v, sU, mu);
-      Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
-					       tmp2_v[sU],
-					       q_out_v[sU],
-					       Umu_v, sU, mu);
-  });
-}
-
-
-template <class Impl>
-void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
-                                              PropagatorField &q_out,
-                                              Current curr_type,
-                                              unsigned int mu,
-                                              unsigned int tmin, 
-                                              unsigned int tmax,
-					      ComplexField &lattice_cmplx)
-{
-  conformable(_grid, q_in.Grid());
-  conformable(_grid, q_out.Grid());
-
-  //  Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
-  Complex i(0.0,1.0);
-  PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
-  unsigned int tshift = (mu == Tp) ? 1 : 0;
-  unsigned int LLt    = GridDefaultLatt()[Tp];
-
-  q_out = Zero();
-  LatticeInteger coords(_grid);
-  LatticeCoordinate(coords, Tp);
-
-  // Need q(x + mu) and q(x - mu).
-  tmp    = Cshift(q_in, mu, 1);
-  tmpFwd = tmp*lattice_cmplx;
-  tmp    = lattice_cmplx*q_in;
-  tmpBwd = Cshift(tmp, mu, -1);
-
-  auto coords_v = coords.View();
-  auto tmpFwd_v = tmpFwd.View();
-  auto tmpBwd_v = tmpBwd.View();
-  auto Umu_v    = Umu.View();
-  auto q_out_v  = q_out.View();
-
-  thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), {
-
-    // Compute the sequential conserved current insertion only if our simd
-    // object contains a timeslice we need.
-    vInteger t_mask   = ((coords_v[sU] >= tmin) &&
-			 (coords_v[sU] <= tmax));
-    Integer timeSlices = Reduce(t_mask);
-
-    if (timeSlices > 0) {
-      Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], 
-					  q_out_v[sU], 
-					  Umu_v, sU, mu, t_mask);
-    }
-
-    // Repeat for backward direction.
-    t_mask     = ((coords_v[sU] >= (tmin + tshift)) && 
-		  (coords_v[sU] <= (tmax + tshift)));
-    
-    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
-    unsigned int t0 = 0;
-    if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords_v[sU] == t0 ));
-    
-    timeSlices = Reduce(t_mask);
-
-    if (timeSlices > 0) {
-      Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], 
-					  q_out_v[sU], 
-					  Umu_v, sU, mu, t_mask);
-    }
-  });
-}
-
-FermOpTemplateInstantiate(WilsonFermion);
-AdjointFermOpTemplateInstantiate(WilsonFermion);
-TwoIndexFermOpTemplateInstantiate(WilsonFermion);
-GparityFermOpTemplateInstantiate(WilsonFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonFermion5D.cc
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.cc
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -227,8 +227,8 @@ public:
 			   Current curr_type,
 			   unsigned int mu,
 			   unsigned int tmin,
-                             unsigned int tmax,
-			     ComplexField &lattice_cmplx);
+			   unsigned int tmax,
+			   ComplexField &lattice_cmplx);
 };

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -90,15 +90,7 @@ public:
    auto UU = coalescedRead(U(mu));
    mult(&phi(), &UU, &chi());
  }
-      
-#ifdef GPU_VEC
-  static accelerator_inline void copyLinkGpu(int lane,
-					     SiteDoubledGaugeField & UU,
-					     const SiteDoubledGaugeField &U)
-  {
-    auto U_l   = extractLane(lane,U);
-    insertLane(lane,UU,U_l);
-  }
+
  static accelerator_inline void multLinkGpu(int lane,
 					     typename SiteHalfSpinor::scalar_object &phi,
 					     const SiteDoubledGaugeField &U,
@@ -108,17 +100,6 @@ public:
    auto U_l   = extractLane(lane,U(mu));
    phi() =  U_l * chi();
  }
-#else
-  static accelerator_inline void multLinkGpu(int lane,
-					     SiteHalfSpinor &phi,
-					     const SiteDoubledGaugeField &U,
-					     const SiteHalfSpinor &chi,
-					     int mu) 
-  {
-    auto U_l   = U(mu);
-    phi() =  U_l * chi();
-  }
-#endif
    
  static accelerator_inline void multLinkProp(SitePropagator &phi,
 					      const SiteDoubledGaugeField &U,
--- a/Grid/qcd/action/fermion/WilsonKernels.cc
+++ b/Grid/qcd/action/fermion/WilsonKernels.cc
@@ -1,445 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-NAMESPACE_BEGIN(Grid);
-
-int WilsonKernelsStatic::Opt   = WilsonKernelsStatic::OptGeneric;
-int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
-
-////////////////////////////////////////////
-// Generic implementation; move to different file?
-////////////////////////////////////////////
-  
-#define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
-  SE = st.GetEntry(ptype, Dir, sF);				\
-  if (SE->_is_local) {						\
-    chi_p = &chi;						\
-    if (SE->_permute) {						\
-      spProj(tmp, in[SE->_offset]);			\
-      permute(chi, tmp, ptype);					\
-    } else {							\
-      spProj(chi, in[SE->_offset]);			\
-    }								\
-  } else {							\
-    chi_p = &buf[SE->_offset];					\
-  }								\
-  Impl::multLink(Uchi, U[sU], *chi_p, Dir, SE, st);	\
-  Recon(result, Uchi);
-  
-#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon)		\
-  SE = st.GetEntry(ptype, Dir, sF);				\
-  if (SE->_is_local) {						\
-    chi_p = &chi;						\
-    if (SE->_permute) {						\
-      spProj(tmp, in[SE->_offset]);			\
-      permute(chi, tmp, ptype);					\
-    } else {							\
-      spProj(chi, in[SE->_offset]);			\
-    }								\
-  } else if ( st.same_node[Dir] ) {				\
-      chi_p = &buf[SE->_offset];				\
-  }								\
-  if (SE->_is_local || st.same_node[Dir] ) {			\
-    Impl::multLink(Uchi, U[sU], *chi_p, Dir, SE, st);	\
-    Recon(result, Uchi);					\
-  }
-
-#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
-  SE = st.GetEntry(ptype, Dir, sF);				\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
-    chi_p = &buf[SE->_offset];					\
-    Impl::multLink(Uchi, U[sU], *chi_p, Dir, SE, st);	\
-    Recon(result, Uchi);					\
-    nmu++;							\
-  }
-
-#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
-  if (gamma == Dir) {						\
-    if (SE->_is_local && SE->_permute) {			\
-      spProj(tmp, in[SE->_offset]);			\
-      permute(chi, tmp, ptype);					\
-    } else if (SE->_is_local) {					\
-      spProj(chi, in[SE->_offset]);			\
-    } else {							\
-      chi = buf[SE->_offset];					\
-    }								\
-    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);	\
-    Recon(result, Uchi);					\
-  }
-
-  ////////////////////////////////////////////////////////////////////
-  // All legs kernels ; comms then compute
-  ////////////////////////////////////////////////////////////////////
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
-							 SiteHalfSpinor *buf, int sF,
-							 int sU, const FermionFieldView &in, FermionFieldView &out)
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-
-  GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
-  GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG(Tp,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG(Xm,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG(Ym,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
-  vstream(out[sF], result);
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
-						      SiteHalfSpinor *buf, int sF,
-						      int sU, const FermionFieldView &in, FermionFieldView &out) 
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-
-  GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
-  GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG(Tm,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG(Xp,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG(Yp,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
-  vstream(out[sF], result);
-};
-  ////////////////////////////////////////////////////////////////////
-  // Interior kernels
-  ////////////////////////////////////////////////////////////////////
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
-							    SiteHalfSpinor *buf, int sF,
-							    int sU, const FermionFieldView &in, FermionFieldView &out)
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-
-  result=Zero();
-  GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_INT(Yp,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_INT(Zp,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_INT(Tp,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_INT(Xm,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_INT(Ym,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
-  vstream(out[sF], result);
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
-							 SiteHalfSpinor *buf, int sF,
-							 int sU, const FermionFieldView &in, FermionFieldView &out) 
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-  result=Zero();
-  GENERIC_STENCIL_LEG_INT(Xm,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_INT(Ym,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_INT(Zm,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_INT(Tm,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_INT(Xp,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_INT(Yp,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
-  vstream(out[sF], result);
-};
-////////////////////////////////////////////////////////////////////
-// Exterior kernels
-////////////////////////////////////////////////////////////////////
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
-							    SiteHalfSpinor *buf, int sF,
-							    int sU, const FermionFieldView &in, FermionFieldView &out)
-{
-  //  SiteHalfSpinor tmp;
-  //  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-  int nmu=0;
-  result=Zero();
-  GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_EXT(Zp,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_EXT(Tp,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_EXT(Xm,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm);
-  if ( nmu ) { 
-    out[sF] = out[sF] + result; 
-  }
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
-							 SiteHalfSpinor *buf, int sF,
-							 int sU, const FermionFieldView &in, FermionFieldView &out) 
-{
-  //  SiteHalfSpinor tmp;
-  //  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-  int nmu=0;
-  result=Zero();
-  GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_EXT(Zm,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_EXT(Tm,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_EXT(Xp,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm);
-  if ( nmu ) { 
-    out[sF] = out[sF] + result; 
-  }
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
-						int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) 
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteSpinor result;
-  SiteHalfSpinor Uchi;
-  StencilEntry *SE;
-  int ptype;
-
-  SE = st.GetEntry(ptype, dir, sF);
-  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
-  GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
-  GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
-  GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
-  GENERIC_DHOPDIR_LEG(Xm,spProjXm,spReconXm);
-  GENERIC_DHOPDIR_LEG(Ym,spProjYm,spReconYm);
-  GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
-  GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
-  vstream(out[sF], result);
-}
-
-/*******************************************************************************
- * Conserved current utilities for Wilson fermions, for contracting propagators
- * to make a conserved current sink or inserting the conserved current 
- * sequentially. Common to both 4D and 5D.
- ******************************************************************************/
-// N.B. Functions below assume a -1/2 factor within U.
-#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
-#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
-
-/*******************************************************************************
- * Name: ContractConservedCurrentSiteFwd
- * Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in_1 shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(
-                                                  const SitePropagator &q_in_1,
-                                                  const SitePropagator &q_in_2,
-                                                  SitePropagator &q_out,
-                                                  DoubledGaugeFieldView &U,
-                                                  unsigned int sU,
-                                                  unsigned int mu,
-                                                  bool switch_sign)
-{
-  SitePropagator result, tmp;
-  Gamma g5(Gamma::Algebra::Gamma5);
-
-  Impl::multLinkProp(tmp, U[sU], q_in_1, mu);
-
-  result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
-
-  if (switch_sign) {
-    q_out -= result;
-  } else {
-    q_out += result;
-  }
-}
-
-/*******************************************************************************
- * Name: ContractConservedCurrentSiteBwd
- * Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in_2 shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(
-                                                  const SitePropagator &q_in_1,
-                                                  const SitePropagator &q_in_2,
-                                                  SitePropagator &q_out,
-                                                  DoubledGaugeFieldView &U,
-                                                  unsigned int sU,
-                                                  unsigned int mu,
-                                                  bool switch_sign)
-{
-  SitePropagator result, tmp;
-  Gamma g5(Gamma::Algebra::Gamma5);
-
-  Impl::multLinkProp(tmp, U[sU], q_in_1, mu + Nd);
-
-  result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
-  if (switch_sign) {
-    q_out += result;
-  } else {
-    q_out -= result;
-  }
-}
-
-// G-parity requires more specialised implementation.
-#define NO_CURR_SITE(Impl) \
-template <> \
-void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd( \
-                                                  const SitePropagator &q_in_1, \
-                                                  const SitePropagator &q_in_2, \
-                                                  SitePropagator &q_out,        \
-                                                  DoubledGaugeFieldView &U,         \
-                                                  unsigned int sU,              \
-                                                  unsigned int mu,              \
-                                                  bool switch_sign)             \
-{ \
-    assert(0); \
-} \
-template <> \
-void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd( \
-                                                  const SitePropagator &q_in_1, \
-                                                  const SitePropagator &q_in_2, \
-                                                  SitePropagator &q_out,        \
-                                                  DoubledGaugeFieldView &U,         \
-                                                  unsigned int mu,              \
-                                                  unsigned int sU,              \
-                                                  bool switch_sign)             \
-{ \
-    assert(0); \
-}
-
-NO_CURR_SITE(GparityWilsonImplF);
-NO_CURR_SITE(GparityWilsonImplD);
-NO_CURR_SITE(GparityWilsonImplFH);
-NO_CURR_SITE(GparityWilsonImplDF);
-
-
-/*******************************************************************************
- * Name: SeqConservedCurrentSiteFwd
- * Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
-                                                     SitePropagator &q_out,
-                                                     DoubledGaugeFieldView &U,
-                                                     unsigned int sU,
-                                                     unsigned int mu,
-                                                     vInteger t_mask,
-                                                     bool switch_sign)
-{
-  SitePropagator result;
-  
-  Impl::multLinkProp(result, U[sU], q_in, mu);
-  result = WilsonCurrentFwd(result, mu);
-
-  // Zero any unwanted timeslice entries.
-  result = predicatedWhere(t_mask, result, 0.*result);
-  
-  if (switch_sign) {
-    q_out -= result;
-  } else {
-    q_out += result;
-  }
-}
-
-/*******************************************************************************
- * Name: SeqConservedCurrentSiteFwd
- * Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in shifted in -ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in, 
-                                                     SitePropagator &q_out,
-                                                     DoubledGaugeFieldView &U,
-                                                     unsigned int sU,
-                                                     unsigned int mu,
-                                                     vInteger t_mask,
-                                                     bool switch_sign)
-{
-  SitePropagator result;
-  Impl::multLinkProp(result, U[sU], q_in, mu + Nd);
-  result = WilsonCurrentBwd(result, mu);
-
-  // Zero any unwanted timeslice entries.
-  result = predicatedWhere(t_mask, result, 0.*result);
-  
-  if (switch_sign) {
-    q_out += result;
-  } else {
-    q_out -= result;
-  }
-}
-
-FermOpTemplateInstantiate(WilsonKernels);
-AdjointFermOpTemplateInstantiate(WilsonKernels);
-TwoIndexFermOpTemplateInstantiate(WilsonKernels);
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -107,7 +107,7 @@ private:
 					 int Ls,int sF, int sU, const FermionFieldView &in, FermionFieldView &out);

  static accelerator void GenericDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-						 int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
      
  static accelerator void GenericDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
 						    int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
--- a/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -1,125 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-NAMESPACE_BEGIN(Grid);
-
-///////////////////////////////////////////////////////////
-// Default to no assembler implementation
-///////////////////////////////////////////////////////////
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-{
-  assert(0);
-}
-
-#include <qcd/action/fermion/WilsonKernelsAsmAvx512.h>
-#include <qcd/action/fermion/WilsonKernelsAsmQPX.h>
-
-#define INSTANTIATE_ASM(A) \
-template void WilsonKernels<A>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
- \
-template void WilsonKernels<A>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
-template void WilsonKernels<A>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
- \
-template void WilsonKernels<A>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
-template void WilsonKernels<A>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
- \
-template void WilsonKernels<A>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
-
-//INSTANTIATE_ASM(WilsonImplF);
-//INSTANTIATE_ASM(WilsonImplD);
-INSTANTIATE_ASM(GparityWilsonImplF);
-INSTANTIATE_ASM(GparityWilsonImplD);
-//INSTANTIATE_ASM(ZWilsonImplF);
-//INSTANTIATE_ASM(ZWilsonImplD);
-//INSTANTIATE_ASM(DomainWallVec5dImplF);
-//INSTANTIATE_ASM(DomainWallVec5dImplD);
-//INSTANTIATE_ASM(ZDomainWallVec5dImplF);
-//INSTANTIATE_ASM(ZDomainWallVec5dImplD);
-
-//INSTANTIATE_ASM(WilsonImplFH);
-//INSTANTIATE_ASM(WilsonImplDF);
-//INSTANTIATE_ASM(ZWilsonImplFH);
-//INSTANTIATE_ASM(ZWilsonImplDF);
-INSTANTIATE_ASM(GparityWilsonImplFH);
-INSTANTIATE_ASM(GparityWilsonImplDF);
-//INSTANTIATE_ASM(DomainWallVec5dImplFH);
-//INSTANTIATE_ASM(DomainWallVec5dImplDF);
-//INSTANTIATE_ASM(ZDomainWallVec5dImplFH);
-//INSTANTIATE_ASM(ZDomainWallVec5dImplDF);
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h
@@ -1,650 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-
-#if defined(AVX512) 
-    ///////////////////////////////////////////////////////////
-    // If we are AVX512 specialise the single precision routine
-    ///////////////////////////////////////////////////////////
-#include <simd/Intel512wilson.h>
-#include <simd/Intel512single.h>
-    
-static Vector<vComplexF> signsF;
-
-  template<typename vtype>    
-  int setupSigns(Vector<vtype>& signs ){
-    Vector<vtype> bother(2);
-    signs = bother;
-    vrsign(signs[0]);
-    visign(signs[1]);
-    return 1;
-  }
-
-  static int signInitF = setupSigns(signsF);
-
-#define MAYBEPERM(A,perm) if (perm) { A ; }
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
-#define COMPLEX_SIGNS(isigns) vComplexF *isigns = &signsF[0];  
-  
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-#undef MAYBEPERM
-#undef MULT_2SPIN
-#define MAYBEPERM(A,B) 
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-#undef  MULT_2SPIN
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef COMPLEX_SIGNS
-#undef MAYBEPERM
-#undef MULT_2SPIN
-	
-
-
-///////////////////////////////////////////////////////////
-// If we are AVX512 specialise the double precision routine
-///////////////////////////////////////////////////////////
-
-#include <simd/Intel512double.h>
-    
-static Vector<vComplexD> signsD;
-static int signInitD = setupSigns(signsD);
-    
-#define MAYBEPERM(A,perm) if (perm) { A ; }
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
-#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];  
-
-
-#define INTERIOR_AND_EXTERIOR    
-#undef  INTERIOR
-#undef  EXTERIOR
-  
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-#undef MAYBEPERM
-#undef MULT_2SPIN
-#define MAYBEPERM(A,B) 
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-#undef  MULT_2SPIN
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef COMPLEX_SIGNS
-#undef MAYBEPERM
-#undef MULT_2SPIN
-
-#endif //AVX512
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -1,198 +0,0 @@
-#ifdef KERNEL_DAG
-#define DIR0_PROJMEM(base) XP_PROJMEM(base);
-#define DIR1_PROJMEM(base) YP_PROJMEM(base);
-#define DIR2_PROJMEM(base) ZP_PROJMEM(base);
-#define DIR3_PROJMEM(base) TP_PROJMEM(base);
-#define DIR4_PROJMEM(base) XM_PROJMEM(base);
-#define DIR5_PROJMEM(base) YM_PROJMEM(base);
-#define DIR6_PROJMEM(base) ZM_PROJMEM(base);
-#define DIR7_PROJMEM(base) TM_PROJMEM(base);
-#define DIR0_RECON   XP_RECON
-#define DIR1_RECON   YP_RECON_ACCUM
-#define DIR2_RECON   ZP_RECON_ACCUM
-#define DIR3_RECON   TP_RECON_ACCUM
-#define DIR4_RECON   XM_RECON_ACCUM
-#define DIR5_RECON   YM_RECON_ACCUM
-#define DIR6_RECON   ZM_RECON_ACCUM
-#define DIR7_RECON   TM_RECON_ACCUM
-#else
-#define DIR0_PROJMEM(base) XM_PROJMEM(base);
-#define DIR1_PROJMEM(base) YM_PROJMEM(base);
-#define DIR2_PROJMEM(base) ZM_PROJMEM(base);
-#define DIR3_PROJMEM(base) TM_PROJMEM(base);
-#define DIR4_PROJMEM(base) XP_PROJMEM(base);
-#define DIR5_PROJMEM(base) YP_PROJMEM(base);
-#define DIR6_PROJMEM(base) ZP_PROJMEM(base);
-#define DIR7_PROJMEM(base) TP_PROJMEM(base);
-#define DIR0_RECON   XM_RECON
-#define DIR1_RECON   YM_RECON_ACCUM
-#define DIR2_RECON   ZM_RECON_ACCUM
-#define DIR3_RECON   TM_RECON_ACCUM
-#define DIR4_RECON   XP_RECON_ACCUM
-#define DIR5_RECON   YP_RECON_ACCUM
-#define DIR6_RECON   ZP_RECON_ACCUM
-#define DIR7_RECON   TP_RECON_ACCUM
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Comms then compute kernel
-////////////////////////////////////////////////////////////////////////////////
-#ifdef INTERIOR_AND_EXTERIOR
-
-#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
-      basep = st.GetPFInfo(nent,plocal); nent++;			\
-      if ( local ) {							\
-	LOAD64(%r10,isigns);						\
-	PROJ(base);							\
-	MAYBEPERM(PERMUTE_DIR,perm);					\
-      } else {								\
-	LOAD_CHI(base);							\
-      }									\
-      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
-      PREFETCH_CHIMU(base);						\
-      MULT_2SPIN_DIR_PF(Dir,basep);					\
-      LOAD64(%r10,isigns);						\
-      RECON;								\
-
-#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
-  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
-  PF_GAUGE(Xp);								\
-  PREFETCH1_CHIMU(base);						\
-  ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) 
-
-#define RESULT(base,basep) SAVE_RESULT(base,basep);
-
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Pre comms kernel -- prefetch like normal because it is mostly right
-////////////////////////////////////////////////////////////////////////////////
-#ifdef INTERIOR
-
-#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
-      basep = st.GetPFInfo(nent,plocal); nent++;			\
-      if ( local ) {							\
-	LOAD64(%r10,isigns);						\
-	PROJ(base);							\
-	MAYBEPERM(PERMUTE_DIR,perm);					\
-      }else if ( st.same_node[Dir] ) {LOAD_CHI(base);}			\
-      if ( local || st.same_node[Dir] ) {				\
-	MULT_2SPIN_DIR_PF(Dir,basep);					\
-	LOAD64(%r10,isigns);						\
-	RECON;								\
-      }									\
-      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
-      PREFETCH_CHIMU(base);						\
-
-#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
-  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
-  PF_GAUGE(Xp);								\
-  PREFETCH1_CHIMU(base);						\
-  { ZERO_PSI; }								\
-  ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) 
-
-#define RESULT(base,basep) SAVE_RESULT(base,basep);
-
-#endif
-////////////////////////////////////////////////////////////////////////////////
-// Post comms kernel
-////////////////////////////////////////////////////////////////////////////////
-#ifdef EXTERIOR
-
-
-#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
-  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
-  if((!local)&&(!st.same_node[Dir]) ) {					\
-    LOAD_CHI(base);							\
-    MULT_2SPIN_DIR_PF(Dir,base);					\
-    LOAD64(%r10,isigns);						\
-    RECON;								\
-    nmu++;								\
-  }									
-
-#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
-  nmu=0;								\
-  { ZERO_PSI;}								\
-  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
-  if((!local)&&(!st.same_node[Dir]) ) {					\
-    LOAD_CHI(base);							\
-    MULT_2SPIN_DIR_PF(Dir,base);					\
-    LOAD64(%r10,isigns);						\
-    RECON;								\
-    nmu++;								\
-  }
-
-#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
-
-#endif
-{
-  int nmu;
-  int local,perm, ptype;
-  uint64_t base;
-  uint64_t basep;
-  const uint64_t plocal =(uint64_t) & in[0];
-
-  COMPLEX_SIGNS(isigns);
-  MASK_REGS;
-  int nmax=U.oSites();
-  for(int site=0;site<Ns;site++) {
-#ifndef EXTERIOR
-    //    int sU =lo.Reorder(ssU);
-    int sU =ssU;
-    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
-    //    int sUn=lo.Reorder(ssn);
-    int sUn=ssn;
-    LOCK_GAUGE(0);
-#else
-    int sU =ssU;
-    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
-    int sUn=ssn;
-#endif
-    for(int s=0;s<Ls;s++) {
-      ss =sU*Ls+s;
-      ssn=sUn*Ls+s; 
-      int  ent=ss*8;// 2*Ndim
-      int nent=ssn*8;
-
-   ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON);
-      ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON);
-      ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON);
-      ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON);
-
-      ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON);
-      ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON);
-      ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON);
-      ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON);
-
-#ifdef EXTERIOR
-      if (nmu==0) break;
-      //      if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
-#endif
-      base = (uint64_t) &out[ss];
-      basep= st.GetPFInfo(nent,plocal); nent++;
-      RESULT(base,basep);
-    }
-    ssU++;
-    UNLOCK_GAUGE(0);
-  }
-}
-
-#undef DIR0_PROJMEM
-#undef DIR1_PROJMEM
-#undef DIR2_PROJMEM
-#undef DIR3_PROJMEM
-#undef DIR4_PROJMEM
-#undef DIR5_PROJMEM
-#undef DIR6_PROJMEM
-#undef DIR7_PROJMEM
-#undef DIR0_RECON
-#undef DIR1_RECON
-#undef DIR2_RECON
-#undef DIR3_RECON
-#undef DIR4_RECON
-#undef DIR5_RECON
-#undef DIR6_RECON
-#undef DIR7_RECON
-#undef ASM_LEG
-#undef ASM_LEG_XP
-#undef RESULT
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
@@ -1,161 +0,0 @@
-{
-  int locala,perma, ptypea;
-  int localb,permb, ptypeb;
-  uint64_t basea, baseb;
-  const uint64_t plocal =(uint64_t) & in._odata[0];
-
-  //  vComplexF isigns[2] = { signs[0], signs[1] };
-  vComplexF *isigns = &signs[0];
-
-  MASK_REGS;
-
-  for(int site=0;site<Ns;site++) {
-  int sU=lo.Reorder(ssU);  
-  for(int s=0;s<Ls;s++) {
-  ss=sU*Ls+s;
-  ////////////////////////////////
-  // Xp
-  ////////////////////////////////
-  int ent=ss*8;// 2*Ndim
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-
-  if ( locala ) {
-    LOAD64(%r10,isigns);
-    XM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR3,perma);
-  } else { 
-    LOAD_CHI(basea);
-  }
-  {
-    MULT_2SPIN_DIR_PFXP(Xp,baseb);
-  }
-  LOAD64(%r10,isigns);
-  XM_RECON;
-
-  ////////////////////////////////
-  // Yp
-  ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++;
-  if ( localb ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YM_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR2,permb);
-  } else { 
-    LOAD_CHI(baseb);
-  }
-  {
-    MULT_2SPIN_DIR_PFYP(Yp,basea);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  YM_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Zp
-  ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++;
-  if ( locala ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR1,perma);
-  } else { 
-    LOAD_CHI(basea);
-  }
-  {
-    MULT_2SPIN_DIR_PFZP(Zp,baseb);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  ZM_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Tp
-  ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++;
-  if ( localb ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TM_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR0,permb);
-  } else { 
-    LOAD_CHI(baseb);
-  }
-  {
-    MULT_2SPIN_DIR_PFTP(Tp,basea);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  TM_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Xm
-  ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++;
-  if ( locala ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    XP_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR3,perma);
-  } else { 
-    LOAD_CHI(basea);
-  }
-  {
-    MULT_2SPIN_DIR_PFXM(Xm,baseb);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  XP_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Ym
-  ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
-  if ( localb ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR2,permb);
-  } else { 
-    LOAD_CHI(baseb);
-  }
-  {
-    MULT_2SPIN_DIR_PFYM(Ym,basea);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  YP_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Zm
-  ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
-  if ( locala ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZP_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR1,perma);
-  } else { 
-    LOAD_CHI(basea);
-  }
-  {
-    MULT_2SPIN_DIR_PFZM(Zm,baseb);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  ZP_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Tm
-  ////////////////////////////////
-  basea = (uint64_t)&out._odata[ss];
-  if ( localb ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR0,permb);
-  } else { 
-    LOAD_CHI(baseb);
-  }
-  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal);
-  {
-    MULT_2SPIN_DIR_PFTM(Tm,basea);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  TP_RECON_ACCUM;
-
-  SAVE_RESULT(&out._odata[ss],baseb);
-
-  } 
-  ssU++;
-  }
-}
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
@@ -1,187 +0,0 @@
-{
-  int locala,perma, ptypea;
-  int localb,permb, ptypeb;
-  int localc,permc, ptypec;
-  uint64_t basea, baseb, basec;
-  uint64_t basex;
-  const uint64_t plocal =(uint64_t) & in._odata[0];
-
-  //  vComplexF isigns[2] = { signs[0], signs[1] };
-  vComplexF *isigns = &signs[0];
-
-  MASK_REGS;
-
-  for(int site=0;site<Ns;site++) {
-  int sU=lo.Reorder(ssU);
-
-  for(int s=0;s<Ls;s++) {
-  ss     =sU*Ls+s;
-
-  ////////////////////////////////
-  // Xp
-  ////////////////////////////////
-  int ent=ss*8;// 2*Ndim
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
-
-  basex = basea;
-
-  label(FX(XP) );
-  if ( locala ) {
-    LOAD64(%r10,isigns);
-    XM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR3,perma);
-  } else { 
-    LOAD_CHI(basea);
-  }
-  {
-    MULT_2SPIN_DIR_PFXP(Xp,baseb);
-  }
-  LOAD64(%r10,isigns);
-  XM_RECON;
-
-  ////////////////////////////////
-  // Yp
-  ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
-  label(FX(YP) );
-  if ( localb ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YM_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR2,permb);
-  } else { 
-    LOAD_CHI(baseb);
-  }
-  {
-    MULT_2SPIN_DIR_PFYP(Yp,basec);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  YM_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Zp
-  ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  label(FX(ZP) );
-  if ( localc ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZM_PROJMEM(basec);
-    MAYBEPERM(PERMUTE_DIR1,permc);
-  } else { 
-    LOAD_CHI(basec);
-  }
-  {
-    MULT_2SPIN_DIR_PFZP(Zp,basea);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  ZM_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Tp
-  ////////////////////////////////
-  basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
-  label(FX(TP) );
-  if ( locala ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR0,perma);
-  } else { 
-    LOAD_CHI(basea);
-  }
-  {
-    MULT_2SPIN_DIR_PFTP(Tp,baseb);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  TM_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Xm
-  ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
-  label(FX(XM) );
-  if ( localb ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    XP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR3,permb);
-  } else { 
-    LOAD_CHI(baseb);
-  }
-  {
-    MULT_2SPIN_DIR_PFXM(Xm,basec);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  XP_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Ym
-  ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  label(FX(YM) );
-  if ( localc ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YP_PROJMEM(basec);
-    MAYBEPERM(PERMUTE_DIR2,permc);
-  } else { 
-    LOAD_CHI(basec);
-  }
-  {
-    MULT_2SPIN_DIR_PFYM(Ym,basea);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  YP_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Zm
-  ////////////////////////////////
-  basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
-  label(FX(ZM) );
-  if ( locala ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZP_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR1,perma);
-  } else { 
-    LOAD_CHI(basea);
-  }
-  {
-    MULT_2SPIN_DIR_PFZM(Zm,baseb);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  ZP_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Tm
-  ////////////////////////////////
-  basea = (uint64_t)&out._odata[ss];
-  PREFETCH_CHIMU(basea);
-  label(FX(TM) );
-  if ( localb ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR0,permb);
-  } else { 
-    LOAD_CHI(baseb);
-  }
-  {
-    MULT_2SPIN_DIR_PFTM(Tm,basec);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  TP_RECON_ACCUM;
-
-  //  PREFETCH_CHIMU(basex);
-  label(FX(SAV) );
-  SAVE_RESULT(&out._odata[ss]);
-  
-  }
-  ssU++;
-  }
-}
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h
@@ -1,150 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmQPX.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-
-#if defined(QPX) 
-
-    ///////////////////////////////////////////////////////////
-    // If we are QPX specialise the single precision routine
-    ///////////////////////////////////////////////////////////
-
-#include <simd/IBM_qpx.h>
-#include <simd/IBM_qpx_single.h>
-  
-#define MAYBEPERM(A,perm) if (perm) { A ; }
-#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf)
-#define COMPLEX_SIGNS(isigns) 
-
-#define INTERIOR_AND_EXTERIOR    
-#undef  INTERIOR
-#undef  EXTERIOR
-  
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
-						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-#undef MAYBEPERM
-#undef MULT_2SPIN
-#define MAYBEPERM(A,B) 
-#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX_LS(ptr,pf)
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-#undef MAYBEPERM
-#undef MULT_2SPIN
-	
-///////////////////////////////////////////////////////////
-// DP routines
-///////////////////////////////////////////////////////////
-
-#include <simd/IBM_qpx_double.h>
-    
-#define MAYBEPERM(A,perm) if (perm) { A ; }
-#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf)
-
-/////////////////////////////////////////////////////////////////
-// XYZT Vectorised, undag Kernel, double
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-/////////////////////////////////////////////////////////////////
-      
-
-/////////////////////////////////////////////////////////////////
-// XYZT Vectorised, dag Kernel, double
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
-						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-/////////////////////////////////////////////////////////////////
-
-#undef MAYBEPERM
-#undef MULT_2SPIN
-#define MAYBEPERM(A,B) 
-#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX_LS(ptr,pf)
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, undag Kernel, double
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-/////////////////////////////////////////////////////////////////
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, dag Kernel, double
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-/////////////////////////////////////////////////////////////////
-	
-#undef MAYBEPERM
-#undef MULT_2SPIN
-
-#endif 
--- a/Grid/qcd/action/fermion/WilsonKernelsGpu.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsGpu.cc
@@ -1,378 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernelsGpu.cc
-
-Copyright (C) 2018
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-NAMESPACE_BEGIN(Grid);
-
-//////////////////////////////////////////////////////////////
-// Gpu implementation; thread loop is implicit ; move to header
-//////////////////////////////////////////////////////////////
-accelerator_inline void synchronise(void) 
-{
-#ifdef __CUDA_ARCH__
-  __syncthreads();
-#endif
-  return;
-}
-accelerator_inline int get_my_lanes(int Nsimd) 
-{
-#ifdef __CUDA_ARCH__
-  return 1;
-#else 
-  return Nsimd;
-#endif
-}
-accelerator_inline int get_my_lane_offset(int Nsimd) 
-{
-#ifdef __CUDA_ARCH__
-  return ( (threadIdx.x) % Nsimd);
-#else
-  return 0;
-#endif
-}
-
-accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
-{
-#ifdef __CUDA_ARCH__
-  static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); 
-  uint4 * mem_pun  = (uint4 *)mem; // force 128 bit loads
-  uint4 * chip_pun = (uint4 *)&chip;
-  * chip_pun = * mem_pun;
-#else 
-  chip = *mem;
-#endif
-  return;
-}
-
-#ifdef GPU_VEC
-#if 1
-#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
-  if (SE._is_local) {							\
-    int mask = Nsimd >> (ptype + 1);					\
-    int plane= SE._permute ? (lane ^ mask) : lane;			\
-    auto in_l = extractLane(plane,in[SE._offset+s]);			\
-    spProj(chi,in_l);							\
-  } else {								\
-    chi  = extractLane(lane,buf[SE._offset+s]);				\
-  }									\
-  synchronise();
-#else 
-#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
-  { int mask = Nsimd >> (ptype + 1);					\
-  int plane= SE._permute ? (lane ^ mask) : lane;			\
-  auto in_l = extractLane(plane,in[SE._offset+s]);			\
-  spProj(chi,in_l); }							
-#endif
-#else 
-#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
-  if (SE._is_local) {							\
-    auto in_t = in[SE._offset+s];					\
-    if (SE._permute) {							\
-      spProj(tmp, in_t);						\
-      permute(chi, tmp, ptype);						\
-    } else {								\
-      spProj(chi, in_t);						\
-    }									\
-  } else {								\
-    chi  = buf[SE._offset+s];						\
-  }									\
-  synchronise();
-#endif
-
-template <class Impl>
-accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
-							    SiteHalfSpinor *buf, int Ls, int s,
-							    int sU, const FermionFieldView &in, FermionFieldView &out)
-{
-#ifdef GPU_VEC
-  typename SiteHalfSpinor::scalar_object chi;
-  typename SiteHalfSpinor::scalar_object Uchi;
-  typename SiteSpinor::scalar_object   result;
-#else 
-  SiteHalfSpinor chi;
-  SiteHalfSpinor Uchi;
-  SiteHalfSpinor tmp;
-  SiteSpinor   result;
-#endif
-  typedef typename SiteSpinor::scalar_type scalar_type;
-  typedef typename SiteSpinor::vector_type vector_type;
-  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
-
-  uint64_t lane_offset= get_my_lane_offset(Nsimd);
-  uint64_t lanes      = get_my_lanes(Nsimd);
-
-  StencilEntry *SE_mem;
-  StencilEntry SE; 
-
-  int ptype;
-  uint64_t ssF = Ls * sU;
-  uint64_t sF  = ssF + s;
-#ifndef __CUDA_ARCH__
-  for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
-#else
-  int lane = lane_offset; {
-#endif
-    SE_mem = st.GetEntry(ptype, Xp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp); 
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
-    spReconXp(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Yp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
-    accumReconYp(result, Uchi);
-      
-    SE_mem = st.GetEntry(ptype, Zp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
-    accumReconZp(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Tp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
-    accumReconTp(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Xm, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
-    accumReconXm(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Ym, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
-    accumReconYm(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Zm, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
-    accumReconZm(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Tm, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); 
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
-    accumReconTm(result, Uchi);
-
-#ifdef GPU_VEC
-    insertLane (lane,out[sF],result);
-#else
-  vstream(out[sF], result);
-#endif
-  }
-}
-
-template <class Impl>
-accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U,
-						  SiteHalfSpinor *buf,  int Ls, int s,
-						  int sU, const FermionFieldView &in, FermionFieldView &out) 
-{
-#ifdef GPU_VEC
-  typename SiteHalfSpinor::scalar_object chi;
-  typename SiteHalfSpinor::scalar_object Uchi;
-  typename SiteSpinor::scalar_object   result;
-#else 
-  SiteHalfSpinor chi;
-  SiteHalfSpinor Uchi;
-  SiteHalfSpinor tmp;
-  SiteSpinor   result;
-#endif
-  typedef typename SiteSpinor::scalar_type scalar_type;
-  typedef typename SiteSpinor::vector_type vector_type;
-  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
-
-  uint64_t lane_offset= get_my_lane_offset(Nsimd);
-  uint64_t lanes      = get_my_lanes(Nsimd);
-
-  //  printf (" sU %d s %d Nsimd %d lanes %ld lane_off %ld\n",sU, s, Nsimd, lanes, lane_offset);
-
-  StencilEntry *SE_mem;
-  StencilEntry SE;
-  int ptype;
-  // Forces some degree of coalesce on the table look ups
-  // Could also use wide load instructions on the data structure
-  uint64_t ssF = Ls * sU;
-  uint64_t sF  = ssF + s;
-
-#ifndef __CUDA_ARCH__
-  for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
-#else
-  int lane = lane_offset; {
-#endif
-    SE_mem = st.GetEntry(ptype, Xp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm); 
-    Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
-    spReconXm(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Yp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
-    Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
-    accumReconYm(result, Uchi);
-      
-    SE_mem = st.GetEntry(ptype, Zp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
-    Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
-    accumReconZm(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Tp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
-    Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
-    accumReconTm(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Xm, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
-    Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
-    accumReconXp(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Ym, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
-    Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
-    accumReconYp(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Zm, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
-    Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
-    accumReconZp(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Tm, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp); 
-    Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
-    accumReconTp(result, Uchi);
-
-#ifdef GPU_VEC
-    insertLane (lane,out[sF],result);
-#else
-  vstream(out[sF], result);
-#endif
-  }
-
-};
-
-// Template specialise Gparity to empty for now
-#define GPU_EMPTY(A)							\
-  template <>								\
-accelerator_inline void							\
-WilsonKernels<A>::GpuDhopSite(StencilView &st,				\
-			      SiteDoubledGaugeField &U,			\
-			      SiteHalfSpinor *buf, int Ls, int sF,	\
-			      int sU,					\
-			      const FermionFieldView &in,		\
-			      FermionFieldView &out) { assert(0);};	\
-  template <>								\
-  accelerator_inline void							\
-  WilsonKernels<A>::GpuDhopSiteDag(StencilView &st,			\
-				   DoubledGaugeFieldView &U,		\
-				   SiteHalfSpinor *buf, int Ls,int sF,	\
-				   int sU,				\
-				   const FermionFieldView &in,		\
-				   FermionFieldView &out) { assert(0);};
-
-GPU_EMPTY(GparityWilsonImplF);
-GPU_EMPTY(GparityWilsonImplFH);
-GPU_EMPTY(GparityWilsonImplD);
-GPU_EMPTY(GparityWilsonImplDF);
-
-template <class Impl>
-void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
-				     int Ls, int Nsite, const FermionField &in, FermionField &out,
-				     int interior,int exterior) 
-{
-    auto U_v   = U.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    auto st_v  = st.View();
-    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
-      const uint64_t nsimd = Simd::Nsimd();
-      const uint64_t    NN = Nsite*Ls*nsimd;
-      accelerator_loopN( sss, NN, {
-	  uint64_t cur  = sss;
-	  //	  uint64_t lane = cur % nsimd;
-	  cur = cur / nsimd;
-	  uint64_t   s  = cur%Ls;
-	  //	  uint64_t   sF = cur;         
-	  cur = cur / Ls;
-	  uint64_t   sU = cur;
-	  WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
-      });
-    } else { 
-      accelerator_loop( ss, U_v, {
-	int sU = ss;
-        int sF = Ls * sU;
-        WilsonKernels<Impl>::DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
-      });
-    }
-  }
-  template <class Impl>
-  void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
-				     int Ls, int Nsite, const FermionField &in, FermionField &out,
-				     int interior,int exterior) 
-  {
-    auto U_v   = U.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    auto st_v  = st.View();
-
-    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
-      const uint64_t nsimd = Simd::Nsimd();
-      const uint64_t    NN = Nsite*Ls*nsimd;
-      accelerator_loopN( sss, NN, {
-	  uint64_t cur  = sss;
-	  // uint64_t lane = cur % nsimd;
-	  cur = cur / nsimd;
-	  uint64_t   s  = cur%Ls;
-	  //uint64_t   sF = cur;         
-	  cur = cur / Ls;
-	  uint64_t   sU = cur;
-	  WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,Ls,s,sU,in_v,out_v);
-      });
-    } else { 
-      accelerator_loop( ss, U_v, {
-	int sU = ss;
-        int sF = Ls * sU;
-        WilsonKernels<Impl>::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
-      });
-    }
-  }
-
-
-/*
-GPU_EMPTY(DomainWallVec5dImplF);
-GPU_EMPTY(DomainWallVec5dImplFH);
-GPU_EMPTY(DomainWallVec5dImplD);
-GPU_EMPTY(DomainWallVec5dImplDF);
-GPU_EMPTY(ZDomainWallVec5dImplF);
-GPU_EMPTY(ZDomainWallVec5dImplFH);
-GPU_EMPTY(ZDomainWallVec5dImplD);
-GPU_EMPTY(ZDomainWallVec5dImplDF);
-*/
-
-FermOpTemplateInstantiate(WilsonKernels);
-AdjointFermOpTemplateInstantiate(WilsonKernels);
-TwoIndexFermOpTemplateInstantiate(WilsonKernels);
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsHand.cc
@@ -1,654 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-#define REGISTER
-
-#define LOAD_CHIMU \
-  {const SiteSpinor & ref (in[offset]);	\
-    Chimu_00=ref()(0)(0);\
-    Chimu_01=ref()(0)(1);\
-    Chimu_02=ref()(0)(2);\
-    Chimu_10=ref()(1)(0);\
-    Chimu_11=ref()(1)(1);\
-    Chimu_12=ref()(1)(2);\
-    Chimu_20=ref()(2)(0);\
-    Chimu_21=ref()(2)(1);\
-    Chimu_22=ref()(2)(2);\
-    Chimu_30=ref()(3)(0);\
-    Chimu_31=ref()(3)(1);\
-    Chimu_32=ref()(3)(2);}
-
-#define LOAD_CHI\
-  {const SiteHalfSpinor &ref(buf[offset]);	\
-    Chi_00 = ref()(0)(0);\
-    Chi_01 = ref()(0)(1);\
-    Chi_02 = ref()(0)(2);\
-    Chi_10 = ref()(1)(0);\
-    Chi_11 = ref()(1)(1);\
-    Chi_12 = ref()(1)(2);}
-
-// To splat or not to splat depends on the implementation
-#define MULT_2SPIN(A)\
-  {auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));	\
-   Impl::loadLinkElement(U_10,ref()(1,0));	\
-   Impl::loadLinkElement(U_20,ref()(2,0));	\
-   Impl::loadLinkElement(U_01,ref()(0,1));	\
-   Impl::loadLinkElement(U_11,ref()(1,1));	\
-   Impl::loadLinkElement(U_21,ref()(2,1));	\
-    UChi_00 = U_00*Chi_00;\
-    UChi_10 = U_00*Chi_10;\
-    UChi_01 = U_10*Chi_00;\
-    UChi_11 = U_10*Chi_10;\
-    UChi_02 = U_20*Chi_00;\
-    UChi_12 = U_20*Chi_10;\
-    UChi_00+= U_01*Chi_01;\
-    UChi_10+= U_01*Chi_11;\
-    UChi_01+= U_11*Chi_01;\
-    UChi_11+= U_11*Chi_11;\
-    UChi_02+= U_21*Chi_01;\
-    UChi_12+= U_21*Chi_11;\
-    Impl::loadLinkElement(U_00,ref()(0,2));	\
-    Impl::loadLinkElement(U_10,ref()(1,2));	\
-    Impl::loadLinkElement(U_20,ref()(2,2));	\
-    UChi_00+= U_00*Chi_02;\
-    UChi_10+= U_00*Chi_12;\
-    UChi_01+= U_10*Chi_02;\
-    UChi_11+= U_10*Chi_12;\
-    UChi_02+= U_20*Chi_02;\
-    UChi_12+= U_20*Chi_12;}
-
-
-#define PERMUTE_DIR(dir)			\
-      permute##dir(Chi_00,Chi_00);\
-      permute##dir(Chi_01,Chi_01);\
-      permute##dir(Chi_02,Chi_02);\
-      permute##dir(Chi_10,Chi_10);\
-      permute##dir(Chi_11,Chi_11);\
-      permute##dir(Chi_12,Chi_12);
-
-//      hspin(0)=fspin(0)+timesI(fspin(3));
-//      hspin(1)=fspin(1)+timesI(fspin(2));
-#define XP_PROJ \
-    Chi_00 = Chimu_00+timesI(Chimu_30);\
-    Chi_01 = Chimu_01+timesI(Chimu_31);\
-    Chi_02 = Chimu_02+timesI(Chimu_32);\
-    Chi_10 = Chimu_10+timesI(Chimu_20);\
-    Chi_11 = Chimu_11+timesI(Chimu_21);\
-    Chi_12 = Chimu_12+timesI(Chimu_22);
-
-#define YP_PROJ \
-    Chi_00 = Chimu_00-Chimu_30;\
-    Chi_01 = Chimu_01-Chimu_31;\
-    Chi_02 = Chimu_02-Chimu_32;\
-    Chi_10 = Chimu_10+Chimu_20;\
-    Chi_11 = Chimu_11+Chimu_21;\
-    Chi_12 = Chimu_12+Chimu_22;
-
-#define ZP_PROJ \
-  Chi_00 = Chimu_00+timesI(Chimu_20);		\
-  Chi_01 = Chimu_01+timesI(Chimu_21);		\
-  Chi_02 = Chimu_02+timesI(Chimu_22);		\
-  Chi_10 = Chimu_10-timesI(Chimu_30);		\
-  Chi_11 = Chimu_11-timesI(Chimu_31);		\
-  Chi_12 = Chimu_12-timesI(Chimu_32);
-
-#define TP_PROJ \
-  Chi_00 = Chimu_00+Chimu_20;		\
-  Chi_01 = Chimu_01+Chimu_21;		\
-  Chi_02 = Chimu_02+Chimu_22;		\
-  Chi_10 = Chimu_10+Chimu_30;		\
-  Chi_11 = Chimu_11+Chimu_31;		\
-  Chi_12 = Chimu_12+Chimu_32;
-
-
-//      hspin(0)=fspin(0)-timesI(fspin(3));
-//      hspin(1)=fspin(1)-timesI(fspin(2));
-#define XM_PROJ \
-    Chi_00 = Chimu_00-timesI(Chimu_30);\
-    Chi_01 = Chimu_01-timesI(Chimu_31);\
-    Chi_02 = Chimu_02-timesI(Chimu_32);\
-    Chi_10 = Chimu_10-timesI(Chimu_20);\
-    Chi_11 = Chimu_11-timesI(Chimu_21);\
-    Chi_12 = Chimu_12-timesI(Chimu_22);
-
-#define YM_PROJ \
-    Chi_00 = Chimu_00+Chimu_30;\
-    Chi_01 = Chimu_01+Chimu_31;\
-    Chi_02 = Chimu_02+Chimu_32;\
-    Chi_10 = Chimu_10-Chimu_20;\
-    Chi_11 = Chimu_11-Chimu_21;\
-    Chi_12 = Chimu_12-Chimu_22;
-
-#define ZM_PROJ \
-  Chi_00 = Chimu_00-timesI(Chimu_20);		\
-  Chi_01 = Chimu_01-timesI(Chimu_21);		\
-  Chi_02 = Chimu_02-timesI(Chimu_22);		\
-  Chi_10 = Chimu_10+timesI(Chimu_30);		\
-  Chi_11 = Chimu_11+timesI(Chimu_31);		\
-  Chi_12 = Chimu_12+timesI(Chimu_32);
-
-#define TM_PROJ \
-  Chi_00 = Chimu_00-Chimu_20;		\
-  Chi_01 = Chimu_01-Chimu_21;		\
-  Chi_02 = Chimu_02-Chimu_22;		\
-  Chi_10 = Chimu_10-Chimu_30;		\
-  Chi_11 = Chimu_11-Chimu_31;		\
-  Chi_12 = Chimu_12-Chimu_32;
-
-//      fspin(0)=hspin(0);
-//      fspin(1)=hspin(1);
-//      fspin(2)=timesMinusI(hspin(1));
-//      fspin(3)=timesMinusI(hspin(0));
-#define XP_RECON\
-  result_00 = UChi_00;\
-  result_01 = UChi_01;\
-  result_02 = UChi_02;\
-  result_10 = UChi_10;\
-  result_11 = UChi_11;\
-  result_12 = UChi_12;\
-  result_20 = timesMinusI(UChi_10);\
-  result_21 = timesMinusI(UChi_11);\
-  result_22 = timesMinusI(UChi_12);\
-  result_30 = timesMinusI(UChi_00);\
-  result_31 = timesMinusI(UChi_01);\
-  result_32 = timesMinusI(UChi_02);
-
-#define XP_RECON_ACCUM\
-  result_00+=UChi_00;\
-  result_01+=UChi_01;\
-  result_02+=UChi_02;\
-  result_10+=UChi_10;\
-  result_11+=UChi_11;\
-  result_12+=UChi_12;\
-  result_20-=timesI(UChi_10);\
-  result_21-=timesI(UChi_11);\
-  result_22-=timesI(UChi_12);\
-  result_30-=timesI(UChi_00);\
-  result_31-=timesI(UChi_01);\
-  result_32-=timesI(UChi_02);
-
-#define XM_RECON\
-  result_00 = UChi_00;\
-  result_01 = UChi_01;\
-  result_02 = UChi_02;\
-  result_10 = UChi_10;\
-  result_11 = UChi_11;\
-  result_12 = UChi_12;\
-  result_20 = timesI(UChi_10);\
-  result_21 = timesI(UChi_11);\
-  result_22 = timesI(UChi_12);\
-  result_30 = timesI(UChi_00);\
-  result_31 = timesI(UChi_01);\
-  result_32 = timesI(UChi_02);
-
-#define XM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= timesI(UChi_10);\
-  result_21+= timesI(UChi_11);\
-  result_22+= timesI(UChi_12);\
-  result_30+= timesI(UChi_00);\
-  result_31+= timesI(UChi_01);\
-  result_32+= timesI(UChi_02);
-
-#define YP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= UChi_10;\
-  result_21+= UChi_11;\
-  result_22+= UChi_12;\
-  result_30-= UChi_00;\
-  result_31-= UChi_01;\
-  result_32-= UChi_02;
-
-#define YM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= UChi_10;\
-  result_21-= UChi_11;\
-  result_22-= UChi_12;\
-  result_30+= UChi_00;\
-  result_31+= UChi_01;\
-  result_32+= UChi_02;
-
-#define ZP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= timesI(UChi_00);			\
-  result_21-= timesI(UChi_01);			\
-  result_22-= timesI(UChi_02);			\
-  result_30+= timesI(UChi_10);			\
-  result_31+= timesI(UChi_11);			\
-  result_32+= timesI(UChi_12);
-
-#define ZM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= timesI(UChi_00);			\
-  result_21+= timesI(UChi_01);			\
-  result_22+= timesI(UChi_02);			\
-  result_30-= timesI(UChi_10);			\
-  result_31-= timesI(UChi_11);			\
-  result_32-= timesI(UChi_12);
-
-#define TP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= UChi_00;			\
-  result_21+= UChi_01;			\
-  result_22+= UChi_02;			\
-  result_30+= UChi_10;			\
-  result_31+= UChi_11;			\
-  result_32+= UChi_12;
-
-#define TM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= UChi_00;	\
-  result_21-= UChi_01;	\
-  result_22-= UChi_02;	\
-  result_30-= UChi_10;	\
-  result_31-= UChi_11;	\
-  result_32-= UChi_12;
-
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU;					\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else {					\
-    LOAD_CHI;					\
-  }						\
-  MULT_2SPIN(DIR);				\
-  RECON;					
-
-#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU;					\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else if ( st.same_node[DIR] ) {		\
-    LOAD_CHI;					\
-  }						\
-  if (local || st.same_node[DIR] ) {		\
-    MULT_2SPIN(DIR);				\
-    RECON;					\
-  }
-
-#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
-    LOAD_CHI;					\
-    MULT_2SPIN(DIR);				\
-    RECON;					\
-    nmu++;					\
-  }
-
-#define HAND_RESULT(ss)				\
-  {						\
-    SiteSpinor & ref (out[ss]);		\
-    vstream(ref()(0)(0),result_00);		\
-    vstream(ref()(0)(1),result_01);		\
-    vstream(ref()(0)(2),result_02);		\
-    vstream(ref()(1)(0),result_10);		\
-    vstream(ref()(1)(1),result_11);		\
-    vstream(ref()(1)(2),result_12);		\
-    vstream(ref()(2)(0),result_20);		\
-    vstream(ref()(2)(1),result_21);		\
-    vstream(ref()(2)(2),result_22);		\
-    vstream(ref()(3)(0),result_30);		\
-    vstream(ref()(3)(1),result_31);		\
-    vstream(ref()(3)(2),result_32);		\
-  }
-
-#define HAND_RESULT_EXT(ss)			\
-  if (nmu){					\
-    SiteSpinor & ref (out[ss]);		\
-    ref()(0)(0)+=result_00;		\
-    ref()(0)(1)+=result_01;		\
-    ref()(0)(2)+=result_02;		\
-    ref()(1)(0)+=result_10;		\
-    ref()(1)(1)+=result_11;		\
-    ref()(1)(2)+=result_12;		\
-    ref()(2)(0)+=result_20;		\
-    ref()(2)(1)+=result_21;		\
-    ref()(2)(2)+=result_22;		\
-    ref()(3)(0)+=result_30;		\
-    ref()(3)(1)+=result_31;		\
-    ref()(3)(2)+=result_32;		\
-  }
-
-
-#define HAND_DECLARATIONS(a)			\
-  Simd result_00;				\
-  Simd result_01;				\
-  Simd result_02;				\
-  Simd result_10;				\
-  Simd result_11;				\
-  Simd result_12;				\
-  Simd result_20;				\
-  Simd result_21;				\
-  Simd result_22;				\
-  Simd result_30;				\
-  Simd result_31;				\
-  Simd result_32;				\
-  Simd Chi_00;					\
-  Simd Chi_01;					\
-  Simd Chi_02;					\
-  Simd Chi_10;					\
-  Simd Chi_11;					\
-  Simd Chi_12;					\
-  Simd UChi_00;					\
-  Simd UChi_01;					\
-  Simd UChi_02;					\
-  Simd UChi_10;					\
-  Simd UChi_11;					\
-  Simd UChi_12;					\
-  Simd U_00;					\
-  Simd U_10;					\
-  Simd U_20;					\
-  Simd U_01;					\
-  Simd U_11;					\
-  Simd U_21;
-
-#define ZERO_RESULT				\
-  result_00=Zero();				\
-  result_01=Zero();				\
-  result_02=Zero();				\
-  result_10=Zero();				\
-  result_11=Zero();				\
-  result_12=Zero();				\
-  result_20=Zero();				\
-  result_21=Zero();				\
-  result_22=Zero();				\
-  result_30=Zero();				\
-  result_31=Zero();				\
-  result_32=Zero();			
-
-#define Chimu_00 Chi_00
-#define Chimu_01 Chi_01
-#define Chimu_02 Chi_02
-#define Chimu_10 Chi_10
-#define Chimu_11 Chi_11
-#define Chimu_12 Chi_12
-#define Chimu_20 UChi_00
-#define Chimu_21 UChi_01
-#define Chimu_22 UChi_02
-#define Chimu_30 UChi_10
-#define Chimu_31 UChi_11
-#define Chimu_32 UChi_12
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-
-  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
-  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT(ss);
-#else
-  assert(0);
-#endif
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  
-  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
-  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT(ss);
-#else
-  assert(0);
-#endif
-}
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT(ss);
-#else
-  assert(0);
-#endif
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT(ss);
-#else
-  assert(0);
-#endif
-}
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-  int nmu=0;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT_EXT(ss);
-#else
-  assert(0);
-#endif
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  int nmu=0;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT_EXT(ss);
-#else
-  assert(0);
-#endif
-}
-
-////////////// Wilson ; uses this implementation /////////////////////
-
-#define INSTANTIATE_THEM(A) \
-template void WilsonKernels<A>::HandDhopSite(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-					     int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out);\
-template void WilsonKernels<A>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionFieldView &in, FermionFieldView &out); 
-
-INSTANTIATE_THEM(WilsonImplF);
-INSTANTIATE_THEM(WilsonImplD);
-INSTANTIATE_THEM(ZWilsonImplF);
-INSTANTIATE_THEM(ZWilsonImplD);
-INSTANTIATE_THEM(DomainWallVec5dImplF);
-INSTANTIATE_THEM(DomainWallVec5dImplD);
-INSTANTIATE_THEM(ZDomainWallVec5dImplF);
-INSTANTIATE_THEM(ZDomainWallVec5dImplD);
-INSTANTIATE_THEM(WilsonImplFH);
-INSTANTIATE_THEM(WilsonImplDF);
-INSTANTIATE_THEM(ZWilsonImplFH);
-INSTANTIATE_THEM(ZWilsonImplDF);
-INSTANTIATE_THEM(DomainWallVec5dImplFH);
-INSTANTIATE_THEM(DomainWallVec5dImplDF);
-INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
-INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
-INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
-INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonKernelsHandGparity.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsHandGparity.cc
@@ -1,943 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-#define REGISTER
-
-#define LOAD_CHIMU_BODY(F)			\
-  Chimu_00=ref(F)(0)(0);			\
-  Chimu_01=ref(F)(0)(1);			\
-  Chimu_02=ref(F)(0)(2);			\
-  Chimu_10=ref(F)(1)(0);			\
-  Chimu_11=ref(F)(1)(1);			\
-  Chimu_12=ref(F)(1)(2);			\
-  Chimu_20=ref(F)(2)(0);			\
-  Chimu_21=ref(F)(2)(1);			\
-  Chimu_22=ref(F)(2)(2);			\
-  Chimu_30=ref(F)(3)(0);			\
-  Chimu_31=ref(F)(3)(1);			\
-  Chimu_32=ref(F)(3)(2)
-
-#define LOAD_CHIMU(DIR,F,PERM)						\
-  { const SiteSpinor & ref (in[offset]); LOAD_CHIMU_BODY(F); }
-
-#define LOAD_CHI_BODY(F)				\
-    Chi_00 = ref(F)(0)(0);\
-    Chi_01 = ref(F)(0)(1);\
-    Chi_02 = ref(F)(0)(2);\
-    Chi_10 = ref(F)(1)(0);\
-    Chi_11 = ref(F)(1)(1);\
-    Chi_12 = ref(F)(1)(2)
-
-#define LOAD_CHI(DIR,F,PERM)					\
-  {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
-
-
-//G-parity implementations using in-place intrinsic ops
-
-//1l 1h -> 1h 1l
-//0l 0h , 1h 1l -> 0l 1h 0h,1l
-//0h,1l -> 1l,0h
-//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
-//Pulled fermion through forwards face, GPBC on upper component
-//Need 0= 0l 1h   1= 1l 0h
-//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
-//Pulled fermion through backwards face, GPBC on lower component
-//Need 0= 1l 0h   1= 0l 1h
-
-//1l 1h -> 1h 1l
-//0l 0h , 1h 1l -> 0l 1h 0h,1l
-#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
-  permute##PERM(tmp1, ref(1)(S)(C));				\
-  exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1);		\
-  INTO = tmp2;
-
-//0l 0h -> 0h 0l
-//1l 1h, 0h 0l -> 1l 0h, 1h 0l
-#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
-  permute##PERM(tmp1, ref(0)(S)(C));				\
-  exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1);		\
-  INTO = tmp2;
-
-
-
-
-#define LOAD_CHI_SETUP(DIR,F)						\
-  g = F;								\
-  direction = st._directions[DIR];				\
-  distance = st._distances[DIR];				\
-  sl = st._simd_layout[direction];			        \
-  inplace_twist = 0;						\
-  if(SE->_around_the_world && st.parameters.twists[DIR % 4]){		\
-    if(sl == 1){							\
-      g = (F+1) % 2;							\
-    }else{								\
-      inplace_twist = 1;						\
-    }									\
-  }  
-
-#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\
-  { const SiteSpinor &ref(in[offset]);				\
-    LOAD_CHI_SETUP(DIR,F);						\
-    if(!inplace_twist){							\
-      LOAD_CHIMU_BODY(g);						\
-    }else{								\
-      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
-	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
-	DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
-      }else{								\
-	DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
-      } \
-    } \
-  }
-
-
-#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)				\
-  { const SiteHalfSpinor &ref(buf[offset]);				\
-    LOAD_CHI_SETUP(DIR,F);						\
-    if(!inplace_twist){							\
-      LOAD_CHI_BODY(g);							\
-    }else{								\
-      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
-	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
-	DO_TWIST_0L_1H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_0L_1H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
-	DO_TWIST_0L_1H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
-	DO_TWIST_0L_1H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
-	DO_TWIST_0L_1H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_0L_1H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
-      }else{								\
-	DO_TWIST_1L_0H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_1L_0H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
-	DO_TWIST_1L_0H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
-	DO_TWIST_1L_0H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
-	DO_TWIST_1L_0H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_1L_0H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
-      }									\
-    }									\
-  }
-
-
-#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
-#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
-
-// To splat or not to splat depends on the implementation
-#define MULT_2SPIN_BODY \
-  Impl::loadLinkElement(U_00,ref()(0,0));	\
-  Impl::loadLinkElement(U_10,ref()(1,0));	\
-  Impl::loadLinkElement(U_20,ref()(2,0));	\
-  Impl::loadLinkElement(U_01,ref()(0,1));	\
-  Impl::loadLinkElement(U_11,ref()(1,1));	\
-  Impl::loadLinkElement(U_21,ref()(2,1));	\
-  UChi_00 = U_00*Chi_00;			\
-  UChi_10 = U_00*Chi_10;			\
-  UChi_01 = U_10*Chi_00;			\
-  UChi_11 = U_10*Chi_10;			\
-  UChi_02 = U_20*Chi_00;			\
-  UChi_12 = U_20*Chi_10;			\
-  UChi_00+= U_01*Chi_01;			\
-  UChi_10+= U_01*Chi_11;			\
-  UChi_01+= U_11*Chi_01;			\
-  UChi_11+= U_11*Chi_11;			\
-  UChi_02+= U_21*Chi_01;			\
-  UChi_12+= U_21*Chi_11;			\
-  Impl::loadLinkElement(U_00,ref()(0,2));	\
-  Impl::loadLinkElement(U_10,ref()(1,2));	\
-  Impl::loadLinkElement(U_20,ref()(2,2));	\
-  UChi_00+= U_00*Chi_02;			\
-  UChi_10+= U_00*Chi_12;			\
-  UChi_01+= U_10*Chi_02;			\
-  UChi_11+= U_10*Chi_12;			\
-  UChi_02+= U_20*Chi_02;			\
-  UChi_12+= U_20*Chi_12
-
-
-#define MULT_2SPIN(A,F)					\
-  {auto & ref(U[sU](A)); MULT_2SPIN_BODY; }
-
-#define MULT_2SPIN_GPARITY(A,F)				\
-  {auto & ref(U[sU](F)(A)); MULT_2SPIN_BODY; }
-
-
-#define PERMUTE_DIR(dir)			\
-      permute##dir(Chi_00,Chi_00);\
-      permute##dir(Chi_01,Chi_01);\
-      permute##dir(Chi_02,Chi_02);\
-      permute##dir(Chi_10,Chi_10);\
-      permute##dir(Chi_11,Chi_11);\
-      permute##dir(Chi_12,Chi_12);
-
-//      hspin(0)=fspin(0)+timesI(fspin(3));
-//      hspin(1)=fspin(1)+timesI(fspin(2));
-#define XP_PROJ \
-    Chi_00 = Chimu_00+timesI(Chimu_30);\
-    Chi_01 = Chimu_01+timesI(Chimu_31);\
-    Chi_02 = Chimu_02+timesI(Chimu_32);\
-    Chi_10 = Chimu_10+timesI(Chimu_20);\
-    Chi_11 = Chimu_11+timesI(Chimu_21);\
-    Chi_12 = Chimu_12+timesI(Chimu_22);
-
-#define YP_PROJ \
-    Chi_00 = Chimu_00-Chimu_30;\
-    Chi_01 = Chimu_01-Chimu_31;\
-    Chi_02 = Chimu_02-Chimu_32;\
-    Chi_10 = Chimu_10+Chimu_20;\
-    Chi_11 = Chimu_11+Chimu_21;\
-    Chi_12 = Chimu_12+Chimu_22;
-
-#define ZP_PROJ \
-  Chi_00 = Chimu_00+timesI(Chimu_20);		\
-  Chi_01 = Chimu_01+timesI(Chimu_21);		\
-  Chi_02 = Chimu_02+timesI(Chimu_22);		\
-  Chi_10 = Chimu_10-timesI(Chimu_30);		\
-  Chi_11 = Chimu_11-timesI(Chimu_31);		\
-  Chi_12 = Chimu_12-timesI(Chimu_32);
-
-#define TP_PROJ \
-  Chi_00 = Chimu_00+Chimu_20;		\
-  Chi_01 = Chimu_01+Chimu_21;		\
-  Chi_02 = Chimu_02+Chimu_22;		\
-  Chi_10 = Chimu_10+Chimu_30;		\
-  Chi_11 = Chimu_11+Chimu_31;		\
-  Chi_12 = Chimu_12+Chimu_32;
-
-
-//      hspin(0)=fspin(0)-timesI(fspin(3));
-//      hspin(1)=fspin(1)-timesI(fspin(2));
-#define XM_PROJ \
-    Chi_00 = Chimu_00-timesI(Chimu_30);\
-    Chi_01 = Chimu_01-timesI(Chimu_31);\
-    Chi_02 = Chimu_02-timesI(Chimu_32);\
-    Chi_10 = Chimu_10-timesI(Chimu_20);\
-    Chi_11 = Chimu_11-timesI(Chimu_21);\
-    Chi_12 = Chimu_12-timesI(Chimu_22);
-
-#define YM_PROJ \
-    Chi_00 = Chimu_00+Chimu_30;\
-    Chi_01 = Chimu_01+Chimu_31;\
-    Chi_02 = Chimu_02+Chimu_32;\
-    Chi_10 = Chimu_10-Chimu_20;\
-    Chi_11 = Chimu_11-Chimu_21;\
-    Chi_12 = Chimu_12-Chimu_22;
-
-#define ZM_PROJ \
-  Chi_00 = Chimu_00-timesI(Chimu_20);		\
-  Chi_01 = Chimu_01-timesI(Chimu_21);		\
-  Chi_02 = Chimu_02-timesI(Chimu_22);		\
-  Chi_10 = Chimu_10+timesI(Chimu_30);		\
-  Chi_11 = Chimu_11+timesI(Chimu_31);		\
-  Chi_12 = Chimu_12+timesI(Chimu_32);
-
-#define TM_PROJ \
-  Chi_00 = Chimu_00-Chimu_20;		\
-  Chi_01 = Chimu_01-Chimu_21;		\
-  Chi_02 = Chimu_02-Chimu_22;		\
-  Chi_10 = Chimu_10-Chimu_30;		\
-  Chi_11 = Chimu_11-Chimu_31;		\
-  Chi_12 = Chimu_12-Chimu_32;
-
-//      fspin(0)=hspin(0);
-//      fspin(1)=hspin(1);
-//      fspin(2)=timesMinusI(hspin(1));
-//      fspin(3)=timesMinusI(hspin(0));
-#define XP_RECON\
-  result_00 = UChi_00;\
-  result_01 = UChi_01;\
-  result_02 = UChi_02;\
-  result_10 = UChi_10;\
-  result_11 = UChi_11;\
-  result_12 = UChi_12;\
-  result_20 = timesMinusI(UChi_10);\
-  result_21 = timesMinusI(UChi_11);\
-  result_22 = timesMinusI(UChi_12);\
-  result_30 = timesMinusI(UChi_00);\
-  result_31 = timesMinusI(UChi_01);\
-  result_32 = timesMinusI(UChi_02);
-
-#define XP_RECON_ACCUM\
-  result_00+=UChi_00;\
-  result_01+=UChi_01;\
-  result_02+=UChi_02;\
-  result_10+=UChi_10;\
-  result_11+=UChi_11;\
-  result_12+=UChi_12;\
-  result_20-=timesI(UChi_10);\
-  result_21-=timesI(UChi_11);\
-  result_22-=timesI(UChi_12);\
-  result_30-=timesI(UChi_00);\
-  result_31-=timesI(UChi_01);\
-  result_32-=timesI(UChi_02);
-
-#define XM_RECON\
-  result_00 = UChi_00;\
-  result_01 = UChi_01;\
-  result_02 = UChi_02;\
-  result_10 = UChi_10;\
-  result_11 = UChi_11;\
-  result_12 = UChi_12;\
-  result_20 = timesI(UChi_10);\
-  result_21 = timesI(UChi_11);\
-  result_22 = timesI(UChi_12);\
-  result_30 = timesI(UChi_00);\
-  result_31 = timesI(UChi_01);\
-  result_32 = timesI(UChi_02);
-
-#define XM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= timesI(UChi_10);\
-  result_21+= timesI(UChi_11);\
-  result_22+= timesI(UChi_12);\
-  result_30+= timesI(UChi_00);\
-  result_31+= timesI(UChi_01);\
-  result_32+= timesI(UChi_02);
-
-#define YP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= UChi_10;\
-  result_21+= UChi_11;\
-  result_22+= UChi_12;\
-  result_30-= UChi_00;\
-  result_31-= UChi_01;\
-  result_32-= UChi_02;
-
-#define YM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= UChi_10;\
-  result_21-= UChi_11;\
-  result_22-= UChi_12;\
-  result_30+= UChi_00;\
-  result_31+= UChi_01;\
-  result_32+= UChi_02;
-
-#define ZP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= timesI(UChi_00);			\
-  result_21-= timesI(UChi_01);			\
-  result_22-= timesI(UChi_02);			\
-  result_30+= timesI(UChi_10);			\
-  result_31+= timesI(UChi_11);			\
-  result_32+= timesI(UChi_12);
-
-#define ZM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= timesI(UChi_00);			\
-  result_21+= timesI(UChi_01);			\
-  result_22+= timesI(UChi_02);			\
-  result_30-= timesI(UChi_10);			\
-  result_31-= timesI(UChi_11);			\
-  result_32-= timesI(UChi_12);
-
-#define TP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= UChi_00;			\
-  result_21+= UChi_01;			\
-  result_22+= UChi_02;			\
-  result_30+= UChi_10;			\
-  result_31+= UChi_11;			\
-  result_32+= UChi_12;
-
-#define TM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= UChi_00;	\
-  result_21-= UChi_01;	\
-  result_22-= UChi_02;	\
-  result_30-= UChi_10;	\
-  result_31-= UChi_11;	\
-  result_32-= UChi_12;
-
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else {					\
-    LOAD_CHI_IMPL(DIR,F,PERM);			\
-  }						\
-  MULT_2SPIN_IMPL(DIR,F);			\
-  RECON;					
-
-
-#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else if ( st.same_node[DIR] ) {		\
-    LOAD_CHI_IMPL(DIR,F,PERM);			\
-  }						\
-  if (local || st.same_node[DIR] ) {		\
-    MULT_2SPIN_IMPL(DIR,F);			\
-    RECON;					\
-  }
-
-#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  perm   = SE->_permute;				\
-  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
-    LOAD_CHI_IMPL(DIR,F,PERM);			\
-    MULT_2SPIN_IMPL(DIR,F);			\
-    RECON;					\
-    nmu++;					\
-  }
-
-#define HAND_RESULT(ss,F)			\
-  {						\
-    SiteSpinor & ref (out[ss]);		\
-    vstream(ref(F)(0)(0),result_00);		\
-    vstream(ref(F)(0)(1),result_01);		\
-    vstream(ref(F)(0)(2),result_02);		\
-    vstream(ref(F)(1)(0),result_10);		\
-    vstream(ref(F)(1)(1),result_11);		\
-    vstream(ref(F)(1)(2),result_12);		\
-    vstream(ref(F)(2)(0),result_20);		\
-    vstream(ref(F)(2)(1),result_21);		\
-    vstream(ref(F)(2)(2),result_22);		\
-    vstream(ref(F)(3)(0),result_30);		\
-    vstream(ref(F)(3)(1),result_31);		\
-    vstream(ref(F)(3)(2),result_32);		\
-  }
-
-#define HAND_RESULT_EXT(ss,F)			\
-  if (nmu){					\
-    SiteSpinor & ref (out[ss]);		\
-    ref(F)(0)(0)+=result_00;		\
-    ref(F)(0)(1)+=result_01;		\
-    ref(F)(0)(2)+=result_02;		\
-    ref(F)(1)(0)+=result_10;		\
-    ref(F)(1)(1)+=result_11;		\
-    ref(F)(1)(2)+=result_12;		\
-    ref(F)(2)(0)+=result_20;		\
-    ref(F)(2)(1)+=result_21;		\
-    ref(F)(2)(2)+=result_22;		\
-    ref(F)(3)(0)+=result_30;		\
-    ref(F)(3)(1)+=result_31;		\
-    ref(F)(3)(2)+=result_32;		\
-  }
-
-
-#define HAND_DECLARATIONS(a)			\
-  Simd result_00;				\
-  Simd result_01;				\
-  Simd result_02;				\
-  Simd result_10;				\
-  Simd result_11;				\
-  Simd result_12;				\
-  Simd result_20;				\
-  Simd result_21;				\
-  Simd result_22;				\
-  Simd result_30;				\
-  Simd result_31;				\
-  Simd result_32;				\
-  Simd Chi_00;					\
-  Simd Chi_01;					\
-  Simd Chi_02;					\
-  Simd Chi_10;					\
-  Simd Chi_11;					\
-  Simd Chi_12;					\
-  Simd UChi_00;					\
-  Simd UChi_01;					\
-  Simd UChi_02;					\
-  Simd UChi_10;					\
-  Simd UChi_11;					\
-  Simd UChi_12;					\
-  Simd U_00;					\
-  Simd U_10;					\
-  Simd U_20;					\
-  Simd U_01;					\
-  Simd U_11;					\
-  Simd U_21;
-
-#define ZERO_RESULT				\
-  result_00=Zero();				\
-  result_01=Zero();				\
-  result_02=Zero();				\
-  result_10=Zero();				\
-  result_11=Zero();				\
-  result_12=Zero();				\
-  result_20=Zero();				\
-  result_21=Zero();				\
-  result_22=Zero();				\
-  result_30=Zero();				\
-  result_31=Zero();				\
-  result_32=Zero();			
-
-#define Chimu_00 Chi_00
-#define Chimu_01 Chi_01
-#define Chimu_02 Chi_02
-#define Chimu_10 Chi_10
-#define Chimu_11 Chi_11
-#define Chimu_12 Chi_12
-#define Chimu_20 UChi_00
-#define Chimu_21 UChi_01
-#define Chimu_22 UChi_02
-#define Chimu_30 UChi_10
-#define Chimu_31 UChi_11
-#define Chimu_32 UChi_12
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl> void  accelerator
-WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-
-#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss,F)
-
-  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-#endif
-}
-
-template<class Impl> accelerator
-void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-
-#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss,F)
-
-  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-#endif
-}
-
-template<class Impl> void accelerator
-WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-
-#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  ZERO_RESULT; \
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss,F)
-
-  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-#endif
-}
-
-template<class Impl> accelerator
-void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-
-#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
-  ZERO_RESULT;							\
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_RESULT(ss,F)
-  
-  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-#endif
-}
-
-template<class Impl> void accelerator
-WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset, perm, ptype;
-  StencilEntry *SE;
-  int nmu=0;
-
-#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  ZERO_RESULT; \
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT_EXT(ss,F)
-
-  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-  perm++;
-#endif
-}
-
-template<class Impl>
-accelerator void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset, perm, ptype;
-  int nmu=0;
-
-#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  ZERO_RESULT; \
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT_EXT(ss,F)
-
-  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-  perm++;
-#endif
-}
-
-  ////////////////////////////////////////////////
-  // Specialise Gparity to simple implementation
-  ////////////////////////////////////////////////
-#define HAND_SPECIALISE_EMPTY(IMPL)					\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSite(StencilView &st,			\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st,			\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st,			\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st,			\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st,	       	\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st,	       	\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-
-
-#ifdef GRID_NVCC
-#define HAND_SPECIALISE_GPARITY(IMPL) HAND_SPECIALISE_EMPTY(IMPL)
-#else
-#define HAND_SPECIALISE_GPARITY(IMPL)					\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
-				    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    StencilEntry *SE;							\
-    HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-					    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    StencilEntry *SE;							\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
-    HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
-						     int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
-    StencilEntry *SE;							\
-    HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-							     int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    StencilEntry *SE;							\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
-						     int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    int offset,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    StencilEntry *SE;							\
-    int nmu=0;								\
-    HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    nmu = 0;								\
-    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-							     int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    StencilEntry *SE;							\
-    int offset,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    int nmu=0;								\
-    HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    nmu = 0;								\
-    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }
-#endif
-
-HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
-
-  
-////////////// Wilson ; uses this implementation /////////////////////
-
-#define INSTANTIATE_THEM(A) \
-template void WilsonKernels<A>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-					     int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out);\
-template void WilsonKernels<A>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionFieldView &in, FermionFieldView &out); 
-
-//INSTANTIATE_THEM(GparityWilsonImplF);
-//INSTANTIATE_THEM(GparityWilsonImplD);
-//INSTANTIATE_THEM(GparityWilsonImplFH);
-//INSTANTIATE_THEM(GparityWilsonImplDF);
-//INSTANTIATE_THEM(DomainWallVec5dImplFH);
-//INSTANTIATE_THEM(DomainWallVec5dImplDF);
-//INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
-//INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonTMFermion.cc
+++ b/Grid/qcd/action/fermion/WilsonTMFermion.cc
@@ -1,97 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
-
-    Copyright (C) 2015
-
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-/*
- * BF sequence
- *
- void bfmbase<Float>::MooeeInv(Fermion_t psi, 
- Fermion_t chi, 
- int dag, int cb)
-
- double m    = this->mass;
- double tm   = this->twistedmass;
- double mtil = 4.0+this->mass;
-
- double sq = mtil*mtil + tm*tm;
-
- double a = mtil/sq;
- double b = -tm /sq;
- if(dag) b=-b;
- axpibg5x(chi,psi,a,b);
-
- void bfmbase<Float>::Mooee(Fermion_t psi, 
- Fermion_t chi, 
- int dag,int cb)
- double a = 4.0+this->mass;
- double b = this->twistedmass;
- if(dag) b=-b;
- axpibg5x(chi,psi,a,b);
-*/
-
-template<class Impl>
-void WilsonTMFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  RealD a = 4.0+this->mass;
-  RealD b = this->mu;
-  out.Checkerboard() = in.Checkerboard();
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  RealD a = 4.0+this->mass;
-  RealD b = -this->mu;
-  out.Checkerboard() = in.Checkerboard();
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  RealD m    = this->mass;
-  RealD tm   = this->mu;
-  RealD mtil = 4.0+m;
-  RealD sq   = mtil*mtil+tm*tm;
-  RealD a    = mtil/sq;
-  RealD b    = -tm /sq;
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
-  RealD m    = this->mass;
-  RealD tm   = this->mu;
-  RealD mtil = 4.0+m;
-  RealD sq   = mtil*mtil+tm*tm;
-  RealD a    = mtil/sq;
-  RealD b    = tm /sq;
-  axpibg5x(out,in,a,b);
-}
-
-FermOpTemplateInstantiate(WilsonTMFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/deprecated/CayleyFermion5Ddense.h
+++ b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Ddense.h
--- a/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dssp.h
+++ b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dssp.h
--- a/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermiondense.h
+++ b/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermiondense.h
--- a/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermionssp.h
+++ b/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermionssp.h
--- a/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermiondense.h
+++ b/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermiondense.h
--- a/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermionssp.h
+++ b/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermionssp.h
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermion.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermion.h
@@ -1,433 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
-						   GaugeField            &_Umu,
-						   GridCartesian         &FiveDimGrid,
-						   GridRedBlackCartesian &FiveDimRedBlackGrid,
-						   GridCartesian         &FourDimGrid,
-						   GridRedBlackCartesian &FourDimRedBlackGrid,
-						   RealD _mq1, RealD _mq2, RealD _mq3,
-						   RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
-  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-			    _shift, _pm, _M5, 1.0, 0.0, p)
-{
-  RealD eps = 1.0;
-  Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
-  assert(zdata->n == this->Ls);
-
-  std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
-  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-
-  Approx::zolotarev_free(zdata);
-}
-
-/***************************************************************
- * Additional EOFA operators only called outside the inverter.
- * Since speed is not essential, simple axpby-style
- * implementations should be fine.
- ***************************************************************/
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-{
-  int Ls = this->Ls;
-
-  Din = Zero();
-  if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
-  else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-  else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
-  else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-}
-
-// This is just the identity for DWF
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-// This is just the identity for DWF
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-/*****************************************************************************************************/
-
-template<class Impl>
-RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->Meooe5D(psi, Din);
-  this->DW(Din, chi, DaggerNo);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  this->M5D(psi, chi);
-  return(norm2(chi));
-}
-
-template<class Impl>
-RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->DW(psi, Din, DaggerYes);
-  this->MeooeDag5D(Din, chi);
-  this->M5Ddag(psi, chi);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  return(norm2(chi));
-}
-
-/********************************************************************
- * Performance critical fermion operators called inside the inverter
- ********************************************************************/
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD shift = this->shift;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-
-  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-  Coeff_t shiftp(0.0), shiftm(0.0);
-  if(shift != 0.0){
-    if(pm == 1){ shiftp = shift*(mq3-mq2); }
-    else{ shiftm = -shift*(mq3-mq2); }
-  }
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
-
-#if(0)
-  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
-  for(int i=0; i<diag.size(); ++i){
-    std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-  }
-  for(int i=0; i<upper.size(); ++i){
-    std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-  }
-  for(int i=0; i<lower.size(); ++i){
-    std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-  }
-#endif
-
-  this->M5D(psi, chi, chi, lower, diag, upper);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD shift = this->shift;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-
-  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-  Coeff_t shiftp(0.0), shiftm(0.0);
-  if(shift != 0.0){
-    if(pm == 1){ shiftp = shift*(mq3-mq2); }
-    else{ shiftm = -shift*(mq3-mq2); }
-  }
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
-
-#if(0)
-  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
-  for(int i=0; i<diag.size(); ++i){
-    std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-  }
-  for(int i=0; i<upper.size(); ++i){
-    std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-  }
-  for(int i=0; i<lower.size(); ++i){
-    std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-  }
-#endif
-
-  this->M5Ddag(psi, chi, chi, lower, diag, upper);
-}
-
-// half checkerboard operations
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] = this->dm;
-  lower[0]    = this->dp;
-
-  this->M5D(psi, psi, chi, lower, diag, upper);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] = this->dp;
-  lower[0]    = this->dm;
-
-  this->M5Ddag(psi, psi, chi, lower, diag, upper);
-}
-
-/****************************************************************************************/
-
-//Zolo
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-  RealD shift = this->shift;
-
-  ////////////////////////////////////////////////////////
-  // Constants for the preconditioned matrix Cayley form
-  ////////////////////////////////////////////////////////
-  this->bs.resize(Ls);
-  this->cs.resize(Ls);
-  this->aee.resize(Ls);
-  this->aeo.resize(Ls);
-  this->bee.resize(Ls);
-  this->beo.resize(Ls);
-  this->cee.resize(Ls);
-  this->ceo.resize(Ls);
-
-  for(int i=0; i<Ls; ++i){
-    this->bee[i] = 4.0 - this->M5 + 1.0;
-    this->cee[i] = 1.0;
-  }
-
-  for(int i=0; i<Ls; ++i){
-    this->aee[i] = this->cee[i];
-    this->bs[i] = this->beo[i] = 1.0;
-    this->cs[i] = this->ceo[i] = 0.0;
-  }
-
-  //////////////////////////////////////////
-  // EOFA shift terms
-  //////////////////////////////////////////
-  if(pm == 1){
-    this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
-    this->dm = mq1*this->cee[Ls-1];
-  } else if(this->pm == -1) {
-    this->dp = mq1*this->cee[0];
-    this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
-  } else {
-    this->dp = mq1*this->cee[0];
-    this->dm = mq1*this->cee[Ls-1];
-  }
-
-  //////////////////////////////////////////
-  // LDU decomposition of eeoo
-  //////////////////////////////////////////
-  this->dee.resize(Ls+1);
-  this->lee.resize(Ls);
-  this->leem.resize(Ls);
-  this->uee.resize(Ls);
-  this->ueem.resize(Ls);
-
-  for(int i=0; i<Ls; ++i){
-
-    if(i < Ls-1){
-
-      this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
-
-      this->leem[i] = this->dm/this->bee[i];
-      for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
-
-      this->dee[i] = this->bee[i];
-
-      this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
-
-      this->ueem[i] = this->dp / this->bee[0];
-      for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
-
-    } else {
-
-      this->lee[i]  = 0.0;
-      this->leem[i] = 0.0;
-      this->uee[i]  = 0.0;
-      this->ueem[i] = 0.0;
-
-    }
-  }
-
-  {
-    Coeff_t delta_d = 1.0 / this->bee[0];
-    for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
-    this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
-    this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
-  }
-
-  int inv = 1;
-  this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
-  this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
-}
-
-// Recompute Cayley-form coefficients for different shift
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-{
-  this->shift = new_shift;
-  Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
-  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-						       Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  int Ls = this->Ls;
-
-  GridBase* grid = this->FermionRedBlackGrid();
-  int LLs = grid->_rdimensions[0];
-
-  if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-  for(int s=0; s<Ls; s++){
-    Pplus(s,s)  = this->bee[s];
-    Pminus(s,s) = this->bee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pminus(s,s+1) = -this->cee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pplus(s+1,s) = -this->cee[s+1];
-  }
-
-  Pplus (0,Ls-1) = this->dp;
-  Pminus(Ls-1,0) = this->dm;
-
-  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXcd PminusMat;
-
-#if(0)
-  std::cout << GridLogMessage << "Pplus:" << std::endl;
-  for(int s=0; s<Ls; ++s){
-    for(int ss=0; ss<Ls; ++ss){
-      std::cout << Pplus(s,ss) << "\t";
-    }
-    std::cout << std::endl;
-  }
-  std::cout << GridLogMessage << "Pminus:" << std::endl;
-  for(int s=0; s<Ls; ++s){
-    for(int ss=0; ss<Ls; ++ss){
-      std::cout << Pminus(s,ss) << "\t";
-    }
-    std::cout << std::endl;
-  }
-#endif
-
-  if(inv) {
-    PplusMat  = Pplus.inverse();
-    PminusMat = Pminus.inverse();
-  } else {
-    PplusMat  = Pplus;
-    PminusMat = Pminus;
-  }
-
-  if(dag){
-    PplusMat.adjointInPlace();
-    PminusMat.adjointInPlace();
-  }
-
-  typedef typename SiteHalfSpinor::scalar_type scalar_type;
-  const int Nsimd = Simd::Nsimd();
-  Matp.resize(Ls*LLs);
-  Matm.resize(Ls*LLs);
-
-  for(int s2=0; s2<Ls; s2++){
-    for(int s1=0; s1<LLs; s1++){
-      int istride = LLs;
-      int ostride = 1;
-      Simd Vp;
-      Simd Vm;
-      scalar_type *sp = (scalar_type*) &Vp;
-      scalar_type *sm = (scalar_type*) &Vm;
-      for(int l=0; l<Nsimd; l++){
-	if(switcheroo<Coeff_t>::iscomplex()) {
-	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
-	} else {
-	  // if real
-	  scalar_type tmp;
-	  tmp = PplusMat (l*istride+s1*ostride,s2);
-	  sp[l] = scalar_type(tmp.real(),tmp.real());
-	  tmp = PminusMat(l*istride+s1*ostride,s2);
-	  sm[l] = scalar_type(tmp.real(),tmp.real());
-	}
-      }
-      Matp[LLs*s2+s1] = Vp;
-      Matm[LLs*s2+s1] = Vm;
-    }}
-}
-
-FermOpTemplateInstantiate(DomainWallEOFAFermion);
-GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermioncache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermioncache.h
@@ -1,255 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// FIXME -- make a version of these routines with site loop outermost for cache reuse.
-
-// Pminus fowards
-// Pplus  backwards..
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  int Ls = this->Ls;
-  GridBase* grid = psi_i.Grid();
-  auto phi = phi_i.View();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  assert(phi.Checkerboard() == psi.Checkerboard());
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-  
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-    for(int s=0; s<Ls; s++){
-      auto tmp = psi[0];
-      if(s==0) {
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5m(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  int Ls = this->Ls;
-
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-    auto tmp = psi[0];
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5p(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionField& chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi=psi_i.View();
-  auto chi=chi_i.View();
-  int Ls = this->Ls;
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-
-    auto tmp1 = psi[0];
-    auto tmp2 = psi[0];
-
-    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-    // Apply (L^{\prime})^{-1}
-    chi[ss] = psi[ss]; // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp1, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
-    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp1, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp1, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
-    }
-    spProj5m(tmp2, chi[ss+Ls-1]);
-    chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
-
-    // Apply U^{-1}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5m(tmp1, chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, FermionField& chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  int Ls = this->Ls;
-
-  assert(psi.Checkerboard() == psi.Checkerboard());
-
-  Vector<Coeff_t> ueec(Ls);
-  Vector<Coeff_t> deec(Ls+1);
-  Vector<Coeff_t> leec(Ls);
-  Vector<Coeff_t> ueemc(Ls);
-  Vector<Coeff_t> leemc(Ls);
-
-  for(int s=0; s<ueec.size(); s++){
-    ueec[s]  = conjugate(this->uee[s]);
-    deec[s]  = conjugate(this->dee[s]);
-    leec[s]  = conjugate(this->lee[s]);
-    ueemc[s] = conjugate(this->ueem[s]);
-    leemc[s] = conjugate(this->leem[s]);
-  }
-  deec[Ls] = conjugate(this->dee[Ls]);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-
-    auto tmp1 = psi[0];
-    auto tmp2 = psi[0];
-
-    // Apply (U^{\prime})^{-dagger}
-    chi[ss] = psi[ss];
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp1, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
-    }
-
-    // U_m^{-\dagger}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp1, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp1, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
-    }
-    spProj5p(tmp2, chi[ss+Ls-1]);
-    chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
-
-    // Apply L^{-dagger}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5p(tmp1, chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
-
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
-
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionvec.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionvec.h
@@ -1,613 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-
-/*
- * Dense matrix versions of routines
- */
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0;o<LLs;o++){ // outer
-    for(int i=0;i<nsimd;i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5m(hp, psi[ss+vp]);
-      spProj5p(hm, psi[ss+vm]);
-
-      if (vp <= v){ rotate(hp, hp, 1); }
-      if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = 0.5*hp;
-      hm = 0.5*hm;
-
-      spRecon5m(fp, hp);
-      spRecon5p(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v];
-      chi[ss+v] = chi[ss+v] + u[v]*fp;
-      chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v==LLs-1) ? 0     : v+1;
-      int vm = (v==0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-
-#endif
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5p(hp, psi[ss+vp]);
-      spProj5m(hm, psi[ss+vm]);
-
-      if(vp <= v){ rotate(hp, hp, 1); }
-      if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = hp*0.5;
-      hm = hm*0.5;
-      spRecon5p(fp, hp);
-      spRecon5m(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-#endif
-
-  });
-
-  this->M5Dtime += usecond();
-}
-
-#ifdef AVX512
-#include<simd/Intel512common.h>
-#include<simd/Intel512avx.h>
-#include<simd/Intel512single.h>
-#endif
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
-						   int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-#ifndef AVX512
-  {
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-    SiteHalfSpinor SiteChiP;
-    SiteHalfSpinor SiteChiM;
-
-    // Ls*Ls * 2 * 12 * vol flops
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-	for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-	  int s = s2 + l*LLs;
-	  int lex = s2 + LLs*site;
-
-	  if( s2==0 && l==0 ){
-	    SiteChiP=Zero();
-	    SiteChiM=Zero();
-	  }
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-	    }}
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-	    }}
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-	      SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-	    }}
-	}}
-
-      {
-	int lex = s1 + LLs*site;
-	for(int sp=0; sp<2;  sp++){
-	  for(int co=0; co<Nc; co++){
-	    vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-	  }}
-      }
-    }
-
-  }
-#else
-  {
-    // pointers
-    //  MASK_REGS;
-#define Chi_00 %%zmm1
-#define Chi_01 %%zmm2
-#define Chi_02 %%zmm3
-#define Chi_10 %%zmm4
-#define Chi_11 %%zmm5
-#define Chi_12 %%zmm6
-#define Chi_20 %%zmm7
-#define Chi_21 %%zmm8
-#define Chi_22 %%zmm9
-#define Chi_30 %%zmm10
-#define Chi_31 %%zmm11
-#define Chi_32 %%zmm12
-
-#define BCAST0  %%zmm13
-#define BCAST1  %%zmm14
-#define BCAST2  %%zmm15
-#define BCAST3  %%zmm16
-#define BCAST4  %%zmm17
-#define BCAST5  %%zmm18
-#define BCAST6  %%zmm19
-#define BCAST7  %%zmm20
-#define BCAST8  %%zmm21
-#define BCAST9  %%zmm22
-#define BCAST10 %%zmm23
-#define BCAST11 %%zmm24
-
-    int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-
-	int lex = s2 + LLs*site;
-	uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-	uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-	uint64_t a2 = (uint64_t) &psi[lex];
-
-	for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-	  if((s2+l)==0) {
-	    asm(
-		VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-		VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-		VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-		VBCASTCDUP(0,%2,BCAST0)
-		VBCASTCDUP(1,%2,BCAST1)
-		VBCASTCDUP(2,%2,BCAST2)
-		VBCASTCDUP(3,%2,BCAST3)
-		VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-		VMULMEM(0,%1,BCAST8,Chi_22)
-		VMULMEM(0,%1,BCAST9,Chi_30)
-		VMULMEM(0,%1,BCAST10,Chi_31)
-		VMULMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  } else {
-	    asm(
-		VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-		VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-		VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-		VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-		VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  }
-	  a0 = a0 + incr;
-	  a1 = a1 + incr;
-                        a2 = a2 + sizeof(typename Simd::scalar_type);
-	}
-      }
-
-      {
-	int lexa = s1+LLs*site;
-	asm (
-	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-      }
-    }
-  }
-
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-#endif
-};
-
-// Z-mobius version
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-						    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-  exit(-1);
-};
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-{
-  chi.Checkerboard() = psi.Checkerboard();
-  int Ls  = this->Ls;
-  int LLs = psi.Grid()->_rdimensions[0];
-  int vol = psi.Grid()->oSites()/LLs;
-
-  Vector<iSinglet<Simd> > Matp;
-  Vector<iSinglet<Simd> > Matm;
-  Vector<iSinglet<Simd> > *_Matp;
-  Vector<iSinglet<Simd> > *_Matm;
-
-  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-  if(inv && dag){
-    _Matp = &this->MatpInvDag;
-    _Matm = &this->MatmInvDag;
-  }
-
-  if(inv && (!dag)){
-    _Matp = &this->MatpInv;
-    _Matm = &this->MatmInv;
-  }
-
-  if(!inv){
-    MooeeInternalCompute(dag, inv, Matp, Matm);
-    _Matp = &Matp;
-    _Matm = &Matm;
-  }
-
-  assert(_Matp->size() == Ls*LLs);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  if(switcheroo<Coeff_t>::iscomplex()){
-    thread_loop((auto site=0; site<vol; site++),{
-      MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  } else {
-    thread_loop((auto site=0; site<vol; site++){
-      MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  }
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef DOMAIN_WALL_EOFA_DPERP_VEC
-
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
-
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
-
-template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermion.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermion.h
@@ -1,497 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
-					   GaugeField            &_Umu,
-					   GridCartesian         &FiveDimGrid,
-					   GridRedBlackCartesian &FiveDimRedBlackGrid,
-					   GridCartesian         &FourDimGrid,
-					   GridRedBlackCartesian &FourDimRedBlackGrid,
-					   RealD _mq1, RealD _mq2, RealD _mq3,
-					   RealD _shift, int _pm, RealD _M5,
-					   RealD _b, RealD _c, const ImplParams &p) :
-  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-			    _shift, _pm, _M5, _b, _c, p)
-{
-  int Ls = this->Ls;
-
-  RealD eps = 1.0;
-  Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
-  assert(zdata->n == this->Ls);
-
-  std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
-    ",c=" << _c << ") with Ls=" << Ls << std::endl;
-  this->SetCoefficientsTanh(zdata, _b, _c);
-  std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
-    ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
-    ",pm=" << _pm << ")" << std::endl;
-
-  Approx::zolotarev_free(zdata);
-
-  if(_shift != 0.0){
-    SetCoefficientsPrecondShiftOps();
-  } else {
-    Mooee_shift.resize(Ls, 0.0);
-    MooeeInv_shift_lc.resize(Ls, 0.0);
-    MooeeInv_shift_norm.resize(Ls, 0.0);
-    MooeeInvDag_shift_lc.resize(Ls, 0.0);
-    MooeeInvDag_shift_norm.resize(Ls, 0.0);
-  }
-}
-
-/****************************************************************
- * Additional EOFA operators only called outside the inverter.  
- * Since speed is not essential, simple axpby-style
- * implementations should be fine.
- ***************************************************************/
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-{
-  int Ls = this->Ls;
-  RealD alpha = this->alpha;
-
-  Din = Zero();
-  if((sign == 1) && (dag == 0)) { // \Omega_{+}
-    for(int s=0; s<Ls; ++s){
-      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
-    }
-  } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
-    for(int s=0; s<Ls; ++s){
-      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
-    }
-  } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
-    for(int sp=0; sp<Ls; ++sp){
-      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
-    }
-  } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
-    for(int sp=0; sp<Ls; ++sp){
-      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
-    }
-  }
-}
-
-// This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
-// It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
-{
-  int Ls    = this->Ls;
-  RealD b   = 0.5 * ( 1.0 + this->alpha );
-  RealD c   = 0.5 * ( 1.0 - this->alpha );
-  RealD mq1 = this->mq1;
-
-  for(int s=0; s<Ls; ++s){
-    if(s == 0) {
-      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-      axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
-    } else if(s == (Ls-1)) {
-      axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
-      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-    } else {
-      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-    }
-  }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-  RealD m = this->mq1;
-  RealD c = 0.5 * this->alpha;
-  RealD d = 0.5;
-
-  RealD DtInv_p(0.0), DtInv_m(0.0);
-  RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
-  FermionField tmp(this->FermionGrid());
-
-  for(int s=0; s<Ls; ++s){
-    for(int sp=0; sp<Ls; ++sp){
-
-      DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
-      DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
-      DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
-      DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
-
-      if(sp == 0){
-	axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
-	axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
-      } else {
-	axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
-	axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
-      }
-
-    }}
-}
-
-/*****************************************************************************************************/
-
-template<class Impl>
-RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->Meooe5D(psi, Din);
-  this->DW(Din, chi, DaggerNo);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  this->M5D(psi, chi);
-  return(norm2(chi));
-}
-
-template<class Impl>
-RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->DW(psi, Din, DaggerYes);
-  this->MeooeDag5D(Din, chi);
-  this->M5Ddag(psi, chi);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  return(norm2(chi));
-}
-
-/********************************************************************
- * Performance critical fermion operators called inside the inverter
- ********************************************************************/
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-// half checkerboard operations
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  // coefficients of Mooee
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] *= -this->mq1;
-  lower[0]    *= -this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  // coefficients of MooeeDag
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-  for(int s=0; s<Ls; s++){
-    if(s==0) {
-      upper[s] = -this->cee[s+1];
-      lower[s] = this->mq1*this->cee[Ls-1];
-    } else if(s==(Ls-1)) {
-      upper[s] = this->mq1*this->cee[0];
-      lower[s] = -this->cee[s-1];
-    } else {
-      upper[s] = -this->cee[s+1];
-      lower[s] = -this->cee[s-1];
-    }
-  }
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-/****************************************************************************************/
-
-// Computes coefficients for applying Cayley preconditioned shift operators
-//  (Mooee + \Delta) --> Mooee_shift
-//  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
-//  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
-// For the latter two cases, the operation takes the form
-//  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
-//      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
-template<class Impl>
-void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD alpha = this->alpha;
-  RealD k     = this->k;
-  RealD mq1   = this->mq1;
-  RealD shift = this->shift;
-
-  // Initialize
-  Mooee_shift.resize(Ls);
-  MooeeInv_shift_lc.resize(Ls);
-  MooeeInv_shift_norm.resize(Ls);
-  MooeeInvDag_shift_lc.resize(Ls);
-  MooeeInvDag_shift_norm.resize(Ls);
-
-  // Construct Mooee_shift
-  int idx(0);
-  Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
-    ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
-  for(int s=0; s<Ls; ++s){
-    idx = (pm == 1) ? (s) : (Ls-1-s);
-    Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
-  }
-
-  // Tridiagonal solve for MooeeInvDag_shift_lc
-  {
-    Coeff_t m(0.0);
-    Vector<Coeff_t> d = Mooee_shift;
-    Vector<Coeff_t> u(Ls,0.0);
-    Vector<Coeff_t> y(Ls,0.0);
-    Vector<Coeff_t> q(Ls,0.0);
-    if(pm == 1){ u[0] = 1.0; }
-    else{ u[Ls-1] = 1.0; }
-
-    // Tridiagonal matrix algorithm + Sherman-Morrison formula
-    //
-    // We solve
-    //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
-    // where Mooee' is the tridiagonal part of Mooee_{+}, and
-    // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
-    // so that the outer-product u \otimes v gives the (0,Ls-1)
-    // entry of Mooee_{+}.
-    //
-    // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
-    // and then construct the solution to the original system
-    //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
-    if(pm == 1){
-      for(int s=1; s<Ls; ++s){
-	m = -this->cee[s] / this->bee[s-1];
-	d[s] -= m*d[s-1];
-	u[s] -= m*u[s-1];
-      }
-    }
-    y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
-    q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
-    for(int s=Ls-2; s>=0; --s){
-      if(pm == 1){
-	y[s] = d[s] / this->bee[s];
-	q[s] = u[s] / this->bee[s];
-      } else {
-	y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
-	q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
-      }
-    }
-
-    // Construct MooeeInvDag_shift_lc
-    for(int s=0; s<Ls; ++s){
-      if(pm == 1){
-	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
-	  (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
-      } else {
-	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
-	  (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
-      }
-    }
-
-    // Compute remaining coefficients
-    N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
-    for(int s=0; s<Ls; ++s){
-
-      // MooeeInv_shift_lc
-      if(pm == 1){ MooeeInv_shift_lc[s] = pow(this->bee[s],s)      * pow(this->cee[s],Ls-1-s); }
-      else       { MooeeInv_shift_lc[s] = pow(this->bee[s],Ls-1-s) * pow(this->cee[s],s); }
-
-      // MooeeInv_shift_norm
-      MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
-	( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N;
-
-      // MooeeInvDag_shift_norm
-      if(pm == 1){ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],s) * pow(this->cee[s],(Ls-1-s)) /
-     	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
-      else{ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],(Ls-1-s)) * pow(this->cee[s],s) /
-	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
-    }
-  }
-}
-
-// Recompute coefficients for a different value of shift constant
-template<class Impl>
-void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-{
-  this->shift = new_shift;
-  if(new_shift != 0.0){
-    SetCoefficientsPrecondShiftOps();
-  } else {
-    int Ls = this->Ls;
-    Mooee_shift.resize(Ls,0.0);
-    MooeeInv_shift_lc.resize(Ls,0.0);
-    MooeeInv_shift_norm.resize(Ls,0.0);
-    MooeeInvDag_shift_lc.resize(Ls,0.0);
-    MooeeInvDag_shift_norm.resize(Ls,0.0);
-  }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-						   Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  int Ls = this->Ls;
-
-  GridBase* grid = this->FermionRedBlackGrid();
-  int LLs = grid->_rdimensions[0];
-
-  if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-  for(int s=0; s<Ls; s++){
-    Pplus(s,s)  = this->bee[s];
-    Pminus(s,s) = this->bee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pminus(s,s+1) = -this->cee[s];
-    Pplus(s+1,s) = -this->cee[s+1];
-  }
-
-  Pplus (0,Ls-1) = this->mq1*this->cee[0];
-  Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
-
-  if(this->shift != 0.0){
-    RealD c = 0.5 * this->alpha;
-    RealD d = 0.5;
-    RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
-    if(this->pm == 1) {
-      for(int s=0; s<Ls; ++s){
-	Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
-      }
-    } else {
-      for(int s=0; s<Ls; ++s){
-	Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
-      }
-    }
-  }
-
-  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXcd PminusMat;
-
-  if(inv) {
-    PplusMat  = Pplus.inverse();
-    PminusMat = Pminus.inverse();
-  } else {
-    PplusMat  = Pplus;
-    PminusMat = Pminus;
-  }
-
-  if(dag){
-    PplusMat.adjointInPlace();
-    PminusMat.adjointInPlace();
-  }
-
-  typedef typename SiteHalfSpinor::scalar_type scalar_type;
-  const int Nsimd = Simd::Nsimd();
-  Matp.resize(Ls*LLs);
-  Matm.resize(Ls*LLs);
-
-  for(int s2=0; s2<Ls; s2++){
-    for(int s1=0; s1<LLs; s1++){
-      int istride = LLs;
-      int ostride = 1;
-      Simd Vp;
-      Simd Vm;
-      scalar_type *sp = (scalar_type*) &Vp;
-      scalar_type *sm = (scalar_type*) &Vm;
-      for(int l=0; l<Nsimd; l++){
-	if(switcheroo<Coeff_t>::iscomplex()) {
-	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
-	} else {
-	  // if real
-	  scalar_type tmp;
-	  tmp = PplusMat (l*istride+s1*ostride,s2);
-	  sp[l] = scalar_type(tmp.real(),tmp.real());
-	  tmp = PminusMat(l*istride+s1*ostride,s2);
-	  sm[l] = scalar_type(tmp.real(),tmp.real());
-	}
-      }
-      Matp[LLs*s2+s1] = Vp;
-      Matm[LLs*s2+s1] = Vm;
-    }}
-}
-
-FermOpTemplateInstantiate(MobiusEOFAFermion);
-GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermioncache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermioncache.h
@@ -1,445 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-				  Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-    for(int s=0; s<Ls; s++){
-      auto tmp = psi[0];
-      if(s==0){
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5m(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-					Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
-					Vector<Coeff_t> &shift_coeffs)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
-
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-    for(int s=0; s<Ls; s++){
-      auto tmp = psi[0];
-      if(s==0){
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5m(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-      if(this->pm == 1){ spProj5p(tmp, psi[ss+shift_s]); }
-      else{ spProj5m(tmp, psi[ss+shift_s]); }
-      chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-				     Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-    auto tmp = psi[0];
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5p(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-					   Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
-					   Vector<Coeff_t> &shift_coeffs)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  int Ls = this->Ls;
-  int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-    chi[ss+Ls-1] = Zero();
-    auto tmp = psi[0];
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5p(tmp, psi[ss+0]);
-	chi[ss+s] = chi[ss+s] + diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-      if(this->pm == 1){ spProj5p(tmp, psi[ss+s]); }
-      else{ spProj5m(tmp, psi[ss+s]); }
-      chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-
-  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-
-    auto tmp = psi[0];
-
-    // Apply (L^{\prime})^{-1}
-    chi[ss] = psi[ss]; // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp;
-    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp;
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp;
-    }
-    chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-
-    // Apply U^{-1}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5m(tmp, chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionField &chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-
-    auto tmp1        = psi[0];
-    auto tmp2        = psi[0];
-    auto tmp2_spProj = psi[0];
-
-    // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
-    chi[ss] = psi[ss]; // chi[0]=psi[0]
-    tmp2 = MooeeInv_shift_lc[0]*psi[ss];
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp1, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
-      tmp2 = tmp2 + MooeeInv_shift_lc[s]*psi[ss+s];
-    }
-    if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
-    else{ spProj5m(tmp2_spProj, tmp2); }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp1, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp1, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp1;
-    }
-    // chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
-    chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-    spProj5m(tmp1, chi[ss+Ls-1]);
-    chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
-
-    // Apply U^{-1} and add shift term
-    for(int s=Ls-2; s>=0; s--){
-      chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
-      spProj5m(tmp1, chi[ss+s]);
-      chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionField &chi_i)
-{
-  if(this->shift != 0.0){ MooeeInvDag_shift(psi_i,chi_i); return; }
-
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-
-    auto tmp = psi[0];
-
-    // Apply (U^{\prime})^{-dag}
-    chi[ss] = psi[ss];
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp;
-    }
-
-    // U_m^{-\dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp;
-    }
-
-    // L_m^{-\dag} D^{-dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp;
-    }
-    chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-
-    // Apply L^{-dag}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5p(tmp, chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, FermionField &chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  int Ls = this->Ls;
-
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-
-    auto tmp1        = psi[0];
-    auto tmp2        = psi[0];
-    auto tmp2_spProj = psi[0];
-
-    // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
-    chi[ss] = psi[ss];
-    tmp2 = MooeeInvDag_shift_lc[0]*psi[ss];
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp1, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp1;
-      tmp2 = tmp2 + MooeeInvDag_shift_lc[s]*psi[ss+s];
-    }
-    if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
-    else{ spProj5m(tmp2_spProj, tmp2); }
-
-    // U_m^{-\dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp1, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp1;
-    }
-
-    // L_m^{-\dag} D^{-dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp1, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp1;
-    }
-    chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-    spProj5p(tmp1, chi[ss+Ls-1]);
-    chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInvDag_shift_norm[Ls-1]*tmp2_spProj;
-
-    // Apply L^{-dag}
-    for(int s=Ls-2; s>=0; s--){
-      chi[ss+s] = chi[ss+s] - this->lee[s]*tmp1;
-      spProj5p(tmp1, chi[ss+s]);
-      chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef MOBIUS_EOFA_DPERP_CACHE
-
-INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
-INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
-
-INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
-INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
-INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionvec.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionvec.h
@@ -1,998 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-/*
- * Dense matrix versions of routines
- */
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				  Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid  = psi_i.Grid();
-  auto psi        = psi_i.View();
-  auto phi        = phi_i.View();
-  auto chi        = chi_i.View();
-  int Ls          = this->Ls;
-  int LLs         = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s   = o + i*LLs;
-      int ss  = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5m(hp, psi[ss+vp]);
-      spProj5p(hm, psi[ss+vm]);
-
-      if (vp <= v){ rotate(hp, hp, 1); }
-      if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = 0.5*hp;
-      hm = 0.5*hm;
-
-      spRecon5m(fp, hp);
-      spRecon5p(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v];
-      chi[ss+v] = chi[ss+v] + u[v]*fp;
-      chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-
-#endif
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi_i, const FermionField& phi_i,
-					FermionField& chi_i, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-					Vector<Coeff_t>& shift_coeffs)
-{
-#if 0
-  auto & psi = psi_i;
-  auto & phi = phi_i;
-  auto & chi = chi_i;
-
-  this->M5D(psi, phi, chi, lower, diag, upper);
-
-  // FIXME: possible gain from vectorizing shift operation as well?
-  Coeff_t one(1.0);
-  int Ls = this->Ls;
-  for(int s=0; s<Ls; s++){
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
-    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
-  }
-
-#else
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid  = psi_i.Grid();
-  auto psi        = psi_i.View();
-  auto phi        = phi_i.View();
-  auto chi        = chi_i.View();
-  int Ls          = this->Ls;
-  int LLs         = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-  Vector<iSinglet<Simd>> s(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-  scalar_type* s_p = (scalar_type*) &s[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s   = o + i*LLs;
-      int ss  = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-      s_p[ss] = shift_coeffs[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-    int vs     = (this->pm == 1) ? LLs-1 : 0;
-    Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
-    Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
-    Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
-    Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
-    Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
-    Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(this->pm == 1 && vs <= v){
-	hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      if(this->pm == -1 && vs >= v){
-	hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-      Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-      Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-      Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-      Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-      Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-      Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-  });
-
-  this->M5Dtime += usecond();
-
-#endif
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				     Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5p(hp, psi[ss+vp]);
-      spProj5m(hm, psi[ss+vm]);
-
-      if(vp <= v){ rotate(hp, hp, 1); }
-      if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = hp*0.5;
-      hm = hm*0.5;
-      spRecon5p(fp, hp);
-      spRecon5m(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-
-    }
-
-#endif
-
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-					   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-					   Vector<Coeff_t>& shift_coeffs)
-{
-#if 0
-  auto & psi = psi_i;
-  auto & phi = phi_i;
-  auto & chi = chi_i;
-  this->M5Ddag(psi, phi, chi, lower, diag, upper);
-
-  // FIXME: possible gain from vectorizing shift operation as well?
-  Coeff_t one(1.0);
-  int Ls = this->Ls;
-  for(int s=0; s<Ls; s++){
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
-    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
-  }
-
-#else
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-  Vector<iSinglet<Simd>> s(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-  scalar_type* s_p = (scalar_type*) &s[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-      s_p[ss] = shift_coeffs[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-    int vs     = (this->pm == 1) ? LLs-1 : 0;
-    Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
-    Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
-    Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
-    Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
-    Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
-    Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(this->pm == 1 && vs <= v){
-	hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      if(this->pm == -1 && vs >= v){
-	hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-      }
-
-      Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-      Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-      Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-      Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-      Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-      Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-
-    }
-
-  });
-
-  this->M5Dtime += usecond();
-
-#endif
-}
-
-#ifdef AVX512
-#include<simd/Intel512common.h>
-#include<simd/Intel512avx.h>
-#include<simd/Intel512single.h>
-#endif
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
-					       int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-#ifndef AVX512
-  {
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-    SiteHalfSpinor SiteChiP;
-    SiteHalfSpinor SiteChiM;
-
-    // Ls*Ls * 2 * 12 * vol flops
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-	for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-	  int s = s2 + l*LLs;
-	  int lex = s2 + LLs*site;
-
-	  if( s2==0 && l==0 ){
-	    SiteChiP=Zero();
-	    SiteChiM=Zero();
-	  }
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-            }}
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-            }}
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-              SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-            }}
-	}}
-
-      {
-	int lex = s1 + LLs*site;
-	for(int sp=0; sp<2;  sp++){
-	  for(int co=0; co<Nc; co++){
-	    vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-	  }}
-      }
-    }
-  }
-#else
-  {
-    // pointers
-    //  MASK_REGS;
-#define Chi_00 %%zmm1
-#define Chi_01 %%zmm2
-#define Chi_02 %%zmm3
-#define Chi_10 %%zmm4
-#define Chi_11 %%zmm5
-#define Chi_12 %%zmm6
-#define Chi_20 %%zmm7
-#define Chi_21 %%zmm8
-#define Chi_22 %%zmm9
-#define Chi_30 %%zmm10
-#define Chi_31 %%zmm11
-#define Chi_32 %%zmm12
-
-#define BCAST0  %%zmm13
-#define BCAST1  %%zmm14
-#define BCAST2  %%zmm15
-#define BCAST3  %%zmm16
-#define BCAST4  %%zmm17
-#define BCAST5  %%zmm18
-#define BCAST6  %%zmm19
-#define BCAST7  %%zmm20
-#define BCAST8  %%zmm21
-#define BCAST9  %%zmm22
-#define BCAST10 %%zmm23
-#define BCAST11 %%zmm24
-
-    int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-
-	int lex = s2 + LLs*site;
-	uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-	uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-	uint64_t a2 = (uint64_t) &psi[lex];
-
-	for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-
-	  if((s2+l)==0) {
-	    asm(
-		VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-		VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-		VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-		VBCASTCDUP(0,%2,BCAST0)
-		VBCASTCDUP(1,%2,BCAST1)
-		VBCASTCDUP(2,%2,BCAST2)
-		VBCASTCDUP(3,%2,BCAST3)
-		VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-		VMULMEM(0,%1,BCAST8,Chi_22)
-		VMULMEM(0,%1,BCAST9,Chi_30)
-		VMULMEM(0,%1,BCAST10,Chi_31)
-		VMULMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  } else {
-	    asm(
-		VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-		VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-		VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-		VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-		VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  }
-
-	  a0 = a0 + incr;
-	  a1 = a1 + incr;
-              a2 = a2 + sizeof(typename Simd::scalar_type);
-	}
-      }
-
-      {
-	int lexa = s1+LLs*site;
-	asm (
-	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-      }
-    }
-  }
-
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-
-#endif
-};
-
-// Z-mobius version
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-						int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-  exit(-1);
-};
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-{
-  chi.Checkerboard() = psi.Checkerboard();
-
-  int Ls  = this->Ls;
-  int LLs = psi.Grid()->_rdimensions[0];
-  int vol = psi.Grid()->oSites()/LLs;
-
-  Vector<iSinglet<Simd>>   Matp;
-  Vector<iSinglet<Simd>>   Matm;
-  Vector<iSinglet<Simd>>* _Matp;
-  Vector<iSinglet<Simd>>* _Matm;
-
-  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-  if(inv && dag){
-    _Matp = &this->MatpInvDag;
-    _Matm = &this->MatmInvDag;
-  }
-
-  if(inv && (!dag)){
-    _Matp = &this->MatpInv;
-    _Matm = &this->MatmInv;
-  }
-
-  if(!inv){
-    MooeeInternalCompute(dag, inv, Matp, Matm);
-    _Matp = &Matp;
-    _Matm = &Matm;
-  }
-
-  assert(_Matp->size() == Ls*LLs);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  if(switcheroo<Coeff_t>::iscomplex()){
-    thread_loop( (auto site=0; site<vol; site++),{
-      MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  } else {
-    thread_loop( (auto site=0; site<vol; site++),{
-      MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  }
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef MOBIUS_EOFA_DPERP_VEC
-
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
-
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
-
-template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermion.h
@@ -1,242 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
-
-    Copyright (C) 2017
-
-    Author: paboyle <paboyle@ph.ed.ac.uk>
-    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid.h>
-#include <Grid/qcd/spin/Dirac.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// *NOT* EO
-template <class Impl>
-RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
-{
-  FermionField temp(out.Grid());
-
-  // Wilson term
-  out.Checkerboard() = in.Checkerboard();
-  this->Dhop(in, out, DaggerNo);
-
-  // Clover term
-  Mooee(in, temp);
-
-  out += temp;
-  return norm2(out);
-}
-
-template <class Impl>
-RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
-{
-  FermionField temp(out.Grid());
-
-  // Wilson term
-  out.Checkerboard() = in.Checkerboard();
-  this->Dhop(in, out, DaggerYes);
-
-  // Clover term
-  MooeeDag(in, temp);
-
-  out += temp;
-  return norm2(out);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
-{
-  WilsonFermion<Impl>::ImportGauge(_Umu);
-  GridBase *grid = _Umu.Grid();
-  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
-
-  // Compute the field strength terms mu>nu
-  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
-  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
-  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
-
-  // Compute the Clover Operator acting on Colour and Spin
-  // multiply here by the clover coefficients for the anisotropy
-  CloverTerm  = fillCloverYZ(Bx) * csw_r;
-  CloverTerm += fillCloverXZ(By) * csw_r;
-  CloverTerm += fillCloverXY(Bz) * csw_r;
-  CloverTerm += fillCloverXT(Ex) * csw_t;
-  CloverTerm += fillCloverYT(Ey) * csw_t;
-  CloverTerm += fillCloverZT(Ez) * csw_t;
-  CloverTerm += diag_mass;
-
-  int lvol = _Umu.Grid()->lSites();
-  int DimRep = Impl::Dimension;
-
-  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-
-  Coordinate lcoor;
-  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
-
-  for (int site = 0; site < lvol; site++)
-  {
-    grid->LocalIndexToLocalCoor(site, lcoor);
-    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-    peekLocalSite(Qx, CloverTerm, lcoor);
-    Qxinv = Zero();
-    //if (csw!=0){
-    for (int j = 0; j < Ns; j++)
-      for (int k = 0; k < Ns; k++)
-        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++){
-	    auto zz =  Qx()(j, k)(a, b);
-            EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
-	  }
-    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
-
-    EigenInvCloverOp = EigenCloverOp.inverse();
-    //std::cout << EigenInvCloverOp << std::endl;
-    for (int j = 0; j < Ns; j++)
-      for (int k = 0; k < Ns; k++)
-        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++)
-            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
-    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
-    //  }
-    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
-  }
-
-  // Separate the even and odd parts
-  pickCheckerboard(Even, CloverTermEven, CloverTerm);
-  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
-
-  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
-  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
-
-  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
-  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
-
-  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
-  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerNo, InverseNo);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerYes, InverseNo);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerNo, InverseYes);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerYes, InverseYes);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
-{
-  out.Checkerboard() = in.Checkerboard();
-  CloverFieldType *Clover;
-  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
-
-  if (dag)
-  {
-    if (in.Grid()->_isCheckerBoarded)
-    {
-      if (in.Checkerboard() == Odd)
-      {
-        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
-      }
-      else
-      {
-        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
-      }
-      out = *Clover * in;
-    }
-    else
-    {
-      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = adj(*Clover) * in;
-    }
-  }
-  else
-  {
-    if (in.Grid()->_isCheckerBoarded)
-    {
-
-      if (in.Checkerboard() == Odd)
-      {
-        //  std::cout << "Calling clover term Odd" << std::endl;
-        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
-      }
-      else
-      {
-        //  std::cout << "Calling clover term Even" << std::endl;
-        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
-      }
-      out = *Clover * in;
-      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
-    }
-    else
-    {
-      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = *Clover * in;
-    }
-  }
-
-} // MooeeInternal
-
-
-// Derivative parts
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
-{
-  assert(0);
-}
-
-// Derivative parts
-template <class Impl>
-void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
-{
-  assert(0); // not implemented yet
-}
-
-FermOpTemplateInstantiate(WilsonCloverFermion);
-AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
-TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
-//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -386,11 +386,9 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         const FermionField &in, FermionField &out,int dag)
 {
  DhopTotalTime-=usecond();
-#ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else 
-#endif
    DhopInternalSerialComms(st,lo,U,in,out,dag);
  DhopTotalTime+=usecond();
 }
@@ -401,111 +399,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 							DoubledGaugeField & U,
 							const FermionField &in, FermionField &out,int dag)
 {
-#ifdef GRID_OMP
  Compressor compressor(dag);

  int LLs = in.Grid()->_rdimensions[0];
  int len =  U.Grid()->oSites();

+  /////////////////////////////
+  // Start comms  // Gather intranode and extra node differentiated??
+  /////////////////////////////
  DhopFaceTime-=usecond();
  st.HaloExchangeOptGather(in,compressor);
-  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();

-  double ctime=0;
-  double ptime=0;
+  DhopCommTime -=usecond();
+  std::vector<std::vector<CommsRequest_t> > requests;
+  st.CommunicateBegin(requests);

-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  auto U_v   = U.View();
-  auto in_v  = in.View();
-  auto out_v = out.View();
-  int Opt = WilsonKernelsStatic::Opt;
-#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
-  { 
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      double start = usecond();
-      nthreads -= ncomms;
-      int ttid = tid - ncomms;
-      int n = U.Grid()->oSites();
-      int chunk = n / nthreads;
-      int rem = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-	myblock = ttid * chunk + ttid;
-	myn = chunk+1;
-      } else {
-	myblock = ttid*chunk + rem;
-	myn = chunk;
-      }
+  /////////////////////////////
+  // Overlap with comms
+  /////////////////////////////
+  DhopFaceTime-=usecond();
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+  DhopFaceTime+=usecond();
      
-      // do the compute
-      if (dag == DaggerYes) {
-	for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  int sU = ss;
-	  int sF = LLs * sU;
- 	  Kernels::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,1,0);
-	}
-      } else {
-	for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  int sU = ss;
-	  int sF = LLs * sU;
-	  Kernels::DhopSite(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,1,0);
-	}
-      }
-      ptime = usecond() - start;
-    }
-    {
-      double start = usecond();
-      st.CommunicateThreaded();
-      ctime = usecond() - start;
-    }
+  /////////////////////////////
+  // do the compute interior
+  /////////////////////////////
+  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
+  DhopComputeTime-=usecond();
+  if (dag == DaggerYes) {
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
+  } else {
+    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
-  DhopCommTime += ctime;
-  DhopComputeTime+=ptime;
+  DhopComputeTime+=usecond();

-  // First to enter, last to leave timing
-  st.CollateThreads();
+  /////////////////////////////
+  // Complete comms
+  /////////////////////////////
+  st.CommunicateComplete(requests);
+  DhopCommTime   +=usecond();

+  /////////////////////////////
+  // do the compute exterior
+  /////////////////////////////
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();

  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
-    int sz=st.surface_list.size();
-    thread_loop( (int ss = 0; ss < sz; ss++) ,{
-      int sU = st.surface_list[ss];
-      int sF = LLs * sU;
-      Kernels::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,0,1);
-    });
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  } else {
-    int sz=st.surface_list.size();
-    thread_loop( (int ss = 0; ss < sz; ss++) ,{
-      int sU = st.surface_list[ss];
-      int sF = LLs * sU;
-      Kernels::DhopSite(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,0,1);
-    });
+    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
-#else 
-  assert(0);
-#endif
 }


 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
-					 DoubledGaugeField & U,
-					 const FermionField &in, FermionField &out,int dag)
+						    DoubledGaugeField & U,
+						    const FermionField &in, 
+						    FermionField &out,int dag)
 {
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  Compressor compressor(dag);

  int LLs = in.Grid()->_rdimensions[0];
@@ -515,24 +472,11 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
  DhopCommTime+=usecond();
  
  DhopComputeTime-=usecond();
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-
-  auto U_v = U.View();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
-    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U_v.size(),in,out);
-    //    parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
-    //      int sU = ss;
-    //      int sF = LLs * sU;
-    //      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
-    //    }
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  } else {
-    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U_v.size(),in,out);
-    //    parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
-    //      int sU = ss;
-    //      int sF = LLs * sU;
-    //      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
-    //    }
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
 }
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -375,78 +375,47 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
 						      const FermionField &in,
 						      FermionField &out, int dag) {
  assert((dag == DaggerNo) || (dag == DaggerYes));
-#ifdef GRID_OMP
+
  Compressor compressor(dag);
  int len =  U.Grid()->oSites();
-  const int LLs =  1;

+  /////////////////////////////
+  // Start comms  // Gather intranode and extra node differentiated??
+  /////////////////////////////
+  std::vector<std::vector<CommsRequest_t> > requests;
  st.Prepare();
  st.HaloGather(in,compressor);
+  st.CommunicateBegin(requests);
+
+  /////////////////////////////
+  // Overlap with comms
+  /////////////////////////////
  st.CommsMergeSHM(compressor);
-#pragma omp parallel
-  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = len;
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
-      }
-      // do the compute
-      auto U_v   = U.View();
-      auto in_v  = in.View();
-      auto out_v = out.View();
-      auto st_v  = st.View();
-      int Opt = WilsonKernelsStatic::Opt;

-      if (dag == DaggerYes) {
-        for (int sss = myblock; sss < myblock+myn; ++sss) {
-	  Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
-	  //	  Kernels::DhopSiteDag(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-	}
-      } else {
-        for (int sss = myblock; sss < myblock+myn; ++sss) {
-	  Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
-	  //	  Kernels::DhopSite(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-	}
-      } 
+  /////////////////////////////
+  // do the compute interior
+  /////////////////////////////
+  int Opt = WilsonKernelsStatic::Opt;
+  if (dag == DaggerYes) {
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
+  } else {
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
+  } 

-    } else {
-      st.CommunicateThreaded();
-    }
-  }  //pragma
+  /////////////////////////////
+  // Complete comms
+  /////////////////////////////
+  st.CommunicateComplete(requests);
+  st.CommsMerge(compressor);

-  {
-    auto U_v   = U.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    auto st_v  =  st.View();
-    int Opt = WilsonKernelsStatic::Opt;
-    if (dag == DaggerYes) {
-      thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
-	Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
-      });
-    } else {
-      thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
-	Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
-      });
-    }
+  /////////////////////////////
+  // do the compute exterior
+  /////////////////////////////
+  if (dag == DaggerYes) {
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
+  } else {
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  }
-#else
-  assert(0);
-#endif
 };


--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h
@@ -73,7 +73,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  return;
 }

-#ifdef GPU_VEC
+#if 1
 #define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
  if (SE._is_local) {							\
    int mask = Nsimd >> (ptype + 1);					\
@@ -96,7 +96,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
      spProj(chi, in_t);						\
    }									\
  } else {								\
-    chi  = buf[SE._offset+s];						\
+    chi  = (buf[SE._offset+s];						\
  }									\
  synchronise();
 #endif
@@ -106,15 +106,9 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, Sit
 							    SiteHalfSpinor *buf, int Ls, int s,
 							    int sU, const FermionFieldView &in, FermionFieldView &out)
 {
-#ifdef __CUDA_ARCH__
  typename SiteHalfSpinor::scalar_object chi;
  typename SiteHalfSpinor::scalar_object Uchi;
  typename SiteSpinor::scalar_object   result;
-#else 
-  SiteHalfSpinor chi;
-  SiteHalfSpinor Uchi;
-  SiteSpinor     result;
-#endif

  typedef typename SiteSpinor::scalar_type scalar_type;
  typedef typename SiteSpinor::vector_type vector_type;
@@ -173,11 +167,7 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, Sit
    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); 
    Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
    accumReconTm(result, Uchi);
-#ifdef GPU_VEC
-  insertLane (lane,out[sF],result);
-#else
-  vstream(out[sF], result);
-#endif
+    insertLane (lane,out[sF],result);
  }
 }

@@ -186,15 +176,10 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDo
 							 SiteHalfSpinor *buf,  int Ls, int s,
 							 int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
-#ifdef __CUDA_ARCH__
  typename SiteHalfSpinor::scalar_object chi;
  typename SiteHalfSpinor::scalar_object Uchi;
  typename SiteSpinor::scalar_object   result;
-#else 
-  SiteHalfSpinor chi;
-  SiteHalfSpinor Uchi;
-  SiteSpinor     result;
-#endif
+
  typedef typename SiteSpinor::scalar_type scalar_type;
  typedef typename SiteSpinor::vector_type vector_type;
  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
@@ -255,11 +240,7 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDo
    Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
    accumReconTp(result, Uchi);

-#ifdef GPU_VEC
    insertLane (lane,out[sF],result);
-#else
-  vstream(out[sF], result);
-#endif
  }
 };

@@ -287,6 +268,25 @@ GPU_EMPTY(GparityWilsonImplFH);
 GPU_EMPTY(GparityWilsonImplD);
 GPU_EMPTY(GparityWilsonImplDF);

+#define KERNEL_CALL(A) \
+      const uint64_t nsimd = Simd::Nsimd(); \
+      const uint64_t    NN = Nsite*Ls*nsimd;\
+      accelerator_loopN( sss, NN, {         \
+	  uint64_t cur  = sss;              \
+	  cur = cur / nsimd;                \
+	  uint64_t   s  = cur%Ls;           \
+	  cur = cur / Ls;                   \
+	  uint64_t   sU = cur;              \
+	  WilsonKernels<Impl>::A(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);\
+      });
+ 
+#define HOST_CALL(A) \
+  accelerator_loopN( ss, Ls*Nsite, {					\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      WilsonKernels<Impl>::A(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v);	\
+  });
+
 template <class Impl>
 void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 				     int Ls, int Nsite, const FermionField &in, FermionField &out,
@@ -297,25 +297,18 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    auto out_v = out.View();
    auto st_v  =  st.View();

-    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
-#define KERNEL_CALL(A) \
-      const uint64_t nsimd = Simd::Nsimd(); \
-      const uint64_t    NN = Nsite*Ls*nsimd;\
-      accelerator_loopN( sss, NN, {         \
-	  uint64_t cur  = sss;              \
-	  cur = cur / nsimd;                \
-	  uint64_t   s  = cur%Ls;           \
-	  cur = cur / Ls;                   \
-	  uint64_t   sU = cur; 
-	  WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
-      });
-    } else { 
-      accelerator_loop( ss, U_v, {
-	int sU = ss;
-        int sF = Ls * sU;
-        WilsonKernels<Impl>::GenericDhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
-      });
-    }
+   if( interior && exterior ) { 
+     if (Opt == WilsonKernelsStatic::OptGpu) {
+       KERNEL_CALL(GpuDhopSite);
+     } else {
+       HOST_CALL(GenericDhopSite);
+     }
+   } else if( interior ) {
+     HOST_CALL(GenericDhopSiteInt);
+   } else if( exterior ) { 
+     HOST_CALL(GenericDhopSiteExt);
+   }
+
  }
  template <class Impl>
  void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
@@ -327,25 +320,16 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    auto out_v = out.View();
    auto st_v  = st.View();

-    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
-      const uint64_t nsimd = Simd::Nsimd();
-      const uint64_t    NN = Nsite*Ls*nsimd;
-      accelerator_loopN( sss, NN, {
-	  uint64_t cur  = sss;
-	  //	  uint64_t lane = cur % nsimd;
-	  cur = cur / nsimd;
-	  uint64_t   s  = cur%Ls;
-	  //	  uint64_t   sF = cur;         
-	  cur = cur / Ls;
-	  uint64_t   sU = cur;
-	  WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
-      });
-    } else { 
-      accelerator_loop( ss, U_v, {
-	int sU = ss;
-	int sF = Ls * sU;
-	WilsonKernels<Impl>::GenericDhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
-      });
+    if( interior && exterior ) { 
+      if (Opt == WilsonKernelsStatic::OptGpu) {
+	KERNEL_CALL(GpuDhopSiteDag);
+      } else {
+	HOST_CALL(GenericDhopSiteDag);
+      }
+    } else if( interior ) {
+      HOST_CALL(GenericDhopSiteDagInt);
+    } else if( exterior ) { 
+      HOST_CALL(GenericDhopSiteDagExt);
    }
  }

--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -267,7 +267,6 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
  int ptype;

  SE = st.GetEntry(ptype, dir, sF);
-  //  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
  if (gamma == Xp) {						
    if (SE->_is_local ) {					
      int perm= SE->_permute;					
--- a/Grid/qcd/action/fermion/implementation/WilsonTMFermion.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonTMFermion.h
@@ -1,97 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
-
-    Copyright (C) 2015
-
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-/*
- * BF sequence
- *
- void bfmbase<Float>::MooeeInv(Fermion_t psi, 
- Fermion_t chi, 
- int dag, int cb)
-
- double m    = this->mass;
- double tm   = this->twistedmass;
- double mtil = 4.0+this->mass;
-
- double sq = mtil*mtil + tm*tm;
-
- double a = mtil/sq;
- double b = -tm /sq;
- if(dag) b=-b;
- axpibg5x(chi,psi,a,b);
-
- void bfmbase<Float>::Mooee(Fermion_t psi, 
- Fermion_t chi, 
- int dag,int cb)
- double a = 4.0+this->mass;
- double b = this->twistedmass;
- if(dag) b=-b;
- axpibg5x(chi,psi,a,b);
-*/
-
-template<class Impl>
-void WilsonTMFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  RealD a = 4.0+this->mass;
-  RealD b = this->mu;
-  out.Checkerboard() = in.Checkerboard();
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  RealD a = 4.0+this->mass;
-  RealD b = -this->mu;
-  out.Checkerboard() = in.Checkerboard();
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  RealD m    = this->mass;
-  RealD tm   = this->mu;
-  RealD mtil = 4.0+m;
-  RealD sq   = mtil*mtil+tm*tm;
-  RealD a    = mtil/sq;
-  RealD b    = -tm /sq;
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
-  RealD m    = this->mass;
-  RealD tm   = this->mu;
-  RealD mtil = 4.0+m;
-  RealD sq   = mtil*mtil+tm*tm;
-  RealD a    = mtil/sq;
-  RealD b    = tm /sq;
-  axpibg5x(out,in,a,b);
-}
-
-FermOpTemplateInstantiate(WilsonTMFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/CayleyFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/CayleyFermion5DInstantiation.cc
@@ -0,0 +1,45 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h>
+
+			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h>
+			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dgpu.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// FIXME: Break these out to parallel make accelerate
+FermOpTemplateInstantiate(CayleyFermion5D);
+GparityFermOpTemplateInstantiate(CayleyFermion5D);
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/ContinuedFractionFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/ContinuedFractionFermion5DInstantiation.cc
@@ -0,0 +1,37 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
+#include <Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+FermOpTemplateInstantiate(ContinuedFractionFermion5D);
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/DomainWallEOFAFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/DomainWallEOFAFermionInstantiation.cc
@@ -0,0 +1,45 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+#include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h>
+
+NAMESPACE_BEGIN(Grid);
+
+
+FermOpTemplateInstantiate(DomainWallEOFAFermion);
+GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermion5DInstantiation.cc
@@ -0,0 +1,46 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
+#include <Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h>
+#include <Grid/perfmon/PerfCount.h>
+
+NAMESPACE_BEGIN(Grid);
+  
+// S-direction is INNERMOST and takes no part in the parity.
+const std::vector<int> ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
+const std::vector<int> ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
+
+FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
+FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
+  
+NAMESPACE_END(Grid);
+
+
+
--- a/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc
@@ -0,0 +1,39 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#include <Grid.h>
+#include <Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+const std::vector<int> ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
+const std::vector<int> ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
+
+FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/PartialFractionFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/PartialFractionFermion5DInstantiation.cc
@@ -0,0 +1,38 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
+#include <Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+ 
+FermOpTemplateInstantiate(PartialFractionFermion5D);
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/StaggeredKernelsInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/StaggeredKernelsInstantiation.cc
@@ -0,0 +1,43 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h>
+
+
+NAMESPACE_BEGIN(Grid);
+
+int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
+int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
+
+FermOpStaggeredTemplateInstantiate(StaggeredKernels);
+FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/WilsonCloverFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonCloverFermionInstantiation.cc
@@ -0,0 +1,42 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
+
+    Copyright (C) 2017
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+FermOpTemplateInstantiate(WilsonCloverFermion);
+AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
+TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
+//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonFermion5DInstantiation.cc
@@ -0,0 +1,40 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+FermOpTemplateInstantiate(WilsonFermion5D);
+GparityFermOpTemplateInstantiate(WilsonFermion5D);
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/WilsonFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonFermionInstantiation.cc
@@ -0,0 +1,46 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
+const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
+int WilsonFermionStatic::HandOptDslash;
+
+FermOpTemplateInstantiate(WilsonFermion);
+AdjointFermOpTemplateInstantiate(WilsonFermion);
+TwoIndexFermOpTemplateInstantiate(WilsonFermion);
+GparityFermOpTemplateInstantiate(WilsonFermion);
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// Move these
+int WilsonKernelsStatic::Opt   = WilsonKernelsStatic::OptGeneric;
+int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
+
+// FIXME: Break these out to parallel make
+FermOpTemplateInstantiate(WilsonKernels);
+GparityFermOpTemplateInstantiate(WilsonKernels); // Specialisation in Gparity forces instantiation
+AdjointFermOpTemplateInstantiate(WilsonKernels);
+TwoIndexFermOpTemplateInstantiate(WilsonKernels);
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/WilsonTMFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTMFermionInstantiation.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+FermOpTemplateInstantiate(WilsonTMFermion);
+
+NAMESPACE_END(Grid);