Compiles GPU and CPU, still gives good performance on CPU

2025-08-02 20:57:06 +01:00 · 2019-06-05 13:28:16 +01:00
parent 18d3cde29a
commit 0ee6e77cbc
71 changed files with 1512 additions and 33769 deletions
--- a/Grid/qcd/action/fermion/CayleyFermion5D.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.cc
@@ -1,668 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
-				       GridCartesian         &FiveDimGrid,
-				       GridRedBlackCartesian &FiveDimRedBlackGrid,
-				       GridCartesian         &FourDimGrid,
-				       GridRedBlackCartesian &FourDimRedBlackGrid,
-				       RealD _mass,RealD _M5,const ImplParams &p) :
-  WilsonFermion5D<Impl>(_Umu,
-			FiveDimGrid,
-			FiveDimRedBlackGrid,
-			FourDimGrid,
-			FourDimRedBlackGrid,_M5,p),
-  mass(_mass)
-{ 
-}
-
-///////////////////////////////////////////////////////////////
-// Physical surface field utilities
-///////////////////////////////////////////////////////////////
-template<class Impl>  
-void CayleyFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
-{
-  int Ls = this->Ls;
-  FermionField tmp(this->FermionGrid());
-  tmp = solution5d;
-  conformable(solution5d.Grid(),this->FermionGrid());
-  conformable(exported4d.Grid(),this->GaugeGrid());
-  axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0);
-  axpby_ssp_pplus (tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
-  ExtractSlice(exported4d, tmp, 0, 0);
-}
-template<class Impl>  
-void CayleyFermion5D<Impl>::P(const FermionField &psi, FermionField &chi)
-{
-  int Ls= this->Ls;
-  chi=Zero();
-  for(int s=0;s<Ls;s++){
-    axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
-    axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s+1)%Ls);
-  }
-}
-template<class Impl>  
-void CayleyFermion5D<Impl>::Pdag(const FermionField &psi, FermionField &chi)
-{
-  int Ls= this->Ls;
-  chi=Zero();
-  for(int s=0;s<Ls;s++){
-    axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
-    axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s-1+Ls)%Ls);
-  }
-}
-template<class Impl>  
-void CayleyFermion5D<Impl>::ExportPhysicalFermionSource(const FermionField &solution5d,FermionField &exported4d)
-{
-  int Ls = this->Ls;
-  FermionField tmp(this->FermionGrid());
-  tmp = solution5d;
-  conformable(solution5d.Grid(),this->FermionGrid());
-  conformable(exported4d.Grid(),this->GaugeGrid());
-  axpby_ssp_pplus (tmp, 0., solution5d, 1., solution5d, 0, 0);
-  axpby_ssp_pminus(tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
-  ExtractSlice(exported4d, tmp, 0, 0);
-}
-template<class Impl>
-void CayleyFermion5D<Impl>::ImportUnphysicalFermion(const FermionField &input4d,FermionField &imported5d)
-{
-  int Ls = this->Ls;
-  FermionField tmp(this->FermionGrid());
-  conformable(imported5d.Grid(),this->FermionGrid());
-  conformable(input4d.Grid()   ,this->GaugeGrid());
-  tmp = Zero();
-  InsertSlice(input4d, tmp, 0   , 0);
-  InsertSlice(input4d, tmp, Ls-1, 0);
-  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
-  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
-  imported5d=tmp;
-}
-
-template<class Impl>  
-void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
-{
-  int Ls = this->Ls;
-  FermionField tmp(this->FermionGrid());
-  conformable(imported5d.Grid(),this->FermionGrid());
-  conformable(input4d.Grid()   ,this->GaugeGrid());
-  tmp = Zero();
-  InsertSlice(input4d, tmp, 0   , 0);
-  InsertSlice(input4d, tmp, Ls-1, 0);
-  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
-  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
-  Dminus(tmp,imported5d);
-}
-template<class Impl>  
-void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
-{
-  int Ls=this->Ls;
-
-  FermionField tmp_f(this->FermionGrid());
-  this->DW(psi,tmp_f,DaggerNo);
-
-  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
-  }
-}
-template<class Impl>  
-void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
-{
-  int Ls=this->Ls;
-
-  FermionField tmp_f(this->FermionGrid());
-  this->DW(psi,tmp_f,DaggerYes);
-
-  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
-  }
-}
-
-template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
-{
-  this->Report();
-  Coordinate latt = GridDefaultLatt();          
-  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP     = this->_FourDimGrid->_Nprocessors;
-  if ( M5Dcalls > 0 ) {
-    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
-
-    // Flops = 10.0*(Nc*Ns) *Ls*vol
-    RealD mflops = 10.0*(Nc*Ns)*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-
-    // Bytes = sizeof(Real) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting)
-    // read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 )
-    // write = 1
-    RealD Gbytes = sizeof(Real) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9;
-    std::cout << GridLogMessage << "Average bandwidth (GB/s)                 : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl;
-  }
-
-  if ( MooeeInvCalls > 0 ) {
-
-    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
-#ifdef GRID_NVCC
-    RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-#else
-    // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
-    RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-#endif
-  }
-
-}
-template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
-{
-  this->ZeroCounters();
-  M5Dflops=0;
-  M5Dcalls=0;
-  M5Dtime=0;
-  MooeeInvFlops=0;
-  MooeeInvCalls=0;
-  MooeeInvTime=0;
-}
-
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
-{
-  int Ls=this->Ls;
-  Vector<Coeff_t> diag (Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
-  M5D(psi,chi,chi,lower,diag,upper);
-}
-template<class Impl>
-void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
-{
-  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bs;
-  Vector<Coeff_t> upper= cs;
-  Vector<Coeff_t> lower= cs; 
-  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
-  M5D(psi,psi,Din,lower,diag,upper);
-}
-// FIXME Redunant with the above routine; check this and eliminate
-template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
-{
-  int Ls=this->Ls;
-  Vector<Coeff_t> diag = beo;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-  for(int i=0;i<Ls;i++) {
-    upper[i]=-ceo[i];
-    lower[i]=-ceo[i];
-  }
-  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
-  M5D(psi,psi,chi,lower,diag,upper);
-}
-template<class Impl>
-void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
-{
-  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-  for(int i=0;i<Ls;i++) {
-    upper[i]=-cee[i];
-    lower[i]=-cee[i];
-  }
-  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
-  M5D(psi,psi,chi,lower,diag,upper);
-}
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
-{
-  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-
-  for (int s=0;s<Ls;s++){
-    // Assemble the 5d matrix
-    if ( s==0 ) {
-      upper[s] = -cee[s+1] ;
-      lower[s] = mass*cee[Ls-1];
-    } else if ( s==(Ls-1)) { 
-      upper[s] = mass*cee[0];
-      lower[s] = -cee[s-1];
-    } else {
-      upper[s]=-cee[s+1];
-      lower[s]=-cee[s-1];
-    }
-  }
-  // Conjugate the terms 
-  for (int s=0;s<Ls;s++){
-    diag[s] =conjugate(diag[s]);
-    upper[s]=conjugate(upper[s]);
-    lower[s]=conjugate(lower[s]);
-  }
-  M5Ddag(psi,psi,chi,lower,diag,upper);
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
-{
-  int Ls=this->Ls;
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);
-  Vector<Coeff_t> lower(Ls,-1.0);
-  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
-  M5Ddag(psi,chi,chi,lower,diag,upper);
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
-{
-  int Ls=this->Ls;
-  Vector<Coeff_t> diag =bs;
-  Vector<Coeff_t> upper=cs;
-  Vector<Coeff_t> lower=cs; 
-
-  for (int s=0;s<Ls;s++){
-    if ( s== 0 ) {
-      upper[s] = cs[s+1];
-      lower[s] =-mass*cs[Ls-1];
-    } else if ( s==(Ls-1) ) { 
-      upper[s] =-mass*cs[0];
-      lower[s] = cs[s-1];
-    } else { 
-      upper[s] = cs[s+1];
-      lower[s] = cs[s-1];
-    }
-    upper[s] = conjugate(upper[s]);
-    lower[s] = conjugate(lower[s]);
-    diag[s]  = conjugate(diag[s]);
-  }
-  M5Ddag(psi,psi,Din,lower,diag,upper);
-}
-
-template<class Impl>
-RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
-{
-  FermionField Din(psi.Grid());
-  
-  // Assemble Din
-  Meooe5D(psi,Din);
-  
-  this->DW(Din,chi,DaggerNo);
-  // ((b D_W + D_w hop terms +1) on s-diag
-  axpby(chi,1.0,1.0,chi,psi); 
-  
-  M5D(psi,chi);
-  return(norm2(chi));
-}
-
-template<class Impl>
-RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
-{
-  // Under adjoint
-  //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
-  //D2- P+     D2+            P-D1-^dag D2+dag
-  
-  FermionField Din(psi.Grid());
-  // Apply Dw
-  this->DW(psi,Din,DaggerYes); 
-  
-  MeooeDag5D(Din,chi);
-  
-  M5Ddag(psi,chi);
-  // ((b D_W + D_w hop terms +1) on s-diag
-  axpby (chi,1.0,1.0,chi,psi); 
-  return norm2(chi);
-}
-
-// half checkerboard operations
-template<class Impl>
-void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
-{
-  Meooe5D(psi,this->tmp()); 
-
-  if ( psi.Checkerboard() == Odd ) {
-    this->DhopEO(this->tmp(),chi,DaggerNo);
-  } else {
-    this->DhopOE(this->tmp(),chi,DaggerNo);
-  }
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
-{
-  // Apply 4d dslash
-  if ( psi.Checkerboard() == Odd ) {
-    this->DhopEO(psi,this->tmp(),DaggerYes);
-  } else {
-    this->DhopOE(psi,this->tmp(),DaggerYes);
-  }
-  MeooeDag5D(this->tmp(),chi); 
-}
-
-template<class Impl>
-void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-  Meo5D(psi,this->tmp());
-  // Apply 4d dslash fragment
-  this->DhopDir(this->tmp(),chi,dir,disp);
-}
-// force terms; five routines; default to Dhop on diagonal
-template<class Impl>
-void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  FermionField Din(V.Grid());
-  
-  if ( dag == DaggerNo ) {
-    //      U d/du [D_w D5] V = U d/du DW D5 V
-    Meooe5D(V,Din);
-    this->DhopDeriv(mat,U,Din,dag);
-  } else {
-    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-    Meooe5D(U,Din);
-    this->DhopDeriv(mat,Din,V,dag);
-  }
-};
-template<class Impl>
-void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  FermionField Din(V.Grid());
-  
-  if ( dag == DaggerNo ) {
-    //      U d/du [D_w D5] V = U d/du DW D5 V
-    Meooe5D(V,Din);
-    this->DhopDerivOE(mat,U,Din,dag);
-  } else {
-    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-    Meooe5D(U,Din);
-    this->DhopDerivOE(mat,Din,V,dag);
-  }
-};
-template<class Impl>
-void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  FermionField Din(V.Grid());
-  
-  if ( dag == DaggerNo ) {
-    //      U d/du [D_w D5] V = U d/du DW D5 V
-    Meooe5D(V,Din);
-    this->DhopDerivEO(mat,U,Din,dag);
-  } else {
-    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-    Meooe5D(U,Din);
-    this->DhopDerivEO(mat,Din,V,dag);
-  }
-};
-  
-// Tanh
-template<class Impl>
-void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
-{
-  Vector<Coeff_t> gamma(this->Ls);
-  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
-  SetCoefficientsInternal(1.0,gamma,b,c);
-}
-//Zolo
-template<class Impl>
-void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
-{
-  Vector<Coeff_t> gamma(this->Ls);
-  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
-  SetCoefficientsInternal(zolo_hi,gamma,b,c);
-}
-//Zolo
-template<class Impl>
-void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
-{
-  int Ls=this->Ls;
-
-  ///////////////////////////////////////////////////////////
-  // The Cayley coeffs (unprec)
-  ///////////////////////////////////////////////////////////
-  assert(gamma.size()==Ls);
-
-  omega.resize(Ls);
-  bs.resize(Ls);
-  cs.resize(Ls);
-  as.resize(Ls);
-  
-  // 
-  // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
-  //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
-  //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
-  //
-  //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
-  //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
-  //
-  // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
-  // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
-  //
-  // So 
-  //
-  // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
-  //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
-  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
-  //
-  // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
-  //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
-  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
-  // 
-    
-  double bpc = b+c;
-  double bmc = b-c;
-  _b = b;
-  _c = c;
-  _gamma  = gamma; // Save the parameters so we can change mass later.
-  _zolo_hi= zolo_hi;
-  for(int i=0; i < Ls; i++){
-    as[i] = 1.0;
-    omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code
-    assert(omega[i]!=Coeff_t(0.0));
-    bs[i] = 0.5*(bpc/omega[i] + bmc);
-    cs[i] = 0.5*(bpc/omega[i] - bmc);
-  }
-
-  ////////////////////////////////////////////////////////
-  // Constants for the preconditioned matrix Cayley form
-  ////////////////////////////////////////////////////////
-  bee.resize(Ls);
-  cee.resize(Ls);
-  beo.resize(Ls);
-  ceo.resize(Ls);
-  
-  for(int i=0;i<Ls;i++){
-    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
-    assert(bee[i]!=Coeff_t(0.0));
-    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
-    beo[i]=as[i]*bs[i];
-    ceo[i]=-as[i]*cs[i];
-  }
-  aee.resize(Ls);
-  aeo.resize(Ls);
-  for(int i=0;i<Ls;i++){
-    aee[i]=cee[i];
-    aeo[i]=ceo[i];
-  }
-  
-  //////////////////////////////////////////
-  // LDU decomposition of eeoo
-  //////////////////////////////////////////
-  dee.resize(Ls);
-  lee.resize(Ls);
-  leem.resize(Ls);
-  uee.resize(Ls);
-  ueem.resize(Ls);
-  
-  for(int i=0;i<Ls;i++){
-    
-    dee[i] = bee[i];
-    
-    if ( i < Ls-1 ) {
-
-      assert(bee[i]!=Coeff_t(0.0));
-      assert(bee[0]!=Coeff_t(0.0));
-      
-      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
-      
-      leem[i]=mass*cee[Ls-1]/bee[0];
-      for(int j=0;j<i;j++) {
-	assert(bee[j+1]!=Coeff_t(0.0));
-	leem[i]*= aee[j]/bee[j+1];
-      }
-      
-      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
-      
-      ueem[i]=mass;
-      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
-      ueem[i]*= aee[0]/bee[0];
-      
-    } else { 
-      lee[i] =0.0;
-      leem[i]=0.0;
-      uee[i] =0.0;
-      ueem[i]=0.0;
-    }
-  }
-	
-  { 
-    Coeff_t delta_d=mass*cee[Ls-1];
-    for(int j=0;j<Ls-1;j++) {
-      assert(bee[j] != Coeff_t(0.0));
-      delta_d *= cee[j]/bee[j];
-    }
-    dee[Ls-1] += delta_d;
-  }  
-
-  int inv=1;
-  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
-  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
-}
-
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
-						 Vector<iSinglet<Simd> > & Matp,
-						 Vector<iSinglet<Simd> > & Matm)
-{
-  int Ls=this->Ls;
-
-  GridBase *grid = this->FermionRedBlackGrid();
-  int LLs = grid->_rdimensions[0];
-
-  if ( LLs == Ls ) {
-    return; // Not vectorised in 5th direction
-  }
-
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-  
-  for(int s=0;s<Ls;s++){
-    Pplus(s,s) = bee[s];
-    Pminus(s,s)= bee[s];
-  }
-  
-  for(int s=0;s<Ls-1;s++){
-    Pminus(s,s+1) = -cee[s];
-  }
-  
-  for(int s=0;s<Ls-1;s++){
-    Pplus(s+1,s) = -cee[s+1];
-  }
-  Pplus (0,Ls-1) = mass*cee[0];
-  Pminus(Ls-1,0) = mass*cee[Ls-1];
-  
-  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXcd PminusMat;
-  
-  if ( inv ) {
-    PplusMat =Pplus.inverse();
-    PminusMat=Pminus.inverse();
-  } else { 
-    PplusMat =Pplus;
-    PminusMat=Pminus;
-  }
-  
-  if(dag){
-    PplusMat.adjointInPlace();
-    PminusMat.adjointInPlace();
-  }
-  
-  typedef typename SiteHalfSpinor::scalar_type scalar_type;
-  const int Nsimd=Simd::Nsimd();
-  Matp.resize(Ls*LLs);
-  Matm.resize(Ls*LLs);
-
-  for(int s2=0;s2<Ls;s2++){
-    for(int s1=0;s1<LLs;s1++){
-      int istride = LLs;
-      int ostride = 1;
-      Simd Vp;
-      Simd Vm;
-      scalar_type *sp = (scalar_type *)&Vp;
-      scalar_type *sm = (scalar_type *)&Vm;
-      for(int l=0;l<Nsimd;l++){
-	if ( switcheroo<Coeff_t>::iscomplex() ) {
-	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
-	} else { 
-	  // if real
-	  scalar_type tmp;
-	  tmp = PplusMat (l*istride+s1*ostride,s2);
-	  sp[l] = scalar_type(tmp.real(),tmp.real());
-	  tmp = PminusMat(l*istride+s1*ostride,s2);
-	  sm[l] = scalar_type(tmp.real(),tmp.real());
-	}
-      }
-      Matp[LLs*s2+s1] = Vp;
-      Matm[LLs*s2+s1] = Vm;
-    }}
-}
-
-
-FermOpTemplateInstantiate(CayleyFermion5D);
-GparityFermOpTemplateInstantiate(CayleyFermion5D);
-
-NAMESPACE_END(Grid);
-
-
--- a/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -1,247 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
-
-
-NAMESPACE_BEGIN(Grid);
-
-// Pminus fowards
-// Pplus  backwards..
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
-				const FermionField &phi_i, 
-				FermionField &chi_i,
-				Vector<Coeff_t> &lower,
-				Vector<Coeff_t> &diag,
-				Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls =this->Ls;
-  assert(phi.Checkerboard() == psi.Checkerboard());
-  // 10 = 3 complex mult + 2 complex add
-  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
-    for(int s=0;s<Ls;s++){
-      auto tmp = psi[0];
-      if ( s==0 ) {
-	spProj5m(tmp,psi[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5p(tmp,psi[ss+Ls-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else if ( s==(Ls-1)) {
-	spProj5m(tmp,psi[ss+0]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5p(tmp,psi[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else { 
-	spProj5m(tmp,psi[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5p(tmp,psi[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      }
-    }
-  });
-  M5Dtime+=usecond();
-}
-
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
-				   const FermionField &phi_i, 
-				   FermionField &chi_i,
-				   Vector<Coeff_t> &lower,
-				   Vector<Coeff_t> &diag,
-				   Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls =this->Ls;
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
-    auto tmp = psi[0];
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	spProj5p(tmp,psi[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5m(tmp,psi[ss+Ls-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else if ( s==(Ls-1)) {
-	spProj5p(tmp,psi[ss+0]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5m(tmp,psi[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      } else { 
-	spProj5p(tmp,psi[ss+s+1]);
-	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
-
-	spProj5m(tmp,psi[ss+s-1]);
-	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
-      }
-    }
-  });
-  M5Dtime+=usecond();
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi_i)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-
-  int Ls=this->Ls;
-
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
-  thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
-    auto tmp = psi[0];
-
-    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-    // Apply (L^{\prime})^{-1}
-    chi[ss]=psi[ss]; // chi[0]=psi[0]
-    for(int s=1;s<Ls;s++){
-      spProj5p(tmp,chi[ss+s-1]);  
-      chi[ss+s] = psi[ss+s]-lee[s-1]*tmp;
-    }
-    // L_m^{-1} 
-    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp,chi[ss+s]);    
-      chi[ss+Ls-1] = chi[ss+Ls-1] - leem[s]*tmp;
-    }
-    // U_m^{-1} D^{-1}
-    for (int s=0;s<Ls-1;s++){
-      // Chi[s] + 1/d chi[s] 
-      spProj5p(tmp,chi[ss+Ls-1]); 
-      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(ueem[s]/dee[Ls-1])*tmp;
-    }	
-    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
-      
-    // Apply U^{-1}
-    for (int s=Ls-2;s>=0;s--){
-      spProj5m(tmp,chi[ss+s+1]);  
-      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
-    }
-  });
-
-  MooeeInvTime+=usecond();
-
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi_i)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  int Ls=this->Ls;
-
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-
-  assert(psi.Checkerboard() == psi.Checkerboard());
-
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
-  thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
-
-    auto tmp = psi[0];
-
-    // Apply (U^{\prime})^{-dagger}
-    chi[ss]=psi[ss];
-    for (int s=1;s<Ls;s++){
-      spProj5m(tmp,chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s]-conjugate(uee[s-1])*tmp;
-    }
-    // U_m^{-\dagger} 
-    for (int s=0;s<Ls-1;s++){
-      spProj5p(tmp,chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - conjugate(ueem[s])*tmp;
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for (int s=0;s<Ls-1;s++){
-      spProj5m(tmp,chi[ss+Ls-1]);
-      chi[ss+s] = conjugate(1.0/dee[s])*chi[ss+s]-conjugate(leem[s]/dee[Ls-1])*tmp;
-    }	
-    chi[ss+Ls-1]= conjugate(1.0/dee[Ls-1])*chi[ss+Ls-1];
-  
-    // Apply L^{-dagger}
-    for (int s=Ls-2;s>=0;s--){
-      spProj5p(tmp,chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - conjugate(lee[s])*tmp;
-    }
-  });
-
-  MooeeInvTime+=usecond();
-
-}
-
-#ifdef CAYLEY_DPERP_CACHE
-INSTANTIATE_DPERP(WilsonImplF);
-INSTANTIATE_DPERP(WilsonImplD);
-INSTANTIATE_DPERP(GparityWilsonImplF);
-INSTANTIATE_DPERP(GparityWilsonImplD);
-INSTANTIATE_DPERP(ZWilsonImplF);
-INSTANTIATE_DPERP(ZWilsonImplD);
-
-INSTANTIATE_DPERP(WilsonImplFH);
-INSTANTIATE_DPERP(WilsonImplDF);
-INSTANTIATE_DPERP(GparityWilsonImplFH);
-INSTANTIATE_DPERP(GparityWilsonImplDF);
-INSTANTIATE_DPERP(ZWilsonImplFH);
-INSTANTIATE_DPERP(ZWilsonImplDF);
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc
@@ -1,284 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
-
-
-NAMESPACE_BEGIN(Grid);
-
-// Pminus fowards
-// Pplus  backwards..
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
-				const FermionField &phi_i, 
-				FermionField &chi_i,
-				Vector<Coeff_t> &lower,
-				Vector<Coeff_t> &diag,
-				Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  Coeff_t *lower_v = &lower[0];
-  Coeff_t *diag_v  = &diag[0];
-  Coeff_t *upper_v = &upper[0];
-  int Ls =this->Ls;
-  assert(phi.Checkerboard() == psi.Checkerboard());
-  
-  const uint64_t nsimd = grid->Nsimd();
-  const uint64_t sites4d = nsimd * grid->oSites() / Ls;
-  
-  // 10 = 3 complex mult + 2 complex add
-  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
-  M5Dcalls++;
-  M5Dtime-=usecond();
-  
-  accelerator_loopN( sss, sites4d ,{
-    uint64_t lane = sss % nsimd;
-    uint64_t ss   = Ls * (sss / nsimd);
-    
-    for(int s=0;s<Ls;s++){
-      auto res = extractLane(lane,phi[ss+s]);
-      res = diag_v[s]*res;
-      
-      auto tmp = extractLane(lane,psi[ss+(s+1)%Ls]);
-      spProj5m(tmp,tmp);
-      res += upper_v[s]*tmp;
-      
-      tmp = extractLane(lane,psi[ss+(s+Ls-1)%Ls]);
-      spProj5p(tmp,tmp);
-      res += lower_v[s]*tmp;
-      
-      insertLane(lane,chi[ss+s],res);
-    }
-  });
-  M5Dtime+=usecond();
-}
-
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
-				   const FermionField &phi_i, 
-				   FermionField &chi_i,
-				   Vector<Coeff_t> &lower,
-				   Vector<Coeff_t> &diag,
-				   Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  Coeff_t *lower_v = &lower[0];
-  Coeff_t *diag_v  = &diag[0];
-  Coeff_t *upper_v = &upper[0];
-  int Ls =this->Ls;
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  const uint64_t nsimd = grid->Nsimd();
-  const uint64_t sites4d = nsimd * grid->oSites() / Ls;
-  
-  // 10 = 3 complex mult + 2 complex add
-  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  accelerator_loopN( sss, sites4d ,{
-    uint64_t lane = sss % nsimd;
-    uint64_t ss   = Ls * (sss / nsimd);
-    
-    for(int s=0;s<Ls;s++){
-      auto res = extractLane(lane,phi[ss+s]);
-      res = diag_v[s]*res;
-      
-      auto tmp = extractLane(lane,psi[ss+(s+1)%Ls]);
-      spProj5p(tmp,tmp);
-      res += upper_v[s]*tmp;
-      
-      tmp = extractLane(lane,psi[ss+(s+Ls-1)%Ls]);
-      spProj5m(tmp,tmp);
-      res += lower_v[s]*tmp;
-      
-      insertLane(lane,chi[ss+s],res);
-    }
-  });
-  M5Dtime+=usecond();
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi_i)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  Coeff_t *lee_v  = &lee[0];
-  Coeff_t *leem_v = &leem[0];
-  Coeff_t *uee_v  = &uee[0];
-  Coeff_t *ueem_v = &ueem[0];
-  Coeff_t *dee_v  = &dee[0];
-  
-  int Ls=this->Ls;
-  const uint64_t nsimd = grid->Nsimd();
-  const uint64_t sites4d = nsimd * grid->oSites() / Ls;
-  
-  typedef typename SiteSpinor::scalar_object ScalarSiteSpinor;
-  
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-  
-  accelerator_loopN( sss, sites4d ,{
-    uint64_t lane = sss % nsimd;
-    uint64_t ss   = Ls * (sss / nsimd);
-    ScalarSiteSpinor res, tmp, acc;
-    
-    // X = Nc*Ns
-    // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
-    // Apply (L^{\prime})^{-1} L_m^{-1}
-    res = extractLane(lane,psi[ss]);
-    spProj5m(tmp,res);
-    acc = leem_v[0]*tmp;
-    spProj5p(tmp,res);
-    insertLane(lane,chi[ss],res);
-    
-    for(int s=1;s<Ls-1;s++){
-      res = extractLane(lane,psi[ss+s]);
-      res -= lee_v[s-1]*tmp;
-      spProj5m(tmp,res);
-      acc += leem_v[s]*tmp;
-      spProj5p(tmp,res);
-      insertLane(lane,chi[ss+s],res);
-    }
-    res = extractLane(lane,psi[ss+Ls-1]);
-    res = res - lee_v[Ls-2]*tmp - acc;
-    
-    // Apply U_m^{-1} D^{-1} U^{-1}
-    res = (1.0/dee_v[Ls-1])*res;
-    insertLane(lane,chi[ss+Ls-1],res);
-    spProj5p(acc,res);
-    spProj5m(tmp,res);
-    for (int s=Ls-2;s>=0;s--){
-      res = extractLane(lane,chi[ss+s]);
-      res = (1.0/dee_v[s])*res - uee_v[s]*tmp - ueem_v[s]*acc;
-      spProj5m(tmp,res);
-      insertLane(lane,chi[ss+s],res);
-    }
-  });
-  
-  MooeeInvTime+=usecond();
-
-}
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi_i)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  Coeff_t *lee_v  = &lee[0];
-  Coeff_t *leem_v = &leem[0];
-  Coeff_t *uee_v  = &uee[0];
-  Coeff_t *ueem_v = &ueem[0];
-  Coeff_t *dee_v  = &dee[0];
-  
-  int Ls=this->Ls;
-  const uint64_t nsimd = grid->Nsimd();
-  const uint64_t sites4d = nsimd * grid->oSites() / Ls;
-  
-  typedef typename SiteSpinor::scalar_object ScalarSiteSpinor;
-  
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-  
-  accelerator_loopN( sss, sites4d ,{
-    uint64_t lane = sss % nsimd;
-    uint64_t ss   = Ls * (sss / nsimd);
-    ScalarSiteSpinor res, tmp, acc;
-    
-    // X = Nc*Ns
-    // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
-    // Apply (U^{\prime})^{-dagger} U_m^{-\dagger}
-    res = extractLane(lane,psi[ss]);
-    spProj5p(tmp,res);
-    acc = conjugate(ueem_v[0])*tmp;
-    spProj5m(tmp,res);
-    insertLane(lane,chi[ss],res);
-    
-    for(int s=1;s<Ls-1;s++){
-      res = extractLane(lane,psi[ss+s]);
-      res -= conjugate(uee_v[s-1])*tmp;
-      spProj5p(tmp,res);
-      acc += conjugate(ueem_v[s])*tmp;
-      spProj5m(tmp,res);
-      insertLane(lane,chi[ss+s],res);
-    }
-    res = extractLane(lane,psi[ss+Ls-1]);
-    res = res - conjugate(uee_v[Ls-2])*tmp - acc;
-    
-    // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
-    res = conjugate(1.0/dee_v[Ls-1])*res;
-    insertLane(lane,chi[ss+Ls-1],res);
-    spProj5m(acc,res);
-    spProj5p(tmp,res);
-    for (int s=Ls-2;s>=0;s--){
-      res = extractLane(lane,chi[ss+s]);
-      res = conjugate(1.0/dee_v[s])*res - conjugate(lee_v[s])*tmp - conjugate(leem_v[s])*acc;
-      spProj5p(tmp,res);
-      insertLane(lane,chi[ss+s],res);
-    }
-  });
-  
-  MooeeInvTime+=usecond();
-  
-}
-
-#ifdef CAYLEY_DPERP_GPU
-INSTANTIATE_DPERP(WilsonImplF);
-INSTANTIATE_DPERP(WilsonImplD);
-INSTANTIATE_DPERP(GparityWilsonImplF);
-INSTANTIATE_DPERP(GparityWilsonImplD);
-INSTANTIATE_DPERP(ZWilsonImplF);
-INSTANTIATE_DPERP(ZWilsonImplD);
-
-INSTANTIATE_DPERP(WilsonImplFH);
-INSTANTIATE_DPERP(WilsonImplDF);
-INSTANTIATE_DPERP(GparityWilsonImplFH);
-INSTANTIATE_DPERP(GparityWilsonImplDF);
-INSTANTIATE_DPERP(ZWilsonImplFH);
-INSTANTIATE_DPERP(ZWilsonImplDF);
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -1,838 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
-
-
-NAMESPACE_BEGIN(Grid);
-
-/*
- * Dense matrix versions of routines
- */
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
-{
-  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
-}
-  
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
-{
-  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
-}
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
-				const FermionField &phi_i, 
-				FermionField &chi_i,
-				Vector<Coeff_t> &lower,
-				Vector<Coeff_t> &diag,
-				Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls   = this->Ls;
-  int LLs  = grid->_rdimensions[0];
-  const int nsimd= Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs==nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type * u_p = (scalar_type *)&u[0];
-  scalar_type * l_p = (scalar_type *)&l[0];
-  scalar_type * d_p = (scalar_type *)&d[0];
-
-  for(int o=0;o<LLs;o++){ // outer
-    for(int i=0;i<nsimd;i++){ //inner
-      int s  = o+i*LLs;
-      int ss = o*nsimd+i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  assert(Nc==3);
-
-  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
-#if 0
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0;v<LLs;v++){
-
-      int vp=(v+1)%LLs;
-      int vm=(v+LLs-1)%LLs;
-
-      spProj5m(hp,psi[ss+vp]);
-      spProj5p(hm,psi[ss+vm]);
-
-      if ( vp<=v ) rotate(hp,hp,1);
-      if ( vm>=v ) rotate(hm,hm,nsimd-1);
-	
-      hp=0.5*hp;
-      hm=0.5*hm;
-
-      spRecon5m(fp,hp);
-      spRecon5p(fm,hm);
-
-      chi[ss+v] = d[v]*phi[ss+v];
-      chi[ss+v] = chi[ss+v]     +u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-    }
-#else
-    for(int v=0;v<LLs;v++){
-      
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp= (v==LLs-1) ? 0     : v+1;
-      int vm= (v==0    ) ? LLs-1 : v-1;
-	
-      Simd hp_00 = psi[ss+vp]()(2)(0); 
-      Simd hp_01 = psi[ss+vp]()(2)(1); 
-      Simd hp_02 = psi[ss+vp]()(2)(2); 
-      Simd hp_10 = psi[ss+vp]()(3)(0); 
-      Simd hp_11 = psi[ss+vp]()(3)(1); 
-      Simd hp_12 = psi[ss+vp]()(3)(2); 
-	
-      Simd hm_00 = psi[ss+vm]()(0)(0); 
-      Simd hm_01 = psi[ss+vm]()(0)(1); 
-      Simd hm_02 = psi[ss+vm]()(0)(2); 
-      Simd hm_10 = psi[ss+vm]()(1)(0); 
-      Simd hm_11 = psi[ss+vm]()(1)(1); 
-      Simd hm_12 = psi[ss+vm]()(1)(2); 
-
-      if ( vp<=v ) {
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-      if ( vm>=v ) {
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
-      Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
-      Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02); 
-      Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
-      Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
-      Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
-      Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
-      Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
-      Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);  
-      Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
-      Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
-      Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
-
-      vstream(chi[ss+v]()(0)(0),p_00);
-      vstream(chi[ss+v]()(0)(1),p_01);
-      vstream(chi[ss+v]()(0)(2),p_02);
-      vstream(chi[ss+v]()(1)(0),p_10);
-      vstream(chi[ss+v]()(1)(1),p_11);
-      vstream(chi[ss+v]()(1)(2),p_12);
-      vstream(chi[ss+v]()(2)(0),p_20);
-      vstream(chi[ss+v]()(2)(1),p_21);
-      vstream(chi[ss+v]()(2)(2),p_22);
-      vstream(chi[ss+v]()(3)(0),p_30);
-      vstream(chi[ss+v]()(3)(1),p_31);
-      vstream(chi[ss+v]()(3)(2),p_32);
-
-    }
-#endif
-  });
-  M5Dtime+=usecond();
-}
-
-template<class Impl>  
-void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
-				   const FermionField &phi_i, 
-				   FermionField &chi_i,
-				   Vector<Coeff_t> &lower,
-				   Vector<Coeff_t> &diag,
-				   Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard()=psi_i.Checkerboard();
-  GridBase *grid=psi_i.Grid();
-  auto psi=psi_i.View();
-  auto phi=phi_i.View();
-  auto chi=chi_i.View();
-  int Ls   = this->Ls;
-  int LLs  = grid->_rdimensions[0];
-  int nsimd= Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs==nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type * u_p = (scalar_type *)&u[0];
-  scalar_type * l_p = (scalar_type *)&l[0];
-  scalar_type * d_p = (scalar_type *)&d[0];
-
-  for(int o=0;o<LLs;o++){ // outer
-    for(int i=0;i<nsimd;i++){ //inner
-      int s  = o+i*LLs;
-      int ss = o*nsimd+i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  M5Dcalls++;
-  M5Dtime-=usecond();
-  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
-#if 0
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0;v<LLs;v++){
-
-      int vp=(v+1)%LLs;
-      int vm=(v+LLs-1)%LLs;
-
-      spProj5p(hp,psi[ss+vp]);
-      spProj5m(hm,psi[ss+vm]);
-
-      if ( vp<=v ) rotate(hp,hp,1);
-      if ( vm>=v ) rotate(hm,hm,nsimd-1);
-      
-      hp=hp*0.5;
-      hm=hm*0.5;
-      spRecon5p(fp,hp);
-      spRecon5m(fm,hm);
-
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-    }
-#else
-    for(int v=0;v<LLs;v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp= (v==LLs-1) ? 0     : v+1;
-      int vm= (v==0    ) ? LLs-1 : v-1;
-	
-      Simd hp_00 = psi[ss+vp]()(0)(0); 
-      Simd hp_01 = psi[ss+vp]()(0)(1); 
-      Simd hp_02 = psi[ss+vp]()(0)(2); 
-      Simd hp_10 = psi[ss+vp]()(1)(0); 
-      Simd hp_11 = psi[ss+vp]()(1)(1); 
-      Simd hp_12 = psi[ss+vp]()(1)(2); 
-	
-      Simd hm_00 = psi[ss+vm]()(2)(0); 
-      Simd hm_01 = psi[ss+vm]()(2)(1); 
-      Simd hm_02 = psi[ss+vm]()(2)(2); 
-      Simd hm_10 = psi[ss+vm]()(3)(0); 
-      Simd hm_11 = psi[ss+vm]()(3)(1); 
-      Simd hm_12 = psi[ss+vm]()(3)(2); 
-
-      if ( vp<=v ) {
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-      if ( vm>=v ) {
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
-      Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
-      Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02); 
-      Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
-      Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
-      Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
-
-      Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
-      Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
-      Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);  
-      Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
-      Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
-      Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
-
-      vstream(chi[ss+v]()(0)(0),p_00);
-      vstream(chi[ss+v]()(0)(1),p_01);
-      vstream(chi[ss+v]()(0)(2),p_02);
-      vstream(chi[ss+v]()(1)(0),p_10);
-      vstream(chi[ss+v]()(1)(1),p_11);
-      vstream(chi[ss+v]()(1)(2),p_12);
-      vstream(chi[ss+v]()(2)(0),p_20);
-      vstream(chi[ss+v]()(2)(1),p_21);
-      vstream(chi[ss+v]()(2)(2),p_22);
-      vstream(chi[ss+v]()(3)(0),p_30);
-      vstream(chi[ss+v]()(3)(1),p_31);
-      vstream(chi[ss+v]()(3)(2),p_32);
-    }
-#endif
-  });
-  M5Dtime+=usecond();
-}
-
-
-#ifdef AVX512 
-#include <simd/Intel512common.h>
-#include <simd/Intel512avx.h>
-#include <simd/Intel512single.h>
-#endif 
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField &chi_i,
-					     int LLs, int site,
-					     Vector<iSinglet<Simd> > &Matp,
-					     Vector<iSinglet<Simd> > &Matm)
-{
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-#ifndef AVX512
-  {
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-    SiteHalfSpinor SiteChiP;
-    SiteHalfSpinor SiteChiM;
-
-    // Ls*Ls * 2 * 12 * vol flops
-    for(int s1=0;s1<LLs;s1++){ 
-      for(int s2=0;s2<LLs;s2++){ 
-	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-
-	  int s=s2+l*LLs;
-	  int lex=s2+LLs*site;
-	
-	  if ( s2==0 && l==0) {
-	    SiteChiP=Zero();
-	    SiteChiM=Zero();
-	  }
-	
-	  for(int sp=0;sp<2;sp++){
-	    for(int co=0;co<Nc;co++){
-	      vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
-	    }}
-	  for(int sp=0;sp<2;sp++){
-	    for(int co=0;co<Nc;co++){
-	      vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
-	    }}
-
-	  for(int sp=0;sp<2;sp++){
-	    for(int co=0;co<Nc;co++){
-	      SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
-	      SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
-	    }}
-
-	}}
-      {
-	int lex = s1+LLs*site;
-	for(int sp=0;sp<2;sp++){
-	  for(int co=0;co<Nc;co++){
-	    vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
-	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-	  }}
-      }
-    }
-
-  }
-#else
-  {
-    // pointers
-    //  MASK_REGS;
-#define Chi_00 %%zmm1
-#define Chi_01 %%zmm2
-#define Chi_02 %%zmm3
-#define Chi_10 %%zmm4
-#define Chi_11 %%zmm5
-#define Chi_12 %%zmm6
-#define Chi_20 %%zmm7
-#define Chi_21 %%zmm8
-#define Chi_22 %%zmm9
-#define Chi_30 %%zmm10
-#define Chi_31 %%zmm11
-#define Chi_32 %%zmm12
-
-#define BCAST0   %%zmm13
-#define BCAST1   %%zmm14
-#define BCAST2   %%zmm15
-#define BCAST3   %%zmm16
-#define BCAST4   %%zmm17
-#define BCAST5   %%zmm18
-#define BCAST6   %%zmm19
-#define BCAST7   %%zmm20
-#define BCAST8   %%zmm21
-#define BCAST9   %%zmm22
-#define BCAST10  %%zmm23
-#define BCAST11  %%zmm24
-
-    int incr=LLs*LLs*sizeof(iSinglet<Simd>);
-    for(int s1=0;s1<LLs;s1++){ 
-      for(int s2=0;s2<LLs;s2++){ 
-	int lex=s2+LLs*site;
-	uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
-	uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
-	uint64_t a2 = (uint64_t)&psi[lex];
-	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-	  if ( (s2+l)==0 ) {
-	    asm (
-		 VPREFETCH1(0,%2)  	     VPREFETCH1(0,%1)
-		 VPREFETCH1(12,%2)  	     VPREFETCH1(13,%2)
-		 VPREFETCH1(14,%2)  	     VPREFETCH1(15,%2)         
-		 VBCASTCDUP(0,%2,BCAST0)   
-		 VBCASTCDUP(1,%2,BCAST1)   
-		 VBCASTCDUP(2,%2,BCAST2)   
-		 VBCASTCDUP(3,%2,BCAST3)   
-		 VBCASTCDUP(4,%2,BCAST4)     VMULMEM (0,%0,BCAST0,Chi_00)
-		 VBCASTCDUP(5,%2,BCAST5)     VMULMEM (0,%0,BCAST1,Chi_01)
-		 VBCASTCDUP(6,%2,BCAST6)     VMULMEM (0,%0,BCAST2,Chi_02)
-		 VBCASTCDUP(7,%2,BCAST7)     VMULMEM (0,%0,BCAST3,Chi_10)
-		 VBCASTCDUP(8,%2,BCAST8)     VMULMEM (0,%0,BCAST4,Chi_11)
-		 VBCASTCDUP(9,%2,BCAST9)     VMULMEM (0,%0,BCAST5,Chi_12)
-		 VBCASTCDUP(10,%2,BCAST10)   VMULMEM (0,%1,BCAST6,Chi_20)
-		 VBCASTCDUP(11,%2,BCAST11)   VMULMEM (0,%1,BCAST7,Chi_21)
-		 VMULMEM (0,%1,BCAST8,Chi_22)         
-		 VMULMEM (0,%1,BCAST9,Chi_30)
-		 VMULMEM (0,%1,BCAST10,Chi_31)       
-		 VMULMEM (0,%1,BCAST11,Chi_32)
-		 : : "r" (a0), "r" (a1), "r" (a2)  );
-	  } else { 
-	    asm (
-		 VBCASTCDUP(0,%2,BCAST0)   VMADDMEM (0,%0,BCAST0,Chi_00)
-		 VBCASTCDUP(1,%2,BCAST1)   VMADDMEM (0,%0,BCAST1,Chi_01)
-		 VBCASTCDUP(2,%2,BCAST2)   VMADDMEM (0,%0,BCAST2,Chi_02)
-		 VBCASTCDUP(3,%2,BCAST3)   VMADDMEM (0,%0,BCAST3,Chi_10)
-		 VBCASTCDUP(4,%2,BCAST4)   VMADDMEM (0,%0,BCAST4,Chi_11)
-		 VBCASTCDUP(5,%2,BCAST5)   VMADDMEM (0,%0,BCAST5,Chi_12)
-		 VBCASTCDUP(6,%2,BCAST6)   VMADDMEM (0,%1,BCAST6,Chi_20)
-		 VBCASTCDUP(7,%2,BCAST7)   VMADDMEM (0,%1,BCAST7,Chi_21)
-		 VBCASTCDUP(8,%2,BCAST8)   VMADDMEM (0,%1,BCAST8,Chi_22)
-		 VBCASTCDUP(9,%2,BCAST9)   VMADDMEM (0,%1,BCAST9,Chi_30)
-		 VBCASTCDUP(10,%2,BCAST10)  VMADDMEM (0,%1,BCAST10,Chi_31)
-		 VBCASTCDUP(11,%2,BCAST11)  VMADDMEM (0,%1,BCAST11,Chi_32) 
-		 : : "r" (a0), "r" (a1), "r" (a2)  );
-	  }
-	  a0 = a0+incr;
-	  a1 = a1+incr;
-	a2 = a2+sizeof(typename Simd::scalar_type);
-	}}
-      {
-	int lexa = s1+LLs*site;
-	asm (
-	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)		
-	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)		
-	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)		
-	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)		
-	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-      }
-    }
-  }
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-#endif
-};
-
-// Z-mobius version
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField &chi_i,
-					      int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
-{
-#ifndef AVX512
-  {
-    auto psi = psi_i.View();
-    auto chi = chi_i.View();
-
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-    SiteHalfSpinor SiteChiP;
-    SiteHalfSpinor SiteChiM;
-
-    // Ls*Ls * 2 * 12 * vol flops
-    for(int s1=0;s1<LLs;s1++){ 
-      for(int s2=0;s2<LLs;s2++){ 
-	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-
-	  int s=s2+l*LLs;
-	  int lex=s2+LLs*site;
-	
-	  if ( s2==0 && l==0) {
-	    SiteChiP=Zero();
-	    SiteChiM=Zero();
-	  }
-	
-	  for(int sp=0;sp<2;sp++){
-	    for(int co=0;co<Nc;co++){
-	      vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
-	    }}
-	  for(int sp=0;sp<2;sp++){
-	    for(int co=0;co<Nc;co++){
-	      vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
-	    }}
-
-	  for(int sp=0;sp<2;sp++){
-	    for(int co=0;co<Nc;co++){
-	      SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co); 
-	      SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co); 
-	    }}
-
-
-	}}
-      {
-	int lex = s1+LLs*site;
-	for(int sp=0;sp<2;sp++){
-	  for(int co=0;co<Nc;co++){
-	    vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
-	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-	  }}
-      }
-    }
-
-  }
-#else
-  {
-    auto psi = psi_i.View();
-    auto chi = chi_i.View();
-    // pointers
-    //  MASK_REGS;
-#define Chi_00 %zmm0
-#define Chi_01 %zmm1
-#define Chi_02 %zmm2
-#define Chi_10 %zmm3
-#define Chi_11 %zmm4
-#define Chi_12 %zmm5
-#define Chi_20 %zmm6
-#define Chi_21 %zmm7
-#define Chi_22 %zmm8
-#define Chi_30 %zmm9
-#define Chi_31 %zmm10
-#define Chi_32 %zmm11
-#define pChi_00 %%zmm0
-#define pChi_01 %%zmm1
-#define pChi_02 %%zmm2
-#define pChi_10 %%zmm3
-#define pChi_11 %%zmm4
-#define pChi_12 %%zmm5
-#define pChi_20 %%zmm6
-#define pChi_21 %%zmm7
-#define pChi_22 %%zmm8
-#define pChi_30 %%zmm9
-#define pChi_31 %%zmm10
-#define pChi_32 %%zmm11
-
-#define BCAST_00   %zmm12
-#define  SHUF_00   %zmm13
-#define BCAST_01   %zmm14
-#define  SHUF_01   %zmm15
-#define BCAST_02   %zmm16
-#define  SHUF_02   %zmm17
-#define BCAST_10   %zmm18
-#define  SHUF_10   %zmm19
-#define BCAST_11   %zmm20
-#define  SHUF_11   %zmm21
-#define BCAST_12   %zmm22
-#define  SHUF_12   %zmm23
-
-#define Mp  %zmm24
-#define Mps %zmm25
-#define Mm  %zmm26
-#define Mms %zmm27
-#define N 8
-    int incr=LLs*LLs*sizeof(iSinglet<Simd>);
-    for(int s1=0;s1<LLs;s1++){ 
-      for(int s2=0;s2<LLs;s2++){ 
-	int lex=s2+LLs*site;
-	uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
-	uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
-	uint64_t a2 = (uint64_t)&psi[lex];
-	for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-	  if ( (s2+l)==0 ) {
-	    LOAD64(%r8,a0);
-	    LOAD64(%r9,a1);
-	    LOAD64(%r10,a2);
-	    asm (
-		 VLOAD(0,%r8,Mp)// i r
-		 VLOAD(0,%r9,Mm)
-		 VSHUF(Mp,Mps)  // r i 
-		 VSHUF(Mm,Mms)
-		 VPREFETCH1(12,%r10)  	     VPREFETCH1(13,%r10)
-		 VPREFETCH1(14,%r10)  	     VPREFETCH1(15,%r10)         
-
-		 VMULIDUP(0*N,%r10,Mps,Chi_00)
-		 VMULIDUP(1*N,%r10,Mps,Chi_01)
-		 VMULIDUP(2*N,%r10,Mps,Chi_02)
-		 VMULIDUP(3*N,%r10,Mps,Chi_10)
-		 VMULIDUP(4*N,%r10,Mps,Chi_11)
-		 VMULIDUP(5*N,%r10,Mps,Chi_12)
-
-		 VMULIDUP(6*N ,%r10,Mms,Chi_20)
-		 VMULIDUP(7*N ,%r10,Mms,Chi_21)
-		 VMULIDUP(8*N ,%r10,Mms,Chi_22)
-		 VMULIDUP(9*N ,%r10,Mms,Chi_30)
-		 VMULIDUP(10*N,%r10,Mms,Chi_31)
-		 VMULIDUP(11*N,%r10,Mms,Chi_32)
-
-		 VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
-		 VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
-		 VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
-		 VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
-		 VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
-		 VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
-
-		 VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
-		 VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
-		 VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
-		 VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
-		 VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
-		 VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
-		 );
-	  } else { 
-	    LOAD64(%r8,a0);
-	    LOAD64(%r9,a1);
-	    LOAD64(%r10,a2);
-	    asm (
-		 VLOAD(0,%r8,Mp)
-		 VSHUF(Mp,Mps)
-
-		 VLOAD(0,%r9,Mm)
-		 VSHUF(Mm,Mms)
-
-		 VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) //  Mri * Pii +- Cir
-		 VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
-		 VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
-		 VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
-		 VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
-		 VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
-
-		 VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
-		 VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
-		 VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
-		 VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
-		 VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
-		 VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
-
-		 VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) //  Cir = Mir * Prr +- ( Mri * Pii +- Cir) 
-		 VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) //  Ci = MiPr + Ci + MrPi ;    Cr = MrPr - ( MiPi - Cr)
-		 VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
-		 VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
-		 VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
-		 VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
-
-		 VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
-		 VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
-		 VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
-		 VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
-		 VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
-		 VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
-		 );
-	  }
-	  a0 = a0+incr;
-	  a1 = a1+incr;
-	a2 = a2+sizeof(typename Simd::scalar_type);
-	}}
-      {
-	int lexa = s1+LLs*site;
-	/*
-	  SiteSpinor tmp;
-	  asm (
-	  VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
-	  VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
-	  VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
-	  VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
-	  : : "r" ((uint64_t)&tmp) : "memory" );
-	*/
-
-	asm (
-	     VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
-	     VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
-	     VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
-	     VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
-	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-	//      if ( 1 || (site==0) ) { 
-	//	std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
-	//      }
-      }
-    }
-  }
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-
-#endif
-};
-
-
-template<class Impl>
-void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
-{
-  chi.Checkerboard()=psi.Checkerboard();
-
-  int Ls=this->Ls;
-  int LLs = psi.Grid()->_rdimensions[0];
-  int vol = psi.Grid()->oSites()/LLs;
-
-  
-  Vector<iSinglet<Simd> >  Matp;
-  Vector<iSinglet<Simd> >  Matm;
-  Vector<iSinglet<Simd> >  *_Matp;
-  Vector<iSinglet<Simd> >  *_Matm;
-  
-  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-  if ( inv && dag ) { 
-    _Matp = &MatpInvDag;
-    _Matm = &MatmInvDag;
-  }
-  if ( inv && (!dag) ) { 
-    _Matp = &MatpInv;
-    _Matm = &MatmInv;
-  } 
-  if ( !inv ) {
-    MooeeInternalCompute(dag,inv,Matp,Matm);
-    _Matp = &Matp;
-    _Matm = &Matm;
-  }
-  assert(_Matp->size()==Ls*LLs);
-
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
-  if ( switcheroo<Coeff_t>::iscomplex() ) {
-    thread_loop( (auto site=0;site<vol;site++),{
-      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
-    });
-  } else { 
-    thread_loop( (auto site=0;site<vol;site++),{
-      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
-    });
-  }
-  MooeeInvTime+=usecond();
-}
-
-INSTANTIATE_DPERP(DomainWallVec5dImplD);
-INSTANTIATE_DPERP(DomainWallVec5dImplF);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplF);
-
-INSTANTIATE_DPERP(DomainWallVec5dImplDF);
-INSTANTIATE_DPERP(DomainWallVec5dImplFH);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplDF);
-INSTANTIATE_DPERP(ZDomainWallVec5dImplFH);
-
-template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-
-template void CayleyFermion5D<DomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<DomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-template void CayleyFermion5D<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.cc
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.cc
@@ -1,320 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-void ContinuedFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
-{
-  SetCoefficientsZolotarev(1.0/scale,zdata);
-}
-template<class Impl>
-void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
-{
-  // How to check Ls matches??
-  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
-  int Ls = this->Ls;
-  assert(zdata->db==Ls);// Beta has Ls coeffs
-
-  R=(1+this->mass)/(1-this->mass);
-
-  Beta.resize(Ls);
-  cc.resize(Ls);
-  cc_d.resize(Ls);
-  sqrt_cc.resize(Ls);
-  for(int i=0; i < Ls ; i++){
-    Beta[i] = zdata -> beta[i];
-    cc[i] = 1.0/Beta[i];
-    cc_d[i]=std::sqrt(cc[i]);
-  }
-    
-  cc_d[Ls-1]=1.0;
-  for(int i=0; i < Ls-1 ; i++){
-    sqrt_cc[i]= std::sqrt(cc[i]*cc[i+1]);
-  }    
-  sqrt_cc[Ls-2]=std::sqrt(cc[Ls-2]);
-
-
-  ZoloHiInv =1.0/zolo_hi;
-  dw_diag = (4.0-this->M5)*ZoloHiInv;
-    
-  See.resize(Ls);
-  Aee.resize(Ls);
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    Aee[s] = sign * Beta[s] * dw_diag;
-    sign   = - sign;
-  }
-  Aee[Ls-1] += R;
-    
-  See[0] = Aee[0];
-  for(int s=1;s<Ls;s++){
-    See[s] = Aee[s] - 1.0/See[s-1];
-  }
-  for(int s=0;s<Ls;s++){
-    std::cout<<GridLogMessage <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
-  }
-}
-
-
-
-template<class Impl>
-RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
-{
-  int Ls = this->Ls;
-
-  FermionField D(psi.Grid());
-
-  this->DW(psi,D,DaggerNo); 
-
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    if ( s==0 ) {
-      ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*ZoloHiInv,D,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
-    } else if ( s==(Ls-1) ){
-      RealD R=(1.0+mass)/(1.0-mass);
-      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,D,sqrt_cc[s-1],psi,s,s-1);
-      ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
-    } else {
-      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,D,sqrt_cc[s],psi,s,s+1);
-      axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
-    }
-    sign=-sign; 
-  }
-  return norm2(chi);
-}
-template<class Impl>
-RealD  ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
-{
-  // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
-  // The rest of matrix is symmetric.
-  // Can ignore "dag"
-  return M(psi,chi);
-}
-template<class Impl>
-void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-  int Ls = this->Ls;
-
-  this->DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
-
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    if ( s==(Ls-1) ){
-      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
-    } else {
-      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
-    }
-    sign=-sign; 
-  }
-}
-template<class Impl>
-void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
-{
-  int Ls = this->Ls;
-
-  // Apply 4d dslash
-  if ( psi.Checkerboard() == Odd ) {
-    this->DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
-  } else {
-    this->DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
-  }
-      
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    if ( s==(Ls-1) ){
-      ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
-    } else {
-      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
-    }
-    sign=-sign; 
-  }
-}
-template<class Impl>
-void   ContinuedFractionFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
-{
-  this->Meooe(psi,chi);
-}
-template<class Impl>
-void   ContinuedFractionFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
-{
-  int Ls = this->Ls;
-
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    if ( s==0 ) {
-      ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*dw_diag,psi,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
-    } else if ( s==(Ls-1) ){
-      // Drop the CC here.
-      double R=(1+mass)/(1-mass);
-      ag5xpby_ssp(chi,Beta[s]*dw_diag,psi,sqrt_cc[s-1],psi,s,s-1);
-      ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
-    } else {
-      ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*dw_diag,psi,sqrt_cc[s],psi,s,s+1);
-      axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
-    }
-    sign=-sign; 
-  }
-}
-
-template<class Impl>
-void   ContinuedFractionFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
-{
-  this->Mooee(psi,chi);
-}
-template<class Impl>
-void   ContinuedFractionFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
-{
-  int Ls = this->Ls;
-
-  // Apply Linv
-  axpby_ssp(chi,1.0/cc_d[0],psi,0.0,psi,0,0); 
-  for(int s=1;s<Ls;s++){
-    axpbg5y_ssp(chi,1.0/cc_d[s],psi,-1.0/See[s-1],chi,s,s-1);
-  }
-  // Apply Dinv
-  for(int s=0;s<Ls;s++){
-    ag5xpby_ssp(chi,1.0/See[s],chi,0.0,chi,s,s); //only appearance of See[0]
-  }
-  // Apply Uinv = (Linv)^T
-  axpby_ssp(chi,1.0/cc_d[Ls-1],chi,0.0,chi,Ls-1,Ls-1);
-  for(int s=Ls-2;s>=0;s--){
-    axpbg5y_ssp(chi,1.0/cc_d[s],chi,-1.0*cc_d[s+1]/See[s]/cc_d[s],chi,s,s+1);
-  }
-}
-template<class Impl>
-void   ContinuedFractionFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
-{
-  this->MooeeInv(psi,chi);
-}
-
-// force terms; five routines; default to Dhop on diagonal
-template<class Impl>
-void ContinuedFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  int Ls = this->Ls;
-
-  FermionField D(V.Grid());
-
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    if ( s==(Ls-1) ){
-      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
-    } else {
-      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
-    }
-    sign=-sign; 
-  }
-  this->DhopDeriv(mat,D,V,DaggerNo); 
-};
-template<class Impl>
-void ContinuedFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  int Ls = this->Ls;
-
-  FermionField D(V.Grid());
-
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    if ( s==(Ls-1) ){
-      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
-    } else {
-      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
-    }
-    sign=-sign; 
-  }
-  this->DhopDerivOE(mat,D,V,DaggerNo); 
-};
-template<class Impl>
-void ContinuedFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  int Ls = this->Ls;
-
-  FermionField D(V.Grid());
-
-  int sign=1;
-  for(int s=0;s<Ls;s++){
-    if ( s==(Ls-1) ){
-      ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
-    } else {
-      ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
-    }
-    sign=-sign; 
-  }
-  this->DhopDerivEO(mat,D,V,DaggerNo); 
-};
-    
-// Constructors
-template<class Impl>
-ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
-							     GaugeField &_Umu,
-							     GridCartesian         &FiveDimGrid,
-							     GridRedBlackCartesian &FiveDimRedBlackGrid,
-							     GridCartesian         &FourDimGrid,
-							     GridRedBlackCartesian &FourDimRedBlackGrid,
-							     RealD _mass,RealD M5,const ImplParams &p) :
-  WilsonFermion5D<Impl>(_Umu,
-			FiveDimGrid, FiveDimRedBlackGrid,
-			FourDimGrid, FourDimRedBlackGrid,M5,p),
-  mass(_mass)
-{
-  int Ls = this->Ls;
-  assert((Ls&0x1)==1); // Odd Ls required
-}
-
-    template<class Impl>
-    void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
-    {
-      int Ls = this->Ls;
-      conformable(solution5d.Grid(),this->FermionGrid());
-      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
-    }
-    template<class Impl>
-    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
-    {
-      int Ls = this->Ls;
-      conformable(imported5d.Grid(),this->FermionGrid());
-      conformable(input4d.Grid()   ,this->GaugeGrid());
-      FermionField tmp(this->FermionGrid());
-      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
-      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
-      this->Dminus(tmp,imported5d);
-    }
-
-FermOpTemplateInstantiate(ContinuedFractionFermion5D);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc
@@ -1,433 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
-						   GaugeField            &_Umu,
-						   GridCartesian         &FiveDimGrid,
-						   GridRedBlackCartesian &FiveDimRedBlackGrid,
-						   GridCartesian         &FourDimGrid,
-						   GridRedBlackCartesian &FourDimRedBlackGrid,
-						   RealD _mq1, RealD _mq2, RealD _mq3,
-						   RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
-  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-			    _shift, _pm, _M5, 1.0, 0.0, p)
-{
-  RealD eps = 1.0;
-  Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
-  assert(zdata->n == this->Ls);
-
-  std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
-  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-
-  Approx::zolotarev_free(zdata);
-}
-
-/***************************************************************
- * Additional EOFA operators only called outside the inverter.
- * Since speed is not essential, simple axpby-style
- * implementations should be fine.
- ***************************************************************/
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-{
-  int Ls = this->Ls;
-
-  Din = Zero();
-  if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
-  else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-  else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
-  else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-}
-
-// This is just the identity for DWF
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-// This is just the identity for DWF
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-/*****************************************************************************************************/
-
-template<class Impl>
-RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->Meooe5D(psi, Din);
-  this->DW(Din, chi, DaggerNo);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  this->M5D(psi, chi);
-  return(norm2(chi));
-}
-
-template<class Impl>
-RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->DW(psi, Din, DaggerYes);
-  this->MeooeDag5D(Din, chi);
-  this->M5Ddag(psi, chi);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  return(norm2(chi));
-}
-
-/********************************************************************
- * Performance critical fermion operators called inside the inverter
- ********************************************************************/
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD shift = this->shift;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-
-  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-  Coeff_t shiftp(0.0), shiftm(0.0);
-  if(shift != 0.0){
-    if(pm == 1){ shiftp = shift*(mq3-mq2); }
-    else{ shiftm = -shift*(mq3-mq2); }
-  }
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
-
-#if(0)
-  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
-  for(int i=0; i<diag.size(); ++i){
-    std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-  }
-  for(int i=0; i<upper.size(); ++i){
-    std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-  }
-  for(int i=0; i<lower.size(); ++i){
-    std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-  }
-#endif
-
-  this->M5D(psi, chi, chi, lower, diag, upper);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD shift = this->shift;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-
-  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-  Coeff_t shiftp(0.0), shiftm(0.0);
-  if(shift != 0.0){
-    if(pm == 1){ shiftp = shift*(mq3-mq2); }
-    else{ shiftm = -shift*(mq3-mq2); }
-  }
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
-
-#if(0)
-  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
-  for(int i=0; i<diag.size(); ++i){
-    std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-  }
-  for(int i=0; i<upper.size(); ++i){
-    std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-  }
-  for(int i=0; i<lower.size(); ++i){
-    std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-  }
-#endif
-
-  this->M5Ddag(psi, chi, chi, lower, diag, upper);
-}
-
-// half checkerboard operations
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] = this->dm;
-  lower[0]    = this->dp;
-
-  this->M5D(psi, psi, chi, lower, diag, upper);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] = this->dp;
-  lower[0]    = this->dm;
-
-  this->M5Ddag(psi, psi, chi, lower, diag, upper);
-}
-
-/****************************************************************************************/
-
-//Zolo
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-  RealD shift = this->shift;
-
-  ////////////////////////////////////////////////////////
-  // Constants for the preconditioned matrix Cayley form
-  ////////////////////////////////////////////////////////
-  this->bs.resize(Ls);
-  this->cs.resize(Ls);
-  this->aee.resize(Ls);
-  this->aeo.resize(Ls);
-  this->bee.resize(Ls);
-  this->beo.resize(Ls);
-  this->cee.resize(Ls);
-  this->ceo.resize(Ls);
-
-  for(int i=0; i<Ls; ++i){
-    this->bee[i] = 4.0 - this->M5 + 1.0;
-    this->cee[i] = 1.0;
-  }
-
-  for(int i=0; i<Ls; ++i){
-    this->aee[i] = this->cee[i];
-    this->bs[i] = this->beo[i] = 1.0;
-    this->cs[i] = this->ceo[i] = 0.0;
-  }
-
-  //////////////////////////////////////////
-  // EOFA shift terms
-  //////////////////////////////////////////
-  if(pm == 1){
-    this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
-    this->dm = mq1*this->cee[Ls-1];
-  } else if(this->pm == -1) {
-    this->dp = mq1*this->cee[0];
-    this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
-  } else {
-    this->dp = mq1*this->cee[0];
-    this->dm = mq1*this->cee[Ls-1];
-  }
-
-  //////////////////////////////////////////
-  // LDU decomposition of eeoo
-  //////////////////////////////////////////
-  this->dee.resize(Ls+1);
-  this->lee.resize(Ls);
-  this->leem.resize(Ls);
-  this->uee.resize(Ls);
-  this->ueem.resize(Ls);
-
-  for(int i=0; i<Ls; ++i){
-
-    if(i < Ls-1){
-
-      this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
-
-      this->leem[i] = this->dm/this->bee[i];
-      for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
-
-      this->dee[i] = this->bee[i];
-
-      this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
-
-      this->ueem[i] = this->dp / this->bee[0];
-      for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
-
-    } else {
-
-      this->lee[i]  = 0.0;
-      this->leem[i] = 0.0;
-      this->uee[i]  = 0.0;
-      this->ueem[i] = 0.0;
-
-    }
-  }
-
-  {
-    Coeff_t delta_d = 1.0 / this->bee[0];
-    for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
-    this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
-    this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
-  }
-
-  int inv = 1;
-  this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
-  this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
-}
-
-// Recompute Cayley-form coefficients for different shift
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-{
-  this->shift = new_shift;
-  Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
-  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-						       Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  int Ls = this->Ls;
-
-  GridBase* grid = this->FermionRedBlackGrid();
-  int LLs = grid->_rdimensions[0];
-
-  if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-  for(int s=0; s<Ls; s++){
-    Pplus(s,s)  = this->bee[s];
-    Pminus(s,s) = this->bee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pminus(s,s+1) = -this->cee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pplus(s+1,s) = -this->cee[s+1];
-  }
-
-  Pplus (0,Ls-1) = this->dp;
-  Pminus(Ls-1,0) = this->dm;
-
-  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXcd PminusMat;
-
-#if(0)
-  std::cout << GridLogMessage << "Pplus:" << std::endl;
-  for(int s=0; s<Ls; ++s){
-    for(int ss=0; ss<Ls; ++ss){
-      std::cout << Pplus(s,ss) << "\t";
-    }
-    std::cout << std::endl;
-  }
-  std::cout << GridLogMessage << "Pminus:" << std::endl;
-  for(int s=0; s<Ls; ++s){
-    for(int ss=0; ss<Ls; ++ss){
-      std::cout << Pminus(s,ss) << "\t";
-    }
-    std::cout << std::endl;
-  }
-#endif
-
-  if(inv) {
-    PplusMat  = Pplus.inverse();
-    PminusMat = Pminus.inverse();
-  } else {
-    PplusMat  = Pplus;
-    PminusMat = Pminus;
-  }
-
-  if(dag){
-    PplusMat.adjointInPlace();
-    PminusMat.adjointInPlace();
-  }
-
-  typedef typename SiteHalfSpinor::scalar_type scalar_type;
-  const int Nsimd = Simd::Nsimd();
-  Matp.resize(Ls*LLs);
-  Matm.resize(Ls*LLs);
-
-  for(int s2=0; s2<Ls; s2++){
-    for(int s1=0; s1<LLs; s1++){
-      int istride = LLs;
-      int ostride = 1;
-      Simd Vp;
-      Simd Vm;
-      scalar_type *sp = (scalar_type*) &Vp;
-      scalar_type *sm = (scalar_type*) &Vm;
-      for(int l=0; l<Nsimd; l++){
-	if(switcheroo<Coeff_t>::iscomplex()) {
-	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
-	} else {
-	  // if real
-	  scalar_type tmp;
-	  tmp = PplusMat (l*istride+s1*ostride,s2);
-	  sp[l] = scalar_type(tmp.real(),tmp.real());
-	  tmp = PminusMat(l*istride+s1*ostride,s2);
-	  sm[l] = scalar_type(tmp.real(),tmp.real());
-	}
-      }
-      Matp[LLs*s2+s1] = Vp;
-      Matm[LLs*s2+s1] = Vm;
-    }}
-}
-
-FermOpTemplateInstantiate(DomainWallEOFAFermion);
-GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc
@@ -1,255 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// FIXME -- make a version of these routines with site loop outermost for cache reuse.
-
-// Pminus fowards
-// Pplus  backwards..
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  int Ls = this->Ls;
-  GridBase* grid = psi_i.Grid();
-  auto phi = phi_i.View();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  assert(phi.Checkerboard() == psi.Checkerboard());
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-  
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-    for(int s=0; s<Ls; s++){
-      auto tmp = psi[0];
-      if(s==0) {
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5m(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  int Ls = this->Ls;
-
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-    auto tmp = psi[0];
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5p(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionField& chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi=psi_i.View();
-  auto chi=chi_i.View();
-  int Ls = this->Ls;
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-
-    auto tmp1 = psi[0];
-    auto tmp2 = psi[0];
-
-    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-    // Apply (L^{\prime})^{-1}
-    chi[ss] = psi[ss]; // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp1, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
-    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp1, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp1, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
-    }
-    spProj5m(tmp2, chi[ss+Ls-1]);
-    chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
-
-    // Apply U^{-1}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5m(tmp1, chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, FermionField& chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  int Ls = this->Ls;
-
-  assert(psi.Checkerboard() == psi.Checkerboard());
-
-  Vector<Coeff_t> ueec(Ls);
-  Vector<Coeff_t> deec(Ls+1);
-  Vector<Coeff_t> leec(Ls);
-  Vector<Coeff_t> ueemc(Ls);
-  Vector<Coeff_t> leemc(Ls);
-
-  for(int s=0; s<ueec.size(); s++){
-    ueec[s]  = conjugate(this->uee[s]);
-    deec[s]  = conjugate(this->dee[s]);
-    leec[s]  = conjugate(this->lee[s]);
-    ueemc[s] = conjugate(this->ueem[s]);
-    leemc[s] = conjugate(this->leem[s]);
-  }
-  deec[Ls] = conjugate(this->dee[Ls]);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-
-    auto tmp1 = psi[0];
-    auto tmp2 = psi[0];
-
-    // Apply (U^{\prime})^{-dagger}
-    chi[ss] = psi[ss];
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp1, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
-    }
-
-    // U_m^{-\dagger}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp1, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp1, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
-    }
-    spProj5p(tmp2, chi[ss+Ls-1]);
-    chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
-
-    // Apply L^{-dagger}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5p(tmp1, chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
-
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
-
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc
@@ -1,613 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-
-/*
- * Dense matrix versions of routines
- */
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0;o<LLs;o++){ // outer
-    for(int i=0;i<nsimd;i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5m(hp, psi[ss+vp]);
-      spProj5p(hm, psi[ss+vm]);
-
-      if (vp <= v){ rotate(hp, hp, 1); }
-      if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = 0.5*hp;
-      hm = 0.5*hm;
-
-      spRecon5m(fp, hp);
-      spRecon5p(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v];
-      chi[ss+v] = chi[ss+v] + u[v]*fp;
-      chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v==LLs-1) ? 0     : v+1;
-      int vm = (v==0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-
-#endif
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5p(hp, psi[ss+vp]);
-      spProj5m(hm, psi[ss+vm]);
-
-      if(vp <= v){ rotate(hp, hp, 1); }
-      if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = hp*0.5;
-      hm = hm*0.5;
-      spRecon5p(fp, hp);
-      spRecon5m(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-#endif
-
-  });
-
-  this->M5Dtime += usecond();
-}
-
-#ifdef AVX512
-#include<simd/Intel512common.h>
-#include<simd/Intel512avx.h>
-#include<simd/Intel512single.h>
-#endif
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
-						   int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-#ifndef AVX512
-  {
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-    SiteHalfSpinor SiteChiP;
-    SiteHalfSpinor SiteChiM;
-
-    // Ls*Ls * 2 * 12 * vol flops
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-	for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-	  int s = s2 + l*LLs;
-	  int lex = s2 + LLs*site;
-
-	  if( s2==0 && l==0 ){
-	    SiteChiP=Zero();
-	    SiteChiM=Zero();
-	  }
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-	    }}
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-	    }}
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-	      SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-	    }}
-	}}
-
-      {
-	int lex = s1 + LLs*site;
-	for(int sp=0; sp<2;  sp++){
-	  for(int co=0; co<Nc; co++){
-	    vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-	  }}
-      }
-    }
-
-  }
-#else
-  {
-    // pointers
-    //  MASK_REGS;
-#define Chi_00 %%zmm1
-#define Chi_01 %%zmm2
-#define Chi_02 %%zmm3
-#define Chi_10 %%zmm4
-#define Chi_11 %%zmm5
-#define Chi_12 %%zmm6
-#define Chi_20 %%zmm7
-#define Chi_21 %%zmm8
-#define Chi_22 %%zmm9
-#define Chi_30 %%zmm10
-#define Chi_31 %%zmm11
-#define Chi_32 %%zmm12
-
-#define BCAST0  %%zmm13
-#define BCAST1  %%zmm14
-#define BCAST2  %%zmm15
-#define BCAST3  %%zmm16
-#define BCAST4  %%zmm17
-#define BCAST5  %%zmm18
-#define BCAST6  %%zmm19
-#define BCAST7  %%zmm20
-#define BCAST8  %%zmm21
-#define BCAST9  %%zmm22
-#define BCAST10 %%zmm23
-#define BCAST11 %%zmm24
-
-    int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-
-	int lex = s2 + LLs*site;
-	uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-	uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-	uint64_t a2 = (uint64_t) &psi[lex];
-
-	for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-	  if((s2+l)==0) {
-	    asm(
-		VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-		VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-		VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-		VBCASTCDUP(0,%2,BCAST0)
-		VBCASTCDUP(1,%2,BCAST1)
-		VBCASTCDUP(2,%2,BCAST2)
-		VBCASTCDUP(3,%2,BCAST3)
-		VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-		VMULMEM(0,%1,BCAST8,Chi_22)
-		VMULMEM(0,%1,BCAST9,Chi_30)
-		VMULMEM(0,%1,BCAST10,Chi_31)
-		VMULMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  } else {
-	    asm(
-		VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-		VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-		VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-		VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-		VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  }
-	  a0 = a0 + incr;
-	  a1 = a1 + incr;
-                        a2 = a2 + sizeof(typename Simd::scalar_type);
-	}
-      }
-
-      {
-	int lexa = s1+LLs*site;
-	asm (
-	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-      }
-    }
-  }
-
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-#endif
-};
-
-// Z-mobius version
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-						    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-  exit(-1);
-};
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-{
-  chi.Checkerboard() = psi.Checkerboard();
-  int Ls  = this->Ls;
-  int LLs = psi.Grid()->_rdimensions[0];
-  int vol = psi.Grid()->oSites()/LLs;
-
-  Vector<iSinglet<Simd> > Matp;
-  Vector<iSinglet<Simd> > Matm;
-  Vector<iSinglet<Simd> > *_Matp;
-  Vector<iSinglet<Simd> > *_Matm;
-
-  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-  if(inv && dag){
-    _Matp = &this->MatpInvDag;
-    _Matm = &this->MatmInvDag;
-  }
-
-  if(inv && (!dag)){
-    _Matp = &this->MatpInv;
-    _Matm = &this->MatmInv;
-  }
-
-  if(!inv){
-    MooeeInternalCompute(dag, inv, Matp, Matm);
-    _Matp = &Matp;
-    _Matm = &Matm;
-  }
-
-  assert(_Matp->size() == Ls*LLs);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  if(switcheroo<Coeff_t>::iscomplex()){
-    thread_loop((auto site=0; site<vol; site++),{
-      MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  } else {
-    thread_loop((auto site=0; site<vol; site++){
-      MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  }
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef DOMAIN_WALL_EOFA_DPERP_VEC
-
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
-
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
-
-template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
+++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
@@ -0,0 +1,267 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class S,class Representation = FundamentalRepresentation, class Options=CoeffReal>
+class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Representation::Dimension> > { 
+public:
+
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > Gimpl;
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  static const int Dimension = Representation::Dimension;
+  static const bool isFundamental = Representation::isFundamental;
+  static const bool LsVectorised=true;
+  static const int Nhcs = Options::Nhcs;
+      
+  typedef typename Options::_Coeff_t Coeff_t;      
+  typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
+  
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
+  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+  
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplPropagator<Simd>        SitePropagator;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+  typedef iImplHalfCommSpinor<SimdL>   SiteHalfCommSpinor;
+  typedef Lattice<SiteSpinor>          FermionField;
+  typedef Lattice<SitePropagator>      PropagatorField;
+
+  /////////////////////////////////////////////////
+  // Make the doubled gauge field a *scalar*
+  /////////////////////////////////////////////////
+  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
+  typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar
+  typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar
+  typedef Lattice<SiteDoubledGaugeField>                      DoubledGaugeField;
+      
+  typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
+  typedef WilsonImplParams ImplParams;
+  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
+  typedef typename StencilImpl::View_type StencilView;
+  
+  ImplParams Params;
+
+  DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
+      
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    vsplat(reg, memory);
+  }
+
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi, const SiteDoubledGaugeField &U,
+					  const _Spinor &chi, int mu, StencilEntry *SE,
+					  StencilView &St) 
+  {
+#ifdef GPU_VEC
+    // Gauge link is scalarised
+    mult(&phi(), &U(mu), &chi());
+#else
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+        vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mult(&phi(), &UU(), &chi());
+#endif
+  }
+#ifdef GPU_VEC
+  static accelerator_inline void copyLinkGpu(int lane,
+					     SiteDoubledGaugeField & UU,
+					     const SiteDoubledGaugeField &U)
+  {
+    UU = U;
+  }
+  static accelerator_inline void multLinkGpu(int lane,
+					     typename SiteHalfSpinor::scalar_object &phi,
+					     const SiteDoubledGaugeField &U,
+					     const typename SiteHalfSpinor::scalar_object &chi,
+					     int mu) 
+  {
+#if 1
+    typedef typename ExtractTypeMap<typename Simd::scalar_type>::extract_type extract_type;
+
+    SiteScalarGaugeLink U_l;
+
+    extract_type * U_mem  = (extract_type *) &U(mu);
+    extract_type * U_stack= (extract_type *) &U_l;
+
+    for(int w=0;w<(sizeof(U_l)/sizeof(extract_type)) ;w++) U_stack[w] = U_mem[w];
+
+    phi() =  U_l() * chi();
+#else
+    auto U_l = U(mu);
+
+    phi() =  U_l * chi();
+#endif
+  }
+#else
+  static accelerator_inline void multLinkGpu(int lane,
+					     SiteHalfSpinor &phi,
+					     const SiteDoubledGaugeField &U,
+					     const SiteHalfSpinor &chi,
+					     int mu) 
+  {
+    auto U_l   = U(mu);
+    phi() =  U_l * chi();
+  }
+#endif
+
+  static accelerator_inline void multLinkProp(SitePropagator &phi,
+					      const SiteDoubledGaugeField &U,
+					      const SitePropagator &chi,int mu) 
+  {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+        vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mult(&phi(), &UU(), &chi());
+  }
+
+  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) 
+  {
+    SiteScalarGaugeField  ScalarUmu;
+    SiteDoubledGaugeField ScalarUds;
+    
+    GaugeLinkField U(Umu.Grid());
+    GaugeField  Uadj(Umu.Grid());
+    for (int mu = 0; mu < Nd; mu++) {
+      U = PeekIndex<LorentzIndex>(Umu, mu);
+      U = adj(Cshift(U, mu, -1));
+      PokeIndex<LorentzIndex>(Uadj, U, mu);
+    }
+    
+    for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
+      Coordinate lcoor;
+      GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
+      
+      peekLocalSite(ScalarUmu, Umu, lcoor);
+      for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
+      
+      peekLocalSite(ScalarUmu, Uadj, lcoor);
+      for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
+      
+      pokeLocalSite(ScalarUds, Uds, lcoor);
+    }
+  }
+      
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) 
+  {
+    assert(0);
+  }
+
+  inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
+    assert(0);
+  } 
+
+  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+    assert(0);
+  }
+
+  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    assert(0);
+  }
+
+
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
+
+    assert(0);
+    // Following lines to be revised after Peter's addition of half prec
+    // missing put lane...
+    /*
+      typedef decltype(traceIndex<SpinIndex>(outerProduct(Btilde[0], Atilde[0]))) result_type;
+      unsigned int LLs = Btilde.Grid()->_rdimensions[0];
+      conformable(Atilde.Grid(),Btilde.Grid());
+      GridBase* grid = mat.Grid();
+      GridBase* Bgrid = Btilde.Grid();
+      unsigned int dimU = grid->Nd();
+      unsigned int dimF = Bgrid->Nd();
+      GaugeLinkField tmp(grid); 
+      tmp = Zero();
+    
+      // FIXME 
+      // Current implementation works, thread safe, probably suboptimal
+      // Passing through the local coordinate for grid transformation
+      // the force grid is in general very different from the Ls vectorized grid
+
+      for (int so = 0; so < grid->oSites(); so++) {
+      std::vector<typename result_type::scalar_object> vres(Bgrid->Nsimd());
+      std::vector<int> ocoor;  grid->oCoorFromOindex(ocoor,so); 
+      for (int si = 0; si < tmp.Grid()->iSites(); si++){
+      typename result_type::scalar_object scalar_object; scalar_object = Zero();
+      std::vector<int> local_coor;      
+      std::vector<int> icoor; grid->iCoorFromIindex(icoor,si);
+      grid->InOutCoorToLocalCoor(ocoor, icoor, local_coor);
+      for (int s = 0; s < LLs; s++) {
+      std::vector<int> slocal_coor(dimF);
+      slocal_coor[0] = s;
+      for (int s4d = 1; s4d< dimF; s4d++) slocal_coor[s4d] = local_coor[s4d-1];
+      int sF = Bgrid->oIndexReduced(slocal_coor);  
+      assert(sF < Bgrid->oSites());
+
+      extract(traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF])), vres); 
+      // sum across the 5d dimension
+      for (auto v : vres) scalar_object += v;  
+      }
+      tmp[so].putlane(scalar_object, si);
+      }
+      }
+      PokeIndex<LorentzIndex>(mat, tmp, mu);
+    */
+  }
+};
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplD; // Double
+ 
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
+ 
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplD; // Double
+ 
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
@@ -1,625 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
-
-Copyright (C) 2015
-
-Author: Azusa Yamaguchi, Peter Boyle
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-			   /*  END LEGAL */
-#include <Grid.h>
-
-NAMESPACE_BEGIN(Grid);
-
-const std::vector<int> 
-ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
-const std::vector<int> 
-ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
-
-/////////////////////////////////
-// Constructor and gauge import
-/////////////////////////////////
-
-
-template <class Impl>
-ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, 
-							 RealD _mass,
-							 RealD _c1, RealD _c2,RealD _u0,
-							 const ImplParams &p)
-  : Kernels(p),
-    _grid(&Fgrid),
-    _cbgrid(&Hgrid),
-    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
-    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
-    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
-    mass(_mass),
-    Lebesgue(_grid),
-    LebesgueEvenOdd(_cbgrid),
-    Umu(&Fgrid),
-    UmuEven(&Hgrid),
-    UmuOdd(&Hgrid),
-    UUUmu(&Fgrid),
-    UUUmuEven(&Hgrid),
-    UUUmuOdd(&Hgrid) ,
-    _tmp(&Hgrid)
-{
-  int vol4;
-  int LLs=1;
-  c1=_c1;
-  c2=_c2;
-  u0=_u0;
-  vol4= _grid->oSites();
-  Stencil.BuildSurfaceList(LLs,vol4);
-  vol4= _cbgrid->oSites();
-  StencilEven.BuildSurfaceList(LLs,vol4);
-  StencilOdd.BuildSurfaceList(LLs,vol4);
-}
-
-template <class Impl>
-ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
-							 GridRedBlackCartesian &Hgrid, RealD _mass,
-							 RealD _c1, RealD _c2,RealD _u0,
-							 const ImplParams &p)
-  : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p)
-{
-  ImportGauge(_Uthin,_Ufat);
-}
-
-////////////////////////////////////////////////////////////
-// Momentum space propagator should be 
-// https://arxiv.org/pdf/hep-lat/9712010.pdf
-//
-// mom space action.
-//   gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
-//
-// must track through staggered flavour/spin reduction in literature to 
-// turn to free propagator for the one component chi field, a la page 4/5
-// of above link to implmement fourier based solver.
-////////////////////////////////////////////////////////////
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) 
-{
-  /////////////////////////////////////////////////////////////////
-  // Trivial import; phases and fattening and such like preapplied
-  /////////////////////////////////////////////////////////////////
-  GaugeLinkField U(GaugeGrid());
-
-  for (int mu = 0; mu < Nd; mu++) {
-
-    U = PeekIndex<LorentzIndex>(_Utriple, mu);
-    PokeIndex<LorentzIndex>(UUUmu, U, mu );
-
-    U = adj( Cshift(U, mu, -3));
-    PokeIndex<LorentzIndex>(UUUmu, -U, mu+4 );
-
-    U = PeekIndex<LorentzIndex>(_Ufat, mu);
-    PokeIndex<LorentzIndex>(Umu, U, mu);
-
-    U = adj( Cshift(U, mu, -1));
-    PokeIndex<LorentzIndex>(Umu, -U, mu+4);
-
-  }
-  CopyGaugeCheckerboards();
-}
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) 
-{
-
-  Umu   = _U;
-  UUUmu = _UUU;
-  CopyGaugeCheckerboards();
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
-{
-  pickCheckerboard(Even, UmuEven,  Umu);
-  pickCheckerboard(Odd,  UmuOdd ,  Umu);
-  pickCheckerboard(Even, UUUmuEven,UUUmu);
-  pickCheckerboard(Odd,  UUUmuOdd, UUUmu);
-}
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) 
-{
-  GaugeLinkField U(GaugeGrid());
-
-  ////////////////////////////////////////////////////////
-  // Double Store should take two fields for Naik and one hop separately.
-  ////////////////////////////////////////////////////////
-  Impl::DoubleStore(GaugeGrid(), UUUmu, Umu, _Uthin, _Ufat );
-
-  ////////////////////////////////////////////////////////
-  // Apply scale factors to get the right fermion Kinetic term
-  // Could pass coeffs into the double store to save work.
-  // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) 
-  ////////////////////////////////////////////////////////
-  for (int mu = 0; mu < Nd; mu++) {
-
-    U = PeekIndex<LorentzIndex>(Umu, mu);
-    PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
-    
-    U = PeekIndex<LorentzIndex>(Umu, mu+4);
-    PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
-
-    U = PeekIndex<LorentzIndex>(UUUmu, mu);
-    PokeIndex<LorentzIndex>(UUUmu, U*( 0.5*c2/u0/u0/u0), mu );
-    
-    U = PeekIndex<LorentzIndex>(UUUmu, mu+4);
-    PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
-  }
-
-  CopyGaugeCheckerboards();
-}
-
-/////////////////////////////
-// Implement the interface
-/////////////////////////////
-
-template <class Impl>
-RealD ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Dhop(in, out, DaggerNo);
-  return axpy_norm(out, mass, in, out);
-}
-
-template <class Impl>
-RealD ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Dhop(in, out, DaggerYes);
-  return axpy_norm(out, mass, in, out);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
-  if (in.Checkerboard() == Odd) {
-    DhopEO(in, out, DaggerNo);
-  } else {
-    DhopOE(in, out, DaggerNo);
-  }
-}
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
-  if (in.Checkerboard() == Odd) {
-    DhopEO(in, out, DaggerYes);
-  } else {
-    DhopOE(in, out, DaggerYes);
-  }
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  typename FermionField::scalar_type scal(mass);
-  out = scal * in;
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Mooee(in, out);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  out = (1.0 / (mass)) * in;
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
-						 FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  MooeeInv(in, out);
-}
-
-///////////////////////////////////
-// Internal
-///////////////////////////////////
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, 
-						   GaugeField & mat,
-						   const FermionField &A, const FermionField &B, int dag) {
-  assert((dag == DaggerNo) || (dag == DaggerYes));
-
-  Compressor compressor;
-
-  FermionField Btilde(B.Grid());
-  FermionField Atilde(B.Grid());
-  Atilde = A;
-
-  st.HaloExchange(B, compressor);
-
-  for (int mu = 0; mu < Nd; mu++) {
-
-    ////////////////////////
-    // Call the single hop
-    ////////////////////////
-    auto U_v   = U.View();
-    auto UUU_v = UUU.View();
-    auto B_v   = B.View();
-    auto Btilde_v   = Btilde.View();
-    thread_loop( (int sss = 0; sss < B.Grid()->oSites(); sss++), {
-      Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
-    });
-
-    // Force in three link terms
-    //
-    //    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
-    //
-    // dU_ac(x)/dt = i p_ab U_bc(x)
-    //
-    // => dS_f/dt = dS_f/dU_ac(x) . dU_ac(x)/dt =  i p_ab U_bc(x) dS_f/dU_ac(x) 
-    //
-    // One link: form fragments S_f = A U B 
-    //
-    //         write Btilde = U(x) B(x+mu)
-    //
-    // mat+= TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
-    // 
-    // Three link: form fragments S_f = A UUU B 
-    //
-    // mat+= outer ( A, UUUB) <-- Best take DhopDeriv with one linke or identity matrix
-    // mat+= outer ( AU, UUB) <-- and then use covariant cshift?
-    // mat+= outer ( AUU, UB) <-- Returned from call to DhopDir
-
-    assert(0);// need to figure out the force interface with a blasted three link term.
-    
-  }
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-
-  conformable(U.Grid(), _grid);
-  conformable(U.Grid(), V.Grid());
-  conformable(U.Grid(), mat.Grid());
-
-  mat.Checkerboard() = U.Checkerboard();
-
-  DerivInternal(Stencil, Umu, UUUmu, mat, U, V, dag);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-
-  conformable(U.Grid(), _cbgrid);
-  conformable(U.Grid(), V.Grid());
-  conformable(U.Grid(), mat.Grid());
-
-  assert(V.Checkerboard() == Even);
-  assert(U.Checkerboard() == Odd);
-  mat.Checkerboard() = Odd;
-
-  DerivInternal(StencilEven, UmuOdd, UUUmuOdd, mat, U, V, dag);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-
-  conformable(U.Grid(), _cbgrid);
-  conformable(U.Grid(), V.Grid());
-  conformable(U.Grid(), mat.Grid());
-
-  assert(V.Checkerboard() == Odd);
-  assert(U.Checkerboard() == Even);
-  mat.Checkerboard() = Even;
-
-  DerivInternal(StencilOdd, UmuEven, UUUmuEven, mat, U, V, dag);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
-{
-  DhopCalls+=2;
-  conformable(in.Grid(), _grid);  // verifies full grid
-  conformable(in.Grid(), out.Grid());
-
-  out.Checkerboard() = in.Checkerboard();
-
-  DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
-{
-  DhopCalls+=1;
-  conformable(in.Grid(), _cbgrid);    // verifies half grid
-  conformable(in.Grid(), out.Grid());  // drops the cb check
-
-  assert(in.Checkerboard() == Even);
-  out.Checkerboard() = Odd;
-
-  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
-{
-  DhopCalls+=1;
-  conformable(in.Grid(), _cbgrid);    // verifies half grid
-  conformable(in.Grid(), out.Grid());  // drops the cb check
-
-  assert(in.Checkerboard() == Odd);
-  out.Checkerboard() = Even;
-
-  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
-  DhopDir(in, out, dir, disp);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
-
-  Compressor compressor;
-  Stencil.HaloExchange(in, compressor);
-  auto Umu_v   =   Umu.View();
-  auto UUUmu_v = UUUmu.View();
-  auto in_v    =  in.View();
-  auto out_v   = out.View();
-  thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) , {
-      Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
-  });
-};
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
-						  DoubledGaugeField &U,
-						  DoubledGaugeField &UUU,
-						  const FermionField &in,
-						  FermionField &out, int dag) 
-{
-#ifdef GRID_OMP
-  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
-  else
-#endif
-    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
-}
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
-								 DoubledGaugeField &U,
-								 DoubledGaugeField &UUU,
-								 const FermionField &in,
-								 FermionField &out, int dag) 
-{
-#ifdef GRID_OMP
-  Compressor compressor; 
-  int len =  U.Grid()->oSites();
-  const int LLs =  1;
-
-  DhopTotalTime   -= usecond();
-
-  DhopFaceTime    -= usecond();
-  st.Prepare();
-  st.HaloGather(in,compressor);
-  st.CommsMergeSHM(compressor);
-  DhopFaceTime    += usecond();
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  DhopComputeTime    -= usecond();
-#pragma omp parallel 
-  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-
-    if (tid >= ncomms) {
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = len;
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
-      }
-
-      // do the compute
-      auto U_v   = U.View();
-      auto UUU_v = UUU.View();
-      auto in_v  = in.View();
-      auto out_v = out.View();
-      if (dag == DaggerYes) {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-          int sU = ss;
-	  // Interior = 1; Exterior = 0; must implement for staggered
-          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); 
-        }
-      } else {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  // Interior = 1; Exterior = 0;
-          int sU = ss;
-          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
-        }
-      }
-    } else {
-      st.CommunicateThreaded();
-    }
-  }
-  DhopComputeTime    += usecond();
-
-  // First to enter, last to leave timing
-  DhopFaceTime    -= usecond();
-  st.CommsMerge(compressor);
-  DhopFaceTime    -= usecond();
-
-  DhopComputeTime2    -= usecond();
-  {
-    auto U_v   = U.View();
-    auto UUU_v = UUU.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    if (dag == DaggerYes) {
-      int sz=st.surface_list.size();
-      thread_loop( (int ss = 0; ss < sz; ss++) ,{
-	int sU = st.surface_list[ss];
-	Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
-      });
-    } else {
-      int sz=st.surface_list.size();
-      thread_loop( (int ss = 0; ss < sz; ss++) ,{
-	int sU = st.surface_list[ss];
-	Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
-      });
-    }
-  }
-  DhopComputeTime2    += usecond();
-#else
-  assert(0);
-#endif
-}
-
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
-							     DoubledGaugeField &U,
-							     DoubledGaugeField &UUU,
-							     const FermionField &in,
-							     FermionField &out, int dag) 
-{
-  assert((dag == DaggerNo) || (dag == DaggerYes));
-
-  DhopTotalTime   -= usecond();
-
-  DhopCommTime    -= usecond();
-  Compressor compressor;
-  st.HaloExchange(in, compressor);
-  DhopCommTime    += usecond();
-
-  auto U_v   =   U.View();
-  auto UUU_v = UUU.View();
-  auto in_v  =  in.View();
-  auto out_v = out.View();
-  DhopComputeTime -= usecond();
-  if (dag == DaggerYes) {
-    thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++), {
-      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
-    });
-  } else {
-    thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++), {
-      Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
-    });
-  }
-  DhopComputeTime += usecond();
-  DhopTotalTime   += usecond();
-};
-
-  ////////////////////////////////////////////////////////////////
-  // Reporting
-  ////////////////////////////////////////////////////////////////
-template<class Impl>
-void ImprovedStaggeredFermion<Impl>::Report(void) 
-{
-  Coordinate latt = _grid->GlobalDimensions();
-  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP = _grid->_Nprocessors;
-  RealD NN = _grid->NodeCount();
-
-  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls   : " 
-	    << DhopCalls   << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime   /Calls       : " 
-	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime    /Calls       : " 
-	    << DhopCommTime    / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls        : " 
-	    << DhopComputeTime / DhopCalls << " us" << std::endl;
-
-  // Average the compute time
-  _grid->GlobalSum(DhopComputeTime);
-  DhopComputeTime/=NP;
-
-  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
-  
-  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
-}
-template<class Impl>
-void ImprovedStaggeredFermion<Impl>::ZeroCounters(void) 
-{
-  DhopCalls       = 0;
-  DhopTotalTime   = 0;
-  DhopCommTime    = 0;
-  DhopComputeTime = 0;
-  DhopFaceTime    = 0;
-
-  Stencil.ZeroCounters();
-  StencilEven.ZeroCounters();
-  StencilOdd.ZeroCounters();
-}
-
-
-//////////////////////////////////////////////////////// 
-// Conserved current - not yet implemented.
-////////////////////////////////////////////////////////
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
-							      PropagatorField &q_in_2,
-							      PropagatorField &q_out,
-							      Current curr_type,
-							      unsigned int mu)
-{
-  assert(0);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
-                                                         PropagatorField &q_out,
-                                                         Current curr_type,
-                                                         unsigned int mu, 
-                                                         unsigned int tmin,
-                                              unsigned int tmax,
-					      ComplexField &lattice_cmplx)
-{
-  assert(0);
-
-}
-
-
-FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
-
-//AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
-//TwoIndexFermOpTemplateInstantiate(ImprovedStaggeredFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
@@ -1,672 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
-#include <Grid/perfmon/PerfCount.h>
-
-NAMESPACE_BEGIN(Grid);
-  
-// S-direction is INNERMOST and takes no part in the parity.
-const std::vector<int> 
-ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
-const std::vector<int> 
-ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
-
-// 5d lattice for DWF.
-template<class Impl>
-ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian         &FiveDimGrid,
-							     GridRedBlackCartesian &FiveDimRedBlackGrid,
-							     GridCartesian         &FourDimGrid,
-							     GridRedBlackCartesian &FourDimRedBlackGrid,
-							     RealD _mass,
-							     RealD _c1,RealD _c2, RealD _u0,
-							     const ImplParams &p) :
-  Kernels(p),
-  _FiveDimGrid        (&FiveDimGrid),
-  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
-  _FourDimGrid        (&FourDimGrid),
-  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
-  Stencil    (&FiveDimGrid,npoint,Even,directions,displacements,p),
-  StencilEven(&FiveDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
-  StencilOdd (&FiveDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
-  mass(_mass),
-  c1(_c1),
-  c2(_c2),
-  u0(_u0),
-  Umu(&FourDimGrid),
-  UmuEven(&FourDimRedBlackGrid),
-  UmuOdd (&FourDimRedBlackGrid),
-  UUUmu(&FourDimGrid),
-  UUUmuEven(&FourDimRedBlackGrid),
-  UUUmuOdd(&FourDimRedBlackGrid),
-  Lebesgue(&FourDimGrid),
-  LebesgueEvenOdd(&FourDimRedBlackGrid),
-  _tmp(&FiveDimRedBlackGrid)
-{
-
-  // some assertions
-  assert(FiveDimGrid._ndimension==5);
-  assert(FourDimGrid._ndimension==4);
-  assert(FourDimRedBlackGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._ndimension==5);
-  assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
-
-  // extent of fifth dim and not spread out
-  Ls=FiveDimGrid._fdimensions[0];
-  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
-  assert(FiveDimGrid._processors[0]         ==1);
-  assert(FiveDimRedBlackGrid._processors[0] ==1);
-
-  // Other dimensions must match the decomposition of the four-D fields 
-  for(int d=0;d<4;d++){
-    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
-    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
-    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
-
-    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
-    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
-    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
-
-    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
-    assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
-    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
-  }
-
-  if (Impl::LsVectorised) { 
-
-    int nsimd = Simd::Nsimd();
-    
-    // Dimension zero of the five-d is the Ls direction
-    assert(FiveDimGrid._simd_layout[0]        ==nsimd);
-    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
-
-    for(int d=0;d<4;d++){
-      assert(FourDimGrid._simd_layout[d]==1);
-      assert(FourDimRedBlackGrid._simd_layout[d]==1);
-      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
-    }
-
-  } else {
-    
-    // Dimension zero of the five-d is the Ls direction
-    assert(FiveDimRedBlackGrid._simd_layout[0]==1);
-    assert(FiveDimGrid._simd_layout[0]        ==1);
-
-  }
-  int LLs = FiveDimGrid._rdimensions[0];
-  int vol4= FourDimGrid.oSites();
-  Stencil.BuildSurfaceList(LLs,vol4);
-
-  vol4=FourDimRedBlackGrid.oSites();
-  StencilEven.BuildSurfaceList(LLs,vol4);
-  StencilOdd.BuildSurfaceList(LLs,vol4);
-}
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::CopyGaugeCheckerboards(void)
-{
-  pickCheckerboard(Even, UmuEven,  Umu);
-  pickCheckerboard(Odd,  UmuOdd ,  Umu);
-  pickCheckerboard(Even, UUUmuEven,UUUmu);
-  pickCheckerboard(Odd,  UUUmuOdd, UUUmu);
-}
-template<class Impl>
-ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
-							     GridCartesian         &FiveDimGrid,
-							     GridRedBlackCartesian &FiveDimRedBlackGrid,
-							     GridCartesian         &FourDimGrid,
-							     GridRedBlackCartesian &FourDimRedBlackGrid,
-							     RealD _mass,
-							     RealD _c1,RealD _c2, RealD _u0,
-							     const ImplParams &p) :
-  ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid,
-			     FourDimGrid,FourDimRedBlackGrid,
-			     _mass,_c1,_c2,_u0,p)
-{
-  ImportGauge(_Uthin,_Ufat);
-}
-
-///////////////////////////////////////////////////
-// For MILC use; pass three link U's and 1 link U
-///////////////////////////////////////////////////
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) 
-{
-  /////////////////////////////////////////////////////////////////
-  // Trivial import; phases and fattening and such like preapplied
-  /////////////////////////////////////////////////////////////////
-  for (int mu = 0; mu < Nd; mu++) {
-
-    auto U = PeekIndex<LorentzIndex>(_Utriple, mu);
-    Impl::InsertGaugeField(UUUmu,U,mu);
-
-    U = adj( Cshift(U, mu, -3));
-    Impl::InsertGaugeField(UUUmu,-U,mu+4);
-
-    U = PeekIndex<LorentzIndex>(_Ufat, mu);
-    Impl::InsertGaugeField(Umu,U,mu);
-
-    U = adj( Cshift(U, mu, -1));
-    Impl::InsertGaugeField(Umu,-U,mu+4);
-
-  }
-  CopyGaugeCheckerboards();
-}
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) 
-{
-  /////////////////////////////////////////////////////////////////
-  // Trivial import; phases and fattening and such like preapplied
-  /////////////////////////////////////////////////////////////////
-  Umu   = _U;
-  UUUmu = _UUU;
-  CopyGaugeCheckerboards();
-}
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
-{
-  ////////////////////////////////////////////////////////
-  // Double Store should take two fields for Naik and one hop separately.
-  ////////////////////////////////////////////////////////
-  Impl::DoubleStore(GaugeGrid(), UUUmu, Umu, _Uthin, _Ufat );
-
-  ////////////////////////////////////////////////////////
-  // Apply scale factors to get the right fermion Kinetic term
-  // Could pass coeffs into the double store to save work.
-  // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) 
-  ////////////////////////////////////////////////////////
-  for (int mu = 0; mu < Nd; mu++) {
-
-    auto U = PeekIndex<LorentzIndex>(Umu, mu);
-    PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
-    
-    U = PeekIndex<LorentzIndex>(Umu, mu+4);
-    PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
-
-    U = PeekIndex<LorentzIndex>(UUUmu, mu);
-    PokeIndex<LorentzIndex>(UUUmu, U*( 0.5*c2/u0/u0/u0), mu );
-    
-    U = PeekIndex<LorentzIndex>(UUUmu, mu+4);
-    PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
-  }
-
-  CopyGaugeCheckerboards();
-}
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
-{
-  int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
-                    // we drop off the innermost fifth dimension
-
-  Compressor compressor;
-  Stencil.HaloExchange(in,compressor);
-  auto Umu_v   = Umu.View();
-  auto UUUmu_v = UUUmu.View();
-  auto in_v    = in.View();
-  auto out_v   = out.View();
-  thread_loop( (int ss=0;ss<Umu.Grid()->oSites();ss++),{
-    for(int s=0;s<Ls;s++){
-      int sU=ss;
-      int sF = s+Ls*sU; 
-      Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sF, sU, in_v, out_v, dir, disp);
-    }
-  });
-};
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DerivInternal(StencilImpl & st,
-						     DoubledGaugeField & U,
-						     DoubledGaugeField & UUU,
-						     GaugeField &mat,
-						     const FermionField &A,
-						     const FermionField &B,
-						     int dag)
-{
-  // No force terms in multi-rhs solver staggered
-  assert(0);
-}
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopDeriv(GaugeField &mat,
-						 const FermionField &A,
-						 const FermionField &B,
-						 int dag)
-{
-  assert(0);
-}
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
-						   const FermionField &A,
-						   const FermionField &B,
-						   int dag)
-{
-  assert(0);
-}
-
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
-						   const FermionField &A,
-						   const FermionField &B,
-						   int dag)
-{
-  assert(0);
-}
-
-/*CHANGE */
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
-						    DoubledGaugeField & U,DoubledGaugeField & UUU,
-						    const FermionField &in, FermionField &out,int dag)
-{
-#ifdef GRID_OMP
-  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
-  else
-#endif
-    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
-}
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
-								   DoubledGaugeField & U,DoubledGaugeField & UUU,
-								   const FermionField &in, FermionField &out,int dag)
-{
-#ifdef GRID_OMP
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-
-  Compressor compressor; 
-
-  int LLs = in.Grid()->_rdimensions[0];
-  int len =  U.Grid()->oSites();
-
-  DhopFaceTime-=usecond();
-  st.Prepare();
-  st.HaloGather(in,compressor);
-  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
-  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
-  DhopFaceTime+=usecond();
-
-  double ctime=0;
-  double ptime=0;
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
-  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      double start = usecond();
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = U.Grid()->oSites(); // 4d vol
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
-      }
-
-      // do the compute
-      auto   U_v  =   U.View();
-      auto UUU_v  = UUU.View();
-      auto  in_v  =  in.View();
-      auto out_v  = out.View();
-
-      if (dag == DaggerYes) {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-          int sU = ss;
-	  // Interior = 1; Exterior = 0; must implement for staggered
-          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
-        }
-      } else {
-        for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  // Interior = 1; Exterior = 0;
-          int sU = ss;
-          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
-        }
-      }
-        ptime = usecond() - start;
-    } else {
-      double start = usecond();
-      st.CommunicateThreaded();
-      ctime = usecond() - start;
-    }
-  }
-  DhopCommTime += ctime;
-  DhopComputeTime+=ptime;
-
-  // First to enter, last to leave timing
-  st.CollateThreads();
-
-  DhopFaceTime-=usecond();
-  st.CommsMerge(compressor);
-  DhopFaceTime+=usecond();
-
-  DhopComputeTime2-=usecond();
-
-  auto   U_v  =   U.View();
-  auto UUU_v  = UUU.View();
-  auto  in_v  =  in.View();
-  auto out_v  = out.View();
-  if (dag == DaggerYes) {
-    int sz=st.surface_list.size();
-    thread_loop( (int ss = 0; ss < sz; ss++) ,{
-      int sU = st.surface_list[ss];
-      Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
-    });
-  } else {
-    int sz=st.surface_list.size();
-    thread_loop( (int ss = 0; ss < sz; ss++) ,{
-      int sU = st.surface_list[ss];
-      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
-    });
-  }
-  DhopComputeTime2+=usecond();
-#else
-  assert(0);
-#endif
-
-}
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
-						    DoubledGaugeField & U,DoubledGaugeField & UUU,
-						    const FermionField &in, FermionField &out,int dag)
-{
-  Compressor compressor;
-  int LLs = in.Grid()->_rdimensions[0];
-
-
-
- //double t1=usecond();
-  DhopTotalTime -= usecond();
-  DhopCommTime -= usecond();
-  st.HaloExchange(in,compressor);
-  DhopCommTime += usecond();
-  
-  DhopComputeTime -= usecond();
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  auto   U_v  =   U.View();
-  auto UUU_v  = UUU.View();
-  auto  in_v  =  in.View();
-  auto out_v  = out.View();
-  if (dag == DaggerYes) {
-  thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++), {
-      int sU=ss;
-      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
-    });
-  } else {
-    thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{
-      int sU=ss;
-      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
-    });
-  }
-  DhopComputeTime += usecond();
-  DhopTotalTime   += usecond();
- //double t2=usecond();
- //std::cout << __FILE__ << " " << __func__  << " Total Time " << DhopTotalTime << std::endl;
- //std::cout << __FILE__ << " " << __func__  << " Total Time Org " << t2-t1 << std::endl;
- //std::cout << __FILE__ << " " << __func__  << " Comml Time " << DhopCommTime << std::endl;
- //std::cout << __FILE__ << " " << __func__  << " Compute Time " << DhopComputeTime << std::endl;
-
-}
-/*CHANGE END*/
-
-/* ORG
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
-						    DoubledGaugeField & U,DoubledGaugeField & UUU,
-						    const FermionField &in, FermionField &out,int dag)
-{
-  Compressor compressor;
-  int LLs = in.Grid()->_rdimensions[0];
-
-  DhopTotalTime -= usecond();
-  DhopCommTime -= usecond();
-  st.HaloExchange(in,compressor);
-  DhopCommTime += usecond();
-  
-  DhopComputeTime -= usecond();
-  auto U_v   =   U.View();
-  auto UUU_v = UUU.View();
-  auto out_v = out.View();
-  auto in_v  =  in.View();
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  if (dag == DaggerYes) {
-    thread_loop(  (int ss = 0; ss < U.Grid()->oSites(); ss++), {
-      int sU=ss;
-      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
-    });
-  } else {
-    thread_loop(  (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{
-      int sU=ss;
-      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
-    });
-  }
-  DhopComputeTime += usecond();
-  DhopTotalTime   += usecond();
-}
-*/
-
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
-{
-  DhopCalls+=1;
-  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
-  conformable(in.Grid(),out.Grid()); // drops the cb check
-
-  assert(in.Checkerboard()==Even);
-  out.Checkerboard() = Odd;
-
-  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
-}
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
-{
-  DhopCalls+=1;
-  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
-  conformable(in.Grid(),out.Grid()); // drops the cb check
-
-  assert(in.Checkerboard()==Odd);
-  out.Checkerboard() = Even;
-
-  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
-}
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
-{
-  DhopCalls+=2;
-  conformable(in.Grid(),FermionGrid()); // verifies full grid
-  conformable(in.Grid(),out.Grid());
-
-  out.Checkerboard() = in.Checkerboard();
-
-  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
-}
-
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Report(void) 
-{
-  Coordinate latt = GridDefaultLatt();          
-  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP = _FourDimGrid->_Nprocessors;
-  RealD NN = _FourDimGrid->NodeCount();
-
-  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
-	    << DhopCalls   << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
-	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
-	    << DhopCommTime    / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
-	    << DhopComputeTime / DhopCalls << " us" << std::endl;
-
-  // Average the compute time
-  _FourDimGrid->GlobalSum(DhopComputeTime);
-  DhopComputeTime/=NP;
-
-  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
-  
-  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
-}
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
-{
-  DhopCalls       = 0;
-  DhopTotalTime    = 0;
-  DhopCommTime    = 0;
-  DhopComputeTime = 0;
-  DhopFaceTime    = 0;
-
-
-  Stencil.ZeroCounters();
-  StencilEven.ZeroCounters();
-  StencilOdd.ZeroCounters();
-}
-
-/////////////////////////////////////////////////////////////////////////
-// Implement the general interface. Here we use SAME mass on all slices
-/////////////////////////////////////////////////////////////////////////
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
-  DhopDir(in, out, dir, disp);
-}
-template <class Impl>
-RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Dhop(in, out, DaggerNo);
-  return axpy_norm(out, mass, in, out);
-}
-
-template <class Impl>
-RealD ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Dhop(in, out, DaggerYes);
-  return axpy_norm(out, mass, in, out);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) {
-  if (in.Checkerboard() == Odd) {
-    DhopEO(in, out, DaggerNo);
-  } else {
-    DhopOE(in, out, DaggerNo);
-  }
-}
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
-  if (in.Checkerboard() == Odd) {
-    DhopEO(in, out, DaggerYes);
-  } else {
-    DhopOE(in, out, DaggerYes);
-  }
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  typename FermionField::scalar_type scal(mass);
-  out = scal * in;
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Mooee(in, out);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  out = (1.0 / (mass)) * in;
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
-						   FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  MooeeInv(in, out);
-}
-
-//////////////////////////////////////////////////////// 
-// Conserved current - not yet implemented.
-////////////////////////////////////////////////////////
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
-								PropagatorField &q_in_2,
-								PropagatorField &q_out,
-								Current curr_type,
-								unsigned int mu)
-{
-  assert(0);
-}
-
-template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
-							   PropagatorField &q_out,
-							   Current curr_type,
-							   unsigned int mu, 
-							   unsigned int tmin,
-                                              unsigned int tmax,
-					      ComplexField &lattice_cmplx)
-{
-  assert(0);
-
-}
-
-FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
-FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
-  
-NAMESPACE_END(Grid);
-
-
-
--- a/Grid/qcd/action/fermion/MobiusEOFAFermion.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.cc
@@ -1,497 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
-					   GaugeField            &_Umu,
-					   GridCartesian         &FiveDimGrid,
-					   GridRedBlackCartesian &FiveDimRedBlackGrid,
-					   GridCartesian         &FourDimGrid,
-					   GridRedBlackCartesian &FourDimRedBlackGrid,
-					   RealD _mq1, RealD _mq2, RealD _mq3,
-					   RealD _shift, int _pm, RealD _M5,
-					   RealD _b, RealD _c, const ImplParams &p) :
-  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-			    _shift, _pm, _M5, _b, _c, p)
-{
-  int Ls = this->Ls;
-
-  RealD eps = 1.0;
-  Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
-  assert(zdata->n == this->Ls);
-
-  std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
-    ",c=" << _c << ") with Ls=" << Ls << std::endl;
-  this->SetCoefficientsTanh(zdata, _b, _c);
-  std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
-    ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
-    ",pm=" << _pm << ")" << std::endl;
-
-  Approx::zolotarev_free(zdata);
-
-  if(_shift != 0.0){
-    SetCoefficientsPrecondShiftOps();
-  } else {
-    Mooee_shift.resize(Ls, 0.0);
-    MooeeInv_shift_lc.resize(Ls, 0.0);
-    MooeeInv_shift_norm.resize(Ls, 0.0);
-    MooeeInvDag_shift_lc.resize(Ls, 0.0);
-    MooeeInvDag_shift_norm.resize(Ls, 0.0);
-  }
-}
-
-/****************************************************************
- * Additional EOFA operators only called outside the inverter.  
- * Since speed is not essential, simple axpby-style
- * implementations should be fine.
- ***************************************************************/
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-{
-  int Ls = this->Ls;
-  RealD alpha = this->alpha;
-
-  Din = Zero();
-  if((sign == 1) && (dag == 0)) { // \Omega_{+}
-    for(int s=0; s<Ls; ++s){
-      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
-    }
-  } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
-    for(int s=0; s<Ls; ++s){
-      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
-    }
-  } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
-    for(int sp=0; sp<Ls; ++sp){
-      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
-    }
-  } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
-    for(int sp=0; sp<Ls; ++sp){
-      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
-    }
-  }
-}
-
-// This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
-// It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
-{
-  int Ls    = this->Ls;
-  RealD b   = 0.5 * ( 1.0 + this->alpha );
-  RealD c   = 0.5 * ( 1.0 - this->alpha );
-  RealD mq1 = this->mq1;
-
-  for(int s=0; s<Ls; ++s){
-    if(s == 0) {
-      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-      axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
-    } else if(s == (Ls-1)) {
-      axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
-      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-    } else {
-      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-    }
-  }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-  RealD m = this->mq1;
-  RealD c = 0.5 * this->alpha;
-  RealD d = 0.5;
-
-  RealD DtInv_p(0.0), DtInv_m(0.0);
-  RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
-  FermionField tmp(this->FermionGrid());
-
-  for(int s=0; s<Ls; ++s){
-    for(int sp=0; sp<Ls; ++sp){
-
-      DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
-      DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
-      DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
-      DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
-
-      if(sp == 0){
-	axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
-	axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
-      } else {
-	axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
-	axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
-      }
-
-    }}
-}
-
-/*****************************************************************************************************/
-
-template<class Impl>
-RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->Meooe5D(psi, Din);
-  this->DW(Din, chi, DaggerNo);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  this->M5D(psi, chi);
-  return(norm2(chi));
-}
-
-template<class Impl>
-RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->DW(psi, Din, DaggerYes);
-  this->MeooeDag5D(Din, chi);
-  this->M5Ddag(psi, chi);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  return(norm2(chi));
-}
-
-/********************************************************************
- * Performance critical fermion operators called inside the inverter
- ********************************************************************/
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-// half checkerboard operations
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  // coefficients of Mooee
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] *= -this->mq1;
-  lower[0]    *= -this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  // coefficients of MooeeDag
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-  for(int s=0; s<Ls; s++){
-    if(s==0) {
-      upper[s] = -this->cee[s+1];
-      lower[s] = this->mq1*this->cee[Ls-1];
-    } else if(s==(Ls-1)) {
-      upper[s] = this->mq1*this->cee[0];
-      lower[s] = -this->cee[s-1];
-    } else {
-      upper[s] = -this->cee[s+1];
-      lower[s] = -this->cee[s-1];
-    }
-  }
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-/****************************************************************************************/
-
-// Computes coefficients for applying Cayley preconditioned shift operators
-//  (Mooee + \Delta) --> Mooee_shift
-//  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
-//  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
-// For the latter two cases, the operation takes the form
-//  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
-//      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
-template<class Impl>
-void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD alpha = this->alpha;
-  RealD k     = this->k;
-  RealD mq1   = this->mq1;
-  RealD shift = this->shift;
-
-  // Initialize
-  Mooee_shift.resize(Ls);
-  MooeeInv_shift_lc.resize(Ls);
-  MooeeInv_shift_norm.resize(Ls);
-  MooeeInvDag_shift_lc.resize(Ls);
-  MooeeInvDag_shift_norm.resize(Ls);
-
-  // Construct Mooee_shift
-  int idx(0);
-  Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
-    ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
-  for(int s=0; s<Ls; ++s){
-    idx = (pm == 1) ? (s) : (Ls-1-s);
-    Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
-  }
-
-  // Tridiagonal solve for MooeeInvDag_shift_lc
-  {
-    Coeff_t m(0.0);
-    Vector<Coeff_t> d = Mooee_shift;
-    Vector<Coeff_t> u(Ls,0.0);
-    Vector<Coeff_t> y(Ls,0.0);
-    Vector<Coeff_t> q(Ls,0.0);
-    if(pm == 1){ u[0] = 1.0; }
-    else{ u[Ls-1] = 1.0; }
-
-    // Tridiagonal matrix algorithm + Sherman-Morrison formula
-    //
-    // We solve
-    //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
-    // where Mooee' is the tridiagonal part of Mooee_{+}, and
-    // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
-    // so that the outer-product u \otimes v gives the (0,Ls-1)
-    // entry of Mooee_{+}.
-    //
-    // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
-    // and then construct the solution to the original system
-    //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
-    if(pm == 1){
-      for(int s=1; s<Ls; ++s){
-	m = -this->cee[s] / this->bee[s-1];
-	d[s] -= m*d[s-1];
-	u[s] -= m*u[s-1];
-      }
-    }
-    y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
-    q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
-    for(int s=Ls-2; s>=0; --s){
-      if(pm == 1){
-	y[s] = d[s] / this->bee[s];
-	q[s] = u[s] / this->bee[s];
-      } else {
-	y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
-	q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
-      }
-    }
-
-    // Construct MooeeInvDag_shift_lc
-    for(int s=0; s<Ls; ++s){
-      if(pm == 1){
-	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
-	  (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
-      } else {
-	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
-	  (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
-      }
-    }
-
-    // Compute remaining coefficients
-    N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
-    for(int s=0; s<Ls; ++s){
-
-      // MooeeInv_shift_lc
-      if(pm == 1){ MooeeInv_shift_lc[s] = pow(this->bee[s],s)      * pow(this->cee[s],Ls-1-s); }
-      else       { MooeeInv_shift_lc[s] = pow(this->bee[s],Ls-1-s) * pow(this->cee[s],s); }
-
-      // MooeeInv_shift_norm
-      MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
-	( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N;
-
-      // MooeeInvDag_shift_norm
-      if(pm == 1){ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],s) * pow(this->cee[s],(Ls-1-s)) /
-     	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
-      else{ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],(Ls-1-s)) * pow(this->cee[s],s) /
-	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
-    }
-  }
-}
-
-// Recompute coefficients for a different value of shift constant
-template<class Impl>
-void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-{
-  this->shift = new_shift;
-  if(new_shift != 0.0){
-    SetCoefficientsPrecondShiftOps();
-  } else {
-    int Ls = this->Ls;
-    Mooee_shift.resize(Ls,0.0);
-    MooeeInv_shift_lc.resize(Ls,0.0);
-    MooeeInv_shift_norm.resize(Ls,0.0);
-    MooeeInvDag_shift_lc.resize(Ls,0.0);
-    MooeeInvDag_shift_norm.resize(Ls,0.0);
-  }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-						   Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  int Ls = this->Ls;
-
-  GridBase* grid = this->FermionRedBlackGrid();
-  int LLs = grid->_rdimensions[0];
-
-  if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-  for(int s=0; s<Ls; s++){
-    Pplus(s,s)  = this->bee[s];
-    Pminus(s,s) = this->bee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pminus(s,s+1) = -this->cee[s];
-    Pplus(s+1,s) = -this->cee[s+1];
-  }
-
-  Pplus (0,Ls-1) = this->mq1*this->cee[0];
-  Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
-
-  if(this->shift != 0.0){
-    RealD c = 0.5 * this->alpha;
-    RealD d = 0.5;
-    RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
-    if(this->pm == 1) {
-      for(int s=0; s<Ls; ++s){
-	Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
-      }
-    } else {
-      for(int s=0; s<Ls; ++s){
-	Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
-      }
-    }
-  }
-
-  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXcd PminusMat;
-
-  if(inv) {
-    PplusMat  = Pplus.inverse();
-    PminusMat = Pminus.inverse();
-  } else {
-    PplusMat  = Pplus;
-    PminusMat = Pminus;
-  }
-
-  if(dag){
-    PplusMat.adjointInPlace();
-    PminusMat.adjointInPlace();
-  }
-
-  typedef typename SiteHalfSpinor::scalar_type scalar_type;
-  const int Nsimd = Simd::Nsimd();
-  Matp.resize(Ls*LLs);
-  Matm.resize(Ls*LLs);
-
-  for(int s2=0; s2<Ls; s2++){
-    for(int s1=0; s1<LLs; s1++){
-      int istride = LLs;
-      int ostride = 1;
-      Simd Vp;
-      Simd Vm;
-      scalar_type *sp = (scalar_type*) &Vp;
-      scalar_type *sm = (scalar_type*) &Vm;
-      for(int l=0; l<Nsimd; l++){
-	if(switcheroo<Coeff_t>::iscomplex()) {
-	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
-	} else {
-	  // if real
-	  scalar_type tmp;
-	  tmp = PplusMat (l*istride+s1*ostride,s2);
-	  sp[l] = scalar_type(tmp.real(),tmp.real());
-	  tmp = PminusMat(l*istride+s1*ostride,s2);
-	  sm[l] = scalar_type(tmp.real(),tmp.real());
-	}
-      }
-      Matp[LLs*s2+s1] = Vp;
-      Matm[LLs*s2+s1] = Vm;
-    }}
-}
-
-FermOpTemplateInstantiate(MobiusEOFAFermion);
-GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc
@@ -1,998 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-/*
- * Dense matrix versions of routines
- */
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				  Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid  = psi_i.Grid();
-  auto psi        = psi_i.View();
-  auto phi        = phi_i.View();
-  auto chi        = chi_i.View();
-  int Ls          = this->Ls;
-  int LLs         = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s   = o + i*LLs;
-      int ss  = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5m(hp, psi[ss+vp]);
-      spProj5p(hm, psi[ss+vm]);
-
-      if (vp <= v){ rotate(hp, hp, 1); }
-      if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = 0.5*hp;
-      hm = 0.5*hm;
-
-      spRecon5m(fp, hp);
-      spRecon5p(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v];
-      chi[ss+v] = chi[ss+v] + u[v]*fp;
-      chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-
-#endif
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi_i, const FermionField& phi_i,
-					FermionField& chi_i, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-					Vector<Coeff_t>& shift_coeffs)
-{
-#if 0
-  auto & psi = psi_i;
-  auto & phi = phi_i;
-  auto & chi = chi_i;
-
-  this->M5D(psi, phi, chi, lower, diag, upper);
-
-  // FIXME: possible gain from vectorizing shift operation as well?
-  Coeff_t one(1.0);
-  int Ls = this->Ls;
-  for(int s=0; s<Ls; s++){
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
-    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
-  }
-
-#else
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid  = psi_i.Grid();
-  auto psi        = psi_i.View();
-  auto phi        = phi_i.View();
-  auto chi        = chi_i.View();
-  int Ls          = this->Ls;
-  int LLs         = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-  Vector<iSinglet<Simd>> s(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-  scalar_type* s_p = (scalar_type*) &s[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s   = o + i*LLs;
-      int ss  = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-      s_p[ss] = shift_coeffs[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-    int vs     = (this->pm == 1) ? LLs-1 : 0;
-    Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
-    Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
-    Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
-    Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
-    Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
-    Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(this->pm == 1 && vs <= v){
-	hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      if(this->pm == -1 && vs >= v){
-	hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-      Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-      Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-      Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-      Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-      Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-      Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-  });
-
-  this->M5Dtime += usecond();
-
-#endif
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				     Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5p(hp, psi[ss+vp]);
-      spProj5m(hm, psi[ss+vm]);
-
-      if(vp <= v){ rotate(hp, hp, 1); }
-      if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = hp*0.5;
-      hm = hm*0.5;
-      spRecon5p(fp, hp);
-      spRecon5m(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-
-    }
-
-#endif
-
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-					   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-					   Vector<Coeff_t>& shift_coeffs)
-{
-#if 0
-  auto & psi = psi_i;
-  auto & phi = phi_i;
-  auto & chi = chi_i;
-  this->M5Ddag(psi, phi, chi, lower, diag, upper);
-
-  // FIXME: possible gain from vectorizing shift operation as well?
-  Coeff_t one(1.0);
-  int Ls = this->Ls;
-  for(int s=0; s<Ls; s++){
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
-    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
-  }
-
-#else
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-  Vector<iSinglet<Simd>> s(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-  scalar_type* s_p = (scalar_type*) &s[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-      s_p[ss] = shift_coeffs[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-    int vs     = (this->pm == 1) ? LLs-1 : 0;
-    Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
-    Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
-    Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
-    Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
-    Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
-    Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(this->pm == 1 && vs <= v){
-	hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      if(this->pm == -1 && vs >= v){
-	hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-      }
-
-      Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-      Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-      Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-      Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-      Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-      Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-
-    }
-
-  });
-
-  this->M5Dtime += usecond();
-
-#endif
-}
-
-#ifdef AVX512
-#include<simd/Intel512common.h>
-#include<simd/Intel512avx.h>
-#include<simd/Intel512single.h>
-#endif
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
-					       int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-#ifndef AVX512
-  {
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-    SiteHalfSpinor SiteChiP;
-    SiteHalfSpinor SiteChiM;
-
-    // Ls*Ls * 2 * 12 * vol flops
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-	for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-	  int s = s2 + l*LLs;
-	  int lex = s2 + LLs*site;
-
-	  if( s2==0 && l==0 ){
-	    SiteChiP=Zero();
-	    SiteChiM=Zero();
-	  }
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-            }}
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-            }}
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-              SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-            }}
-	}}
-
-      {
-	int lex = s1 + LLs*site;
-	for(int sp=0; sp<2;  sp++){
-	  for(int co=0; co<Nc; co++){
-	    vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-	  }}
-      }
-    }
-  }
-#else
-  {
-    // pointers
-    //  MASK_REGS;
-#define Chi_00 %%zmm1
-#define Chi_01 %%zmm2
-#define Chi_02 %%zmm3
-#define Chi_10 %%zmm4
-#define Chi_11 %%zmm5
-#define Chi_12 %%zmm6
-#define Chi_20 %%zmm7
-#define Chi_21 %%zmm8
-#define Chi_22 %%zmm9
-#define Chi_30 %%zmm10
-#define Chi_31 %%zmm11
-#define Chi_32 %%zmm12
-
-#define BCAST0  %%zmm13
-#define BCAST1  %%zmm14
-#define BCAST2  %%zmm15
-#define BCAST3  %%zmm16
-#define BCAST4  %%zmm17
-#define BCAST5  %%zmm18
-#define BCAST6  %%zmm19
-#define BCAST7  %%zmm20
-#define BCAST8  %%zmm21
-#define BCAST9  %%zmm22
-#define BCAST10 %%zmm23
-#define BCAST11 %%zmm24
-
-    int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-
-	int lex = s2 + LLs*site;
-	uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-	uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-	uint64_t a2 = (uint64_t) &psi[lex];
-
-	for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-
-	  if((s2+l)==0) {
-	    asm(
-		VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-		VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-		VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-		VBCASTCDUP(0,%2,BCAST0)
-		VBCASTCDUP(1,%2,BCAST1)
-		VBCASTCDUP(2,%2,BCAST2)
-		VBCASTCDUP(3,%2,BCAST3)
-		VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-		VMULMEM(0,%1,BCAST8,Chi_22)
-		VMULMEM(0,%1,BCAST9,Chi_30)
-		VMULMEM(0,%1,BCAST10,Chi_31)
-		VMULMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  } else {
-	    asm(
-		VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-		VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-		VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-		VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-		VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  }
-
-	  a0 = a0 + incr;
-	  a1 = a1 + incr;
-              a2 = a2 + sizeof(typename Simd::scalar_type);
-	}
-      }
-
-      {
-	int lexa = s1+LLs*site;
-	asm (
-	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-      }
-    }
-  }
-
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-
-#endif
-};
-
-// Z-mobius version
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-						int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-  exit(-1);
-};
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-{
-  chi.Checkerboard() = psi.Checkerboard();
-
-  int Ls  = this->Ls;
-  int LLs = psi.Grid()->_rdimensions[0];
-  int vol = psi.Grid()->oSites()/LLs;
-
-  Vector<iSinglet<Simd>>   Matp;
-  Vector<iSinglet<Simd>>   Matm;
-  Vector<iSinglet<Simd>>* _Matp;
-  Vector<iSinglet<Simd>>* _Matm;
-
-  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-  if(inv && dag){
-    _Matp = &this->MatpInvDag;
-    _Matm = &this->MatmInvDag;
-  }
-
-  if(inv && (!dag)){
-    _Matp = &this->MatpInv;
-    _Matm = &this->MatmInv;
-  }
-
-  if(!inv){
-    MooeeInternalCompute(dag, inv, Matp, Matm);
-    _Matp = &Matp;
-    _Matm = &Matm;
-  }
-
-  assert(_Matp->size() == Ls*LLs);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  if(switcheroo<Coeff_t>::iscomplex()){
-    thread_loop( (auto site=0; site<vol; site++),{
-      MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  } else {
-    thread_loop( (auto site=0; site<vol; site++),{
-      MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  }
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef MOBIUS_EOFA_DPERP_VEC
-
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
-
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
-
-template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.cc
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.cc
@@ -1,452 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-  // this does both dag and undag but is trivial; make a common helper routing
-  int Ls = this->Ls;
-
-  this->DhopDir(psi,chi,dir,disp);
-
-  int nblock=(Ls-1)/2;
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
-    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
-  }
-  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
-
-}
-template<class Impl>
-void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
-{
-  int Ls = this->Ls;
-  if ( psi.Checkerboard() == Odd ) {
-    this->DhopEO(psi,chi,DaggerNo);
-  } else {
-    this->DhopOE(psi,chi,DaggerNo);
-  }
-
-  int nblock=(Ls-1)/2;
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
-    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
-  }
-  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
-}
-
-template<class Impl>
-void   PartialFractionFermion5D<Impl>::Mooee_internal(const FermionField &psi, FermionField &chi,int dag)
-{
-  // again dag and undag are trivially related
-  int sign = dag ? (-1) : 1;
-  int Ls = this->Ls;
-      
-  int nblock=(Ls-1)/2;
-  for(int b=0;b<nblock;b++){
-	
-    int s = 2*b;
-    RealD pp = p[nblock-1-b];
-    RealD qq = q[nblock-1-b];
-	
-    // Do each 2x2 block aligned at s and multiplies Dw site diagonal by G5 so Hw
-    ag5xpby_ssp(chi,-dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s  ,s+1); 
-    ag5xpby_ssp(chi, dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s+1,s);
-    axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
-  }
-      
-  {
-    RealD R=(1+mass)/(1-mass);
-    //R g5 psi[Ls-1] + p[0] H
-    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale*dw_diag/amax,psi,Ls-1,Ls-1);
-	
-    for(int b=0;b<nblock;b++){
-      int s = 2*b+1;
-      RealD pp = p[nblock-1-b];
-      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
-    }
-  }
-}
-
-template<class Impl>
-void   PartialFractionFermion5D<Impl>::MooeeInv_internal(const FermionField &psi, FermionField &chi,int dag)
-{
-  int sign = dag ? (-1) : 1;
-  int Ls = this->Ls;
-
-  FermionField tmp(psi.Grid());
-      
-  ///////////////////////////////////////////////////////////////////////////////////////
-  //Linv
-  ///////////////////////////////////////////////////////////////////////////////////////
-  int nblock=(Ls-1)/2;
-
-  axpy(chi,0.0,psi,psi); // Identity piece
-      
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    RealD pp = p[nblock-1-b];
-    RealD qq = q[nblock-1-b];
-    RealD coeff1=sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
-    RealD coeff2=sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
-    axpby_ssp  (chi,1.0,chi,coeff1,psi,Ls-1,s);
-    axpbg5y_ssp(chi,1.0,chi,coeff2,psi,Ls-1,s+1);
-  }
-      
-  ///////////////////////////////////////////////////////////////////////////////////////
-  //Dinv (note D isn't really diagonal -- just diagonal enough that we can still invert)
-  // Compute Seeinv (coeff of gamma5)
-  ///////////////////////////////////////////////////////////////////////////////////////
-  RealD R=(1+mass)/(1-mass);
-  RealD Seeinv = R + p[nblock]*dw_diag/amax;
-  for(int b=0;b<nblock;b++){
-    Seeinv += p[nblock-1-b]*dw_diag/amax / ( dw_diag*dw_diag/amax/amax + q[nblock-1-b]);
-  }    
-  Seeinv = 1.0/Seeinv;
-      
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    RealD pp = p[nblock-1-b];
-    RealD qq = q[nblock-1-b];
-    RealD coeff1=dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
-    RealD coeff2=amax*sqrt(qq) / ( dw_diag*dw_diag + amax*amax* qq);
-    ag5xpby_ssp  (tmp,-coeff1,chi,coeff2,chi,s,s+1);
-    ag5xpby_ssp  (tmp, coeff1,chi,coeff2,chi,s+1,s);
-  }
-  ag5xpby_ssp  (tmp, Seeinv,chi,0.0,chi,Ls-1,Ls-1);
-      
-  ///////////////////////////////////////////////////////////////////////////////////////
-  // Uinv
-  ///////////////////////////////////////////////////////////////////////////////////////
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    RealD pp = p[nblock-1-b];
-    RealD qq = q[nblock-1-b];
-    RealD coeff1=-sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
-    RealD coeff2=-sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
-    axpby_ssp  (chi,1.0/scale,tmp,coeff1/scale,tmp,s,Ls-1);
-    axpbg5y_ssp(chi,1.0/scale,tmp,coeff2/scale,tmp,s+1,Ls-1);
-  }
-  axpby_ssp  (chi, 1.0/scale,tmp,0.0,tmp,Ls-1,Ls-1);
-}
-
-template<class Impl>
-void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, FermionField &chi,int dag)
-{
-  FermionField D(psi.Grid());
-  
-  int Ls = this->Ls;
-  int sign = dag ? (-1) : 1;
-
-  // For partial frac Hw case (b5=c5=1) chroma quirkily computes
-  //
-  // Conventions for partfrac appear to be a mess.
-  // Tony's Nara lectures have
-  //
-  // BlockDiag(  H/p_i  1             | 1       )    
-  //          (  1      p_i H / q_i^2 | 0       )  
-  //           ---------------------------------
-  //           ( -1      0                | R  +p0 H  )
-  //
-  //Chroma     ( -2H    2sqrt(q_i)    |   0         )
-  //           (2 sqrt(q_i)   2H      |  2 sqrt(p_i) )
-  //           ---------------------------------
-  //           ( 0     -2 sqrt(p_i)   |  2 R gamma_5 + p0 2H
-  //
-  // Edwards/Joo/Kennedy/Wenger
-  //
-  // Here, the "beta's" selected by chroma to scale the unphysical bulk constraint fields
-  // incorporate the approx scale factor. This is obtained by propagating the
-  // scale on "H" out to the off diagonal elements as follows:
-  //
-  // BlockDiag(  H/p_i  1             | 1       ) 
-  //          (  1      p_i H / q_i^2 | 0       )  
-  //           ---------------------------------
-  //          ( -1      0                | R  + p_0 H  )
-  //
-  // becomes:
-  // BlockDiag(  H/ sp_i  1               | 1             ) 
-  //          (  1      sp_i H / s^2q_i^2 | 0             )  
-  //           ---------------------------------
-  //           ( -1      0                | R + p_0/s H   )
-  //
-  //
-  // This is implemented in Chroma by
-  //           p0' = p0/approxMax
-  //           p_i' = p_i*approxMax
-  //           q_i' = q_i*approxMax*approxMax
-  //
-  // After the equivalence transform is applied the matrix becomes
-  // 
-  //Chroma     ( -2H    sqrt(q'_i)    |   0         )
-  //           (sqrt(q'_i)   2H       |   sqrt(p'_i) )
-  //           ---------------------------------
-  //           ( 0     -sqrt(p'_i)    |  2 R gamma_5 + p'0 2H
-  //
-  //     =     ( -2H    sqrt(q_i)amax    |   0              )
-  //           (sqrt(q_i)amax   2H       |   sqrt(p_i*amax) )
-  //           ---------------------------------
-  //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H
-  //
-
-  this->DW(psi,D,DaggerNo); 
-
-  int nblock=(Ls-1)/2;
-  for(int b=0;b<nblock;b++){
-	
-    int s = 2*b;
-    double pp = p[nblock-1-b];
-    double qq = q[nblock-1-b];
-	
-    // Do each 2x2 block aligned at s and
-    ag5xpby_ssp(chi,-1.0*scale,D,amax*sqrt(qq)*scale,psi, s  ,s+1); // Multiplies Dw by G5 so Hw
-    ag5xpby_ssp(chi, 1.0*scale,D,amax*sqrt(qq)*scale,psi, s+1,s);
-	
-    // Pick up last column
-    axpby_ssp  (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
-  }
-	
-  {
-    double R=(1+this->mass)/(1-this->mass);
-    //R g5 psi[Ls] + p[0] H
-    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
-	
-    for(int b=0;b<nblock;b++){
-      int s = 2*b+1;
-      double pp = p[nblock-1-b];
-      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
-    }
-  }
-
-}
-
-template<class Impl>
-RealD  PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
-{
-  M_internal(in,out,DaggerNo);
-  return norm2(out);
-}
-template<class Impl>
-RealD  PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
-{
-  M_internal(in,out,DaggerYes);
-  return norm2(out);
-}
-
-template<class Impl>
-void PartialFractionFermion5D<Impl>::Meooe       (const FermionField &in, FermionField &out)
-{
-  Meooe_internal(in,out,DaggerNo);
-}
-template<class Impl>
-void PartialFractionFermion5D<Impl>::MeooeDag    (const FermionField &in, FermionField &out)
-{
-  Meooe_internal(in,out,DaggerYes);
-}
-template<class Impl>
-void PartialFractionFermion5D<Impl>::Mooee       (const FermionField &in, FermionField &out)
-{
-  Mooee_internal(in,out,DaggerNo);
-}
-template<class Impl>
-void PartialFractionFermion5D<Impl>::MooeeDag    (const FermionField &in, FermionField &out)
-{
-  Mooee_internal(in,out,DaggerYes);
-}
-
-template<class Impl>
-void PartialFractionFermion5D<Impl>::MooeeInv    (const FermionField &in, FermionField &out)
-{
-  MooeeInv_internal(in,out,DaggerNo);
-}
-template<class Impl>
-void PartialFractionFermion5D<Impl>::MooeeInvDag (const FermionField &in, FermionField &out)
-{
-  MooeeInv_internal(in,out,DaggerYes);
-}
-
-
-// force terms; five routines; default to Dhop on diagonal
-template<class Impl>
-void PartialFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  int Ls = this->Ls;
-
-  FermionField D(V.Grid());
-
-  int nblock=(Ls-1)/2;
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
-    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
-  }
-  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
-
-  this->DhopDeriv(mat,D,V,DaggerNo); 
-};
-template<class Impl>
-void PartialFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  int Ls = this->Ls;
-
-  FermionField D(V.Grid());
-
-  int nblock=(Ls-1)/2;
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
-    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
-  }
-  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
-
-  this->DhopDerivOE(mat,D,V,DaggerNo); 
-};
-template<class Impl>
-void PartialFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
-  int Ls = this->Ls;
-
-  FermionField D(V.Grid());
-
-  int nblock=(Ls-1)/2;
-  for(int b=0;b<nblock;b++){
-    int s = 2*b;
-    ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
-    ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
-  }
-  ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
-
-  this->DhopDerivEO(mat,D,V,DaggerNo); 
-};
-
-template<class Impl>
-void  PartialFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
-  SetCoefficientsZolotarev(1.0/scale,zdata);
-}
-template<class Impl>
-void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){
-
-  // check on degree matching
-  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
-  int Ls = this->Ls;
-
-  assert(Ls == (2*zdata->da -1) );
-
-  // Part frac
-  //      RealD R;
-  R=(1+mass)/(1-mass);
-  dw_diag = (4.0-this->M5);
-
-  //      std::vector<RealD> p; 
-  //      std::vector<RealD> q;
-  p.resize(zdata->da);
-  q.resize(zdata->dd);
-	
-  for(int n=0;n<zdata->da;n++){
-    p[n] = zdata -> alpha[n];
-  }
-  for(int n=0;n<zdata->dd;n++){
-    q[n] = -zdata -> ap[n];
-  }
-      
-  scale= part_frac_chroma_convention ? 2.0 : 1.0; // Chroma conventions annoy me
-
-  amax=zolo_hi;
-}
-
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
-    {
-      int Ls = this->Ls;
-      conformable(solution5d.Grid(),this->FermionGrid());
-      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
-    }
-    template<class Impl>
-    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
-    {
-      int Ls = this->Ls;
-      conformable(imported5d.Grid(),this->FermionGrid());
-      conformable(input4d.Grid()   ,this->GaugeGrid());
-      FermionField tmp(this->FermionGrid());
-      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
-      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
-      this->Dminus(tmp,imported5d);
-    }
-
-// Constructors
-template<class Impl>
-PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
-							 GridCartesian         &FiveDimGrid,
-							 GridRedBlackCartesian &FiveDimRedBlackGrid,
-							 GridCartesian         &FourDimGrid,
-							 GridRedBlackCartesian &FourDimRedBlackGrid,
-							 RealD _mass,RealD M5,
-							 const ImplParams &p) :
-  WilsonFermion5D<Impl>(_Umu,
-			FiveDimGrid, FiveDimRedBlackGrid,
-			FourDimGrid, FourDimRedBlackGrid,M5,p),
-  mass(_mass)
-
-{
-  int Ls = this->Ls;
-
-  assert((Ls&0x1)==1); // Odd Ls required
-  int nrational=Ls-1;
-
-
-  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);
-
-  // NB: chroma uses a cast to "float" for the zolotarev range(!?).
-  // this creates a real difference in the operator which I do not like but we can replicate here
-  // to demonstrate compatibility
-  //      RealD eps = (zolo_lo / zolo_hi);
-  //      zdata = bfm_zolotarev(eps,nrational,0);
-      
-  SetCoefficientsTanh(zdata,1.0);
-
-  Approx::zolotarev_free(zdata);
-
-}
- 
-FermOpTemplateInstantiate(PartialFractionFermion5D);
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/StaggeredKernels.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernels.cc
@@ -1,294 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Azusa Yamaguchi, Peter Boyle
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-NAMESPACE_BEGIN(Grid);
-
-int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
-int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
-
-#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)		\
-  SE = st.GetEntry(ptype, Dir+skew, sF);			\
-  if (SE->_is_local ) {						\
-    if (SE->_permute) {						\
-      chi_p = &chi;						\
-      permute(chi,  in[SE->_offset], ptype);		\
-    } else {							\
-      chi_p = &in[SE->_offset];				\
-    }								\
-  } else {							\
-    chi_p = &buf[SE->_offset];					\
-  }								\
-  multLink(Uchi, U[sU], *chi_p, Dir);			
-
-#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)		\
-  SE = st.GetEntry(ptype, Dir+skew, sF);			\
-  if (SE->_is_local ) {						\
-    if (SE->_permute) {						\
-      chi_p = &chi;						\
-      permute(chi,  in[SE->_offset], ptype);		\
-    } else {							\
-      chi_p = &in[SE->_offset];				\
-    }								\
-  } else if ( st.same_node[Dir] ) {				\
-    chi_p = &buf[SE->_offset];					\
-  }								\
-  if (SE->_is_local || st.same_node[Dir] ) {			\
-    multLink(Uchi, U[sU], *chi_p, Dir);			\
-  }
-
-#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
-  SE = st.GetEntry(ptype, Dir+skew, sF);			\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
-    nmu++;							\
-    chi_p = &buf[SE->_offset];					\
-    multLink(Uchi, U[sU], *chi_p, Dir);			\
-  }
-
-template <class Impl>
-StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
-
-////////////////////////////////////////////////////////////////////////////////////
-// Generic implementation; move to different file?
-// Int, Ext, Int+Ext cases for comms overlap
-////////////////////////////////////////////////////////////////////////////////////
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionFieldView &in, FermionFieldView &out, int dag) {
-  const SiteSpinor *chi_p;
-  SiteSpinor chi;
-  SiteSpinor Uchi;
-  StencilEntry *SE;
-  int ptype;
-  int skew;
-
-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
-    skew = 0;
-    GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
-    GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
-    skew=8;
-    GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
-    if ( dag ) { 
-      Uchi = - Uchi;
-    } 
-    vstream(out[sF], Uchi);
-  }
-};
-
-  ///////////////////////////////////////////////////
-  // Only contributions from interior of our node
-  ///////////////////////////////////////////////////
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
-						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-						SiteSpinor *buf, int LLs, int sU, 
-						const FermionFieldView &in, FermionFieldView &out,int dag) {
-  const SiteSpinor *chi_p;
-  SiteSpinor chi;
-  SiteSpinor Uchi;
-  StencilEntry *SE;
-  int ptype;
-  int skew ;
-
-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
-    skew = 0;
-    Uchi=Zero();
-    GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
-    skew=8;
-    GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
-    if ( dag ) {
-      Uchi = - Uchi;
-    }
-    vstream(out[sF], Uchi);
-  }
-};
-
-
-  ///////////////////////////////////////////////////
-  // Only contributions from exterior of our node
-  ///////////////////////////////////////////////////
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
-						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-						SiteSpinor *buf, int LLs, int sU,
-						const FermionFieldView &in, FermionFieldView &out,int dag) {
-  const SiteSpinor *chi_p;
-  //  SiteSpinor chi;
-  SiteSpinor Uchi;
-  StencilEntry *SE;
-  int ptype;
-  int nmu=0;
-  int skew ;
-
-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
-    skew = 0;
-    Uchi=Zero();
-    GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
-    skew=8;
-    GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
-
-    if ( nmu ) { 
-      if ( dag ) { 
-	out[sF] = out[sF] - Uchi;
-      } else { 
-	out[sF] = out[sF] + Uchi;
-      }
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////
-// Driving / wrapping routine to select right kernel
-////////////////////////////////////////////////////////////////////////////////////
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					 SiteSpinor *buf, int LLs, int sU,
-					 const FermionFieldView &in, FermionFieldView &out,
-					 int interior,int exterior)
-{
-  int dag=1;
-  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-				      SiteSpinor *buf, int LLs, int sU,
-				      const FermionFieldView &in, FermionFieldView &out,
-				      int interior,int exterior)
-{
-  int dag=0;
-  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-				      SiteSpinor *buf, int LLs,
-				      int sU, const FermionFieldView &in, FermionFieldView &out,
-				      int dag,int interior,int exterior) 
-{
-  switch(Opt) {
-#ifdef AVX512
-  case OptInlineAsm:
-    if ( interior && exterior ) {
-      DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else { 
-      std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
-      assert(0);
-    }
-    break;
-#endif
-  case OptHandUnroll:
-    if ( interior && exterior ) {
-      DhopSiteHand   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( interior ) {
-      DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( exterior ) {
-      DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    }
-    break;
-  case OptGeneric:
-    if ( interior && exterior ) {
-      DhopSiteGeneric   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( interior ) {
-      DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( exterior ) {
-      DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    }
-    break;
-  default:
-    std::cout<<"Oops Opt = "<<Opt<<std::endl;
-    assert(0);
-    break;
-  }
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U,  DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
-					    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp) 
-{
-  // Disp should be either +1,-1,+3,-3
-  // What about "dag" ?
-  // Because we work out pU . dS/dU 
-  // U
-  assert(0);
-}
-
-FermOpStaggeredTemplateInstantiate(StaggeredKernels);
-FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
-
-NAMESPACE_END(Grid);
-
-
--- a/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
@@ -1,972 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid.h>
-
-#ifdef AVX512
-#include <simd/Intel512common.h>
-#include <simd/Intel512avx.h>
-#endif
-
-// Interleave operations from two directions
-// This looks just like a 2 spin multiply and reuse same sequence from the Wilson
-// Kernel. But the spin index becomes a mu index instead.
-#define Chi_00 %zmm0
-#define Chi_01 %zmm1
-#define Chi_02 %zmm2
-#define Chi_10 %zmm3
-#define Chi_11 %zmm4
-#define Chi_12 %zmm5
-#define Chi_20 %zmm6
-#define Chi_21 %zmm7
-#define Chi_22 %zmm8
-#define Chi_30 %zmm9
-#define Chi_31 %zmm10
-#define Chi_32 %zmm11
-
-#define UChi_00 %zmm12
-#define UChi_01 %zmm13
-#define UChi_02 %zmm14
-#define UChi_10 %zmm15
-#define UChi_11 %zmm16
-#define UChi_12 %zmm17
-#define UChi_20 %zmm18
-#define UChi_21 %zmm19
-#define UChi_22 %zmm20
-#define UChi_30 %zmm21
-#define UChi_31 %zmm22
-#define UChi_32 %zmm23
-
-#define pChi_00 %%zmm0
-#define pChi_01 %%zmm1
-#define pChi_02 %%zmm2
-#define pChi_10 %%zmm3
-#define pChi_11 %%zmm4
-#define pChi_12 %%zmm5
-#define pChi_20 %%zmm6
-#define pChi_21 %%zmm7
-#define pChi_22 %%zmm8
-#define pChi_30 %%zmm9
-#define pChi_31 %%zmm10
-#define pChi_32 %%zmm11
-
-#define pUChi_00 %%zmm12
-#define pUChi_01 %%zmm13
-#define pUChi_02 %%zmm14
-#define pUChi_10 %%zmm15
-#define pUChi_11 %%zmm16
-#define pUChi_12 %%zmm17
-#define pUChi_20 %%zmm18
-#define pUChi_21 %%zmm19
-#define pUChi_22 %%zmm20
-#define pUChi_30 %%zmm21
-#define pUChi_31 %%zmm22
-#define pUChi_32 %%zmm23
-
-#define T0 %zmm24
-#define T1 %zmm25
-#define T2 %zmm26
-#define T3 %zmm27
-
-#define Z00 %zmm26
-#define Z10 %zmm27
-#define Z0 Z00
-#define Z1 %zmm28
-#define Z2 %zmm29
-
-#define Z3 %zmm30
-#define Z4 %zmm31
-#define Z5 Chi_31
-#define Z6 Chi_32
-
-#define MULT_ADD_LS(g0,g1,g2,g3)					\
-  asm ( "movq %0, %%r8 \n\t"					\
-	"movq %1, %%r9 \n\t"						\
-        "movq %2, %%r10 \n\t"						\
-        "movq %3, %%r11 \n\t" :  : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
-  asm (									\
-  VSHUF(Chi_00,T0)      VSHUF(Chi_10,T1)				\
-  VSHUF(Chi_20,T2)      VSHUF(Chi_30,T3)				\
-  VMADDSUBIDUP(0,%r8,T0,UChi_00) VMADDSUBIDUP(0,%r9,T1,UChi_10)		\
-  VMADDSUBIDUP(3,%r8,T0,UChi_01) VMADDSUBIDUP(3,%r9,T1,UChi_11)		\
-  VMADDSUBIDUP(6,%r8,T0,UChi_02) VMADDSUBIDUP(6,%r9,T1,UChi_12)		\
-  VMADDSUBIDUP(0,%r10,T2,UChi_20) VMADDSUBIDUP(0,%r11,T3,UChi_30)		\
-  VMADDSUBIDUP(3,%r10,T2,UChi_21) VMADDSUBIDUP(3,%r11,T3,UChi_31)		\
-  VMADDSUBIDUP(6,%r10,T2,UChi_22) VMADDSUBIDUP(6,%r11,T3,UChi_32)		\
-  VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
-  VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
-  VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
-  VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
-  VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
-  VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
-  VSHUF(Chi_01,T0)	  VSHUF(Chi_11,T1)				\
-  VSHUF(Chi_21,T2)	  VSHUF(Chi_31,T3)				\
-  VMADDSUBIDUP(1,%r8,T0,UChi_00)     VMADDSUBIDUP(1,%r9,T1,UChi_10)	\
-  VMADDSUBIDUP(4,%r8,T0,UChi_01)     VMADDSUBIDUP(4,%r9,T1,UChi_11)	\
-  VMADDSUBIDUP(7,%r8,T0,UChi_02)     VMADDSUBIDUP(7,%r9,T1,UChi_12)	\
-  VMADDSUBIDUP(1,%r10,T2,UChi_20)     VMADDSUBIDUP(1,%r11,T3,UChi_30)	\
-  VMADDSUBIDUP(4,%r10,T2,UChi_21)     VMADDSUBIDUP(4,%r11,T3,UChi_31)	\
-  VMADDSUBIDUP(7,%r10,T2,UChi_22)     VMADDSUBIDUP(7,%r11,T3,UChi_32)	\
-  VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
-  VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
-  VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
-  VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
-  VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
-  VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
-  VSHUF(Chi_02,T0)    VSHUF(Chi_12,T1)					\
-  VSHUF(Chi_22,T2)    VSHUF(Chi_32,T3)					\
-  VMADDSUBIDUP(2,%r8,T0,UChi_00)     VMADDSUBIDUP(2,%r9,T1,UChi_10)     \
-  VMADDSUBIDUP(5,%r8,T0,UChi_01)     VMADDSUBIDUP(5,%r9,T1,UChi_11)     \
-  VMADDSUBIDUP(8,%r8,T0,UChi_02)     VMADDSUBIDUP(8,%r9,T1,UChi_12)     \
-  VMADDSUBIDUP(2,%r10,T2,UChi_20)     VMADDSUBIDUP(2,%r11,T3,UChi_30)     \
-  VMADDSUBIDUP(5,%r10,T2,UChi_21)     VMADDSUBIDUP(5,%r11,T3,UChi_31)     \
-  VMADDSUBIDUP(8,%r10,T2,UChi_22)     VMADDSUBIDUP(8,%r11,T3,UChi_32)     \
-  VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
-  VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
-  VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
-  VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
-  VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
-  VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
-
-#define MULT_LS(g0,g1,g2,g3)					\
-  asm ( "movq %0, %%r8 \n\t"					\
-	"movq %1, %%r9 \n\t"						\
-        "movq %2, %%r10 \n\t"						\
-        "movq %3, %%r11 \n\t" :  : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
-  asm (									\
-  VSHUF(Chi_00,T0)      VSHUF(Chi_10,T1)				\
-  VSHUF(Chi_20,T2)      VSHUF(Chi_30,T3)				\
-  VMULIDUP(0,%r8,T0,UChi_00) VMULIDUP(0,%r9,T1,UChi_10)		\
-  VMULIDUP(3,%r8,T0,UChi_01) VMULIDUP(3,%r9,T1,UChi_11)		\
-  VMULIDUP(6,%r8,T0,UChi_02) VMULIDUP(6,%r9,T1,UChi_12)		\
-  VMULIDUP(0,%r10,T2,UChi_20) VMULIDUP(0,%r11,T3,UChi_30)		\
-  VMULIDUP(3,%r10,T2,UChi_21) VMULIDUP(3,%r11,T3,UChi_31)		\
-  VMULIDUP(6,%r10,T2,UChi_22) VMULIDUP(6,%r11,T3,UChi_32)		\
-  VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
-  VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
-  VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
-  VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
-  VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
-  VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
-  VSHUF(Chi_01,T0)	  VSHUF(Chi_11,T1)				\
-  VSHUF(Chi_21,T2)	  VSHUF(Chi_31,T3)				\
-  VMADDSUBIDUP(1,%r8,T0,UChi_00)     VMADDSUBIDUP(1,%r9,T1,UChi_10)	\
-  VMADDSUBIDUP(4,%r8,T0,UChi_01)     VMADDSUBIDUP(4,%r9,T1,UChi_11)	\
-  VMADDSUBIDUP(7,%r8,T0,UChi_02)     VMADDSUBIDUP(7,%r9,T1,UChi_12)	\
-  VMADDSUBIDUP(1,%r10,T2,UChi_20)     VMADDSUBIDUP(1,%r11,T3,UChi_30)	\
-  VMADDSUBIDUP(4,%r10,T2,UChi_21)     VMADDSUBIDUP(4,%r11,T3,UChi_31)	\
-  VMADDSUBIDUP(7,%r10,T2,UChi_22)     VMADDSUBIDUP(7,%r11,T3,UChi_32)	\
-  VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
-  VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
-  VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
-  VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
-  VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
-  VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
-  VSHUF(Chi_02,T0)    VSHUF(Chi_12,T1)					\
-  VSHUF(Chi_22,T2)    VSHUF(Chi_32,T3)					\
-  VMADDSUBIDUP(2,%r8,T0,UChi_00)     VMADDSUBIDUP(2,%r9,T1,UChi_10)     \
-  VMADDSUBIDUP(5,%r8,T0,UChi_01)     VMADDSUBIDUP(5,%r9,T1,UChi_11)     \
-  VMADDSUBIDUP(8,%r8,T0,UChi_02)     VMADDSUBIDUP(8,%r9,T1,UChi_12)     \
-  VMADDSUBIDUP(2,%r10,T2,UChi_20)     VMADDSUBIDUP(2,%r11,T3,UChi_30)     \
-  VMADDSUBIDUP(5,%r10,T2,UChi_21)     VMADDSUBIDUP(5,%r11,T3,UChi_31)     \
-  VMADDSUBIDUP(8,%r10,T2,UChi_22)     VMADDSUBIDUP(8,%r11,T3,UChi_32)     \
-  VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
-  VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
-  VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
-  VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
-  VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
-  VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
-
-#define MULT_ADD_XYZTa(g0,g1)					\
-  asm ( "movq %0, %%r8 \n\t"					\
-	"movq %1, %%r9 \n\t"	 :  : "r"(g0), "r"(g1) : "%r8","%r9");\
-	   __asm__ (						\
-	   VSHUF(Chi_00,T0)				\
-	   VSHUF(Chi_10,T1)						\
-	   VMOVIDUP(0,%r8,Z0 )						\
-           VMOVIDUP(3,%r8,Z1 )						\
-           VMOVIDUP(6,%r8,Z2 )						\
-           VMADDSUB(Z0,T0,UChi_00)					\
-	   VMADDSUB(Z1,T0,UChi_01)					\
-	   VMADDSUB(Z2,T0,UChi_02)					\
-									\
-	   VMOVIDUP(0,%r9,Z0 )						\
-           VMOVIDUP(3,%r9,Z1 )						\
-           VMOVIDUP(6,%r9,Z2 )						\
-           VMADDSUB(Z0,T1,UChi_10)					\
-           VMADDSUB(Z1,T1,UChi_11)            \
-           VMADDSUB(Z2,T1,UChi_12)            \
-	   							\
-								\
-	   VMOVRDUP(0,%r8,Z3 )					\
-	   VMOVRDUP(3,%r8,Z4 )					\
-	   VMOVRDUP(6,%r8,Z5 )					\
-           VMADDSUB(Z3,Chi_00,UChi_00)/*rr * ir = ri rr*/	\
-           VMADDSUB(Z4,Chi_00,UChi_01)				\
-           VMADDSUB(Z5,Chi_00,UChi_02)				\
-								\
-	   VMOVRDUP(0,%r9,Z3 )					\
-	   VMOVRDUP(3,%r9,Z4 )					\
-	   VMOVRDUP(6,%r9,Z5 )					\
-           VMADDSUB(Z3,Chi_10,UChi_10)				\
-           VMADDSUB(Z4,Chi_10,UChi_11)\
-           VMADDSUB(Z5,Chi_10,UChi_12)				\
-	   							\
-								\
-	   VMOVIDUP(1,%r8,Z0 )					\
-	   VMOVIDUP(4,%r8,Z1 )					\
-	   VMOVIDUP(7,%r8,Z2 )					\
-	   VSHUF(Chi_01,T0)					\
-           VMADDSUB(Z0,T0,UChi_00)				\
-           VMADDSUB(Z1,T0,UChi_01)				\
-           VMADDSUB(Z2,T0,UChi_02)				\
-								\
-	   VMOVIDUP(1,%r9,Z0 )					\
-	   VMOVIDUP(4,%r9,Z1 )					\
-	   VMOVIDUP(7,%r9,Z2 )					\
-	   VSHUF(Chi_11,T1)					\
-           VMADDSUB(Z0,T1,UChi_10)				\
-           VMADDSUB(Z1,T1,UChi_11)				\
-           VMADDSUB(Z2,T1,UChi_12)				\
-								\
-	   VMOVRDUP(1,%r8,Z3 )					\
-	   VMOVRDUP(4,%r8,Z4 )					\
-	   VMOVRDUP(7,%r8,Z5 )					\
-           VMADDSUB(Z3,Chi_01,UChi_00)				\
-           VMADDSUB(Z4,Chi_01,UChi_01)				\
-           VMADDSUB(Z5,Chi_01,UChi_02)				\
-								\
-	   VMOVRDUP(1,%r9,Z3 )					\
-	   VMOVRDUP(4,%r9,Z4 )					\
-	   VMOVRDUP(7,%r9,Z5 )					\
-           VMADDSUB(Z3,Chi_11,UChi_10)				\
-           VMADDSUB(Z4,Chi_11,UChi_11)				\
-           VMADDSUB(Z5,Chi_11,UChi_12)				\
-	   							\
-	   VSHUF(Chi_02,T0)					\
-	   VSHUF(Chi_12,T1)					\
-	   VMOVIDUP(2,%r8,Z0 )					\
-	   VMOVIDUP(5,%r8,Z1 )					\
-	   VMOVIDUP(8,%r8,Z2 )					\
-           VMADDSUB(Z0,T0,UChi_00)				\
-           VMADDSUB(Z1,T0,UChi_01)			      \
-           VMADDSUB(Z2,T0,UChi_02)			      \
-	   VMOVIDUP(2,%r9,Z0 )					\
-	   VMOVIDUP(5,%r9,Z1 )					\
-	   VMOVIDUP(8,%r9,Z2 )					\
-           VMADDSUB(Z0,T1,UChi_10)			      \
-           VMADDSUB(Z1,T1,UChi_11)			      \
-           VMADDSUB(Z2,T1,UChi_12)			      \
-	   /*55*/					      \
-	   VMOVRDUP(2,%r8,Z3 )		  \
-	   VMOVRDUP(5,%r8,Z4 )					\
-	   VMOVRDUP(8,%r8,Z5 )				      \
-           VMADDSUB(Z3,Chi_02,UChi_00)			      \
-           VMADDSUB(Z4,Chi_02,UChi_01)			      \
-           VMADDSUB(Z5,Chi_02,UChi_02)			      \
-	   VMOVRDUP(2,%r9,Z3 )		  \
-	   VMOVRDUP(5,%r9,Z4 )					\
-	   VMOVRDUP(8,%r9,Z5 )				      \
-           VMADDSUB(Z3,Chi_12,UChi_10)			      \
-           VMADDSUB(Z4,Chi_12,UChi_11)			      \
-           VMADDSUB(Z5,Chi_12,UChi_12)			      \
-	   /*61 insns*/							);
-
-#define MULT_ADD_XYZT(g0,g1)					\
-  asm ( "movq %0, %%r8 \n\t"					\
-	"movq %1, %%r9 \n\t"	 :  : "r"(g0), "r"(g1) : "%r8","%r9");\
-  __asm__ (							  \
-  VSHUFMEM(0,%r8,Z00)		   VSHUFMEM(0,%r9,Z10)			\
-  VRDUP(Chi_00,T0)           VIDUP(Chi_00,Chi_00)	          \
-   VRDUP(Chi_10,T1)           VIDUP(Chi_10,Chi_10)		  \
-   VMUL(Z00,Chi_00,Z1)        VMUL(Z10,Chi_10,Z2)		  \
-   VSHUFMEM(3,%r8,Z00)	      VSHUFMEM(3,%r9,Z10)		  \
-   VMUL(Z00,Chi_00,Z3)        VMUL(Z10,Chi_10,Z4)		  \
-   VSHUFMEM(6,%r8,Z00)	      VSHUFMEM(6,%r9,Z10)		  \
-   VMUL(Z00,Chi_00,Z5)        VMUL(Z10,Chi_10,Z6)		  \
-   VMADDMEM(0,%r8,T0,UChi_00)  VMADDMEM(0,%r9,T1,UChi_10)		  \
-   VMADDMEM(3,%r8,T0,UChi_01)  VMADDMEM(3,%r9,T1,UChi_11)		  \
-   VMADDMEM(6,%r8,T0,UChi_02)  VMADDMEM(6,%r9,T1,UChi_12)		  \
-   VSHUFMEM(1,%r8,Z00)	      VSHUFMEM(1,%r9,Z10)		  \
-   VRDUP(Chi_01,T0)           VIDUP(Chi_01,Chi_01)		  \
-   VRDUP(Chi_11,T1)           VIDUP(Chi_11,Chi_11)		  \
-   VMADD(Z00,Chi_01,Z1)       VMADD(Z10,Chi_11,Z2)		  \
-   VSHUFMEM(4,%r8,Z00)	      VSHUFMEM(4,%r9,Z10)		  \
-   VMADD(Z00,Chi_01,Z3)       VMADD(Z10,Chi_11,Z4)		  \
-   VSHUFMEM(7,%r8,Z00)	      VSHUFMEM(7,%r9,Z10)		  \
-   VMADD(Z00,Chi_01,Z5)       VMADD(Z10,Chi_11,Z6)		  \
-   VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10)	  \
-   VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11)	  \
-   VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12)	  \
-   VSHUFMEM(2,%r8,Z00)	      VSHUFMEM(2,%r9,Z10)			\
-   VRDUP(Chi_02,T0)           VIDUP(Chi_02,Chi_02)			\
-   VRDUP(Chi_12,T1)           VIDUP(Chi_12,Chi_12)			\
-   VMADD(Z00,Chi_02,Z1)       VMADD(Z10,Chi_12,Z2)		  \
-   VSHUFMEM(5,%r8,Z00)	      VSHUFMEM(5,%r9,Z10)		  \
-   VMADD(Z00,Chi_02,Z3)       VMADD(Z10,Chi_12,Z4)		  \
-   VSHUFMEM(8,%r8,Z00)	      VSHUFMEM(8,%r9,Z10)		  \
-   VMADD(Z00,Chi_02,Z5)       VMADD(Z10,Chi_12,Z6)		  \
-   VMADDSUBMEM(2,%r8,T0,Z1)   VMADDSUBMEM(2,%r9,T1,Z2)		  \
-   VMADDSUBMEM(5,%r8,T0,Z3)   VMADDSUBMEM(5,%r9,T1,Z4)	          \
-   VMADDSUBMEM(8,%r8,T0,Z5)   VMADDSUBMEM(8,%r9,T1,Z6)	       \
-   VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
-   VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
-   VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) );
-
-#define MULT_XYZT(g0,g1)					\
-    asm ( "movq %0, %%r8 \n\t"						\
-	"movq %1, %%r9 \n\t" :  : "r"(g0), "r"(g1) : "%r8","%r9" ); \
-	   __asm__ (						\
-	   VSHUF(Chi_00,T0)				\
-	   VSHUF(Chi_10,T1)						\
-	   VMOVIDUP(0,%r8,Z0 )						\
-           VMOVIDUP(3,%r8,Z1 )						\
-           VMOVIDUP(6,%r8,Z2 )						\
-	   /*6*/							\
-           VMUL(Z0,T0,UChi_00)            \
-           VMUL(Z1,T0,UChi_01)            \
-           VMUL(Z2,T0,UChi_02)            \
-	   VMOVIDUP(0,%r9,Z0 )						\
-           VMOVIDUP(3,%r9,Z1 )						\
-           VMOVIDUP(6,%r9,Z2 )						\
-           VMUL(Z0,T1,UChi_10)            \
-           VMUL(Z1,T1,UChi_11)            \
-           VMUL(Z2,T1,UChi_12)            \
-	   VMOVRDUP(0,%r8,Z3 )					\
-	   VMOVRDUP(3,%r8,Z4 )					\
-	   VMOVRDUP(6,%r8,Z5 )					\
-	   /*18*/						\
-           VMADDSUB(Z3,Chi_00,UChi_00)				\
-           VMADDSUB(Z4,Chi_00,UChi_01)\
-           VMADDSUB(Z5,Chi_00,UChi_02) \
-	   VMOVRDUP(0,%r9,Z3 )					\
-	   VMOVRDUP(3,%r9,Z4 )					\
-	   VMOVRDUP(6,%r9,Z5 )					\
-           VMADDSUB(Z3,Chi_10,UChi_10)				\
-           VMADDSUB(Z4,Chi_10,UChi_11)\
-           VMADDSUB(Z5,Chi_10,UChi_12)				\
-	   VMOVIDUP(1,%r8,Z0 )					\
-	   VMOVIDUP(4,%r8,Z1 )					\
-	   VMOVIDUP(7,%r8,Z2 )					\
-	   /*28*/						\
-	   VSHUF(Chi_01,T0)					\
-           VMADDSUB(Z0,T0,UChi_00)      \
-           VMADDSUB(Z1,T0,UChi_01)       \
-           VMADDSUB(Z2,T0,UChi_02)        \
-	   VMOVIDUP(1,%r9,Z0 )					\
-	   VMOVIDUP(4,%r9,Z1 )					\
-	   VMOVIDUP(7,%r9,Z2 )					\
-	   VSHUF(Chi_11,T1)					\
-           VMADDSUB(Z0,T1,UChi_10)				\
-           VMADDSUB(Z1,T1,UChi_11)				\
-           VMADDSUB(Z2,T1,UChi_12)        \
-	   VMOVRDUP(1,%r8,Z3 )					\
-	   VMOVRDUP(4,%r8,Z4 )					\
-	   VMOVRDUP(7,%r8,Z5 )					\
-           /*38*/						\
-           VMADDSUB(Z3,Chi_01,UChi_00)    \
-           VMADDSUB(Z4,Chi_01,UChi_01)    \
-           VMADDSUB(Z5,Chi_01,UChi_02)    \
-	   VMOVRDUP(1,%r9,Z3 )					\
-	   VMOVRDUP(4,%r9,Z4 )					\
-	   VMOVRDUP(7,%r9,Z5 )					\
-           VMADDSUB(Z3,Chi_11,UChi_10)				\
-           VMADDSUB(Z4,Chi_11,UChi_11)    \
-           VMADDSUB(Z5,Chi_11,UChi_12)				\
-	   /*48*/						\
-	   VSHUF(Chi_02,T0)					\
-	   VSHUF(Chi_12,T1)					\
-	   VMOVIDUP(2,%r8,Z0 )					\
-	   VMOVIDUP(5,%r8,Z1 )					\
-	   VMOVIDUP(8,%r8,Z2 )					\
-           VMADDSUB(Z0,T0,UChi_00)				\
-           VMADDSUB(Z1,T0,UChi_01)			      \
-           VMADDSUB(Z2,T0,UChi_02)			      \
-	   VMOVIDUP(2,%r9,Z0 )					\
-	   VMOVIDUP(5,%r9,Z1 )					\
-	   VMOVIDUP(8,%r9,Z2 )					\
-           VMADDSUB(Z0,T1,UChi_10)			      \
-           VMADDSUB(Z1,T1,UChi_11)			      \
-           VMADDSUB(Z2,T1,UChi_12)			      \
-	   /*55*/					      \
-	   VMOVRDUP(2,%r8,Z3 )		  \
-	   VMOVRDUP(5,%r8,Z4 )					\
-	   VMOVRDUP(8,%r8,Z5 )				      \
-           VMADDSUB(Z3,Chi_02,UChi_00)			      \
-           VMADDSUB(Z4,Chi_02,UChi_01)			      \
-           VMADDSUB(Z5,Chi_02,UChi_02)			      \
-	   VMOVRDUP(2,%r9,Z3 )		  \
-	   VMOVRDUP(5,%r9,Z4 )					\
-	   VMOVRDUP(8,%r9,Z5 )				      \
-           VMADDSUB(Z3,Chi_12,UChi_10)			      \
-           VMADDSUB(Z4,Chi_12,UChi_11)			      \
-           VMADDSUB(Z5,Chi_12,UChi_12)			      \
-	   /*61 insns*/							);
-
-#define MULT_XYZTa(g0,g1)					\
-  asm ( "movq %0, %%r8 \n\t"					\
-	"movq %1, %%r9 \n\t" :  : "r"(g0), "r"(g1) : "%r8","%r9" ); \
-  __asm__ (							  \
-   VSHUFMEM(0,%r8,Z00)		   VSHUFMEM(0,%r9,Z10)	  \
-   VRDUP(Chi_00,T0)           VIDUP(Chi_00,Chi_00)	          \
-   VRDUP(Chi_10,T1)           VIDUP(Chi_10,Chi_10)		  \
-   VMUL(Z00,Chi_00,Z1)        VMUL(Z10,Chi_10,Z2)		  \
-   VSHUFMEM(3,%r8,Z00)	      VSHUFMEM(3,%r9,Z10)		  \
-   VMUL(Z00,Chi_00,Z3)        VMUL(Z10,Chi_10,Z4)		  \
-   VSHUFMEM(6,%r8,Z00)	      VSHUFMEM(6,%r9,Z10)		  \
-   VMUL(Z00,Chi_00,Z5)        VMUL(Z10,Chi_10,Z6)		  \
-   VMULMEM(0,%r8,T0,UChi_00)  VMULMEM(0,%r9,T1,UChi_10)		  \
-   VMULMEM(3,%r8,T0,UChi_01)  VMULMEM(3,%r9,T1,UChi_11)		  \
-   VMULMEM(6,%r8,T0,UChi_02)  VMULMEM(6,%r9,T1,UChi_12)		  \
-   VSHUFMEM(1,%r8,Z00)	      VSHUFMEM(1,%r9,Z10)		  \
-   VRDUP(Chi_01,T0)           VIDUP(Chi_01,Chi_01)		  \
-   VRDUP(Chi_11,T1)           VIDUP(Chi_11,Chi_11)		  \
-   VMADD(Z00,Chi_01,Z1)       VMADD(Z10,Chi_11,Z2)		  \
-   VSHUFMEM(4,%r8,Z00)	      VSHUFMEM(4,%r9,Z10)		  \
-   VMADD(Z00,Chi_01,Z3)       VMADD(Z10,Chi_11,Z4)		  \
-   VSHUFMEM(7,%r8,Z00)	      VSHUFMEM(7,%r9,Z10)		  \
-   VMADD(Z00,Chi_01,Z5)       VMADD(Z10,Chi_11,Z6)		  \
-   VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10)	  \
-   VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11)	  \
-   VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12)	  \
-   VSHUFMEM(2,%r8,Z00)	      VSHUFMEM(2,%r9,Z10)			\
-   VRDUP(Chi_02,T0)           VIDUP(Chi_02,Chi_02)			\
-   VRDUP(Chi_12,T1)           VIDUP(Chi_12,Chi_12)			\
-   VMADD(Z00,Chi_02,Z1)       VMADD(Z10,Chi_12,Z2)		  \
-   VSHUFMEM(5,%r8,Z00)	      VSHUFMEM(5,%r9,Z10)		  \
-   VMADD(Z00,Chi_02,Z3)       VMADD(Z10,Chi_12,Z4)		  \
-   VSHUFMEM(8,%r8,Z00)	      VSHUFMEM(8,%r9,Z10)		  \
-   VMADD(Z00,Chi_02,Z5)       VMADD(Z10,Chi_12,Z6)		  \
-   VMADDSUBMEM(2,%r8,T0,Z1)   VMADDSUBMEM(2,%r9,T1,Z2)		  \
-   VMADDSUBMEM(5,%r8,T0,Z3)   VMADDSUBMEM(5,%r9,T1,Z4)	          \
-   VMADDSUBMEM(8,%r8,T0,Z5)   VMADDSUBMEM(8,%r9,T1,Z6)	       \
-   VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
-   VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
-   VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) );
-
-
-#define LOAD_CHI(a0,a1,a2,a3)						\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VLOAD(0,%%r8,pChi_00)						\
-       VLOAD(1,%%r8,pChi_01)						\
-       VLOAD(2,%%r8,pChi_02)						\
-       : : "r" (a0) : "%r8" );						\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VLOAD(0,%%r8,pChi_10)						\
-       VLOAD(1,%%r8,pChi_11)						\
-       VLOAD(2,%%r8,pChi_12)						\
-       : : "r" (a1) : "%r8" );						\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VLOAD(0,%%r8,pChi_20)						\
-       VLOAD(1,%%r8,pChi_21)						\
-       VLOAD(2,%%r8,pChi_22)						\
-       : : "r" (a2) : "%r8" );						\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VLOAD(0,%%r8,pChi_30)						\
-       VLOAD(1,%%r8,pChi_31)						\
-       VLOAD(2,%%r8,pChi_32)						\
-       : : "r" (a3) : "%r8" );						
-
-#define LOAD_CHIa(a0,a1)						\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VLOAD(0,%%r8,pChi_00)						\
-       VLOAD(1,%%r8,pChi_01)						\
-       VLOAD(2,%%r8,pChi_02)						\
-       : : "r" (a0) : "%r8" );						\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VLOAD(0,%%r8,pChi_10)						\
-       VLOAD(1,%%r8,pChi_11)						\
-       VLOAD(2,%%r8,pChi_12)						\
-       : : "r" (a1) : "%r8" );						
-
-#define PF_CHI(a0)							
-#define PF_CHIa(a0)							\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VPREFETCH1(0,%%r8)						\
-       VPREFETCH1(1,%%r8)						\
-       VPREFETCH1(2,%%r8)						\
-       : : "r" (a0) : "%r8" );						\
-
-#define PF_GAUGE_XYZT(a0)							
-#define PF_GAUGE_XYZTa(a0)						\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VPREFETCH1(0,%%r8)						\
-       VPREFETCH1(1,%%r8)						\
-       VPREFETCH1(2,%%r8)						\
-       VPREFETCH1(3,%%r8)						\
-       VPREFETCH1(4,%%r8)						\
-       VPREFETCH1(5,%%r8)						\
-       VPREFETCH1(6,%%r8)						\
-       VPREFETCH1(7,%%r8)						\
-       VPREFETCH1(8,%%r8)						\
-       : : "r" (a0) : "%r8" );						\
-
-#define PF_GAUGE_LS(a0)							
-#define PF_GAUGE_LSa(a0)							\
-  asm (									\
-       "movq %0, %%r8 \n\t"						\
-       VPREFETCH1(0,%%r8)						\
-       VPREFETCH1(1,%%r8)						\
-       : : "r" (a0) : "%r8" );						\
-  
-
-#define REDUCE(out)					\
-  asm (							\
-  VADD(UChi_00,UChi_10,UChi_00)				\
-  VADD(UChi_01,UChi_11,UChi_01)				\
-  VADD(UChi_02,UChi_12,UChi_02)				\
-  VADD(UChi_30,UChi_20,UChi_30)				\
-  VADD(UChi_31,UChi_21,UChi_31)				\
-  VADD(UChi_32,UChi_22,UChi_32)				\
-  VADD(UChi_00,UChi_30,UChi_00)				\
-  VADD(UChi_01,UChi_31,UChi_01)				\
-  VADD(UChi_02,UChi_32,UChi_02)				);	\
-  asm (								\
-       VSTORE(0,%0,pUChi_00)					\
-       VSTORE(1,%0,pUChi_01)					\
-       VSTORE(2,%0,pUChi_02)					\
-       : : "r" (out) : "memory" );
-
-#define nREDUCE(out)							\
-  asm (									\
-       VADD(UChi_00,UChi_10,UChi_00)					\
-       VADD(UChi_01,UChi_11,UChi_01)					\
-       VADD(UChi_02,UChi_12,UChi_02)					\
-       VADD(UChi_30,UChi_20,UChi_30)					\
-       VADD(UChi_31,UChi_21,UChi_31)					\
-       VADD(UChi_32,UChi_22,UChi_32)					\
-       VADD(UChi_00,UChi_30,UChi_00)					\
-       VADD(UChi_01,UChi_31,UChi_01)					\
-       VADD(UChi_02,UChi_32,UChi_02)				);	\
-  asm (VZERO(Chi_00)							\
-       VSUB(UChi_00,Chi_00,UChi_00)					\
-       VSUB(UChi_01,Chi_00,UChi_01)					\
-       VSUB(UChi_02,Chi_00,UChi_02)				);	\
-  asm (								\
-       VSTORE(0,%0,pUChi_00)					\
-       VSTORE(1,%0,pUChi_01)					\
-       VSTORE(2,%0,pUChi_02)					\
-       : : "r" (out) : "memory" );
-
-#define REDUCEa(out)					\
-  asm (							\
-  VADD(UChi_00,UChi_10,UChi_00)				\
-  VADD(UChi_01,UChi_11,UChi_01)				\
-  VADD(UChi_02,UChi_12,UChi_02)	);			\
-  asm (							\
-  VSTORE(0,%0,pUChi_00)					\
-  VSTORE(1,%0,pUChi_01)					\
-  VSTORE(2,%0,pUChi_02)					\
-  : : "r" (out) : "memory" );
-
-// FIXME is sign right in the VSUB ?
-#define nREDUCEa(out)					\
-  asm (							\
-  VADD(UChi_00,UChi_10,UChi_00)				\
-  VADD(UChi_01,UChi_11,UChi_01)				\
-  VADD(UChi_02,UChi_12,UChi_02)	);			\
-  asm (VZERO(Chi_00)							\
-       VSUB(UChi_00,Chi_00,UChi_00)					\
-       VSUB(UChi_01,Chi_00,UChi_01)					\
-       VSUB(UChi_02,Chi_00,UChi_02)				);	\
-  asm (									\
-       VSTORE(0,%0,pUChi_00)				\
-       VSTORE(1,%0,pUChi_01)				\
-       VSTORE(2,%0,pUChi_02)				\
-       : : "r" (out) : "memory" );
-
-#define PERMUTE_DIR(dir)			\
-      permute##dir(Chi_0,Chi_0);\
-      permute##dir(Chi_1,Chi_1);\
-      permute##dir(Chi_2,Chi_2);
-
-NAMESPACE_BEGIN(Grid);
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-					 DoubledGaugeFieldView &U,
-					 DoubledGaugeFieldView &UUU,
-					 SiteSpinor *buf, int LLs,
-					 int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
-{
-  assert(0);
-};
-
-
-//#define CONDITIONAL_MOVE(l,o,out) if ( l ) { out = (uint64_t) &in[o] ; } else { out =(uint64_t) &buf[o]; }
-
-#define CONDITIONAL_MOVE(l,o,out) { const SiteSpinor *ptr = l? in_p : buf; out = (uint64_t) &ptr[o]; }
-
-#define PREPARE_XYZT(X,Y,Z,T,skew,UU)			\
-  PREPARE(X,Y,Z,T,skew,UU);				\
-  PF_GAUGE_XYZT(gauge0);					\
-  PF_GAUGE_XYZT(gauge1);					\
-  PF_GAUGE_XYZT(gauge2);					\
-  PF_GAUGE_XYZT(gauge3);					
-
-#define PREPARE_LS(X,Y,Z,T,skew,UU)			\
-  PREPARE(X,Y,Z,T,skew,UU);				\
-  PF_GAUGE_LS(gauge0);					\
-  PF_GAUGE_LS(gauge1);					\
-  PF_GAUGE_LS(gauge2);					\
-  PF_GAUGE_LS(gauge3);					
-
-#define PREPARE(X,Y,Z,T,skew,UU)					\
-  SE0=st.GetEntry(ptype,X+skew,sF);					\
-  o0 = SE0->_offset;							\
-  l0 = SE0->_is_local;							\
-  p0 = SE0->_permute;							\
-  CONDITIONAL_MOVE(l0,o0,addr0);					\
-  PF_CHI(addr0);							\
-  									\
-  SE1=st.GetEntry(ptype,Y+skew,sF);					\
-  o1 = SE1->_offset;							\
-  l1 = SE1->_is_local;							\
-  p1 = SE1->_permute;							\
-  CONDITIONAL_MOVE(l1,o1,addr1);					\
-  PF_CHI(addr1);							\
-  									\
-  SE2=st.GetEntry(ptype,Z+skew,sF);					\
-  o2 = SE2->_offset;							\
-  l2 = SE2->_is_local;							\
-  p2 = SE2->_permute;							\
-  CONDITIONAL_MOVE(l2,o2,addr2);					\
-  PF_CHI(addr2);							\
-  									\
-  SE3=st.GetEntry(ptype,T+skew,sF);					\
-  o3 = SE3->_offset;							\
-  l3 = SE3->_is_local;							\
-  p3 = SE3->_permute;							\
-  CONDITIONAL_MOVE(l3,o3,addr3);					\
-  PF_CHI(addr3);							\
-  									\
-  gauge0 =(uint64_t)&UU[sU]( X );				\
-  gauge1 =(uint64_t)&UU[sU]( Y );				\
-  gauge2 =(uint64_t)&UU[sU]( Z );				\
-  gauge3 =(uint64_t)&UU[sU]( T ); 
-  
-  // This is the single precision 5th direction vectorised kernel
-#include <simd/Intel512single.h>
-template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-								    DoubledGaugeFieldView &U,
-								    DoubledGaugeFieldView &UUU,
-								    SiteSpinor *buf, int LLs,
-								    int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
-{
-#ifdef AVX512
-  uint64_t gauge0,gauge1,gauge2,gauge3;
-  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in[0];
-
-  int o0,o1,o2,o3; // offsets
-  int l0,l1,l2,l3; // local 
-  int p0,p1,p2,p3; // perm
-  int ptype;
-  StencilEntry *SE0;
-  StencilEntry *SE1;
-  StencilEntry *SE2;
-  StencilEntry *SE3;
-
-   for(int s=0;s<LLs;s++){
-
-    int sF=s+LLs*sU;
-    // Xp, Yp, Zp, Tp
-    PREPARE(Xp,Yp,Zp,Tp,0,U);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_LS(gauge0,gauge1,gauge2,gauge3);  
-
-    PREPARE(Xm,Ym,Zm,Tm,0,U);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);  
-
-    PREPARE(Xp,Yp,Zp,Tp,8,UUU);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
-
-    PREPARE(Xm,Ym,Zm,Tm,8,UUU);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
-
-    addr0 = (uint64_t) &out[sF];
-    if ( dag ) {
-      nREDUCE(addr0);
-    } else { 
-      REDUCE(addr0);
-    }
-   }
-#else 
-    assert(0);
-#endif
-   
-}
-
-#include <simd/Intel512double.h>
-template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-								    DoubledGaugeFieldView &U,
-								    DoubledGaugeFieldView &UUU,
-								    SiteSpinor *buf, int LLs,
-								    int sU, const FermionFieldView &in, FermionFieldView &out, int dag) 
-{
-#ifdef AVX512
-  uint64_t gauge0,gauge1,gauge2,gauge3;
-  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in[0];
-
-  int o0,o1,o2,o3; // offsets
-  int l0,l1,l2,l3; // local 
-  int p0,p1,p2,p3; // perm
-  int ptype;
-  StencilEntry *SE0;
-  StencilEntry *SE1;
-  StencilEntry *SE2;
-  StencilEntry *SE3;
-
-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
-    // Xp, Yp, Zp, Tp
-    PREPARE(Xp,Yp,Zp,Tp,0,U);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_LS(gauge0,gauge1,gauge2,gauge3);  
-
-    PREPARE(Xm,Ym,Zm,Tm,0,U);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);  
-
-    PREPARE(Xp,Yp,Zp,Tp,8,UUU);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
-
-    PREPARE(Xm,Ym,Zm,Tm,8,UUU);
-    LOAD_CHI(addr0,addr1,addr2,addr3);
-    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
-
-    addr0 = (uint64_t) &out[sF];
-    if ( dag ) {
-      nREDUCE(addr0);
-    } else { 
-      REDUCE(addr0);
-    }
-  }
-#else 
-  assert(0);
-#endif
-}
-   
-   
-
-
-#define PERMUTE_DIR3 __asm__ (	\
-  VPERM3(Chi_00,Chi_00)	\
-  VPERM3(Chi_01,Chi_01)	\
-  VPERM3(Chi_02,Chi_02)	);
-
-#define PERMUTE_DIR2 __asm__ (	\
-  VPERM2(Chi_10,Chi_10)	\
-  VPERM2(Chi_11,Chi_11)	\
-  VPERM2(Chi_12,Chi_12) );
-
-#define PERMUTE_DIR1 __asm__ (	\
-  VPERM1(Chi_00,Chi_00)	\
-  VPERM1(Chi_01,Chi_01)	\
-  VPERM1(Chi_02,Chi_02)	);
-
-#define PERMUTE_DIR0 __asm__ (			\
-  VPERM0(Chi_10,Chi_10)	\
-  VPERM0(Chi_11,Chi_11)	\
-  VPERM0(Chi_12,Chi_12) );
-
-#define PERMUTE01 \
-  if ( p0 ) { PERMUTE_DIR3; }\
-  if ( p1 ) { PERMUTE_DIR2; }
-
-#define PERMUTE23 \
-  if ( p2 ) { PERMUTE_DIR1; }\
-  if ( p3 ) { PERMUTE_DIR0; }
-
-  // This is the single precision 5th direction vectorised kernel
-
-#include <simd/Intel512single.h>
-template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-							       DoubledGaugeFieldView &U,
-							       DoubledGaugeFieldView &UUU,
-							       SiteSpinor *buf, int LLs,
-							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
-{
-#ifdef AVX512
-  uint64_t gauge0,gauge1,gauge2,gauge3;
-  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in[0];
-
-  int o0,o1,o2,o3; // offsets
-  int l0,l1,l2,l3; // local 
-  int p0,p1,p2,p3; // perm
-  int ptype;
-  StencilEntry *SE0;
-  StencilEntry *SE1;
-  StencilEntry *SE2;
-  StencilEntry *SE3;
-
-  for(int s=0;s<LLs;s++){
-    
-    int sF=s+LLs*sU;
-    // Xp, Yp, Zp, Tp
-    PREPARE(Xp,Yp,Zp,Tp,0,U);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-
-    PREPARE(Xm,Ym,Zm,Tm,0,U);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_ADD_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-
-    PREPARE(Xp,Yp,Zp,Tp,8,UUU);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_ADD_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-    
-    PREPARE(Xm,Ym,Zm,Tm,8,UUU);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_ADD_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-
-    addr0 = (uint64_t) &out[sF];
-    if ( dag ) { 
-      nREDUCEa(addr0);
-    } else { 
-      REDUCEa(addr0);
-    }
-  }
-#else 
-  assert(0);
-#endif
-}
-
-#include <simd/Intel512double.h>
-template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-							       DoubledGaugeFieldView &U,
-							       DoubledGaugeFieldView &UUU,
-							       SiteSpinor *buf, int LLs,
-							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
-{
-#ifdef AVX512
-  uint64_t gauge0,gauge1,gauge2,gauge3;
-  uint64_t addr0,addr1,addr2,addr3;
-  const SiteSpinor *in_p; in_p = &in[0];
-
-  int o0,o1,o2,o3; // offsets
-  int l0,l1,l2,l3; // local 
-  int p0,p1,p2,p3; // perm
-  int ptype;
-  StencilEntry *SE0;
-  StencilEntry *SE1;
-  StencilEntry *SE2;
-  StencilEntry *SE3;
-
-  for(int s=0;s<LLs;s++){
-    
-    int sF=s+LLs*sU;
-    // Xp, Yp, Zp, Tp
-    PREPARE(Xp,Yp,Zp,Tp,0,U);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-    
-    PREPARE(Xm,Ym,Zm,Tm,0,U);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_ADD_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-    
-    PREPARE(Xp,Yp,Zp,Tp,8,UUU);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_ADD_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-    
-    PREPARE(Xm,Ym,Zm,Tm,8,UUU);
-    LOAD_CHIa(addr0,addr1);
-    PERMUTE01;
-    MULT_ADD_XYZT(gauge0,gauge1);
-    LOAD_CHIa(addr2,addr3);
-    PERMUTE23;
-    MULT_ADD_XYZT(gauge2,gauge3);  
-    
-    addr0 = (uint64_t) &out[sF];
-    if ( dag ) {
-      nREDUCEa(addr0);
-    } else { 
-      REDUCEa(addr0);
-    }
-  }
-#else 
-  assert(0);
-#endif
-}
-
-#define KERNEL_INSTANTIATE(CLASS,FUNC,IMPL)			    \
-  template void CLASS<IMPL>::FUNC(StencilImpl &st, LebesgueOrder &lo,	\
-				  DoubledGaugeFieldView &U,			\
-				  DoubledGaugeFieldView &UUU,		\
-				  SiteSpinor *buf, int LLs,		\
-				  int sU, const FermionFieldView &in, FermionFieldView &out,int dag);
-
-//KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
-//KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
-//KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplD);
-//KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplF);
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
@@ -1,396 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#define LOAD_CHI(b)		\
-  const SiteSpinor & ref (b[offset]);	\
-    Chi_0=ref()()(0);\
-    Chi_1=ref()()(1);\
-    Chi_2=ref()()(2);
-
-
-// To splat or not to splat depends on the implementation
-#define MULT(A,UChi)				\
-  auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
-    UChi ## _0  = U_00*Chi_0;	       \
-    UChi ## _1  = U_10*Chi_0;\
-    UChi ## _2  = U_20*Chi_0;\
-    UChi ## _0 += U_01*Chi_1;\
-    UChi ## _1 += U_11*Chi_1;\
-    UChi ## _2 += U_21*Chi_1;\
-    UChi ## _0 += U_02*Chi_2;\
-    UChi ## _1 += U_12*Chi_2;\
-    UChi ## _2 += U_22*Chi_2;
-
-#define MULT_ADD(U,A,UChi)			\
-  auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
-    UChi ## _0 += U_00*Chi_0;	       \
-    UChi ## _1 += U_10*Chi_0;\
-    UChi ## _2 += U_20*Chi_0;\
-    UChi ## _0 += U_01*Chi_1;\
-    UChi ## _1 += U_11*Chi_1;\
-    UChi ## _2 += U_21*Chi_1;\
-    UChi ## _0 += U_02*Chi_2;\
-    UChi ## _1 += U_12*Chi_2;\
-    UChi ## _2 += U_22*Chi_2;
-
-
-#define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_0,Chi_0);			\
-  permute##dir(Chi_1,Chi_1);			\
-  permute##dir(Chi_2,Chi_2);
-
-
-#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew)	\
-  SE=st.GetEntry(ptype,Dir+skew,sF);	\
-  offset = SE->_offset;			\
-  local  = SE->_is_local;		\
-  perm   = SE->_permute;		\
-  if ( local ) {						\
-    LOAD_CHI(in);					\
-    if ( perm) {						\
-      PERMUTE_DIR(Perm);					\
-    }								\
-  } else {							\
-    LOAD_CHI(buf);						\
-  }								
-
-#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even)		\
-  HAND_STENCIL_LEG_BASE(Dir,Perm,skew)				\
-  {								\
-    MULT(Dir,even);						\
-  }
-
-#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even)			\
-  HAND_STENCIL_LEG_BASE(Dir,Perm,skew)				\
-  {								\
-    MULT_ADD(U,Dir,even);					\
-  }
-
-
-
-#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even)	\
-  SE=st.GetEntry(ptype,Dir+skew,sF);			\
-  offset = SE->_offset;					\
-  local  = SE->_is_local;				\
-  perm   = SE->_permute;				\
-  if ( local ) {					\
-    LOAD_CHI(in);				\
-    if ( perm) {					\
-      PERMUTE_DIR(Perm);				\
-    }							\
-  } else if ( st.same_node[Dir] ) {			\
-    LOAD_CHI(buf);					\
-  }							\
-  if (SE->_is_local || st.same_node[Dir] ) {		\
-    MULT_ADD(U,Dir,even);				\
-  }
-
-#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even)	\
-  SE=st.GetEntry(ptype,Dir+skew,sF);			\
-  offset = SE->_offset;					\
-  local  = SE->_is_local;				\
-  perm   = SE->_permute;				\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
-    nmu++;							\
-    { LOAD_CHI(buf);	  }					\
-    { MULT_ADD(U,Dir,even); }					\
-  }								
-
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
-					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
-					  SiteSpinor *buf, int LLs, int sU, 
-					  const FermionFieldView &in, FermionFieldView &out,int dag) 
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
-
-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
-  int offset,local,perm, ptype;
-
-  StencilEntry *SE;
-  int skew;
-
-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
-
-    skew = 0;
-    HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);  
-    HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);   
-    HAND_STENCIL_LEG      (U,Zp,1,skew,even);  
-    HAND_STENCIL_LEG      (U,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG      (U,Xm,3,skew,even);  
-    HAND_STENCIL_LEG      (U,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG      (U,Zm,1,skew,even);  
-    HAND_STENCIL_LEG      (U,Tm,0,skew,odd);  
-    skew = 8;
-    HAND_STENCIL_LEG(UUU,Xp,3,skew,even);  
-    HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG(UUU,Zp,1,skew,even);  
-    HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG(UUU,Xm,3,skew,even);  
-    HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG(UUU,Zm,1,skew,even);  
-    HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);  
-    
-    if ( dag ) {
-      result()()(0) = - even_0 - odd_0;
-      result()()(1) = - even_1 - odd_1;
-      result()()(2) = - even_2 - odd_2;
-    } else { 
-      result()()(0) = even_0 + odd_0;
-      result()()(1) = even_1 + odd_1;
-      result()()(2) = even_2 + odd_2;
-    }
-    vstream(out[sF],result);
-  }
-}
-
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionFieldView &in, FermionFieldView &out,int dag) 
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
-
-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
-  int offset, ptype, local, perm;
-
-  StencilEntry *SE;
-  int skew;
-
-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
-
-    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
-     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
-
-    skew = 0;
-    HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);  
-    HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);  
-    HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);  
-    HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);  
-    HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);  
-    skew = 8;
-    HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);  
-    HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);  
-    HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);  
-    HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);  
-    HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);  
-
-    // Assume every site must be connected to at least one interior point. No 1^4 subvols.
-    if ( dag ) {
-      result()()(0) = - even_0 - odd_0;
-      result()()(1) = - even_1 - odd_1;
-      result()()(2) = - even_2 - odd_2;
-    } else { 
-      result()()(0) = even_0 + odd_0;
-      result()()(1) = even_1 + odd_1;
-      result()()(2) = even_2 + odd_2;
-    }
-    vstream(out[sF],result);
-  }
-}
-
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionFieldView &in, FermionFieldView &out,int dag) 
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
-
-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
-  int offset, ptype, local, perm;
-
-  StencilEntry *SE;
-  int skew;
-
-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
-
-    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
-     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
-    int nmu=0;
-    skew = 0;
-    HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);  
-    HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);  
-    HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);  
-    HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);  
-    HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);  
-    skew = 8;
-    HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);  
-    HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);  
-    HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);  
-    HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);  
-    HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);  
-
-    // Add sum of all exterior connected stencil legs
-    if ( nmu ) { 
-      if ( dag ) {
-	result()()(0) = - even_0 - odd_0;
-	result()()(1) = - even_1 - odd_1;
-	result()()(2) = - even_2 - odd_2;
-      } else { 
-	result()()(0) = even_0 + odd_0;
-	result()()(1) = even_1 + odd_1;
-	result()()(2) = even_2 + odd_2;
-      }
-      out[sF] = out[sF] + result;
-    }
-  }
-}
-
-
-#define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
-  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
-						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionFieldView &in, FermionFieldView &out, int dag); \
-									\
-  template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
-						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionFieldView &in, FermionFieldView &out, int dag); \
-									\
-  template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
-						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionFieldView &in, FermionFieldView &out, int dag); \
-
-DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
-
-NAMESPACE_END(Grid);
-
-
--- a/Grid/qcd/action/fermion/StaggeredVec5dImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredVec5dImpl.h
@@ -0,0 +1,203 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template <class S, class Representation = FundamentalRepresentation >
+class StaggeredVec5dImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
+
+public:
+
+  static const int Dimension = Representation::Dimension;
+    static const bool isFundamental = Representation::isFundamental;
+  static const bool LsVectorised=true;
+  typedef RealD   Coeff_t ;
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
+      
+  //Necessary?
+  constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
+
+
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  template <typename vtype> using iImplSpinor            = iScalar<iScalar<iVector<vtype, Dimension> > >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iScalar<iVector<vtype, Dimension> > >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
+  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+  template <typename vtype> using iImplPropagator        = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+
+  // Make the doubled gauge field a *scalar*
+  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
+  typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar
+  typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar
+  typedef iImplPropagator<Simd>        SitePropagator;
+
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+  typedef Lattice<SitePropagator> PropagatorField;
+    
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+
+    
+  typedef Lattice<SiteSpinor>            FermionField;
+    
+  typedef StaggeredImplParams ImplParams;
+  typedef SimpleCompressor<SiteSpinor> Compressor;
+  typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
+  typedef typename StencilImpl::View_type StencilView;
+    
+  ImplParams Params;
+    
+  StaggeredVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
+
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    vsplat(reg, memory);
+  }
+
+  static accelerator_inline void multLink(SiteHalfSpinor &phi, 
+					  const SiteDoubledGaugeField &U,
+					  const SiteHalfSpinor &chi, 
+					  int mu) 
+  {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+	vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mult(&phi(), &UU(), &chi());
+  }
+  static accelerator_inline void multLinkAdd(SiteHalfSpinor &phi, 
+					     const SiteDoubledGaugeField &U,
+					     const SiteHalfSpinor &chi, 
+					     int mu) 
+  {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
+	vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mac(&phi(), &UU(), &chi());
+  }
+      
+  inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
+  {
+    GridBase *GaugeGrid = U_ds.Grid();
+    thread_loop( (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++), {
+
+	SiteScalarGaugeLink   ScalarU;
+	SiteDoubledGaugeField ScalarUds;
+	
+	Coordinate lcoor;
+	GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
+	peekLocalSite(ScalarUds, U_ds, lcoor);
+	
+	peekLocalSite(ScalarU, U, lcoor);
+	ScalarUds(mu) = ScalarU();
+	
+    });
+  }
+  inline void DoubleStore(GridBase *GaugeGrid,
+			  DoubledGaugeField &UUUds, // for Naik term
+			  DoubledGaugeField &Uds,
+			  const GaugeField &Uthin,
+			  const GaugeField &Ufat) 
+  {
+
+    GridBase * InputGrid = Uthin.Grid();
+    conformable(InputGrid,Ufat.Grid());
+
+    GaugeLinkField U(InputGrid);
+    GaugeLinkField UU(InputGrid);
+    GaugeLinkField UUU(InputGrid);
+    GaugeLinkField Udag(InputGrid);
+    GaugeLinkField UUUdag(InputGrid);
+
+    for (int mu = 0; mu < Nd; mu++) {
+
+      // Staggered Phase.
+      Lattice<iScalar<vInteger> > coor(InputGrid);
+      Lattice<iScalar<vInteger> > x(InputGrid); LatticeCoordinate(x,0);
+      Lattice<iScalar<vInteger> > y(InputGrid); LatticeCoordinate(y,1);
+      Lattice<iScalar<vInteger> > z(InputGrid); LatticeCoordinate(z,2);
+      Lattice<iScalar<vInteger> > t(InputGrid); LatticeCoordinate(t,3);
+
+      Lattice<iScalar<vInteger> > lin_z(InputGrid); lin_z=x+y;
+      Lattice<iScalar<vInteger> > lin_t(InputGrid); lin_t=x+y+z;
+
+      ComplexField phases(InputGrid);	phases=1.0;
+
+      if ( mu == 1 ) phases = where( mod(x    ,2)==(Integer)0, phases,-phases);
+      if ( mu == 2 ) phases = where( mod(lin_z,2)==(Integer)0, phases,-phases);
+      if ( mu == 3 ) phases = where( mod(lin_t,2)==(Integer)0, phases,-phases);
+
+      // 1 hop based on fat links
+      U      = PeekIndex<LorentzIndex>(Ufat, mu);
+      Udag   = adj( Cshift(U, mu, -1));
+
+      U    = U    *phases;
+      Udag = Udag *phases;
+
+      InsertGaugeField(Uds,U,mu);
+      InsertGaugeField(Uds,Udag,mu+4);
+
+      // 3 hop based on thin links. Crazy huh ?
+      U  = PeekIndex<LorentzIndex>(Uthin, mu);
+      UU = Gimpl::CovShiftForward(U,mu,U);
+      UUU= Gimpl::CovShiftForward(U,mu,UU);
+	
+      UUUdag = adj( Cshift(UUU, mu, -3));
+
+      UUU    = UUU    *phases;
+      UUUdag = UUUdag *phases;
+
+      InsertGaugeField(UUUds,UUU,mu);
+      InsertGaugeField(UUUds,UUUdag,mu+4);
+
+    }
+  }
+
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+    assert(0);
+  }   
+      
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+    assert (0); 
+  }
+};
+typedef StaggeredVec5dImpl<vComplex,  FundamentalRepresentation > StaggeredVec5dImplR;   // Real.. whichever prec
+typedef StaggeredVec5dImpl<vComplexF, FundamentalRepresentation > StaggeredVec5dImplF;  // Float
+typedef StaggeredVec5dImpl<vComplexD, FundamentalRepresentation > StaggeredVec5dImplD;  // Double
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.cc
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.cc
@@ -1,242 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
-
-    Copyright (C) 2017
-
-    Author: paboyle <paboyle@ph.ed.ac.uk>
-    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid.h>
-#include <Grid/qcd/spin/Dirac.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// *NOT* EO
-template <class Impl>
-RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
-{
-  FermionField temp(out.Grid());
-
-  // Wilson term
-  out.Checkerboard() = in.Checkerboard();
-  this->Dhop(in, out, DaggerNo);
-
-  // Clover term
-  Mooee(in, temp);
-
-  out += temp;
-  return norm2(out);
-}
-
-template <class Impl>
-RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
-{
-  FermionField temp(out.Grid());
-
-  // Wilson term
-  out.Checkerboard() = in.Checkerboard();
-  this->Dhop(in, out, DaggerYes);
-
-  // Clover term
-  MooeeDag(in, temp);
-
-  out += temp;
-  return norm2(out);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
-{
-  WilsonFermion<Impl>::ImportGauge(_Umu);
-  GridBase *grid = _Umu.Grid();
-  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
-
-  // Compute the field strength terms mu>nu
-  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
-  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
-  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
-
-  // Compute the Clover Operator acting on Colour and Spin
-  // multiply here by the clover coefficients for the anisotropy
-  CloverTerm  = fillCloverYZ(Bx) * csw_r;
-  CloverTerm += fillCloverXZ(By) * csw_r;
-  CloverTerm += fillCloverXY(Bz) * csw_r;
-  CloverTerm += fillCloverXT(Ex) * csw_t;
-  CloverTerm += fillCloverYT(Ey) * csw_t;
-  CloverTerm += fillCloverZT(Ez) * csw_t;
-  CloverTerm += diag_mass;
-
-  int lvol = _Umu.Grid()->lSites();
-  int DimRep = Impl::Dimension;
-
-  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-
-  Coordinate lcoor;
-  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
-
-  for (int site = 0; site < lvol; site++)
-  {
-    grid->LocalIndexToLocalCoor(site, lcoor);
-    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-    peekLocalSite(Qx, CloverTerm, lcoor);
-    Qxinv = Zero();
-    //if (csw!=0){
-    for (int j = 0; j < Ns; j++)
-      for (int k = 0; k < Ns; k++)
-        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++){
-	    auto zz =  Qx()(j, k)(a, b);
-            EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
-	  }
-    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
-
-    EigenInvCloverOp = EigenCloverOp.inverse();
-    //std::cout << EigenInvCloverOp << std::endl;
-    for (int j = 0; j < Ns; j++)
-      for (int k = 0; k < Ns; k++)
-        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++)
-            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
-    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
-    //  }
-    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
-  }
-
-  // Separate the even and odd parts
-  pickCheckerboard(Even, CloverTermEven, CloverTerm);
-  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
-
-  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
-  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
-
-  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
-  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
-
-  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
-  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerNo, InverseNo);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerYes, InverseNo);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerNo, InverseYes);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerYes, InverseYes);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
-{
-  out.Checkerboard() = in.Checkerboard();
-  CloverFieldType *Clover;
-  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
-
-  if (dag)
-  {
-    if (in.Grid()->_isCheckerBoarded)
-    {
-      if (in.Checkerboard() == Odd)
-      {
-        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
-      }
-      else
-      {
-        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
-      }
-      out = *Clover * in;
-    }
-    else
-    {
-      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = adj(*Clover) * in;
-    }
-  }
-  else
-  {
-    if (in.Grid()->_isCheckerBoarded)
-    {
-
-      if (in.Checkerboard() == Odd)
-      {
-        //  std::cout << "Calling clover term Odd" << std::endl;
-        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
-      }
-      else
-      {
-        //  std::cout << "Calling clover term Even" << std::endl;
-        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
-      }
-      out = *Clover * in;
-      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
-    }
-    else
-    {
-      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = *Clover * in;
-    }
-  }
-
-} // MooeeInternal
-
-
-// Derivative parts
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
-{
-  assert(0);
-}
-
-// Derivative parts
-template <class Impl>
-void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
-{
-  assert(0); // not implemented yet
-}
-
-FermOpTemplateInstantiate(WilsonCloverFermion);
-AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
-TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
-//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonFermion.cc
+++ b/Grid/qcd/action/fermion/WilsonFermion.cc
@@ -1,596 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-			   /*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/WilsonFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
-const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
-int WilsonFermionStatic::HandOptDslash;
-
-/////////////////////////////////
-// Constructor and gauge import
-/////////////////////////////////
-
-template <class Impl>
-WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
-                                   GridRedBlackCartesian &Hgrid, RealD _mass,
-                                   const ImplParams &p,
-                                   const WilsonAnisotropyCoefficients &anis)
-  : 
-    Kernels(p),
-    _grid(&Fgrid),
-    _cbgrid(&Hgrid),
-    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
-    StencilEven(&Hgrid, npoint, Even, directions,displacements,p),  // source is Even
-    StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p),  // source is Odd
-    mass(_mass),
-    Lebesgue(_grid),
-    LebesgueEvenOdd(_cbgrid),
-    Umu(&Fgrid),
-    UmuEven(&Hgrid),
-    UmuOdd(&Hgrid),
-      _tmp(&Hgrid),
-      anisotropyCoeff(anis)
-{
-  // Allocate the required comms buffer
-  ImportGauge(_Umu);
-  if  (anisotropyCoeff.isAnisotropic){
-    diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
-  } else {
-    diag_mass = 4.0 + mass;
-  }
-
-
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) 
-{
-  GaugeField HUmu(_Umu.Grid());
-
-  //Here multiply the anisotropy coefficients
-  if (anisotropyCoeff.isAnisotropic)
-  {
-
-    for (int mu = 0; mu < Nd; mu++)
-    {
-      GaugeLinkField U_dir = (-0.5)*PeekIndex<LorentzIndex>(_Umu, mu);
-      if (mu != anisotropyCoeff.t_direction)
-        U_dir *= (anisotropyCoeff.nu / anisotropyCoeff.xi_0);
-
-      PokeIndex<LorentzIndex>(HUmu, U_dir, mu);
-    }
-  }
-  else
-  {
-    HUmu = _Umu * (-0.5);
-  }
-  Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
-  pickCheckerboard(Even, UmuEven, Umu);
-  pickCheckerboard(Odd, UmuOdd, Umu);
-}
-
-/////////////////////////////
-// Implement the interface
-/////////////////////////////
-
-template <class Impl>
-RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Dhop(in, out, DaggerNo);
-  return axpy_norm(out, diag_mass, in, out);
-}
-
-template <class Impl>
-RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Dhop(in, out, DaggerYes);
-  return axpy_norm(out, diag_mass, in, out);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
-  if (in.Checkerboard() == Odd) {
-    DhopEO(in, out, DaggerNo);
-  } else {
-    DhopOE(in, out, DaggerNo);
-  }
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
-  if (in.Checkerboard() == Odd) {
-    DhopEO(in, out, DaggerYes);
-  } else {
-    DhopOE(in, out, DaggerYes);
-  }
-}
-  
-template <class Impl>
-void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  typename FermionField::scalar_type scal(diag_mass);
-  out = scal * in;
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  Mooee(in, out);
-}
-
-template<class Impl>
-void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  out = (1.0/(diag_mass))*in;
-}
-  
-template<class Impl>
-void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  MooeeInv(in,out);
-}
-template<class Impl>
-void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m,std::vector<double> twist)
-{  
-  typedef typename FermionField::vector_type vector_type;
-  typedef typename FermionField::scalar_type ScalComplex;
-  typedef Lattice<iSinglet<vector_type> > LatComplex;
-  
-  // what type LatticeComplex 
-  conformable(_grid,out.Grid());
-  
-  Gamma::Algebra Gmu [] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT
-  };
-  
-  Coordinate latt_size   = _grid->_fdimensions;
-  
-  FermionField   num  (_grid); num  = Zero();
-  LatComplex    wilson(_grid); wilson= Zero();
-  LatComplex     one  (_grid); one = ScalComplex(1.0,0.0);
-  
-  LatComplex denom(_grid); denom= Zero();
-  LatComplex kmu(_grid); 
-  ScalComplex ci(0.0,1.0);
-  // momphase = n * 2pi / L
-  for(int mu=0;mu<Nd;mu++) {
-    
-    LatticeCoordinate(kmu,mu);
-    
-    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-    
-    kmu = TwoPiL * kmu;
-    kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
-    
-    wilson = wilson + 2.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
-    
-    num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in);    // derivative term
-    
-    denom=denom + sin(kmu)*sin(kmu);
-  }
-  
-  wilson = wilson + _m;     // 2 sin^2 k/2 + m
-  
-  num   = num + wilson*in;     // -i gmu sin k + 2 sin^2 k/2 + m
-  
-  denom= denom+wilson*wilson; // sin^2 k + (2 sin^2 k/2 + m)^2
-  
-  denom= one/denom;
-  
-  out = num*denom; // [ -i gmu sin k + 2 sin^2 k/2 + m] / [ sin^2 k + (2 sin^2 k/2 + m)^2 ]
-  
-}
-  
-
-///////////////////////////////////
-// Internal
-///////////////////////////////////
-
-template <class Impl>
-void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
-                                        GaugeField &mat, const FermionField &A,
-                                        const FermionField &B, int dag) {
-  assert((dag == DaggerNo) || (dag == DaggerYes));
-
-  Compressor compressor(dag);
-
-  FermionField Btilde(B.Grid());
-  FermionField Atilde(B.Grid());
-  Atilde = A;
-
-  st.HaloExchange(B, compressor);
-
-  for (int mu = 0; mu < Nd; mu++) {
-    ////////////////////////////////////////////////////////////////////////
-    // Flip gamma (1+g)<->(1-g) if dag
-    ////////////////////////////////////////////////////////////////////////
-    int gamma = mu;
-    if (!dag) gamma += Nd;
-
-    ////////////////////////
-    // Call the single hop
-    ////////////////////////
-    auto U_v = U.View();
-    auto B_v = B.View();
-    auto Btilde_v = Btilde.View();
-    auto st_v = st.View();
-    thread_loop( (int sss = 0; sss < B.Grid()->oSites(); sss++) ,{
-      Kernels::DhopDirK(st_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu, gamma);
-    });
-
-    //////////////////////////////////////////////////
-    // spin trace outer product
-    //////////////////////////////////////////////////
-    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
-  }
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-  conformable(U.Grid(), _grid);
-  conformable(U.Grid(), V.Grid());
-  conformable(U.Grid(), mat.Grid());
-
-  mat.Checkerboard() = U.Checkerboard();
-
-  DerivInternal(Stencil, Umu, mat, U, V, dag);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-  conformable(U.Grid(), _cbgrid);
-  conformable(U.Grid(), V.Grid());
-  //conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido)
-  // Motivation: look at the SchurDiff operator
-  
-  assert(V.Checkerboard() == Even);
-  assert(U.Checkerboard() == Odd);
-  mat.Checkerboard() = Odd;
-
-  DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-  conformable(U.Grid(), _cbgrid);
-  conformable(U.Grid(), V.Grid());
-  //conformable(U.Grid(), mat.Grid());
-
-  assert(V.Checkerboard() == Odd);
-  assert(U.Checkerboard() == Even);
-  mat.Checkerboard() = Even;
-
-  DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
-  conformable(in.Grid(), _grid);  // verifies full grid
-  conformable(in.Grid(), out.Grid());
-
-  out.Checkerboard() = in.Checkerboard();
-
-  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
-  conformable(in.Grid(), _cbgrid);    // verifies half grid
-  conformable(in.Grid(), out.Grid());  // drops the cb check
-
-  assert(in.Checkerboard() == Even);
-  out.Checkerboard() = Odd;
-
-  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) {
-  conformable(in.Grid(), _cbgrid);    // verifies half grid
-  conformable(in.Grid(), out.Grid());  // drops the cb check
-
-  assert(in.Checkerboard() == Odd);
-  out.Checkerboard() = Even;
-
-  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
-  DhopDir(in, out, dir, disp);
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
-  int skip = (disp == 1) ? 0 : 1;
-  int dirdisp = dir + skip * 4;
-  int gamma = dir + (1 - skip) * 4;
-
-  DhopDirDisp(in, out, dirdisp, gamma, DaggerNo);
-};
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
-{
-  Compressor compressor(dag);
-
-  Stencil.HaloExchange(in, compressor);
-  auto in_v = in.View();
-  auto out_v = in.View();
-  auto Umu_v = Umu.View();
-  auto Stencil_v = Stencil.View();
-  thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
-    Kernels::DhopDirK(Stencil_v, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dirdisp, gamma);
-  });
-};
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
-                                       DoubledGaugeField &U,
-                                       const FermionField &in,
-                                       FermionField &out, int dag) {
-#ifdef GRID_OMP
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
-  else
-#endif 
-    DhopInternalSerial(st,lo,U,in,out,dag);
-
-}
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
-						      DoubledGaugeField &U,
-						      const FermionField &in,
-						      FermionField &out, int dag) {
-  assert((dag == DaggerNo) || (dag == DaggerYes));
-#ifdef GRID_OMP
-  Compressor compressor(dag);
-  int len =  U.Grid()->oSites();
-  const int LLs =  1;
-
-  st.Prepare();
-  st.HaloGather(in,compressor);
-  st.CommsMergeSHM(compressor);
-#pragma omp parallel
-  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = len;
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
-      }
-      // do the compute
-      auto U_v   = U.View();
-      auto in_v  = in.View();
-      auto out_v = out.View();
-      auto st_v  = st.View();
-      int Opt = WilsonKernelsStatic::Opt;
-
-      if (dag == DaggerYes) {
-        for (int sss = myblock; sss < myblock+myn; ++sss) {
-	  Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
-	  //	  Kernels::DhopSiteDag(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-	}
-      } else {
-        for (int sss = myblock; sss < myblock+myn; ++sss) {
-	  Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
-	  //	  Kernels::DhopSite(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-	}
-      } 
-
-    } else {
-      st.CommunicateThreaded();
-    }
-  }  //pragma
-
-  {
-    auto U_v   = U.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    auto st_v  =  st.View();
-    int Opt = WilsonKernelsStatic::Opt;
-    if (dag == DaggerYes) {
-      thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
-	Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
-      });
-    } else {
-      thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
-	Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
-      });
-    }
-  }
-#else
-  assert(0);
-#endif
-};
-
-
-template <class Impl>
-void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
-                                       DoubledGaugeField &U,
-                                       const FermionField &in,
-                                       FermionField &out, int dag) {
-  assert((dag == DaggerNo) || (dag == DaggerYes));
-  Compressor compressor(dag);
-  st.HaloExchange(in, compressor);
-
-  int Opt = WilsonKernelsStatic::Opt;
-  auto U_v  = U.View();
-  auto in_v = in.View();
-  auto out_v= out.View();
-  auto st_v = st.View();
-  if (dag == DaggerYes) {
-    accelerator_loop( sss,in_v, {
-      Kernels::DhopSiteDag(Opt,st_v, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-    });
-  } else {
-    accelerator_loop( sss,in_v, {
-      Kernels::DhopSite(Opt,st_v, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-    });
-  }
-};
-/*Change ends */
-
-/*******************************************************************************
- * Conserved current utilities for Wilson fermions, for contracting propagators
- * to make a conserved current sink or inserting the conserved current 
- * sequentially.
- ******************************************************************************/
-template <class Impl>
-void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
-                                                   PropagatorField &q_in_2,
-                                                   PropagatorField &q_out,
-                                                   Current curr_type,
-                                                   unsigned int mu)
-{
-  Gamma g5(Gamma::Algebra::Gamma5);
-  conformable(_grid, q_in_1.Grid());
-  conformable(_grid, q_in_2.Grid());
-  conformable(_grid, q_out.Grid());
-  PropagatorField tmp1(_grid), tmp2(_grid);
-  q_out = Zero();
-
-  // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
-  // Inefficient comms method but not performance critical.
-  tmp1 = Cshift(q_in_1, mu, 1);
-  tmp2 = Cshift(q_in_2, mu, 1);
-  auto tmp1_v  =  tmp1.View();
-  auto tmp2_v  =  tmp2.View();
-  auto q_in_1_v=q_in_1.View();
-  auto q_in_2_v=q_in_2.View();
-  auto q_out_v = q_out.View();
-  auto Umu_v   =   Umu.View();
-  thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), {
-      Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
-					       q_in_2_v[sU],
-					       q_out_v[sU],
-					       Umu_v, sU, mu);
-      Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
-					       tmp2_v[sU],
-					       q_out_v[sU],
-					       Umu_v, sU, mu);
-  });
-}
-
-
-template <class Impl>
-void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
-                                              PropagatorField &q_out,
-                                              Current curr_type,
-                                              unsigned int mu,
-                                              unsigned int tmin, 
-                                              unsigned int tmax,
-					      ComplexField &lattice_cmplx)
-{
-  conformable(_grid, q_in.Grid());
-  conformable(_grid, q_out.Grid());
-
-  //  Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
-  Complex i(0.0,1.0);
-  PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
-  unsigned int tshift = (mu == Tp) ? 1 : 0;
-  unsigned int LLt    = GridDefaultLatt()[Tp];
-
-  q_out = Zero();
-  LatticeInteger coords(_grid);
-  LatticeCoordinate(coords, Tp);
-
-  // Need q(x + mu) and q(x - mu).
-  tmp    = Cshift(q_in, mu, 1);
-  tmpFwd = tmp*lattice_cmplx;
-  tmp    = lattice_cmplx*q_in;
-  tmpBwd = Cshift(tmp, mu, -1);
-
-  auto coords_v = coords.View();
-  auto tmpFwd_v = tmpFwd.View();
-  auto tmpBwd_v = tmpBwd.View();
-  auto Umu_v    = Umu.View();
-  auto q_out_v  = q_out.View();
-
-  thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), {
-
-    // Compute the sequential conserved current insertion only if our simd
-    // object contains a timeslice we need.
-    vInteger t_mask   = ((coords_v[sU] >= tmin) &&
-			 (coords_v[sU] <= tmax));
-    Integer timeSlices = Reduce(t_mask);
-
-    if (timeSlices > 0) {
-      Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], 
-					  q_out_v[sU], 
-					  Umu_v, sU, mu, t_mask);
-    }
-
-    // Repeat for backward direction.
-    t_mask     = ((coords_v[sU] >= (tmin + tshift)) && 
-		  (coords_v[sU] <= (tmax + tshift)));
-    
-    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
-    unsigned int t0 = 0;
-    if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords_v[sU] == t0 ));
-    
-    timeSlices = Reduce(t_mask);
-
-    if (timeSlices > 0) {
-      Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], 
-					  q_out_v[sU], 
-					  Umu_v, sU, mu, t_mask);
-    }
-  });
-}
-
-FermOpTemplateInstantiate(WilsonFermion);
-AdjointFermOpTemplateInstantiate(WilsonFermion);
-TwoIndexFermOpTemplateInstantiate(WilsonFermion);
-GparityFermOpTemplateInstantiate(WilsonFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonFermion5D.cc
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.cc
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -227,8 +227,8 @@ public:
 			   Current curr_type,
 			   unsigned int mu,
 			   unsigned int tmin,
-                             unsigned int tmax,
-			     ComplexField &lattice_cmplx);
+			   unsigned int tmax,
+			   ComplexField &lattice_cmplx);
 };

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -90,15 +90,7 @@ public:
    auto UU = coalescedRead(U(mu));
    mult(&phi(), &UU, &chi());
  }
-      
-#ifdef GPU_VEC
-  static accelerator_inline void copyLinkGpu(int lane,
-					     SiteDoubledGaugeField & UU,
-					     const SiteDoubledGaugeField &U)
-  {
-    auto U_l   = extractLane(lane,U);
-    insertLane(lane,UU,U_l);
-  }
+
  static accelerator_inline void multLinkGpu(int lane,
 					     typename SiteHalfSpinor::scalar_object &phi,
 					     const SiteDoubledGaugeField &U,
@@ -108,17 +100,6 @@ public:
    auto U_l   = extractLane(lane,U(mu));
    phi() =  U_l * chi();
  }
-#else
-  static accelerator_inline void multLinkGpu(int lane,
-					     SiteHalfSpinor &phi,
-					     const SiteDoubledGaugeField &U,
-					     const SiteHalfSpinor &chi,
-					     int mu) 
-  {
-    auto U_l   = U(mu);
-    phi() =  U_l * chi();
-  }
-#endif
    
  static accelerator_inline void multLinkProp(SitePropagator &phi,
 					      const SiteDoubledGaugeField &U,
--- a/Grid/qcd/action/fermion/WilsonKernels.cc
+++ b/Grid/qcd/action/fermion/WilsonKernels.cc
@@ -1,445 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-NAMESPACE_BEGIN(Grid);
-
-int WilsonKernelsStatic::Opt   = WilsonKernelsStatic::OptGeneric;
-int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
-
-////////////////////////////////////////////
-// Generic implementation; move to different file?
-////////////////////////////////////////////
-  
-#define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
-  SE = st.GetEntry(ptype, Dir, sF);				\
-  if (SE->_is_local) {						\
-    chi_p = &chi;						\
-    if (SE->_permute) {						\
-      spProj(tmp, in[SE->_offset]);			\
-      permute(chi, tmp, ptype);					\
-    } else {							\
-      spProj(chi, in[SE->_offset]);			\
-    }								\
-  } else {							\
-    chi_p = &buf[SE->_offset];					\
-  }								\
-  Impl::multLink(Uchi, U[sU], *chi_p, Dir, SE, st);	\
-  Recon(result, Uchi);
-  
-#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon)		\
-  SE = st.GetEntry(ptype, Dir, sF);				\
-  if (SE->_is_local) {						\
-    chi_p = &chi;						\
-    if (SE->_permute) {						\
-      spProj(tmp, in[SE->_offset]);			\
-      permute(chi, tmp, ptype);					\
-    } else {							\
-      spProj(chi, in[SE->_offset]);			\
-    }								\
-  } else if ( st.same_node[Dir] ) {				\
-      chi_p = &buf[SE->_offset];				\
-  }								\
-  if (SE->_is_local || st.same_node[Dir] ) {			\
-    Impl::multLink(Uchi, U[sU], *chi_p, Dir, SE, st);	\
-    Recon(result, Uchi);					\
-  }
-
-#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
-  SE = st.GetEntry(ptype, Dir, sF);				\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
-    chi_p = &buf[SE->_offset];					\
-    Impl::multLink(Uchi, U[sU], *chi_p, Dir, SE, st);	\
-    Recon(result, Uchi);					\
-    nmu++;							\
-  }
-
-#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
-  if (gamma == Dir) {						\
-    if (SE->_is_local && SE->_permute) {			\
-      spProj(tmp, in[SE->_offset]);			\
-      permute(chi, tmp, ptype);					\
-    } else if (SE->_is_local) {					\
-      spProj(chi, in[SE->_offset]);			\
-    } else {							\
-      chi = buf[SE->_offset];					\
-    }								\
-    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);	\
-    Recon(result, Uchi);					\
-  }
-
-  ////////////////////////////////////////////////////////////////////
-  // All legs kernels ; comms then compute
-  ////////////////////////////////////////////////////////////////////
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
-							 SiteHalfSpinor *buf, int sF,
-							 int sU, const FermionFieldView &in, FermionFieldView &out)
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-
-  GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
-  GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG(Tp,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG(Xm,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG(Ym,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
-  vstream(out[sF], result);
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
-						      SiteHalfSpinor *buf, int sF,
-						      int sU, const FermionFieldView &in, FermionFieldView &out) 
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-
-  GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
-  GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG(Tm,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG(Xp,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG(Yp,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
-  vstream(out[sF], result);
-};
-  ////////////////////////////////////////////////////////////////////
-  // Interior kernels
-  ////////////////////////////////////////////////////////////////////
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
-							    SiteHalfSpinor *buf, int sF,
-							    int sU, const FermionFieldView &in, FermionFieldView &out)
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-
-  result=Zero();
-  GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_INT(Yp,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_INT(Zp,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_INT(Tp,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_INT(Xm,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_INT(Ym,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
-  vstream(out[sF], result);
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
-							 SiteHalfSpinor *buf, int sF,
-							 int sU, const FermionFieldView &in, FermionFieldView &out) 
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-  result=Zero();
-  GENERIC_STENCIL_LEG_INT(Xm,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_INT(Ym,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_INT(Zm,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_INT(Tm,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_INT(Xp,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_INT(Yp,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
-  vstream(out[sF], result);
-};
-////////////////////////////////////////////////////////////////////
-// Exterior kernels
-////////////////////////////////////////////////////////////////////
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
-							    SiteHalfSpinor *buf, int sF,
-							    int sU, const FermionFieldView &in, FermionFieldView &out)
-{
-  //  SiteHalfSpinor tmp;
-  //  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-  int nmu=0;
-  result=Zero();
-  GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_EXT(Zp,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_EXT(Tp,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_EXT(Xm,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm);
-  if ( nmu ) { 
-    out[sF] = out[sF] + result; 
-  }
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
-							 SiteHalfSpinor *buf, int sF,
-							 int sU, const FermionFieldView &in, FermionFieldView &out) 
-{
-  //  SiteHalfSpinor tmp;
-  //  SiteHalfSpinor chi;
-  SiteHalfSpinor *chi_p;
-  SiteHalfSpinor Uchi;
-  SiteSpinor result;
-  StencilEntry *SE;
-  int ptype;
-  int nmu=0;
-  result=Zero();
-  GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
-  GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
-  GENERIC_STENCIL_LEG_EXT(Zm,spProjZp,accumReconZp);
-  GENERIC_STENCIL_LEG_EXT(Tm,spProjTp,accumReconTp);
-  GENERIC_STENCIL_LEG_EXT(Xp,spProjXm,accumReconXm);
-  GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm);
-  GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm);
-  GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm);
-  if ( nmu ) { 
-    out[sF] = out[sF] + result; 
-  }
-};
-
-template <class Impl>
-void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
-						int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) 
-{
-  SiteHalfSpinor tmp;
-  SiteHalfSpinor chi;
-  SiteSpinor result;
-  SiteHalfSpinor Uchi;
-  StencilEntry *SE;
-  int ptype;
-
-  SE = st.GetEntry(ptype, dir, sF);
-  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
-  GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
-  GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
-  GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
-  GENERIC_DHOPDIR_LEG(Xm,spProjXm,spReconXm);
-  GENERIC_DHOPDIR_LEG(Ym,spProjYm,spReconYm);
-  GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
-  GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
-  vstream(out[sF], result);
-}
-
-/*******************************************************************************
- * Conserved current utilities for Wilson fermions, for contracting propagators
- * to make a conserved current sink or inserting the conserved current 
- * sequentially. Common to both 4D and 5D.
- ******************************************************************************/
-// N.B. Functions below assume a -1/2 factor within U.
-#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
-#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
-
-/*******************************************************************************
- * Name: ContractConservedCurrentSiteFwd
- * Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in_1 shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(
-                                                  const SitePropagator &q_in_1,
-                                                  const SitePropagator &q_in_2,
-                                                  SitePropagator &q_out,
-                                                  DoubledGaugeFieldView &U,
-                                                  unsigned int sU,
-                                                  unsigned int mu,
-                                                  bool switch_sign)
-{
-  SitePropagator result, tmp;
-  Gamma g5(Gamma::Algebra::Gamma5);
-
-  Impl::multLinkProp(tmp, U[sU], q_in_1, mu);
-
-  result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
-
-  if (switch_sign) {
-    q_out -= result;
-  } else {
-    q_out += result;
-  }
-}
-
-/*******************************************************************************
- * Name: ContractConservedCurrentSiteBwd
- * Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in_2 shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(
-                                                  const SitePropagator &q_in_1,
-                                                  const SitePropagator &q_in_2,
-                                                  SitePropagator &q_out,
-                                                  DoubledGaugeFieldView &U,
-                                                  unsigned int sU,
-                                                  unsigned int mu,
-                                                  bool switch_sign)
-{
-  SitePropagator result, tmp;
-  Gamma g5(Gamma::Algebra::Gamma5);
-
-  Impl::multLinkProp(tmp, U[sU], q_in_1, mu + Nd);
-
-  result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
-  if (switch_sign) {
-    q_out += result;
-  } else {
-    q_out -= result;
-  }
-}
-
-// G-parity requires more specialised implementation.
-#define NO_CURR_SITE(Impl) \
-template <> \
-void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd( \
-                                                  const SitePropagator &q_in_1, \
-                                                  const SitePropagator &q_in_2, \
-                                                  SitePropagator &q_out,        \
-                                                  DoubledGaugeFieldView &U,         \
-                                                  unsigned int sU,              \
-                                                  unsigned int mu,              \
-                                                  bool switch_sign)             \
-{ \
-    assert(0); \
-} \
-template <> \
-void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd( \
-                                                  const SitePropagator &q_in_1, \
-                                                  const SitePropagator &q_in_2, \
-                                                  SitePropagator &q_out,        \
-                                                  DoubledGaugeFieldView &U,         \
-                                                  unsigned int mu,              \
-                                                  unsigned int sU,              \
-                                                  bool switch_sign)             \
-{ \
-    assert(0); \
-}
-
-NO_CURR_SITE(GparityWilsonImplF);
-NO_CURR_SITE(GparityWilsonImplD);
-NO_CURR_SITE(GparityWilsonImplFH);
-NO_CURR_SITE(GparityWilsonImplDF);
-
-
-/*******************************************************************************
- * Name: SeqConservedCurrentSiteFwd
- * Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
-                                                     SitePropagator &q_out,
-                                                     DoubledGaugeFieldView &U,
-                                                     unsigned int sU,
-                                                     unsigned int mu,
-                                                     vInteger t_mask,
-                                                     bool switch_sign)
-{
-  SitePropagator result;
-  
-  Impl::multLinkProp(result, U[sU], q_in, mu);
-  result = WilsonCurrentFwd(result, mu);
-
-  // Zero any unwanted timeslice entries.
-  result = predicatedWhere(t_mask, result, 0.*result);
-  
-  if (switch_sign) {
-    q_out -= result;
-  } else {
-    q_out += result;
-  }
-}
-
-/*******************************************************************************
- * Name: SeqConservedCurrentSiteFwd
- * Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in shifted in -ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in, 
-                                                     SitePropagator &q_out,
-                                                     DoubledGaugeFieldView &U,
-                                                     unsigned int sU,
-                                                     unsigned int mu,
-                                                     vInteger t_mask,
-                                                     bool switch_sign)
-{
-  SitePropagator result;
-  Impl::multLinkProp(result, U[sU], q_in, mu + Nd);
-  result = WilsonCurrentBwd(result, mu);
-
-  // Zero any unwanted timeslice entries.
-  result = predicatedWhere(t_mask, result, 0.*result);
-  
-  if (switch_sign) {
-    q_out += result;
-  } else {
-    q_out -= result;
-  }
-}
-
-FermOpTemplateInstantiate(WilsonKernels);
-AdjointFermOpTemplateInstantiate(WilsonKernels);
-TwoIndexFermOpTemplateInstantiate(WilsonKernels);
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -107,7 +107,7 @@ private:
 					 int Ls,int sF, int sU, const FermionFieldView &in, FermionFieldView &out);

  static accelerator void GenericDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-						 int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
+					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
      
  static accelerator void GenericDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
 						    int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
--- a/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -1,125 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-NAMESPACE_BEGIN(Grid);
-
-///////////////////////////////////////////////////////////
-// Default to no assembler implementation
-///////////////////////////////////////////////////////////
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-{
-  assert(0);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-{
-  assert(0);
-}
-
-#include <qcd/action/fermion/WilsonKernelsAsmAvx512.h>
-#include <qcd/action/fermion/WilsonKernelsAsmQPX.h>
-
-#define INSTANTIATE_ASM(A) \
-template void WilsonKernels<A>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
- \
-template void WilsonKernels<A>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
-template void WilsonKernels<A>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
- \
-template void WilsonKernels<A>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
-template void WilsonKernels<A>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
- \
-template void WilsonKernels<A>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
-                                  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
-
-//INSTANTIATE_ASM(WilsonImplF);
-//INSTANTIATE_ASM(WilsonImplD);
-INSTANTIATE_ASM(GparityWilsonImplF);
-INSTANTIATE_ASM(GparityWilsonImplD);
-//INSTANTIATE_ASM(ZWilsonImplF);
-//INSTANTIATE_ASM(ZWilsonImplD);
-//INSTANTIATE_ASM(DomainWallVec5dImplF);
-//INSTANTIATE_ASM(DomainWallVec5dImplD);
-//INSTANTIATE_ASM(ZDomainWallVec5dImplF);
-//INSTANTIATE_ASM(ZDomainWallVec5dImplD);
-
-//INSTANTIATE_ASM(WilsonImplFH);
-//INSTANTIATE_ASM(WilsonImplDF);
-//INSTANTIATE_ASM(ZWilsonImplFH);
-//INSTANTIATE_ASM(ZWilsonImplDF);
-INSTANTIATE_ASM(GparityWilsonImplFH);
-INSTANTIATE_ASM(GparityWilsonImplDF);
-//INSTANTIATE_ASM(DomainWallVec5dImplFH);
-//INSTANTIATE_ASM(DomainWallVec5dImplDF);
-//INSTANTIATE_ASM(ZDomainWallVec5dImplFH);
-//INSTANTIATE_ASM(ZDomainWallVec5dImplDF);
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h
@@ -1,650 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-
-#if defined(AVX512) 
-    ///////////////////////////////////////////////////////////
-    // If we are AVX512 specialise the single precision routine
-    ///////////////////////////////////////////////////////////
-#include <simd/Intel512wilson.h>
-#include <simd/Intel512single.h>
-    
-static Vector<vComplexF> signsF;
-
-  template<typename vtype>    
-  int setupSigns(Vector<vtype>& signs ){
-    Vector<vtype> bother(2);
-    signs = bother;
-    vrsign(signs[0]);
-    visign(signs[1]);
-    return 1;
-  }
-
-  static int signInitF = setupSigns(signsF);
-
-#define MAYBEPERM(A,perm) if (perm) { A ; }
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
-#define COMPLEX_SIGNS(isigns) vComplexF *isigns = &signsF[0];  
-  
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-#undef MAYBEPERM
-#undef MULT_2SPIN
-#define MAYBEPERM(A,B) 
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-#undef  MULT_2SPIN
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef COMPLEX_SIGNS
-#undef MAYBEPERM
-#undef MULT_2SPIN
-	
-
-
-///////////////////////////////////////////////////////////
-// If we are AVX512 specialise the double precision routine
-///////////////////////////////////////////////////////////
-
-#include <simd/Intel512double.h>
-    
-static Vector<vComplexD> signsD;
-static int signInitD = setupSigns(signsD);
-    
-#define MAYBEPERM(A,perm) if (perm) { A ; }
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
-#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];  
-
-
-#define INTERIOR_AND_EXTERIOR    
-#undef  INTERIOR
-#undef  EXTERIOR
-  
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-#undef MAYBEPERM
-#undef MULT_2SPIN
-#define MAYBEPERM(A,B) 
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-#undef  MULT_2SPIN
-#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-#define INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#define INTERIOR
-#undef EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef INTERIOR_AND_EXTERIOR
-#undef INTERIOR
-#define EXTERIOR
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-template<> void 
-WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-template<> void 
-WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
-#undef COMPLEX_SIGNS
-#undef MAYBEPERM
-#undef MULT_2SPIN
-
-#endif //AVX512
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -1,198 +0,0 @@
-#ifdef KERNEL_DAG
-#define DIR0_PROJMEM(base) XP_PROJMEM(base);
-#define DIR1_PROJMEM(base) YP_PROJMEM(base);
-#define DIR2_PROJMEM(base) ZP_PROJMEM(base);
-#define DIR3_PROJMEM(base) TP_PROJMEM(base);
-#define DIR4_PROJMEM(base) XM_PROJMEM(base);
-#define DIR5_PROJMEM(base) YM_PROJMEM(base);
-#define DIR6_PROJMEM(base) ZM_PROJMEM(base);
-#define DIR7_PROJMEM(base) TM_PROJMEM(base);
-#define DIR0_RECON   XP_RECON
-#define DIR1_RECON   YP_RECON_ACCUM
-#define DIR2_RECON   ZP_RECON_ACCUM
-#define DIR3_RECON   TP_RECON_ACCUM
-#define DIR4_RECON   XM_RECON_ACCUM
-#define DIR5_RECON   YM_RECON_ACCUM
-#define DIR6_RECON   ZM_RECON_ACCUM
-#define DIR7_RECON   TM_RECON_ACCUM
-#else
-#define DIR0_PROJMEM(base) XM_PROJMEM(base);
-#define DIR1_PROJMEM(base) YM_PROJMEM(base);
-#define DIR2_PROJMEM(base) ZM_PROJMEM(base);
-#define DIR3_PROJMEM(base) TM_PROJMEM(base);
-#define DIR4_PROJMEM(base) XP_PROJMEM(base);
-#define DIR5_PROJMEM(base) YP_PROJMEM(base);
-#define DIR6_PROJMEM(base) ZP_PROJMEM(base);
-#define DIR7_PROJMEM(base) TP_PROJMEM(base);
-#define DIR0_RECON   XM_RECON
-#define DIR1_RECON   YM_RECON_ACCUM
-#define DIR2_RECON   ZM_RECON_ACCUM
-#define DIR3_RECON   TM_RECON_ACCUM
-#define DIR4_RECON   XP_RECON_ACCUM
-#define DIR5_RECON   YP_RECON_ACCUM
-#define DIR6_RECON   ZP_RECON_ACCUM
-#define DIR7_RECON   TP_RECON_ACCUM
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Comms then compute kernel
-////////////////////////////////////////////////////////////////////////////////
-#ifdef INTERIOR_AND_EXTERIOR
-
-#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
-      basep = st.GetPFInfo(nent,plocal); nent++;			\
-      if ( local ) {							\
-	LOAD64(%r10,isigns);						\
-	PROJ(base);							\
-	MAYBEPERM(PERMUTE_DIR,perm);					\
-      } else {								\
-	LOAD_CHI(base);							\
-      }									\
-      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
-      PREFETCH_CHIMU(base);						\
-      MULT_2SPIN_DIR_PF(Dir,basep);					\
-      LOAD64(%r10,isigns);						\
-      RECON;								\
-
-#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
-  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
-  PF_GAUGE(Xp);								\
-  PREFETCH1_CHIMU(base);						\
-  ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) 
-
-#define RESULT(base,basep) SAVE_RESULT(base,basep);
-
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Pre comms kernel -- prefetch like normal because it is mostly right
-////////////////////////////////////////////////////////////////////////////////
-#ifdef INTERIOR
-
-#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
-      basep = st.GetPFInfo(nent,plocal); nent++;			\
-      if ( local ) {							\
-	LOAD64(%r10,isigns);						\
-	PROJ(base);							\
-	MAYBEPERM(PERMUTE_DIR,perm);					\
-      }else if ( st.same_node[Dir] ) {LOAD_CHI(base);}			\
-      if ( local || st.same_node[Dir] ) {				\
-	MULT_2SPIN_DIR_PF(Dir,basep);					\
-	LOAD64(%r10,isigns);						\
-	RECON;								\
-      }									\
-      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
-      PREFETCH_CHIMU(base);						\
-
-#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
-  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
-  PF_GAUGE(Xp);								\
-  PREFETCH1_CHIMU(base);						\
-  { ZERO_PSI; }								\
-  ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) 
-
-#define RESULT(base,basep) SAVE_RESULT(base,basep);
-
-#endif
-////////////////////////////////////////////////////////////////////////////////
-// Post comms kernel
-////////////////////////////////////////////////////////////////////////////////
-#ifdef EXTERIOR
-
-
-#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
-  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
-  if((!local)&&(!st.same_node[Dir]) ) {					\
-    LOAD_CHI(base);							\
-    MULT_2SPIN_DIR_PF(Dir,base);					\
-    LOAD64(%r10,isigns);						\
-    RECON;								\
-    nmu++;								\
-  }									
-
-#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
-  nmu=0;								\
-  { ZERO_PSI;}								\
-  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
-  if((!local)&&(!st.same_node[Dir]) ) {					\
-    LOAD_CHI(base);							\
-    MULT_2SPIN_DIR_PF(Dir,base);					\
-    LOAD64(%r10,isigns);						\
-    RECON;								\
-    nmu++;								\
-  }
-
-#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
-
-#endif
-{
-  int nmu;
-  int local,perm, ptype;
-  uint64_t base;
-  uint64_t basep;
-  const uint64_t plocal =(uint64_t) & in[0];
-
-  COMPLEX_SIGNS(isigns);
-  MASK_REGS;
-  int nmax=U.oSites();
-  for(int site=0;site<Ns;site++) {
-#ifndef EXTERIOR
-    //    int sU =lo.Reorder(ssU);
-    int sU =ssU;
-    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
-    //    int sUn=lo.Reorder(ssn);
-    int sUn=ssn;
-    LOCK_GAUGE(0);
-#else
-    int sU =ssU;
-    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
-    int sUn=ssn;
-#endif
-    for(int s=0;s<Ls;s++) {
-      ss =sU*Ls+s;
-      ssn=sUn*Ls+s; 
-      int  ent=ss*8;// 2*Ndim
-      int nent=ssn*8;
-
-   ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON);
-      ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON);
-      ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON);
-      ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON);
-
-      ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON);
-      ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON);
-      ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON);
-      ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON);
-
-#ifdef EXTERIOR
-      if (nmu==0) break;
-      //      if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
-#endif
-      base = (uint64_t) &out[ss];
-      basep= st.GetPFInfo(nent,plocal); nent++;
-      RESULT(base,basep);
-    }
-    ssU++;
-    UNLOCK_GAUGE(0);
-  }
-}
-
-#undef DIR0_PROJMEM
-#undef DIR1_PROJMEM
-#undef DIR2_PROJMEM
-#undef DIR3_PROJMEM
-#undef DIR4_PROJMEM
-#undef DIR5_PROJMEM
-#undef DIR6_PROJMEM
-#undef DIR7_PROJMEM
-#undef DIR0_RECON
-#undef DIR1_RECON
-#undef DIR2_RECON
-#undef DIR3_RECON
-#undef DIR4_RECON
-#undef DIR5_RECON
-#undef DIR6_RECON
-#undef DIR7_RECON
-#undef ASM_LEG
-#undef ASM_LEG_XP
-#undef RESULT
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
@@ -1,161 +0,0 @@
-{
-  int locala,perma, ptypea;
-  int localb,permb, ptypeb;
-  uint64_t basea, baseb;
-  const uint64_t plocal =(uint64_t) & in._odata[0];
-
-  //  vComplexF isigns[2] = { signs[0], signs[1] };
-  vComplexF *isigns = &signs[0];
-
-  MASK_REGS;
-
-  for(int site=0;site<Ns;site++) {
-  int sU=lo.Reorder(ssU);  
-  for(int s=0;s<Ls;s++) {
-  ss=sU*Ls+s;
-  ////////////////////////////////
-  // Xp
-  ////////////////////////////////
-  int ent=ss*8;// 2*Ndim
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-
-  if ( locala ) {
-    LOAD64(%r10,isigns);
-    XM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR3,perma);
-  } else { 
-    LOAD_CHI(basea);
-  }
-  {
-    MULT_2SPIN_DIR_PFXP(Xp,baseb);
-  }
-  LOAD64(%r10,isigns);
-  XM_RECON;
-
-  ////////////////////////////////
-  // Yp
-  ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++;
-  if ( localb ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YM_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR2,permb);
-  } else { 
-    LOAD_CHI(baseb);
-  }
-  {
-    MULT_2SPIN_DIR_PFYP(Yp,basea);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  YM_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Zp
-  ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++;
-  if ( locala ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR1,perma);
-  } else { 
-    LOAD_CHI(basea);
-  }
-  {
-    MULT_2SPIN_DIR_PFZP(Zp,baseb);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  ZM_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Tp
-  ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++;
-  if ( localb ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TM_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR0,permb);
-  } else { 
-    LOAD_CHI(baseb);
-  }
-  {
-    MULT_2SPIN_DIR_PFTP(Tp,basea);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  TM_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Xm
-  ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++;
-  if ( locala ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    XP_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR3,perma);
-  } else { 
-    LOAD_CHI(basea);
-  }
-  {
-    MULT_2SPIN_DIR_PFXM(Xm,baseb);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  XP_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Ym
-  ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
-  if ( localb ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR2,permb);
-  } else { 
-    LOAD_CHI(baseb);
-  }
-  {
-    MULT_2SPIN_DIR_PFYM(Ym,basea);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  YP_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Zm
-  ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
-  if ( locala ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZP_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR1,perma);
-  } else { 
-    LOAD_CHI(basea);
-  }
-  {
-    MULT_2SPIN_DIR_PFZM(Zm,baseb);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  ZP_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Tm
-  ////////////////////////////////
-  basea = (uint64_t)&out._odata[ss];
-  if ( localb ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR0,permb);
-  } else { 
-    LOAD_CHI(baseb);
-  }
-  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal);
-  {
-    MULT_2SPIN_DIR_PFTM(Tm,basea);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  TP_RECON_ACCUM;
-
-  SAVE_RESULT(&out._odata[ss],baseb);
-
-  } 
-  ssU++;
-  }
-}
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
@@ -1,187 +0,0 @@
-{
-  int locala,perma, ptypea;
-  int localb,permb, ptypeb;
-  int localc,permc, ptypec;
-  uint64_t basea, baseb, basec;
-  uint64_t basex;
-  const uint64_t plocal =(uint64_t) & in._odata[0];
-
-  //  vComplexF isigns[2] = { signs[0], signs[1] };
-  vComplexF *isigns = &signs[0];
-
-  MASK_REGS;
-
-  for(int site=0;site<Ns;site++) {
-  int sU=lo.Reorder(ssU);
-
-  for(int s=0;s<Ls;s++) {
-  ss     =sU*Ls+s;
-
-  ////////////////////////////////
-  // Xp
-  ////////////////////////////////
-  int ent=ss*8;// 2*Ndim
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
-
-  basex = basea;
-
-  label(FX(XP) );
-  if ( locala ) {
-    LOAD64(%r10,isigns);
-    XM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR3,perma);
-  } else { 
-    LOAD_CHI(basea);
-  }
-  {
-    MULT_2SPIN_DIR_PFXP(Xp,baseb);
-  }
-  LOAD64(%r10,isigns);
-  XM_RECON;
-
-  ////////////////////////////////
-  // Yp
-  ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
-  label(FX(YP) );
-  if ( localb ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YM_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR2,permb);
-  } else { 
-    LOAD_CHI(baseb);
-  }
-  {
-    MULT_2SPIN_DIR_PFYP(Yp,basec);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  YM_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Zp
-  ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  label(FX(ZP) );
-  if ( localc ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZM_PROJMEM(basec);
-    MAYBEPERM(PERMUTE_DIR1,permc);
-  } else { 
-    LOAD_CHI(basec);
-  }
-  {
-    MULT_2SPIN_DIR_PFZP(Zp,basea);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  ZM_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Tp
-  ////////////////////////////////
-  basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
-  label(FX(TP) );
-  if ( locala ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR0,perma);
-  } else { 
-    LOAD_CHI(basea);
-  }
-  {
-    MULT_2SPIN_DIR_PFTP(Tp,baseb);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  TM_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Xm
-  ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
-  label(FX(XM) );
-  if ( localb ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    XP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR3,permb);
-  } else { 
-    LOAD_CHI(baseb);
-  }
-  {
-    MULT_2SPIN_DIR_PFXM(Xm,basec);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  XP_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Ym
-  ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  label(FX(YM) );
-  if ( localc ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YP_PROJMEM(basec);
-    MAYBEPERM(PERMUTE_DIR2,permc);
-  } else { 
-    LOAD_CHI(basec);
-  }
-  {
-    MULT_2SPIN_DIR_PFYM(Ym,basea);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  YP_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Zm
-  ////////////////////////////////
-  basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
-  label(FX(ZM) );
-  if ( locala ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZP_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR1,perma);
-  } else { 
-    LOAD_CHI(basea);
-  }
-  {
-    MULT_2SPIN_DIR_PFZM(Zm,baseb);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  ZP_RECON_ACCUM;
-
-  ////////////////////////////////
-  // Tm
-  ////////////////////////////////
-  basea = (uint64_t)&out._odata[ss];
-  PREFETCH_CHIMU(basea);
-  label(FX(TM) );
-  if ( localb ) {
-    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR0,permb);
-  } else { 
-    LOAD_CHI(baseb);
-  }
-  {
-    MULT_2SPIN_DIR_PFTM(Tm,basec);
-  }
-  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-  TP_RECON_ACCUM;
-
-  //  PREFETCH_CHIMU(basex);
-  label(FX(SAV) );
-  SAVE_RESULT(&out._odata[ss]);
-  
-  }
-  ssU++;
-  }
-}
--- a/Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h
@@ -1,150 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmQPX.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-
-#if defined(QPX) 
-
-    ///////////////////////////////////////////////////////////
-    // If we are QPX specialise the single precision routine
-    ///////////////////////////////////////////////////////////
-
-#include <simd/IBM_qpx.h>
-#include <simd/IBM_qpx_single.h>
-  
-#define MAYBEPERM(A,perm) if (perm) { A ; }
-#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf)
-#define COMPLEX_SIGNS(isigns) 
-
-#define INTERIOR_AND_EXTERIOR    
-#undef  INTERIOR
-#undef  EXTERIOR
-  
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
-/////////////////////////////////////////////////////////////////
-// XYZT vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-template<> void 
-WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
-						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-#undef MAYBEPERM
-#undef MULT_2SPIN
-#define MAYBEPERM(A,B) 
-#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX_LS(ptr,pf)
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, undag Kernel, single
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, dag Kernel, single
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-template<> void 
-WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-#undef MAYBEPERM
-#undef MULT_2SPIN
-	
-///////////////////////////////////////////////////////////
-// DP routines
-///////////////////////////////////////////////////////////
-
-#include <simd/IBM_qpx_double.h>
-    
-#define MAYBEPERM(A,perm) if (perm) { A ; }
-#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf)
-
-/////////////////////////////////////////////////////////////////
-// XYZT Vectorised, undag Kernel, double
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
-						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-/////////////////////////////////////////////////////////////////
-      
-
-/////////////////////////////////////////////////////////////////
-// XYZT Vectorised, dag Kernel, double
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-template<> void 
-WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
-						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-/////////////////////////////////////////////////////////////////
-
-#undef MAYBEPERM
-#undef MULT_2SPIN
-#define MAYBEPERM(A,B) 
-#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX_LS(ptr,pf)
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, undag Kernel, double
-/////////////////////////////////////////////////////////////////
-#undef KERNEL_DAG
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-/////////////////////////////////////////////////////////////////
-				    
-/////////////////////////////////////////////////////////////////
-// Ls vectorised, dag Kernel, double
-/////////////////////////////////////////////////////////////////
-#define KERNEL_DAG
-template<> void 
-WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-/////////////////////////////////////////////////////////////////
-	
-#undef MAYBEPERM
-#undef MULT_2SPIN
-
-#endif 
--- a/Grid/qcd/action/fermion/WilsonKernelsGpu.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsGpu.cc
@@ -1,378 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernelsGpu.cc
-
-Copyright (C) 2018
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-NAMESPACE_BEGIN(Grid);
-
-//////////////////////////////////////////////////////////////
-// Gpu implementation; thread loop is implicit ; move to header
-//////////////////////////////////////////////////////////////
-accelerator_inline void synchronise(void) 
-{
-#ifdef __CUDA_ARCH__
-  __syncthreads();
-#endif
-  return;
-}
-accelerator_inline int get_my_lanes(int Nsimd) 
-{
-#ifdef __CUDA_ARCH__
-  return 1;
-#else 
-  return Nsimd;
-#endif
-}
-accelerator_inline int get_my_lane_offset(int Nsimd) 
-{
-#ifdef __CUDA_ARCH__
-  return ( (threadIdx.x) % Nsimd);
-#else
-  return 0;
-#endif
-}
-
-accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
-{
-#ifdef __CUDA_ARCH__
-  static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); 
-  uint4 * mem_pun  = (uint4 *)mem; // force 128 bit loads
-  uint4 * chip_pun = (uint4 *)&chip;
-  * chip_pun = * mem_pun;
-#else 
-  chip = *mem;
-#endif
-  return;
-}
-
-#ifdef GPU_VEC
-#if 1
-#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
-  if (SE._is_local) {							\
-    int mask = Nsimd >> (ptype + 1);					\
-    int plane= SE._permute ? (lane ^ mask) : lane;			\
-    auto in_l = extractLane(plane,in[SE._offset+s]);			\
-    spProj(chi,in_l);							\
-  } else {								\
-    chi  = extractLane(lane,buf[SE._offset+s]);				\
-  }									\
-  synchronise();
-#else 
-#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
-  { int mask = Nsimd >> (ptype + 1);					\
-  int plane= SE._permute ? (lane ^ mask) : lane;			\
-  auto in_l = extractLane(plane,in[SE._offset+s]);			\
-  spProj(chi,in_l); }							
-#endif
-#else 
-#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
-  if (SE._is_local) {							\
-    auto in_t = in[SE._offset+s];					\
-    if (SE._permute) {							\
-      spProj(tmp, in_t);						\
-      permute(chi, tmp, ptype);						\
-    } else {								\
-      spProj(chi, in_t);						\
-    }									\
-  } else {								\
-    chi  = buf[SE._offset+s];						\
-  }									\
-  synchronise();
-#endif
-
-template <class Impl>
-accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
-							    SiteHalfSpinor *buf, int Ls, int s,
-							    int sU, const FermionFieldView &in, FermionFieldView &out)
-{
-#ifdef GPU_VEC
-  typename SiteHalfSpinor::scalar_object chi;
-  typename SiteHalfSpinor::scalar_object Uchi;
-  typename SiteSpinor::scalar_object   result;
-#else 
-  SiteHalfSpinor chi;
-  SiteHalfSpinor Uchi;
-  SiteHalfSpinor tmp;
-  SiteSpinor   result;
-#endif
-  typedef typename SiteSpinor::scalar_type scalar_type;
-  typedef typename SiteSpinor::vector_type vector_type;
-  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
-
-  uint64_t lane_offset= get_my_lane_offset(Nsimd);
-  uint64_t lanes      = get_my_lanes(Nsimd);
-
-  StencilEntry *SE_mem;
-  StencilEntry SE; 
-
-  int ptype;
-  uint64_t ssF = Ls * sU;
-  uint64_t sF  = ssF + s;
-#ifndef __CUDA_ARCH__
-  for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
-#else
-  int lane = lane_offset; {
-#endif
-    SE_mem = st.GetEntry(ptype, Xp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp); 
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
-    spReconXp(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Yp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
-    accumReconYp(result, Uchi);
-      
-    SE_mem = st.GetEntry(ptype, Zp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
-    accumReconZp(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Tp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
-    accumReconTp(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Xm, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
-    accumReconXm(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Ym, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
-    accumReconYm(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Zm, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
-    accumReconZm(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Tm, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); 
-    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
-    accumReconTm(result, Uchi);
-
-#ifdef GPU_VEC
-    insertLane (lane,out[sF],result);
-#else
-  vstream(out[sF], result);
-#endif
-  }
-}
-
-template <class Impl>
-accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U,
-						  SiteHalfSpinor *buf,  int Ls, int s,
-						  int sU, const FermionFieldView &in, FermionFieldView &out) 
-{
-#ifdef GPU_VEC
-  typename SiteHalfSpinor::scalar_object chi;
-  typename SiteHalfSpinor::scalar_object Uchi;
-  typename SiteSpinor::scalar_object   result;
-#else 
-  SiteHalfSpinor chi;
-  SiteHalfSpinor Uchi;
-  SiteHalfSpinor tmp;
-  SiteSpinor   result;
-#endif
-  typedef typename SiteSpinor::scalar_type scalar_type;
-  typedef typename SiteSpinor::vector_type vector_type;
-  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
-
-  uint64_t lane_offset= get_my_lane_offset(Nsimd);
-  uint64_t lanes      = get_my_lanes(Nsimd);
-
-  //  printf (" sU %d s %d Nsimd %d lanes %ld lane_off %ld\n",sU, s, Nsimd, lanes, lane_offset);
-
-  StencilEntry *SE_mem;
-  StencilEntry SE;
-  int ptype;
-  // Forces some degree of coalesce on the table look ups
-  // Could also use wide load instructions on the data structure
-  uint64_t ssF = Ls * sU;
-  uint64_t sF  = ssF + s;
-
-#ifndef __CUDA_ARCH__
-  for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
-#else
-  int lane = lane_offset; {
-#endif
-    SE_mem = st.GetEntry(ptype, Xp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm); 
-    Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
-    spReconXm(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Yp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
-    Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
-    accumReconYm(result, Uchi);
-      
-    SE_mem = st.GetEntry(ptype, Zp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
-    Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
-    accumReconZm(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Tp, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
-    Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
-    accumReconTm(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Xm, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
-    Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
-    accumReconXp(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Ym, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
-    Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
-    accumReconYp(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Zm, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
-    Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
-    accumReconZp(result, Uchi);
-
-    SE_mem = st.GetEntry(ptype, Tm, ssF); get_stencil(SE_mem,SE);
-    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp); 
-    Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
-    accumReconTp(result, Uchi);
-
-#ifdef GPU_VEC
-    insertLane (lane,out[sF],result);
-#else
-  vstream(out[sF], result);
-#endif
-  }
-
-};
-
-// Template specialise Gparity to empty for now
-#define GPU_EMPTY(A)							\
-  template <>								\
-accelerator_inline void							\
-WilsonKernels<A>::GpuDhopSite(StencilView &st,				\
-			      SiteDoubledGaugeField &U,			\
-			      SiteHalfSpinor *buf, int Ls, int sF,	\
-			      int sU,					\
-			      const FermionFieldView &in,		\
-			      FermionFieldView &out) { assert(0);};	\
-  template <>								\
-  accelerator_inline void							\
-  WilsonKernels<A>::GpuDhopSiteDag(StencilView &st,			\
-				   DoubledGaugeFieldView &U,		\
-				   SiteHalfSpinor *buf, int Ls,int sF,	\
-				   int sU,				\
-				   const FermionFieldView &in,		\
-				   FermionFieldView &out) { assert(0);};
-
-GPU_EMPTY(GparityWilsonImplF);
-GPU_EMPTY(GparityWilsonImplFH);
-GPU_EMPTY(GparityWilsonImplD);
-GPU_EMPTY(GparityWilsonImplDF);
-
-template <class Impl>
-void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
-				     int Ls, int Nsite, const FermionField &in, FermionField &out,
-				     int interior,int exterior) 
-{
-    auto U_v   = U.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    auto st_v  = st.View();
-    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
-      const uint64_t nsimd = Simd::Nsimd();
-      const uint64_t    NN = Nsite*Ls*nsimd;
-      accelerator_loopN( sss, NN, {
-	  uint64_t cur  = sss;
-	  //	  uint64_t lane = cur % nsimd;
-	  cur = cur / nsimd;
-	  uint64_t   s  = cur%Ls;
-	  //	  uint64_t   sF = cur;         
-	  cur = cur / Ls;
-	  uint64_t   sU = cur;
-	  WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
-      });
-    } else { 
-      accelerator_loop( ss, U_v, {
-	int sU = ss;
-        int sF = Ls * sU;
-        WilsonKernels<Impl>::DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
-      });
-    }
-  }
-  template <class Impl>
-  void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
-				     int Ls, int Nsite, const FermionField &in, FermionField &out,
-				     int interior,int exterior) 
-  {
-    auto U_v   = U.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    auto st_v  = st.View();
-
-    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
-      const uint64_t nsimd = Simd::Nsimd();
-      const uint64_t    NN = Nsite*Ls*nsimd;
-      accelerator_loopN( sss, NN, {
-	  uint64_t cur  = sss;
-	  // uint64_t lane = cur % nsimd;
-	  cur = cur / nsimd;
-	  uint64_t   s  = cur%Ls;
-	  //uint64_t   sF = cur;         
-	  cur = cur / Ls;
-	  uint64_t   sU = cur;
-	  WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,Ls,s,sU,in_v,out_v);
-      });
-    } else { 
-      accelerator_loop( ss, U_v, {
-	int sU = ss;
-        int sF = Ls * sU;
-        WilsonKernels<Impl>::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
-      });
-    }
-  }
-
-
-/*
-GPU_EMPTY(DomainWallVec5dImplF);
-GPU_EMPTY(DomainWallVec5dImplFH);
-GPU_EMPTY(DomainWallVec5dImplD);
-GPU_EMPTY(DomainWallVec5dImplDF);
-GPU_EMPTY(ZDomainWallVec5dImplF);
-GPU_EMPTY(ZDomainWallVec5dImplFH);
-GPU_EMPTY(ZDomainWallVec5dImplD);
-GPU_EMPTY(ZDomainWallVec5dImplDF);
-*/
-
-FermOpTemplateInstantiate(WilsonKernels);
-AdjointFermOpTemplateInstantiate(WilsonKernels);
-TwoIndexFermOpTemplateInstantiate(WilsonKernels);
-
-NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsHand.cc
@@ -1,654 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-#define REGISTER
-
-#define LOAD_CHIMU \
-  {const SiteSpinor & ref (in[offset]);	\
-    Chimu_00=ref()(0)(0);\
-    Chimu_01=ref()(0)(1);\
-    Chimu_02=ref()(0)(2);\
-    Chimu_10=ref()(1)(0);\
-    Chimu_11=ref()(1)(1);\
-    Chimu_12=ref()(1)(2);\
-    Chimu_20=ref()(2)(0);\
-    Chimu_21=ref()(2)(1);\
-    Chimu_22=ref()(2)(2);\
-    Chimu_30=ref()(3)(0);\
-    Chimu_31=ref()(3)(1);\
-    Chimu_32=ref()(3)(2);}
-
-#define LOAD_CHI\
-  {const SiteHalfSpinor &ref(buf[offset]);	\
-    Chi_00 = ref()(0)(0);\
-    Chi_01 = ref()(0)(1);\
-    Chi_02 = ref()(0)(2);\
-    Chi_10 = ref()(1)(0);\
-    Chi_11 = ref()(1)(1);\
-    Chi_12 = ref()(1)(2);}
-
-// To splat or not to splat depends on the implementation
-#define MULT_2SPIN(A)\
-  {auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));	\
-   Impl::loadLinkElement(U_10,ref()(1,0));	\
-   Impl::loadLinkElement(U_20,ref()(2,0));	\
-   Impl::loadLinkElement(U_01,ref()(0,1));	\
-   Impl::loadLinkElement(U_11,ref()(1,1));	\
-   Impl::loadLinkElement(U_21,ref()(2,1));	\
-    UChi_00 = U_00*Chi_00;\
-    UChi_10 = U_00*Chi_10;\
-    UChi_01 = U_10*Chi_00;\
-    UChi_11 = U_10*Chi_10;\
-    UChi_02 = U_20*Chi_00;\
-    UChi_12 = U_20*Chi_10;\
-    UChi_00+= U_01*Chi_01;\
-    UChi_10+= U_01*Chi_11;\
-    UChi_01+= U_11*Chi_01;\
-    UChi_11+= U_11*Chi_11;\
-    UChi_02+= U_21*Chi_01;\
-    UChi_12+= U_21*Chi_11;\
-    Impl::loadLinkElement(U_00,ref()(0,2));	\
-    Impl::loadLinkElement(U_10,ref()(1,2));	\
-    Impl::loadLinkElement(U_20,ref()(2,2));	\
-    UChi_00+= U_00*Chi_02;\
-    UChi_10+= U_00*Chi_12;\
-    UChi_01+= U_10*Chi_02;\
-    UChi_11+= U_10*Chi_12;\
-    UChi_02+= U_20*Chi_02;\
-    UChi_12+= U_20*Chi_12;}
-
-
-#define PERMUTE_DIR(dir)			\
-      permute##dir(Chi_00,Chi_00);\
-      permute##dir(Chi_01,Chi_01);\
-      permute##dir(Chi_02,Chi_02);\
-      permute##dir(Chi_10,Chi_10);\
-      permute##dir(Chi_11,Chi_11);\
-      permute##dir(Chi_12,Chi_12);
-
-//      hspin(0)=fspin(0)+timesI(fspin(3));
-//      hspin(1)=fspin(1)+timesI(fspin(2));
-#define XP_PROJ \
-    Chi_00 = Chimu_00+timesI(Chimu_30);\
-    Chi_01 = Chimu_01+timesI(Chimu_31);\
-    Chi_02 = Chimu_02+timesI(Chimu_32);\
-    Chi_10 = Chimu_10+timesI(Chimu_20);\
-    Chi_11 = Chimu_11+timesI(Chimu_21);\
-    Chi_12 = Chimu_12+timesI(Chimu_22);
-
-#define YP_PROJ \
-    Chi_00 = Chimu_00-Chimu_30;\
-    Chi_01 = Chimu_01-Chimu_31;\
-    Chi_02 = Chimu_02-Chimu_32;\
-    Chi_10 = Chimu_10+Chimu_20;\
-    Chi_11 = Chimu_11+Chimu_21;\
-    Chi_12 = Chimu_12+Chimu_22;
-
-#define ZP_PROJ \
-  Chi_00 = Chimu_00+timesI(Chimu_20);		\
-  Chi_01 = Chimu_01+timesI(Chimu_21);		\
-  Chi_02 = Chimu_02+timesI(Chimu_22);		\
-  Chi_10 = Chimu_10-timesI(Chimu_30);		\
-  Chi_11 = Chimu_11-timesI(Chimu_31);		\
-  Chi_12 = Chimu_12-timesI(Chimu_32);
-
-#define TP_PROJ \
-  Chi_00 = Chimu_00+Chimu_20;		\
-  Chi_01 = Chimu_01+Chimu_21;		\
-  Chi_02 = Chimu_02+Chimu_22;		\
-  Chi_10 = Chimu_10+Chimu_30;		\
-  Chi_11 = Chimu_11+Chimu_31;		\
-  Chi_12 = Chimu_12+Chimu_32;
-
-
-//      hspin(0)=fspin(0)-timesI(fspin(3));
-//      hspin(1)=fspin(1)-timesI(fspin(2));
-#define XM_PROJ \
-    Chi_00 = Chimu_00-timesI(Chimu_30);\
-    Chi_01 = Chimu_01-timesI(Chimu_31);\
-    Chi_02 = Chimu_02-timesI(Chimu_32);\
-    Chi_10 = Chimu_10-timesI(Chimu_20);\
-    Chi_11 = Chimu_11-timesI(Chimu_21);\
-    Chi_12 = Chimu_12-timesI(Chimu_22);
-
-#define YM_PROJ \
-    Chi_00 = Chimu_00+Chimu_30;\
-    Chi_01 = Chimu_01+Chimu_31;\
-    Chi_02 = Chimu_02+Chimu_32;\
-    Chi_10 = Chimu_10-Chimu_20;\
-    Chi_11 = Chimu_11-Chimu_21;\
-    Chi_12 = Chimu_12-Chimu_22;
-
-#define ZM_PROJ \
-  Chi_00 = Chimu_00-timesI(Chimu_20);		\
-  Chi_01 = Chimu_01-timesI(Chimu_21);		\
-  Chi_02 = Chimu_02-timesI(Chimu_22);		\
-  Chi_10 = Chimu_10+timesI(Chimu_30);		\
-  Chi_11 = Chimu_11+timesI(Chimu_31);		\
-  Chi_12 = Chimu_12+timesI(Chimu_32);
-
-#define TM_PROJ \
-  Chi_00 = Chimu_00-Chimu_20;		\
-  Chi_01 = Chimu_01-Chimu_21;		\
-  Chi_02 = Chimu_02-Chimu_22;		\
-  Chi_10 = Chimu_10-Chimu_30;		\
-  Chi_11 = Chimu_11-Chimu_31;		\
-  Chi_12 = Chimu_12-Chimu_32;
-
-//      fspin(0)=hspin(0);
-//      fspin(1)=hspin(1);
-//      fspin(2)=timesMinusI(hspin(1));
-//      fspin(3)=timesMinusI(hspin(0));
-#define XP_RECON\
-  result_00 = UChi_00;\
-  result_01 = UChi_01;\
-  result_02 = UChi_02;\
-  result_10 = UChi_10;\
-  result_11 = UChi_11;\
-  result_12 = UChi_12;\
-  result_20 = timesMinusI(UChi_10);\
-  result_21 = timesMinusI(UChi_11);\
-  result_22 = timesMinusI(UChi_12);\
-  result_30 = timesMinusI(UChi_00);\
-  result_31 = timesMinusI(UChi_01);\
-  result_32 = timesMinusI(UChi_02);
-
-#define XP_RECON_ACCUM\
-  result_00+=UChi_00;\
-  result_01+=UChi_01;\
-  result_02+=UChi_02;\
-  result_10+=UChi_10;\
-  result_11+=UChi_11;\
-  result_12+=UChi_12;\
-  result_20-=timesI(UChi_10);\
-  result_21-=timesI(UChi_11);\
-  result_22-=timesI(UChi_12);\
-  result_30-=timesI(UChi_00);\
-  result_31-=timesI(UChi_01);\
-  result_32-=timesI(UChi_02);
-
-#define XM_RECON\
-  result_00 = UChi_00;\
-  result_01 = UChi_01;\
-  result_02 = UChi_02;\
-  result_10 = UChi_10;\
-  result_11 = UChi_11;\
-  result_12 = UChi_12;\
-  result_20 = timesI(UChi_10);\
-  result_21 = timesI(UChi_11);\
-  result_22 = timesI(UChi_12);\
-  result_30 = timesI(UChi_00);\
-  result_31 = timesI(UChi_01);\
-  result_32 = timesI(UChi_02);
-
-#define XM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= timesI(UChi_10);\
-  result_21+= timesI(UChi_11);\
-  result_22+= timesI(UChi_12);\
-  result_30+= timesI(UChi_00);\
-  result_31+= timesI(UChi_01);\
-  result_32+= timesI(UChi_02);
-
-#define YP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= UChi_10;\
-  result_21+= UChi_11;\
-  result_22+= UChi_12;\
-  result_30-= UChi_00;\
-  result_31-= UChi_01;\
-  result_32-= UChi_02;
-
-#define YM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= UChi_10;\
-  result_21-= UChi_11;\
-  result_22-= UChi_12;\
-  result_30+= UChi_00;\
-  result_31+= UChi_01;\
-  result_32+= UChi_02;
-
-#define ZP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= timesI(UChi_00);			\
-  result_21-= timesI(UChi_01);			\
-  result_22-= timesI(UChi_02);			\
-  result_30+= timesI(UChi_10);			\
-  result_31+= timesI(UChi_11);			\
-  result_32+= timesI(UChi_12);
-
-#define ZM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= timesI(UChi_00);			\
-  result_21+= timesI(UChi_01);			\
-  result_22+= timesI(UChi_02);			\
-  result_30-= timesI(UChi_10);			\
-  result_31-= timesI(UChi_11);			\
-  result_32-= timesI(UChi_12);
-
-#define TP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= UChi_00;			\
-  result_21+= UChi_01;			\
-  result_22+= UChi_02;			\
-  result_30+= UChi_10;			\
-  result_31+= UChi_11;			\
-  result_32+= UChi_12;
-
-#define TM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= UChi_00;	\
-  result_21-= UChi_01;	\
-  result_22-= UChi_02;	\
-  result_30-= UChi_10;	\
-  result_31-= UChi_11;	\
-  result_32-= UChi_12;
-
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU;					\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else {					\
-    LOAD_CHI;					\
-  }						\
-  MULT_2SPIN(DIR);				\
-  RECON;					
-
-#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU;					\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else if ( st.same_node[DIR] ) {		\
-    LOAD_CHI;					\
-  }						\
-  if (local || st.same_node[DIR] ) {		\
-    MULT_2SPIN(DIR);				\
-    RECON;					\
-  }
-
-#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
-    LOAD_CHI;					\
-    MULT_2SPIN(DIR);				\
-    RECON;					\
-    nmu++;					\
-  }
-
-#define HAND_RESULT(ss)				\
-  {						\
-    SiteSpinor & ref (out[ss]);		\
-    vstream(ref()(0)(0),result_00);		\
-    vstream(ref()(0)(1),result_01);		\
-    vstream(ref()(0)(2),result_02);		\
-    vstream(ref()(1)(0),result_10);		\
-    vstream(ref()(1)(1),result_11);		\
-    vstream(ref()(1)(2),result_12);		\
-    vstream(ref()(2)(0),result_20);		\
-    vstream(ref()(2)(1),result_21);		\
-    vstream(ref()(2)(2),result_22);		\
-    vstream(ref()(3)(0),result_30);		\
-    vstream(ref()(3)(1),result_31);		\
-    vstream(ref()(3)(2),result_32);		\
-  }
-
-#define HAND_RESULT_EXT(ss)			\
-  if (nmu){					\
-    SiteSpinor & ref (out[ss]);		\
-    ref()(0)(0)+=result_00;		\
-    ref()(0)(1)+=result_01;		\
-    ref()(0)(2)+=result_02;		\
-    ref()(1)(0)+=result_10;		\
-    ref()(1)(1)+=result_11;		\
-    ref()(1)(2)+=result_12;		\
-    ref()(2)(0)+=result_20;		\
-    ref()(2)(1)+=result_21;		\
-    ref()(2)(2)+=result_22;		\
-    ref()(3)(0)+=result_30;		\
-    ref()(3)(1)+=result_31;		\
-    ref()(3)(2)+=result_32;		\
-  }
-
-
-#define HAND_DECLARATIONS(a)			\
-  Simd result_00;				\
-  Simd result_01;				\
-  Simd result_02;				\
-  Simd result_10;				\
-  Simd result_11;				\
-  Simd result_12;				\
-  Simd result_20;				\
-  Simd result_21;				\
-  Simd result_22;				\
-  Simd result_30;				\
-  Simd result_31;				\
-  Simd result_32;				\
-  Simd Chi_00;					\
-  Simd Chi_01;					\
-  Simd Chi_02;					\
-  Simd Chi_10;					\
-  Simd Chi_11;					\
-  Simd Chi_12;					\
-  Simd UChi_00;					\
-  Simd UChi_01;					\
-  Simd UChi_02;					\
-  Simd UChi_10;					\
-  Simd UChi_11;					\
-  Simd UChi_12;					\
-  Simd U_00;					\
-  Simd U_10;					\
-  Simd U_20;					\
-  Simd U_01;					\
-  Simd U_11;					\
-  Simd U_21;
-
-#define ZERO_RESULT				\
-  result_00=Zero();				\
-  result_01=Zero();				\
-  result_02=Zero();				\
-  result_10=Zero();				\
-  result_11=Zero();				\
-  result_12=Zero();				\
-  result_20=Zero();				\
-  result_21=Zero();				\
-  result_22=Zero();				\
-  result_30=Zero();				\
-  result_31=Zero();				\
-  result_32=Zero();			
-
-#define Chimu_00 Chi_00
-#define Chimu_01 Chi_01
-#define Chimu_02 Chi_02
-#define Chimu_10 Chi_10
-#define Chimu_11 Chi_11
-#define Chimu_12 Chi_12
-#define Chimu_20 UChi_00
-#define Chimu_21 UChi_01
-#define Chimu_22 UChi_02
-#define Chimu_30 UChi_10
-#define Chimu_31 UChi_11
-#define Chimu_32 UChi_12
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-
-  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
-  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT(ss);
-#else
-  assert(0);
-#endif
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  
-  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
-  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT(ss);
-#else
-  assert(0);
-#endif
-}
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT(ss);
-#else
-  assert(0);
-#endif
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT(ss);
-#else
-  assert(0);
-#endif
-}
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-  int nmu=0;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT_EXT(ss);
-#else
-  assert(0);
-#endif
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  int nmu=0;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT_EXT(ss);
-#else
-  assert(0);
-#endif
-}
-
-////////////// Wilson ; uses this implementation /////////////////////
-
-#define INSTANTIATE_THEM(A) \
-template void WilsonKernels<A>::HandDhopSite(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-					     int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out);\
-template void WilsonKernels<A>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionFieldView &in, FermionFieldView &out); 
-
-INSTANTIATE_THEM(WilsonImplF);
-INSTANTIATE_THEM(WilsonImplD);
-INSTANTIATE_THEM(ZWilsonImplF);
-INSTANTIATE_THEM(ZWilsonImplD);
-INSTANTIATE_THEM(DomainWallVec5dImplF);
-INSTANTIATE_THEM(DomainWallVec5dImplD);
-INSTANTIATE_THEM(ZDomainWallVec5dImplF);
-INSTANTIATE_THEM(ZDomainWallVec5dImplD);
-INSTANTIATE_THEM(WilsonImplFH);
-INSTANTIATE_THEM(WilsonImplDF);
-INSTANTIATE_THEM(ZWilsonImplFH);
-INSTANTIATE_THEM(ZWilsonImplDF);
-INSTANTIATE_THEM(DomainWallVec5dImplFH);
-INSTANTIATE_THEM(DomainWallVec5dImplDF);
-INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
-INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
-INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
-INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonKernelsHandGparity.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsHandGparity.cc
@@ -1,943 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-#define REGISTER
-
-#define LOAD_CHIMU_BODY(F)			\
-  Chimu_00=ref(F)(0)(0);			\
-  Chimu_01=ref(F)(0)(1);			\
-  Chimu_02=ref(F)(0)(2);			\
-  Chimu_10=ref(F)(1)(0);			\
-  Chimu_11=ref(F)(1)(1);			\
-  Chimu_12=ref(F)(1)(2);			\
-  Chimu_20=ref(F)(2)(0);			\
-  Chimu_21=ref(F)(2)(1);			\
-  Chimu_22=ref(F)(2)(2);			\
-  Chimu_30=ref(F)(3)(0);			\
-  Chimu_31=ref(F)(3)(1);			\
-  Chimu_32=ref(F)(3)(2)
-
-#define LOAD_CHIMU(DIR,F,PERM)						\
-  { const SiteSpinor & ref (in[offset]); LOAD_CHIMU_BODY(F); }
-
-#define LOAD_CHI_BODY(F)				\
-    Chi_00 = ref(F)(0)(0);\
-    Chi_01 = ref(F)(0)(1);\
-    Chi_02 = ref(F)(0)(2);\
-    Chi_10 = ref(F)(1)(0);\
-    Chi_11 = ref(F)(1)(1);\
-    Chi_12 = ref(F)(1)(2)
-
-#define LOAD_CHI(DIR,F,PERM)					\
-  {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
-
-
-//G-parity implementations using in-place intrinsic ops
-
-//1l 1h -> 1h 1l
-//0l 0h , 1h 1l -> 0l 1h 0h,1l
-//0h,1l -> 1l,0h
-//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
-//Pulled fermion through forwards face, GPBC on upper component
-//Need 0= 0l 1h   1= 1l 0h
-//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
-//Pulled fermion through backwards face, GPBC on lower component
-//Need 0= 1l 0h   1= 0l 1h
-
-//1l 1h -> 1h 1l
-//0l 0h , 1h 1l -> 0l 1h 0h,1l
-#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
-  permute##PERM(tmp1, ref(1)(S)(C));				\
-  exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1);		\
-  INTO = tmp2;
-
-//0l 0h -> 0h 0l
-//1l 1h, 0h 0l -> 1l 0h, 1h 0l
-#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
-  permute##PERM(tmp1, ref(0)(S)(C));				\
-  exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1);		\
-  INTO = tmp2;
-
-
-
-
-#define LOAD_CHI_SETUP(DIR,F)						\
-  g = F;								\
-  direction = st._directions[DIR];				\
-  distance = st._distances[DIR];				\
-  sl = st._simd_layout[direction];			        \
-  inplace_twist = 0;						\
-  if(SE->_around_the_world && st.parameters.twists[DIR % 4]){		\
-    if(sl == 1){							\
-      g = (F+1) % 2;							\
-    }else{								\
-      inplace_twist = 1;						\
-    }									\
-  }  
-
-#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\
-  { const SiteSpinor &ref(in[offset]);				\
-    LOAD_CHI_SETUP(DIR,F);						\
-    if(!inplace_twist){							\
-      LOAD_CHIMU_BODY(g);						\
-    }else{								\
-      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
-	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
-	DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
-      }else{								\
-	DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
-      } \
-    } \
-  }
-
-
-#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)				\
-  { const SiteHalfSpinor &ref(buf[offset]);				\
-    LOAD_CHI_SETUP(DIR,F);						\
-    if(!inplace_twist){							\
-      LOAD_CHI_BODY(g);							\
-    }else{								\
-      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
-	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
-	DO_TWIST_0L_1H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_0L_1H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
-	DO_TWIST_0L_1H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
-	DO_TWIST_0L_1H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
-	DO_TWIST_0L_1H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_0L_1H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
-      }else{								\
-	DO_TWIST_1L_0H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_1L_0H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
-	DO_TWIST_1L_0H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
-	DO_TWIST_1L_0H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
-	DO_TWIST_1L_0H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_1L_0H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
-      }									\
-    }									\
-  }
-
-
-#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
-#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
-
-// To splat or not to splat depends on the implementation
-#define MULT_2SPIN_BODY \
-  Impl::loadLinkElement(U_00,ref()(0,0));	\
-  Impl::loadLinkElement(U_10,ref()(1,0));	\
-  Impl::loadLinkElement(U_20,ref()(2,0));	\
-  Impl::loadLinkElement(U_01,ref()(0,1));	\
-  Impl::loadLinkElement(U_11,ref()(1,1));	\
-  Impl::loadLinkElement(U_21,ref()(2,1));	\
-  UChi_00 = U_00*Chi_00;			\
-  UChi_10 = U_00*Chi_10;			\
-  UChi_01 = U_10*Chi_00;			\
-  UChi_11 = U_10*Chi_10;			\
-  UChi_02 = U_20*Chi_00;			\
-  UChi_12 = U_20*Chi_10;			\
-  UChi_00+= U_01*Chi_01;			\
-  UChi_10+= U_01*Chi_11;			\
-  UChi_01+= U_11*Chi_01;			\
-  UChi_11+= U_11*Chi_11;			\
-  UChi_02+= U_21*Chi_01;			\
-  UChi_12+= U_21*Chi_11;			\
-  Impl::loadLinkElement(U_00,ref()(0,2));	\
-  Impl::loadLinkElement(U_10,ref()(1,2));	\
-  Impl::loadLinkElement(U_20,ref()(2,2));	\
-  UChi_00+= U_00*Chi_02;			\
-  UChi_10+= U_00*Chi_12;			\
-  UChi_01+= U_10*Chi_02;			\
-  UChi_11+= U_10*Chi_12;			\
-  UChi_02+= U_20*Chi_02;			\
-  UChi_12+= U_20*Chi_12
-
-
-#define MULT_2SPIN(A,F)					\
-  {auto & ref(U[sU](A)); MULT_2SPIN_BODY; }
-
-#define MULT_2SPIN_GPARITY(A,F)				\
-  {auto & ref(U[sU](F)(A)); MULT_2SPIN_BODY; }
-
-
-#define PERMUTE_DIR(dir)			\
-      permute##dir(Chi_00,Chi_00);\
-      permute##dir(Chi_01,Chi_01);\
-      permute##dir(Chi_02,Chi_02);\
-      permute##dir(Chi_10,Chi_10);\
-      permute##dir(Chi_11,Chi_11);\
-      permute##dir(Chi_12,Chi_12);
-
-//      hspin(0)=fspin(0)+timesI(fspin(3));
-//      hspin(1)=fspin(1)+timesI(fspin(2));
-#define XP_PROJ \
-    Chi_00 = Chimu_00+timesI(Chimu_30);\
-    Chi_01 = Chimu_01+timesI(Chimu_31);\
-    Chi_02 = Chimu_02+timesI(Chimu_32);\
-    Chi_10 = Chimu_10+timesI(Chimu_20);\
-    Chi_11 = Chimu_11+timesI(Chimu_21);\
-    Chi_12 = Chimu_12+timesI(Chimu_22);
-
-#define YP_PROJ \
-    Chi_00 = Chimu_00-Chimu_30;\
-    Chi_01 = Chimu_01-Chimu_31;\
-    Chi_02 = Chimu_02-Chimu_32;\
-    Chi_10 = Chimu_10+Chimu_20;\
-    Chi_11 = Chimu_11+Chimu_21;\
-    Chi_12 = Chimu_12+Chimu_22;
-
-#define ZP_PROJ \
-  Chi_00 = Chimu_00+timesI(Chimu_20);		\
-  Chi_01 = Chimu_01+timesI(Chimu_21);		\
-  Chi_02 = Chimu_02+timesI(Chimu_22);		\
-  Chi_10 = Chimu_10-timesI(Chimu_30);		\
-  Chi_11 = Chimu_11-timesI(Chimu_31);		\
-  Chi_12 = Chimu_12-timesI(Chimu_32);
-
-#define TP_PROJ \
-  Chi_00 = Chimu_00+Chimu_20;		\
-  Chi_01 = Chimu_01+Chimu_21;		\
-  Chi_02 = Chimu_02+Chimu_22;		\
-  Chi_10 = Chimu_10+Chimu_30;		\
-  Chi_11 = Chimu_11+Chimu_31;		\
-  Chi_12 = Chimu_12+Chimu_32;
-
-
-//      hspin(0)=fspin(0)-timesI(fspin(3));
-//      hspin(1)=fspin(1)-timesI(fspin(2));
-#define XM_PROJ \
-    Chi_00 = Chimu_00-timesI(Chimu_30);\
-    Chi_01 = Chimu_01-timesI(Chimu_31);\
-    Chi_02 = Chimu_02-timesI(Chimu_32);\
-    Chi_10 = Chimu_10-timesI(Chimu_20);\
-    Chi_11 = Chimu_11-timesI(Chimu_21);\
-    Chi_12 = Chimu_12-timesI(Chimu_22);
-
-#define YM_PROJ \
-    Chi_00 = Chimu_00+Chimu_30;\
-    Chi_01 = Chimu_01+Chimu_31;\
-    Chi_02 = Chimu_02+Chimu_32;\
-    Chi_10 = Chimu_10-Chimu_20;\
-    Chi_11 = Chimu_11-Chimu_21;\
-    Chi_12 = Chimu_12-Chimu_22;
-
-#define ZM_PROJ \
-  Chi_00 = Chimu_00-timesI(Chimu_20);		\
-  Chi_01 = Chimu_01-timesI(Chimu_21);		\
-  Chi_02 = Chimu_02-timesI(Chimu_22);		\
-  Chi_10 = Chimu_10+timesI(Chimu_30);		\
-  Chi_11 = Chimu_11+timesI(Chimu_31);		\
-  Chi_12 = Chimu_12+timesI(Chimu_32);
-
-#define TM_PROJ \
-  Chi_00 = Chimu_00-Chimu_20;		\
-  Chi_01 = Chimu_01-Chimu_21;		\
-  Chi_02 = Chimu_02-Chimu_22;		\
-  Chi_10 = Chimu_10-Chimu_30;		\
-  Chi_11 = Chimu_11-Chimu_31;		\
-  Chi_12 = Chimu_12-Chimu_32;
-
-//      fspin(0)=hspin(0);
-//      fspin(1)=hspin(1);
-//      fspin(2)=timesMinusI(hspin(1));
-//      fspin(3)=timesMinusI(hspin(0));
-#define XP_RECON\
-  result_00 = UChi_00;\
-  result_01 = UChi_01;\
-  result_02 = UChi_02;\
-  result_10 = UChi_10;\
-  result_11 = UChi_11;\
-  result_12 = UChi_12;\
-  result_20 = timesMinusI(UChi_10);\
-  result_21 = timesMinusI(UChi_11);\
-  result_22 = timesMinusI(UChi_12);\
-  result_30 = timesMinusI(UChi_00);\
-  result_31 = timesMinusI(UChi_01);\
-  result_32 = timesMinusI(UChi_02);
-
-#define XP_RECON_ACCUM\
-  result_00+=UChi_00;\
-  result_01+=UChi_01;\
-  result_02+=UChi_02;\
-  result_10+=UChi_10;\
-  result_11+=UChi_11;\
-  result_12+=UChi_12;\
-  result_20-=timesI(UChi_10);\
-  result_21-=timesI(UChi_11);\
-  result_22-=timesI(UChi_12);\
-  result_30-=timesI(UChi_00);\
-  result_31-=timesI(UChi_01);\
-  result_32-=timesI(UChi_02);
-
-#define XM_RECON\
-  result_00 = UChi_00;\
-  result_01 = UChi_01;\
-  result_02 = UChi_02;\
-  result_10 = UChi_10;\
-  result_11 = UChi_11;\
-  result_12 = UChi_12;\
-  result_20 = timesI(UChi_10);\
-  result_21 = timesI(UChi_11);\
-  result_22 = timesI(UChi_12);\
-  result_30 = timesI(UChi_00);\
-  result_31 = timesI(UChi_01);\
-  result_32 = timesI(UChi_02);
-
-#define XM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= timesI(UChi_10);\
-  result_21+= timesI(UChi_11);\
-  result_22+= timesI(UChi_12);\
-  result_30+= timesI(UChi_00);\
-  result_31+= timesI(UChi_01);\
-  result_32+= timesI(UChi_02);
-
-#define YP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= UChi_10;\
-  result_21+= UChi_11;\
-  result_22+= UChi_12;\
-  result_30-= UChi_00;\
-  result_31-= UChi_01;\
-  result_32-= UChi_02;
-
-#define YM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= UChi_10;\
-  result_21-= UChi_11;\
-  result_22-= UChi_12;\
-  result_30+= UChi_00;\
-  result_31+= UChi_01;\
-  result_32+= UChi_02;
-
-#define ZP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= timesI(UChi_00);			\
-  result_21-= timesI(UChi_01);			\
-  result_22-= timesI(UChi_02);			\
-  result_30+= timesI(UChi_10);			\
-  result_31+= timesI(UChi_11);			\
-  result_32+= timesI(UChi_12);
-
-#define ZM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= timesI(UChi_00);			\
-  result_21+= timesI(UChi_01);			\
-  result_22+= timesI(UChi_02);			\
-  result_30-= timesI(UChi_10);			\
-  result_31-= timesI(UChi_11);			\
-  result_32-= timesI(UChi_12);
-
-#define TP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= UChi_00;			\
-  result_21+= UChi_01;			\
-  result_22+= UChi_02;			\
-  result_30+= UChi_10;			\
-  result_31+= UChi_11;			\
-  result_32+= UChi_12;
-
-#define TM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= UChi_00;	\
-  result_21-= UChi_01;	\
-  result_22-= UChi_02;	\
-  result_30-= UChi_10;	\
-  result_31-= UChi_11;	\
-  result_32-= UChi_12;
-
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else {					\
-    LOAD_CHI_IMPL(DIR,F,PERM);			\
-  }						\
-  MULT_2SPIN_IMPL(DIR,F);			\
-  RECON;					
-
-
-#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else if ( st.same_node[DIR] ) {		\
-    LOAD_CHI_IMPL(DIR,F,PERM);			\
-  }						\
-  if (local || st.same_node[DIR] ) {		\
-    MULT_2SPIN_IMPL(DIR,F);			\
-    RECON;					\
-  }
-
-#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  perm   = SE->_permute;				\
-  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
-    LOAD_CHI_IMPL(DIR,F,PERM);			\
-    MULT_2SPIN_IMPL(DIR,F);			\
-    RECON;					\
-    nmu++;					\
-  }
-
-#define HAND_RESULT(ss,F)			\
-  {						\
-    SiteSpinor & ref (out[ss]);		\
-    vstream(ref(F)(0)(0),result_00);		\
-    vstream(ref(F)(0)(1),result_01);		\
-    vstream(ref(F)(0)(2),result_02);		\
-    vstream(ref(F)(1)(0),result_10);		\
-    vstream(ref(F)(1)(1),result_11);		\
-    vstream(ref(F)(1)(2),result_12);		\
-    vstream(ref(F)(2)(0),result_20);		\
-    vstream(ref(F)(2)(1),result_21);		\
-    vstream(ref(F)(2)(2),result_22);		\
-    vstream(ref(F)(3)(0),result_30);		\
-    vstream(ref(F)(3)(1),result_31);		\
-    vstream(ref(F)(3)(2),result_32);		\
-  }
-
-#define HAND_RESULT_EXT(ss,F)			\
-  if (nmu){					\
-    SiteSpinor & ref (out[ss]);		\
-    ref(F)(0)(0)+=result_00;		\
-    ref(F)(0)(1)+=result_01;		\
-    ref(F)(0)(2)+=result_02;		\
-    ref(F)(1)(0)+=result_10;		\
-    ref(F)(1)(1)+=result_11;		\
-    ref(F)(1)(2)+=result_12;		\
-    ref(F)(2)(0)+=result_20;		\
-    ref(F)(2)(1)+=result_21;		\
-    ref(F)(2)(2)+=result_22;		\
-    ref(F)(3)(0)+=result_30;		\
-    ref(F)(3)(1)+=result_31;		\
-    ref(F)(3)(2)+=result_32;		\
-  }
-
-
-#define HAND_DECLARATIONS(a)			\
-  Simd result_00;				\
-  Simd result_01;				\
-  Simd result_02;				\
-  Simd result_10;				\
-  Simd result_11;				\
-  Simd result_12;				\
-  Simd result_20;				\
-  Simd result_21;				\
-  Simd result_22;				\
-  Simd result_30;				\
-  Simd result_31;				\
-  Simd result_32;				\
-  Simd Chi_00;					\
-  Simd Chi_01;					\
-  Simd Chi_02;					\
-  Simd Chi_10;					\
-  Simd Chi_11;					\
-  Simd Chi_12;					\
-  Simd UChi_00;					\
-  Simd UChi_01;					\
-  Simd UChi_02;					\
-  Simd UChi_10;					\
-  Simd UChi_11;					\
-  Simd UChi_12;					\
-  Simd U_00;					\
-  Simd U_10;					\
-  Simd U_20;					\
-  Simd U_01;					\
-  Simd U_11;					\
-  Simd U_21;
-
-#define ZERO_RESULT				\
-  result_00=Zero();				\
-  result_01=Zero();				\
-  result_02=Zero();				\
-  result_10=Zero();				\
-  result_11=Zero();				\
-  result_12=Zero();				\
-  result_20=Zero();				\
-  result_21=Zero();				\
-  result_22=Zero();				\
-  result_30=Zero();				\
-  result_31=Zero();				\
-  result_32=Zero();			
-
-#define Chimu_00 Chi_00
-#define Chimu_01 Chi_01
-#define Chimu_02 Chi_02
-#define Chimu_10 Chi_10
-#define Chimu_11 Chi_11
-#define Chimu_12 Chi_12
-#define Chimu_20 UChi_00
-#define Chimu_21 UChi_01
-#define Chimu_22 UChi_02
-#define Chimu_30 UChi_10
-#define Chimu_31 UChi_11
-#define Chimu_32 UChi_12
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl> void  accelerator
-WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-
-#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss,F)
-
-  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-#endif
-}
-
-template<class Impl> accelerator
-void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-
-#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss,F)
-
-  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-#endif
-}
-
-template<class Impl> void accelerator
-WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-
-#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  ZERO_RESULT; \
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss,F)
-
-  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-#endif
-}
-
-template<class Impl> accelerator
-void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-
-#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
-  ZERO_RESULT;							\
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_RESULT(ss,F)
-  
-  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-#endif
-}
-
-template<class Impl> void accelerator
-WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset, perm, ptype;
-  StencilEntry *SE;
-  int nmu=0;
-
-#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  ZERO_RESULT; \
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT_EXT(ss,F)
-
-  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-  perm++;
-#endif
-}
-
-template<class Impl>
-accelerator void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-#ifndef GRID_NVCC
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset, perm, ptype;
-  int nmu=0;
-
-#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  ZERO_RESULT; \
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT_EXT(ss,F)
-
-  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-  perm++;
-#endif
-}
-
-  ////////////////////////////////////////////////
-  // Specialise Gparity to simple implementation
-  ////////////////////////////////////////////////
-#define HAND_SPECIALISE_EMPTY(IMPL)					\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSite(StencilView &st,			\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st,			\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st,			\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st,			\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st,	       	\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st,	       	\
-				     			\
-				    DoubledGaugeFieldView &U,		\
-				    SiteHalfSpinor *buf,		\
-				    int sF,int sU,			\
-				    const FermionFieldView &in,		\
-				    FermionFieldView &out){ assert(0); }	\
-
-
-#ifdef GRID_NVCC
-#define HAND_SPECIALISE_GPARITY(IMPL) HAND_SPECIALISE_EMPTY(IMPL)
-#else
-#define HAND_SPECIALISE_GPARITY(IMPL)					\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
-				    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    StencilEntry *SE;							\
-    HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-					    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    StencilEntry *SE;							\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
-    HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
-						     int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
-    StencilEntry *SE;							\
-    HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-							     int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    StencilEntry *SE;							\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
-						     int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    int offset,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    StencilEntry *SE;							\
-    int nmu=0;								\
-    HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    nmu = 0;								\
-    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-							     int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    StencilEntry *SE;							\
-    int offset,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    int nmu=0;								\
-    HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    nmu = 0;								\
-    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }
-#endif
-
-HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
-
-  
-////////////// Wilson ; uses this implementation /////////////////////
-
-#define INSTANTIATE_THEM(A) \
-template void WilsonKernels<A>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-					     int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out);\
-template void WilsonKernels<A>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
-template void WilsonKernels<A>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionFieldView &in, FermionFieldView &out); 
-
-//INSTANTIATE_THEM(GparityWilsonImplF);
-//INSTANTIATE_THEM(GparityWilsonImplD);
-//INSTANTIATE_THEM(GparityWilsonImplFH);
-//INSTANTIATE_THEM(GparityWilsonImplDF);
-//INSTANTIATE_THEM(DomainWallVec5dImplFH);
-//INSTANTIATE_THEM(DomainWallVec5dImplDF);
-//INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
-//INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonTMFermion.cc
+++ b/Grid/qcd/action/fermion/WilsonTMFermion.cc
@@ -1,97 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
-
-    Copyright (C) 2015
-
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-/*
- * BF sequence
- *
- void bfmbase<Float>::MooeeInv(Fermion_t psi, 
- Fermion_t chi, 
- int dag, int cb)
-
- double m    = this->mass;
- double tm   = this->twistedmass;
- double mtil = 4.0+this->mass;
-
- double sq = mtil*mtil + tm*tm;
-
- double a = mtil/sq;
- double b = -tm /sq;
- if(dag) b=-b;
- axpibg5x(chi,psi,a,b);
-
- void bfmbase<Float>::Mooee(Fermion_t psi, 
- Fermion_t chi, 
- int dag,int cb)
- double a = 4.0+this->mass;
- double b = this->twistedmass;
- if(dag) b=-b;
- axpibg5x(chi,psi,a,b);
-*/
-
-template<class Impl>
-void WilsonTMFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  RealD a = 4.0+this->mass;
-  RealD b = this->mu;
-  out.Checkerboard() = in.Checkerboard();
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  RealD a = 4.0+this->mass;
-  RealD b = -this->mu;
-  out.Checkerboard() = in.Checkerboard();
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  RealD m    = this->mass;
-  RealD tm   = this->mu;
-  RealD mtil = 4.0+m;
-  RealD sq   = mtil*mtil+tm*tm;
-  RealD a    = mtil/sq;
-  RealD b    = -tm /sq;
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
-  RealD m    = this->mass;
-  RealD tm   = this->mu;
-  RealD mtil = 4.0+m;
-  RealD sq   = mtil*mtil+tm*tm;
-  RealD a    = mtil/sq;
-  RealD b    = tm /sq;
-  axpibg5x(out,in,a,b);
-}
-
-FermOpTemplateInstantiate(WilsonTMFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/deprecated/CayleyFermion5Ddense.h
+++ b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Ddense.h
--- a/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dssp.h
+++ b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dssp.h
--- a/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermiondense.h
+++ b/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermiondense.h
--- a/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermionssp.h
+++ b/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermionssp.h
--- a/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermiondense.h
+++ b/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermiondense.h
--- a/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermionssp.h
+++ b/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermionssp.h
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermion.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermion.h
@@ -1,433 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
-						   GaugeField            &_Umu,
-						   GridCartesian         &FiveDimGrid,
-						   GridRedBlackCartesian &FiveDimRedBlackGrid,
-						   GridCartesian         &FourDimGrid,
-						   GridRedBlackCartesian &FourDimRedBlackGrid,
-						   RealD _mq1, RealD _mq2, RealD _mq3,
-						   RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
-  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-			    _shift, _pm, _M5, 1.0, 0.0, p)
-{
-  RealD eps = 1.0;
-  Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
-  assert(zdata->n == this->Ls);
-
-  std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
-  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-
-  Approx::zolotarev_free(zdata);
-}
-
-/***************************************************************
- * Additional EOFA operators only called outside the inverter.
- * Since speed is not essential, simple axpby-style
- * implementations should be fine.
- ***************************************************************/
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-{
-  int Ls = this->Ls;
-
-  Din = Zero();
-  if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
-  else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-  else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
-  else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-}
-
-// This is just the identity for DWF
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-// This is just the identity for DWF
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-/*****************************************************************************************************/
-
-template<class Impl>
-RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->Meooe5D(psi, Din);
-  this->DW(Din, chi, DaggerNo);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  this->M5D(psi, chi);
-  return(norm2(chi));
-}
-
-template<class Impl>
-RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->DW(psi, Din, DaggerYes);
-  this->MeooeDag5D(Din, chi);
-  this->M5Ddag(psi, chi);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  return(norm2(chi));
-}
-
-/********************************************************************
- * Performance critical fermion operators called inside the inverter
- ********************************************************************/
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD shift = this->shift;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-
-  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-  Coeff_t shiftp(0.0), shiftm(0.0);
-  if(shift != 0.0){
-    if(pm == 1){ shiftp = shift*(mq3-mq2); }
-    else{ shiftm = -shift*(mq3-mq2); }
-  }
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
-
-#if(0)
-  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
-  for(int i=0; i<diag.size(); ++i){
-    std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-  }
-  for(int i=0; i<upper.size(); ++i){
-    std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-  }
-  for(int i=0; i<lower.size(); ++i){
-    std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-  }
-#endif
-
-  this->M5D(psi, chi, chi, lower, diag, upper);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD shift = this->shift;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-
-  // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-  Coeff_t shiftp(0.0), shiftm(0.0);
-  if(shift != 0.0){
-    if(pm == 1){ shiftp = shift*(mq3-mq2); }
-    else{ shiftm = -shift*(mq3-mq2); }
-  }
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
-
-#if(0)
-  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
-  for(int i=0; i<diag.size(); ++i){
-    std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-  }
-  for(int i=0; i<upper.size(); ++i){
-    std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-  }
-  for(int i=0; i<lower.size(); ++i){
-    std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-  }
-#endif
-
-  this->M5Ddag(psi, chi, chi, lower, diag, upper);
-}
-
-// half checkerboard operations
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] = this->dm;
-  lower[0]    = this->dp;
-
-  this->M5D(psi, psi, chi, lower, diag, upper);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] = this->dp;
-  lower[0]    = this->dm;
-
-  this->M5Ddag(psi, psi, chi, lower, diag, upper);
-}
-
-/****************************************************************************************/
-
-//Zolo
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD mq1   = this->mq1;
-  RealD mq2   = this->mq2;
-  RealD mq3   = this->mq3;
-  RealD shift = this->shift;
-
-  ////////////////////////////////////////////////////////
-  // Constants for the preconditioned matrix Cayley form
-  ////////////////////////////////////////////////////////
-  this->bs.resize(Ls);
-  this->cs.resize(Ls);
-  this->aee.resize(Ls);
-  this->aeo.resize(Ls);
-  this->bee.resize(Ls);
-  this->beo.resize(Ls);
-  this->cee.resize(Ls);
-  this->ceo.resize(Ls);
-
-  for(int i=0; i<Ls; ++i){
-    this->bee[i] = 4.0 - this->M5 + 1.0;
-    this->cee[i] = 1.0;
-  }
-
-  for(int i=0; i<Ls; ++i){
-    this->aee[i] = this->cee[i];
-    this->bs[i] = this->beo[i] = 1.0;
-    this->cs[i] = this->ceo[i] = 0.0;
-  }
-
-  //////////////////////////////////////////
-  // EOFA shift terms
-  //////////////////////////////////////////
-  if(pm == 1){
-    this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
-    this->dm = mq1*this->cee[Ls-1];
-  } else if(this->pm == -1) {
-    this->dp = mq1*this->cee[0];
-    this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
-  } else {
-    this->dp = mq1*this->cee[0];
-    this->dm = mq1*this->cee[Ls-1];
-  }
-
-  //////////////////////////////////////////
-  // LDU decomposition of eeoo
-  //////////////////////////////////////////
-  this->dee.resize(Ls+1);
-  this->lee.resize(Ls);
-  this->leem.resize(Ls);
-  this->uee.resize(Ls);
-  this->ueem.resize(Ls);
-
-  for(int i=0; i<Ls; ++i){
-
-    if(i < Ls-1){
-
-      this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
-
-      this->leem[i] = this->dm/this->bee[i];
-      for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
-
-      this->dee[i] = this->bee[i];
-
-      this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
-
-      this->ueem[i] = this->dp / this->bee[0];
-      for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
-
-    } else {
-
-      this->lee[i]  = 0.0;
-      this->leem[i] = 0.0;
-      this->uee[i]  = 0.0;
-      this->ueem[i] = 0.0;
-
-    }
-  }
-
-  {
-    Coeff_t delta_d = 1.0 / this->bee[0];
-    for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
-    this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
-    this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
-  }
-
-  int inv = 1;
-  this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
-  this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
-}
-
-// Recompute Cayley-form coefficients for different shift
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-{
-  this->shift = new_shift;
-  Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
-  this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-						       Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  int Ls = this->Ls;
-
-  GridBase* grid = this->FermionRedBlackGrid();
-  int LLs = grid->_rdimensions[0];
-
-  if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-  for(int s=0; s<Ls; s++){
-    Pplus(s,s)  = this->bee[s];
-    Pminus(s,s) = this->bee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pminus(s,s+1) = -this->cee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pplus(s+1,s) = -this->cee[s+1];
-  }
-
-  Pplus (0,Ls-1) = this->dp;
-  Pminus(Ls-1,0) = this->dm;
-
-  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXcd PminusMat;
-
-#if(0)
-  std::cout << GridLogMessage << "Pplus:" << std::endl;
-  for(int s=0; s<Ls; ++s){
-    for(int ss=0; ss<Ls; ++ss){
-      std::cout << Pplus(s,ss) << "\t";
-    }
-    std::cout << std::endl;
-  }
-  std::cout << GridLogMessage << "Pminus:" << std::endl;
-  for(int s=0; s<Ls; ++s){
-    for(int ss=0; ss<Ls; ++ss){
-      std::cout << Pminus(s,ss) << "\t";
-    }
-    std::cout << std::endl;
-  }
-#endif
-
-  if(inv) {
-    PplusMat  = Pplus.inverse();
-    PminusMat = Pminus.inverse();
-  } else {
-    PplusMat  = Pplus;
-    PminusMat = Pminus;
-  }
-
-  if(dag){
-    PplusMat.adjointInPlace();
-    PminusMat.adjointInPlace();
-  }
-
-  typedef typename SiteHalfSpinor::scalar_type scalar_type;
-  const int Nsimd = Simd::Nsimd();
-  Matp.resize(Ls*LLs);
-  Matm.resize(Ls*LLs);
-
-  for(int s2=0; s2<Ls; s2++){
-    for(int s1=0; s1<LLs; s1++){
-      int istride = LLs;
-      int ostride = 1;
-      Simd Vp;
-      Simd Vm;
-      scalar_type *sp = (scalar_type*) &Vp;
-      scalar_type *sm = (scalar_type*) &Vm;
-      for(int l=0; l<Nsimd; l++){
-	if(switcheroo<Coeff_t>::iscomplex()) {
-	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
-	} else {
-	  // if real
-	  scalar_type tmp;
-	  tmp = PplusMat (l*istride+s1*ostride,s2);
-	  sp[l] = scalar_type(tmp.real(),tmp.real());
-	  tmp = PminusMat(l*istride+s1*ostride,s2);
-	  sm[l] = scalar_type(tmp.real(),tmp.real());
-	}
-      }
-      Matp[LLs*s2+s1] = Vp;
-      Matm[LLs*s2+s1] = Vm;
-    }}
-}
-
-FermOpTemplateInstantiate(DomainWallEOFAFermion);
-GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermioncache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermioncache.h
@@ -1,255 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// FIXME -- make a version of these routines with site loop outermost for cache reuse.
-
-// Pminus fowards
-// Pplus  backwards..
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  int Ls = this->Ls;
-  GridBase* grid = psi_i.Grid();
-  auto phi = phi_i.View();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  assert(phi.Checkerboard() == psi.Checkerboard());
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-  
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-    for(int s=0; s<Ls; s++){
-      auto tmp = psi[0];
-      if(s==0) {
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5m(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  int Ls = this->Ls;
-
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-    auto tmp = psi[0];
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5p(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionField& chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi=psi_i.View();
-  auto chi=chi_i.View();
-  int Ls = this->Ls;
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-
-    auto tmp1 = psi[0];
-    auto tmp2 = psi[0];
-
-    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-    // Apply (L^{\prime})^{-1}
-    chi[ss] = psi[ss]; // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp1, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
-    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp1, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp1, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
-    }
-    spProj5m(tmp2, chi[ss+Ls-1]);
-    chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
-
-    // Apply U^{-1}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5m(tmp1, chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, FermionField& chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  int Ls = this->Ls;
-
-  assert(psi.Checkerboard() == psi.Checkerboard());
-
-  Vector<Coeff_t> ueec(Ls);
-  Vector<Coeff_t> deec(Ls+1);
-  Vector<Coeff_t> leec(Ls);
-  Vector<Coeff_t> ueemc(Ls);
-  Vector<Coeff_t> leemc(Ls);
-
-  for(int s=0; s<ueec.size(); s++){
-    ueec[s]  = conjugate(this->uee[s]);
-    deec[s]  = conjugate(this->dee[s]);
-    leec[s]  = conjugate(this->lee[s]);
-    ueemc[s] = conjugate(this->ueem[s]);
-    leemc[s] = conjugate(this->leem[s]);
-  }
-  deec[Ls] = conjugate(this->dee[Ls]);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
-
-    auto tmp1 = psi[0];
-    auto tmp2 = psi[0];
-
-    // Apply (U^{\prime})^{-dagger}
-    chi[ss] = psi[ss];
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp1, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
-    }
-
-    // U_m^{-\dagger}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp1, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp1, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
-    }
-    spProj5p(tmp2, chi[ss+Ls-1]);
-    chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
-
-    // Apply L^{-dagger}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5p(tmp1, chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
-
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
-
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionvec.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionvec.h
@@ -1,613 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-
-/*
- * Dense matrix versions of routines
- */
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0;o<LLs;o++){ // outer
-    for(int i=0;i<nsimd;i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5m(hp, psi[ss+vp]);
-      spProj5p(hm, psi[ss+vm]);
-
-      if (vp <= v){ rotate(hp, hp, 1); }
-      if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = 0.5*hp;
-      hm = 0.5*hm;
-
-      spRecon5m(fp, hp);
-      spRecon5p(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v];
-      chi[ss+v] = chi[ss+v] + u[v]*fp;
-      chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v==LLs-1) ? 0     : v+1;
-      int vm = (v==0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-
-#endif
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd> > u(LLs);
-  Vector<iSinglet<Simd> > l(LLs);
-  Vector<iSinglet<Simd> > d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop((int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5p(hp, psi[ss+vp]);
-      spProj5m(hm, psi[ss+vm]);
-
-      if(vp <= v){ rotate(hp, hp, 1); }
-      if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = hp*0.5;
-      hm = hm*0.5;
-      spRecon5p(fp, hp);
-      spRecon5m(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-#endif
-
-  });
-
-  this->M5Dtime += usecond();
-}
-
-#ifdef AVX512
-#include<simd/Intel512common.h>
-#include<simd/Intel512avx.h>
-#include<simd/Intel512single.h>
-#endif
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
-						   int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-#ifndef AVX512
-  {
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-    SiteHalfSpinor SiteChiP;
-    SiteHalfSpinor SiteChiM;
-
-    // Ls*Ls * 2 * 12 * vol flops
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-	for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-	  int s = s2 + l*LLs;
-	  int lex = s2 + LLs*site;
-
-	  if( s2==0 && l==0 ){
-	    SiteChiP=Zero();
-	    SiteChiM=Zero();
-	  }
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-	    }}
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-	    }}
-
-	  for(int sp=0; sp<2;  sp++){
-	    for(int co=0; co<Nc; co++){
-	      SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-	      SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-	    }}
-	}}
-
-      {
-	int lex = s1 + LLs*site;
-	for(int sp=0; sp<2;  sp++){
-	  for(int co=0; co<Nc; co++){
-	    vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-	  }}
-      }
-    }
-
-  }
-#else
-  {
-    // pointers
-    //  MASK_REGS;
-#define Chi_00 %%zmm1
-#define Chi_01 %%zmm2
-#define Chi_02 %%zmm3
-#define Chi_10 %%zmm4
-#define Chi_11 %%zmm5
-#define Chi_12 %%zmm6
-#define Chi_20 %%zmm7
-#define Chi_21 %%zmm8
-#define Chi_22 %%zmm9
-#define Chi_30 %%zmm10
-#define Chi_31 %%zmm11
-#define Chi_32 %%zmm12
-
-#define BCAST0  %%zmm13
-#define BCAST1  %%zmm14
-#define BCAST2  %%zmm15
-#define BCAST3  %%zmm16
-#define BCAST4  %%zmm17
-#define BCAST5  %%zmm18
-#define BCAST6  %%zmm19
-#define BCAST7  %%zmm20
-#define BCAST8  %%zmm21
-#define BCAST9  %%zmm22
-#define BCAST10 %%zmm23
-#define BCAST11 %%zmm24
-
-    int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-
-	int lex = s2 + LLs*site;
-	uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-	uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-	uint64_t a2 = (uint64_t) &psi[lex];
-
-	for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-	  if((s2+l)==0) {
-	    asm(
-		VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-		VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-		VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-		VBCASTCDUP(0,%2,BCAST0)
-		VBCASTCDUP(1,%2,BCAST1)
-		VBCASTCDUP(2,%2,BCAST2)
-		VBCASTCDUP(3,%2,BCAST3)
-		VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-		VMULMEM(0,%1,BCAST8,Chi_22)
-		VMULMEM(0,%1,BCAST9,Chi_30)
-		VMULMEM(0,%1,BCAST10,Chi_31)
-		VMULMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  } else {
-	    asm(
-		VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-		VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-		VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-		VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-		VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  }
-	  a0 = a0 + incr;
-	  a1 = a1 + incr;
-                        a2 = a2 + sizeof(typename Simd::scalar_type);
-	}
-      }
-
-      {
-	int lexa = s1+LLs*site;
-	asm (
-	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-      }
-    }
-  }
-
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-#endif
-};
-
-// Z-mobius version
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-						    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-  exit(-1);
-};
-
-template<class Impl>
-void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-{
-  chi.Checkerboard() = psi.Checkerboard();
-  int Ls  = this->Ls;
-  int LLs = psi.Grid()->_rdimensions[0];
-  int vol = psi.Grid()->oSites()/LLs;
-
-  Vector<iSinglet<Simd> > Matp;
-  Vector<iSinglet<Simd> > Matm;
-  Vector<iSinglet<Simd> > *_Matp;
-  Vector<iSinglet<Simd> > *_Matm;
-
-  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-  if(inv && dag){
-    _Matp = &this->MatpInvDag;
-    _Matm = &this->MatmInvDag;
-  }
-
-  if(inv && (!dag)){
-    _Matp = &this->MatpInv;
-    _Matm = &this->MatmInv;
-  }
-
-  if(!inv){
-    MooeeInternalCompute(dag, inv, Matp, Matm);
-    _Matp = &Matp;
-    _Matm = &Matm;
-  }
-
-  assert(_Matp->size() == Ls*LLs);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  if(switcheroo<Coeff_t>::iscomplex()){
-    thread_loop((auto site=0; site<vol; site++),{
-      MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  } else {
-    thread_loop((auto site=0; site<vol; site++){
-      MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  }
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef DOMAIN_WALL_EOFA_DPERP_VEC
-
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
-
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
-INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
-
-template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermion.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermion.h
@@ -1,497 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
-					   GaugeField            &_Umu,
-					   GridCartesian         &FiveDimGrid,
-					   GridRedBlackCartesian &FiveDimRedBlackGrid,
-					   GridCartesian         &FourDimGrid,
-					   GridRedBlackCartesian &FourDimRedBlackGrid,
-					   RealD _mq1, RealD _mq2, RealD _mq3,
-					   RealD _shift, int _pm, RealD _M5,
-					   RealD _b, RealD _c, const ImplParams &p) :
-  AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-			    FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-			    _shift, _pm, _M5, _b, _c, p)
-{
-  int Ls = this->Ls;
-
-  RealD eps = 1.0;
-  Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
-  assert(zdata->n == this->Ls);
-
-  std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
-    ",c=" << _c << ") with Ls=" << Ls << std::endl;
-  this->SetCoefficientsTanh(zdata, _b, _c);
-  std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
-    ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
-    ",pm=" << _pm << ")" << std::endl;
-
-  Approx::zolotarev_free(zdata);
-
-  if(_shift != 0.0){
-    SetCoefficientsPrecondShiftOps();
-  } else {
-    Mooee_shift.resize(Ls, 0.0);
-    MooeeInv_shift_lc.resize(Ls, 0.0);
-    MooeeInv_shift_norm.resize(Ls, 0.0);
-    MooeeInvDag_shift_lc.resize(Ls, 0.0);
-    MooeeInvDag_shift_norm.resize(Ls, 0.0);
-  }
-}
-
-/****************************************************************
- * Additional EOFA operators only called outside the inverter.  
- * Since speed is not essential, simple axpby-style
- * implementations should be fine.
- ***************************************************************/
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-{
-  int Ls = this->Ls;
-  RealD alpha = this->alpha;
-
-  Din = Zero();
-  if((sign == 1) && (dag == 0)) { // \Omega_{+}
-    for(int s=0; s<Ls; ++s){
-      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
-    }
-  } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
-    for(int s=0; s<Ls; ++s){
-      axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
-    }
-  } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
-    for(int sp=0; sp<Ls; ++sp){
-      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
-    }
-  } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
-    for(int sp=0; sp<Ls; ++sp){
-      axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
-    }
-  }
-}
-
-// This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
-// It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
-{
-  int Ls    = this->Ls;
-  RealD b   = 0.5 * ( 1.0 + this->alpha );
-  RealD c   = 0.5 * ( 1.0 - this->alpha );
-  RealD mq1 = this->mq1;
-
-  for(int s=0; s<Ls; ++s){
-    if(s == 0) {
-      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-      axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
-    } else if(s == (Ls-1)) {
-      axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
-      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-    } else {
-      axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-      axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-    }
-  }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-  RealD m = this->mq1;
-  RealD c = 0.5 * this->alpha;
-  RealD d = 0.5;
-
-  RealD DtInv_p(0.0), DtInv_m(0.0);
-  RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
-  FermionField tmp(this->FermionGrid());
-
-  for(int s=0; s<Ls; ++s){
-    for(int sp=0; sp<Ls; ++sp){
-
-      DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
-      DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
-      DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
-      DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
-
-      if(sp == 0){
-	axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
-	axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
-      } else {
-	axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
-	axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
-      }
-
-    }}
-}
-
-/*****************************************************************************************************/
-
-template<class Impl>
-RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->Meooe5D(psi, Din);
-  this->DW(Din, chi, DaggerNo);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  this->M5D(psi, chi);
-  return(norm2(chi));
-}
-
-template<class Impl>
-RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-{
-  FermionField Din(psi.Grid());
-
-  this->DW(psi, Din, DaggerYes);
-  this->MeooeDag5D(Din, chi);
-  this->M5Ddag(psi, chi);
-  axpby(chi, 1.0, 1.0, chi, psi);
-  return(norm2(chi));
-}
-
-/********************************************************************
- * Performance critical fermion operators called inside the inverter
- ********************************************************************/
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-// half checkerboard operations
-template<class Impl>
-void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  // coefficients of Mooee
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-  for(int s=0; s<Ls; s++){
-    upper[s] = -this->cee[s];
-    lower[s] = -this->cee[s];
-  }
-  upper[Ls-1] *= -this->mq1;
-  lower[0]    *= -this->mq1;
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-{
-  int Ls = this->Ls;
-
-  // coefficients of MooeeDag
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
-  for(int s=0; s<Ls; s++){
-    if(s==0) {
-      upper[s] = -this->cee[s+1];
-      lower[s] = this->mq1*this->cee[Ls-1];
-    } else if(s==(Ls-1)) {
-      upper[s] = this->mq1*this->cee[0];
-      lower[s] = -this->cee[s-1];
-    } else {
-      upper[s] = -this->cee[s+1];
-      lower[s] = -this->cee[s-1];
-    }
-  }
-
-  // no shift term
-  if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
-
-  // fused M + shift operation
-  else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-}
-
-/****************************************************************************************/
-
-// Computes coefficients for applying Cayley preconditioned shift operators
-//  (Mooee + \Delta) --> Mooee_shift
-//  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
-//  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
-// For the latter two cases, the operation takes the form
-//  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
-//      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
-template<class Impl>
-void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
-{
-  int   Ls    = this->Ls;
-  int   pm    = this->pm;
-  RealD alpha = this->alpha;
-  RealD k     = this->k;
-  RealD mq1   = this->mq1;
-  RealD shift = this->shift;
-
-  // Initialize
-  Mooee_shift.resize(Ls);
-  MooeeInv_shift_lc.resize(Ls);
-  MooeeInv_shift_norm.resize(Ls);
-  MooeeInvDag_shift_lc.resize(Ls);
-  MooeeInvDag_shift_norm.resize(Ls);
-
-  // Construct Mooee_shift
-  int idx(0);
-  Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
-    ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
-  for(int s=0; s<Ls; ++s){
-    idx = (pm == 1) ? (s) : (Ls-1-s);
-    Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
-  }
-
-  // Tridiagonal solve for MooeeInvDag_shift_lc
-  {
-    Coeff_t m(0.0);
-    Vector<Coeff_t> d = Mooee_shift;
-    Vector<Coeff_t> u(Ls,0.0);
-    Vector<Coeff_t> y(Ls,0.0);
-    Vector<Coeff_t> q(Ls,0.0);
-    if(pm == 1){ u[0] = 1.0; }
-    else{ u[Ls-1] = 1.0; }
-
-    // Tridiagonal matrix algorithm + Sherman-Morrison formula
-    //
-    // We solve
-    //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
-    // where Mooee' is the tridiagonal part of Mooee_{+}, and
-    // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
-    // so that the outer-product u \otimes v gives the (0,Ls-1)
-    // entry of Mooee_{+}.
-    //
-    // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
-    // and then construct the solution to the original system
-    //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
-    if(pm == 1){
-      for(int s=1; s<Ls; ++s){
-	m = -this->cee[s] / this->bee[s-1];
-	d[s] -= m*d[s-1];
-	u[s] -= m*u[s-1];
-      }
-    }
-    y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
-    q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
-    for(int s=Ls-2; s>=0; --s){
-      if(pm == 1){
-	y[s] = d[s] / this->bee[s];
-	q[s] = u[s] / this->bee[s];
-      } else {
-	y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
-	q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
-      }
-    }
-
-    // Construct MooeeInvDag_shift_lc
-    for(int s=0; s<Ls; ++s){
-      if(pm == 1){
-	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
-	  (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
-      } else {
-	MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
-	  (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
-      }
-    }
-
-    // Compute remaining coefficients
-    N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
-    for(int s=0; s<Ls; ++s){
-
-      // MooeeInv_shift_lc
-      if(pm == 1){ MooeeInv_shift_lc[s] = pow(this->bee[s],s)      * pow(this->cee[s],Ls-1-s); }
-      else       { MooeeInv_shift_lc[s] = pow(this->bee[s],Ls-1-s) * pow(this->cee[s],s); }
-
-      // MooeeInv_shift_norm
-      MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
-	( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N;
-
-      // MooeeInvDag_shift_norm
-      if(pm == 1){ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],s) * pow(this->cee[s],(Ls-1-s)) /
-     	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
-      else{ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],(Ls-1-s)) * pow(this->cee[s],s) /
-	  ( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
-    }
-  }
-}
-
-// Recompute coefficients for a different value of shift constant
-template<class Impl>
-void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-{
-  this->shift = new_shift;
-  if(new_shift != 0.0){
-    SetCoefficientsPrecondShiftOps();
-  } else {
-    int Ls = this->Ls;
-    Mooee_shift.resize(Ls,0.0);
-    MooeeInv_shift_lc.resize(Ls,0.0);
-    MooeeInv_shift_norm.resize(Ls,0.0);
-    MooeeInvDag_shift_lc.resize(Ls,0.0);
-    MooeeInvDag_shift_norm.resize(Ls,0.0);
-  }
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-						   Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  int Ls = this->Ls;
-
-  GridBase* grid = this->FermionRedBlackGrid();
-  int LLs = grid->_rdimensions[0];
-
-  if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-  for(int s=0; s<Ls; s++){
-    Pplus(s,s)  = this->bee[s];
-    Pminus(s,s) = this->bee[s];
-  }
-
-  for(int s=0; s<Ls-1; s++){
-    Pminus(s,s+1) = -this->cee[s];
-    Pplus(s+1,s) = -this->cee[s+1];
-  }
-
-  Pplus (0,Ls-1) = this->mq1*this->cee[0];
-  Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
-
-  if(this->shift != 0.0){
-    RealD c = 0.5 * this->alpha;
-    RealD d = 0.5;
-    RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
-    if(this->pm == 1) {
-      for(int s=0; s<Ls; ++s){
-	Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
-      }
-    } else {
-      for(int s=0; s<Ls; ++s){
-	Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
-      }
-    }
-  }
-
-  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXcd PminusMat;
-
-  if(inv) {
-    PplusMat  = Pplus.inverse();
-    PminusMat = Pminus.inverse();
-  } else {
-    PplusMat  = Pplus;
-    PminusMat = Pminus;
-  }
-
-  if(dag){
-    PplusMat.adjointInPlace();
-    PminusMat.adjointInPlace();
-  }
-
-  typedef typename SiteHalfSpinor::scalar_type scalar_type;
-  const int Nsimd = Simd::Nsimd();
-  Matp.resize(Ls*LLs);
-  Matm.resize(Ls*LLs);
-
-  for(int s2=0; s2<Ls; s2++){
-    for(int s1=0; s1<LLs; s1++){
-      int istride = LLs;
-      int ostride = 1;
-      Simd Vp;
-      Simd Vm;
-      scalar_type *sp = (scalar_type*) &Vp;
-      scalar_type *sm = (scalar_type*) &Vm;
-      for(int l=0; l<Nsimd; l++){
-	if(switcheroo<Coeff_t>::iscomplex()) {
-	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
-	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
-	} else {
-	  // if real
-	  scalar_type tmp;
-	  tmp = PplusMat (l*istride+s1*ostride,s2);
-	  sp[l] = scalar_type(tmp.real(),tmp.real());
-	  tmp = PminusMat(l*istride+s1*ostride,s2);
-	  sm[l] = scalar_type(tmp.real(),tmp.real());
-	}
-      }
-      Matp[LLs*s2+s1] = Vp;
-      Matm[LLs*s2+s1] = Vm;
-    }}
-}
-
-FermOpTemplateInstantiate(MobiusEOFAFermion);
-GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermioncache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermioncache.h
@@ -1,445 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-				  Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-    for(int s=0; s<Ls; s++){
-      auto tmp = psi[0];
-      if(s==0){
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5m(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-					Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
-					Vector<Coeff_t> &shift_coeffs)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
-
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-    for(int s=0; s<Ls; s++){
-      auto tmp = psi[0];
-      if(s==0){
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5m(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5m(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5p(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-      if(this->pm == 1){ spProj5p(tmp, psi[ss+shift_s]); }
-      else{ spProj5m(tmp, psi[ss+shift_s]); }
-      chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-				     Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-    auto tmp = psi[0];
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5p(tmp, psi[ss+0]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-					   Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
-					   Vector<Coeff_t> &shift_coeffs)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  int Ls = this->Ls;
-  int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-    chi[ss+Ls-1] = Zero();
-    auto tmp = psi[0];
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+Ls-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else if(s==(Ls-1)) {
-	spProj5p(tmp, psi[ss+0]);
-	chi[ss+s] = chi[ss+s] + diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      } else {
-	spProj5p(tmp, psi[ss+s+1]);
-	chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-	spProj5m(tmp, psi[ss+s-1]);
-	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-      }
-      if(this->pm == 1){ spProj5p(tmp, psi[ss+s]); }
-      else{ spProj5m(tmp, psi[ss+s]); }
-      chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
-    }
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-
-  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-
-    auto tmp = psi[0];
-
-    // Apply (L^{\prime})^{-1}
-    chi[ss] = psi[ss]; // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp;
-    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp;
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp;
-    }
-    chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-
-    // Apply U^{-1}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5m(tmp, chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionField &chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-
-    auto tmp1        = psi[0];
-    auto tmp2        = psi[0];
-    auto tmp2_spProj = psi[0];
-
-    // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
-    chi[ss] = psi[ss]; // chi[0]=psi[0]
-    tmp2 = MooeeInv_shift_lc[0]*psi[ss];
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp1, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
-      tmp2 = tmp2 + MooeeInv_shift_lc[s]*psi[ss+s];
-    }
-    if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
-    else{ spProj5m(tmp2_spProj, tmp2); }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp1, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp1, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp1;
-    }
-    // chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
-    chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-    spProj5m(tmp1, chi[ss+Ls-1]);
-    chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
-
-    // Apply U^{-1} and add shift term
-    for(int s=Ls-2; s>=0; s--){
-      chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
-      spProj5m(tmp1, chi[ss+s]);
-      chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionField &chi_i)
-{
-  if(this->shift != 0.0){ MooeeInvDag_shift(psi_i,chi_i); return; }
-
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  int Ls = this->Ls;
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-
-    auto tmp = psi[0];
-
-    // Apply (U^{\prime})^{-dag}
-    chi[ss] = psi[ss];
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp;
-    }
-
-    // U_m^{-\dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp;
-    }
-
-    // L_m^{-\dag} D^{-dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp;
-    }
-    chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-
-    // Apply L^{-dag}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5p(tmp, chi[ss+s+1]);
-      chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, FermionField &chi_i)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase *grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-  int Ls = this->Ls;
-
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
-
-    auto tmp1        = psi[0];
-    auto tmp2        = psi[0];
-    auto tmp2_spProj = psi[0];
-
-    // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
-    chi[ss] = psi[ss];
-    tmp2 = MooeeInvDag_shift_lc[0]*psi[ss];
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp1, chi[ss+s-1]);
-      chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp1;
-      tmp2 = tmp2 + MooeeInvDag_shift_lc[s]*psi[ss+s];
-    }
-    if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
-    else{ spProj5m(tmp2_spProj, tmp2); }
-
-    // U_m^{-\dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp1, chi[ss+s]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp1;
-    }
-
-    // L_m^{-\dag} D^{-dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp1, chi[ss+Ls-1]);
-      chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp1;
-    }
-    chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-    spProj5p(tmp1, chi[ss+Ls-1]);
-    chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInvDag_shift_norm[Ls-1]*tmp2_spProj;
-
-    // Apply L^{-dag}
-    for(int s=Ls-2; s>=0; s--){
-      chi[ss+s] = chi[ss+s] - this->lee[s]*tmp1;
-      spProj5p(tmp1, chi[ss+s]);
-      chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
-    }
-  });
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef MOBIUS_EOFA_DPERP_CACHE
-
-INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
-INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
-
-INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
-INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
-INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionvec.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionvec.h
@@ -1,998 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-/*
- * Dense matrix versions of routines
- */
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
-{
-  this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				  Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid  = psi_i.Grid();
-  auto psi        = psi_i.View();
-  auto phi        = phi_i.View();
-  auto chi        = chi_i.View();
-  int Ls          = this->Ls;
-  int LLs         = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s   = o + i*LLs;
-      int ss  = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5m(hp, psi[ss+vp]);
-      spProj5p(hm, psi[ss+vm]);
-
-      if (vp <= v){ rotate(hp, hp, 1); }
-      if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = 0.5*hp;
-      hm = 0.5*hm;
-
-      spRecon5m(fp, hp);
-      spRecon5p(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v];
-      chi[ss+v] = chi[ss+v] + u[v]*fp;
-      chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-
-#endif
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi_i, const FermionField& phi_i,
-					FermionField& chi_i, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-					Vector<Coeff_t>& shift_coeffs)
-{
-#if 0
-  auto & psi = psi_i;
-  auto & phi = phi_i;
-  auto & chi = chi_i;
-
-  this->M5D(psi, phi, chi, lower, diag, upper);
-
-  // FIXME: possible gain from vectorizing shift operation as well?
-  Coeff_t one(1.0);
-  int Ls = this->Ls;
-  for(int s=0; s<Ls; s++){
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
-    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
-  }
-
-#else
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid  = psi_i.Grid();
-  auto psi        = psi_i.View();
-  auto phi        = phi_i.View();
-  auto chi        = chi_i.View();
-  int Ls          = this->Ls;
-  int LLs         = grid->_rdimensions[0];
-  const int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-  Vector<iSinglet<Simd>> s(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-  scalar_type* s_p = (scalar_type*) &s[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s   = o + i*LLs;
-      int ss  = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-      s_p[ss] = shift_coeffs[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  assert(Nc == 3);
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-    int vs     = (this->pm == 1) ? LLs-1 : 0;
-    Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
-    Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
-    Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
-    Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
-    Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
-    Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0)     ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(2)(0);
-      Simd hp_01 = psi[ss+vp]()(2)(1);
-      Simd hp_02 = psi[ss+vp]()(2)(2);
-      Simd hp_10 = psi[ss+vp]()(3)(0);
-      Simd hp_11 = psi[ss+vp]()(3)(1);
-      Simd hp_12 = psi[ss+vp]()(3)(2);
-
-      Simd hm_00 = psi[ss+vm]()(0)(0);
-      Simd hm_01 = psi[ss+vm]()(0)(1);
-      Simd hm_02 = psi[ss+vm]()(0)(2);
-      Simd hm_10 = psi[ss+vm]()(1)(0);
-      Simd hm_11 = psi[ss+vm]()(1)(1);
-      Simd hm_12 = psi[ss+vm]()(1)(2);
-
-      if(vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(this->pm == 1 && vs <= v){
-	hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      if(this->pm == -1 && vs >= v){
-	hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-      }
-
-      // Can force these to real arithmetic and save 2x.
-      Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-      Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-      Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-      Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-      Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-      Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-      Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-    }
-  });
-
-  this->M5Dtime += usecond();
-
-#endif
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				     Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
-{
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-#if 0
-
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
-
-    for(int v=0; v<LLs; v++){
-
-      int vp = (v+1)%LLs;
-      int vm = (v+LLs-1)%LLs;
-
-      spProj5p(hp, psi[ss+vp]);
-      spProj5m(hm, psi[ss+vm]);
-
-      if(vp <= v){ rotate(hp, hp, 1); }
-      if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-      hp = hp*0.5;
-      hm = hm*0.5;
-      spRecon5p(fp, hp);
-      spRecon5m(fm, hm);
-
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-    }
-
-#else
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-      Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-      Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-      Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-      Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-      Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-
-    }
-
-#endif
-
-  });
-
-  this->M5Dtime += usecond();
-}
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-					   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-					   Vector<Coeff_t>& shift_coeffs)
-{
-#if 0
-  auto & psi = psi_i;
-  auto & phi = phi_i;
-  auto & chi = chi_i;
-  this->M5Ddag(psi, phi, chi, lower, diag, upper);
-
-  // FIXME: possible gain from vectorizing shift operation as well?
-  Coeff_t one(1.0);
-  int Ls = this->Ls;
-  for(int s=0; s<Ls; s++){
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
-    else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
-  }
-
-#else
-  chi_i.Checkerboard() = psi_i.Checkerboard();
-  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
-  auto phi = phi_i.View();
-  auto chi = chi_i.View();
-  int Ls  = this->Ls;
-  int LLs = grid->_rdimensions[0];
-  int nsimd = Simd::Nsimd();
-
-  Vector<iSinglet<Simd>> u(LLs);
-  Vector<iSinglet<Simd>> l(LLs);
-  Vector<iSinglet<Simd>> d(LLs);
-  Vector<iSinglet<Simd>> s(LLs);
-
-  assert(Ls/LLs == nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
-
-
-  // just directly address via type pun
-  typedef typename Simd::scalar_type scalar_type;
-  scalar_type* u_p = (scalar_type*) &u[0];
-  scalar_type* l_p = (scalar_type*) &l[0];
-  scalar_type* d_p = (scalar_type*) &d[0];
-  scalar_type* s_p = (scalar_type*) &s[0];
-
-  for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-      s_p[ss] = shift_coeffs[s];
-    }}
-
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
-  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
-
-    int vs     = (this->pm == 1) ? LLs-1 : 0;
-    Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
-    Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
-    Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
-    Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
-    Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
-    Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
-
-    for(int v=0; v<LLs; v++){
-
-      vprefetch(psi[ss+v+LLs]);
-
-      int vp = (v == LLs-1) ? 0     : v+1;
-      int vm = (v == 0    ) ? LLs-1 : v-1;
-
-      Simd hp_00 = psi[ss+vp]()(0)(0);
-      Simd hp_01 = psi[ss+vp]()(0)(1);
-      Simd hp_02 = psi[ss+vp]()(0)(2);
-      Simd hp_10 = psi[ss+vp]()(1)(0);
-      Simd hp_11 = psi[ss+vp]()(1)(1);
-      Simd hp_12 = psi[ss+vp]()(1)(2);
-
-      Simd hm_00 = psi[ss+vm]()(2)(0);
-      Simd hm_01 = psi[ss+vm]()(2)(1);
-      Simd hm_02 = psi[ss+vm]()(2)(2);
-      Simd hm_10 = psi[ss+vm]()(3)(0);
-      Simd hm_11 = psi[ss+vm]()(3)(1);
-      Simd hm_12 = psi[ss+vm]()(3)(2);
-
-      if (vp <= v){
-	hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-      }
-
-      if(this->pm == 1 && vs <= v){
-	hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-      }
-
-      if(vm >= v){
-	hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-      }
-
-      if(this->pm == -1 && vs >= v){
-	hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-	hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-	hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-	hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-	hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-	hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-      }
-
-      Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-      Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-      Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-      Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-      Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-      Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-      Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-      Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-      Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-      Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-      Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-      Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-	+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-
-      vstream(chi[ss+v]()(0)(0), p_00);
-      vstream(chi[ss+v]()(0)(1), p_01);
-      vstream(chi[ss+v]()(0)(2), p_02);
-      vstream(chi[ss+v]()(1)(0), p_10);
-      vstream(chi[ss+v]()(1)(1), p_11);
-      vstream(chi[ss+v]()(1)(2), p_12);
-      vstream(chi[ss+v]()(2)(0), p_20);
-      vstream(chi[ss+v]()(2)(1), p_21);
-      vstream(chi[ss+v]()(2)(2), p_22);
-      vstream(chi[ss+v]()(3)(0), p_30);
-      vstream(chi[ss+v]()(3)(1), p_31);
-      vstream(chi[ss+v]()(3)(2), p_32);
-
-    }
-
-  });
-
-  this->M5Dtime += usecond();
-
-#endif
-}
-
-#ifdef AVX512
-#include<simd/Intel512common.h>
-#include<simd/Intel512avx.h>
-#include<simd/Intel512single.h>
-#endif
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
-					       int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  auto psi = psi_i.View();
-  auto chi = chi_i.View();
-#ifndef AVX512
-  {
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-    SiteHalfSpinor SiteChiP;
-    SiteHalfSpinor SiteChiM;
-
-    // Ls*Ls * 2 * 12 * vol flops
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-	for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-	  int s = s2 + l*LLs;
-	  int lex = s2 + LLs*site;
-
-	  if( s2==0 && l==0 ){
-	    SiteChiP=Zero();
-	    SiteChiM=Zero();
-	  }
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-            }}
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-            }}
-
-	  for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-              SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-            }}
-	}}
-
-      {
-	int lex = s1 + LLs*site;
-	for(int sp=0; sp<2;  sp++){
-	  for(int co=0; co<Nc; co++){
-	    vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-	    vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-	  }}
-      }
-    }
-  }
-#else
-  {
-    // pointers
-    //  MASK_REGS;
-#define Chi_00 %%zmm1
-#define Chi_01 %%zmm2
-#define Chi_02 %%zmm3
-#define Chi_10 %%zmm4
-#define Chi_11 %%zmm5
-#define Chi_12 %%zmm6
-#define Chi_20 %%zmm7
-#define Chi_21 %%zmm8
-#define Chi_22 %%zmm9
-#define Chi_30 %%zmm10
-#define Chi_31 %%zmm11
-#define Chi_32 %%zmm12
-
-#define BCAST0  %%zmm13
-#define BCAST1  %%zmm14
-#define BCAST2  %%zmm15
-#define BCAST3  %%zmm16
-#define BCAST4  %%zmm17
-#define BCAST5  %%zmm18
-#define BCAST6  %%zmm19
-#define BCAST7  %%zmm20
-#define BCAST8  %%zmm21
-#define BCAST9  %%zmm22
-#define BCAST10 %%zmm23
-#define BCAST11 %%zmm24
-
-    int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-
-    for(int s1=0; s1<LLs; s1++){
-
-      for(int s2=0; s2<LLs; s2++){
-
-	int lex = s2 + LLs*site;
-	uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-	uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-	uint64_t a2 = (uint64_t) &psi[lex];
-
-	for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-
-	  if((s2+l)==0) {
-	    asm(
-		VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-		VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-		VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-		VBCASTCDUP(0,%2,BCAST0)
-		VBCASTCDUP(1,%2,BCAST1)
-		VBCASTCDUP(2,%2,BCAST2)
-		VBCASTCDUP(3,%2,BCAST3)
-		VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-		VMULMEM(0,%1,BCAST8,Chi_22)
-		VMULMEM(0,%1,BCAST9,Chi_30)
-		VMULMEM(0,%1,BCAST10,Chi_31)
-		VMULMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  } else {
-	    asm(
-		VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-		VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-		VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-		VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-		VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-		VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-		VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-		VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-		VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-		VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-		VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-		VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-		: : "r" (a0), "r" (a1), "r" (a2)                            );
-	  }
-
-	  a0 = a0 + incr;
-	  a1 = a1 + incr;
-              a2 = a2 + sizeof(typename Simd::scalar_type);
-	}
-      }
-
-      {
-	int lexa = s1+LLs*site;
-	asm (
-	     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-	     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-	     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-	     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-	     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-      }
-    }
-  }
-
-#undef Chi_00
-#undef Chi_01
-#undef Chi_02
-#undef Chi_10
-#undef Chi_11
-#undef Chi_12
-#undef Chi_20
-#undef Chi_21
-#undef Chi_22
-#undef Chi_30
-#undef Chi_31
-#undef Chi_32
-
-#undef BCAST0
-#undef BCAST1
-#undef BCAST2
-#undef BCAST3
-#undef BCAST4
-#undef BCAST5
-#undef BCAST6
-#undef BCAST7
-#undef BCAST8
-#undef BCAST9
-#undef BCAST10
-#undef BCAST11
-
-#endif
-};
-
-// Z-mobius version
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-						int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-{
-  std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-  exit(-1);
-};
-
-template<class Impl>
-void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-{
-  chi.Checkerboard() = psi.Checkerboard();
-
-  int Ls  = this->Ls;
-  int LLs = psi.Grid()->_rdimensions[0];
-  int vol = psi.Grid()->oSites()/LLs;
-
-  Vector<iSinglet<Simd>>   Matp;
-  Vector<iSinglet<Simd>>   Matm;
-  Vector<iSinglet<Simd>>* _Matp;
-  Vector<iSinglet<Simd>>* _Matm;
-
-  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-  if(inv && dag){
-    _Matp = &this->MatpInvDag;
-    _Matm = &this->MatmInvDag;
-  }
-
-  if(inv && (!dag)){
-    _Matp = &this->MatpInv;
-    _Matm = &this->MatmInv;
-  }
-
-  if(!inv){
-    MooeeInternalCompute(dag, inv, Matp, Matm);
-    _Matp = &Matp;
-    _Matm = &Matm;
-  }
-
-  assert(_Matp->size() == Ls*LLs);
-
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
-  if(switcheroo<Coeff_t>::iscomplex()){
-    thread_loop( (auto site=0; site<vol; site++),{
-      MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  } else {
-    thread_loop( (auto site=0; site<vol; site++),{
-      MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    });
-  }
-
-  this->MooeeInvTime += usecond();
-}
-
-#ifdef MOBIUS_EOFA_DPERP_VEC
-
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
-
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
-INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
-
-template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-#endif
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermion.h
@@ -1,242 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
-
-    Copyright (C) 2017
-
-    Author: paboyle <paboyle@ph.ed.ac.uk>
-    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid.h>
-#include <Grid/qcd/spin/Dirac.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// *NOT* EO
-template <class Impl>
-RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
-{
-  FermionField temp(out.Grid());
-
-  // Wilson term
-  out.Checkerboard() = in.Checkerboard();
-  this->Dhop(in, out, DaggerNo);
-
-  // Clover term
-  Mooee(in, temp);
-
-  out += temp;
-  return norm2(out);
-}
-
-template <class Impl>
-RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
-{
-  FermionField temp(out.Grid());
-
-  // Wilson term
-  out.Checkerboard() = in.Checkerboard();
-  this->Dhop(in, out, DaggerYes);
-
-  // Clover term
-  MooeeDag(in, temp);
-
-  out += temp;
-  return norm2(out);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
-{
-  WilsonFermion<Impl>::ImportGauge(_Umu);
-  GridBase *grid = _Umu.Grid();
-  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
-
-  // Compute the field strength terms mu>nu
-  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
-  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
-  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
-
-  // Compute the Clover Operator acting on Colour and Spin
-  // multiply here by the clover coefficients for the anisotropy
-  CloverTerm  = fillCloverYZ(Bx) * csw_r;
-  CloverTerm += fillCloverXZ(By) * csw_r;
-  CloverTerm += fillCloverXY(Bz) * csw_r;
-  CloverTerm += fillCloverXT(Ex) * csw_t;
-  CloverTerm += fillCloverYT(Ey) * csw_t;
-  CloverTerm += fillCloverZT(Ez) * csw_t;
-  CloverTerm += diag_mass;
-
-  int lvol = _Umu.Grid()->lSites();
-  int DimRep = Impl::Dimension;
-
-  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-
-  Coordinate lcoor;
-  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
-
-  for (int site = 0; site < lvol; site++)
-  {
-    grid->LocalIndexToLocalCoor(site, lcoor);
-    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-    peekLocalSite(Qx, CloverTerm, lcoor);
-    Qxinv = Zero();
-    //if (csw!=0){
-    for (int j = 0; j < Ns; j++)
-      for (int k = 0; k < Ns; k++)
-        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++){
-	    auto zz =  Qx()(j, k)(a, b);
-            EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
-	  }
-    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
-
-    EigenInvCloverOp = EigenCloverOp.inverse();
-    //std::cout << EigenInvCloverOp << std::endl;
-    for (int j = 0; j < Ns; j++)
-      for (int k = 0; k < Ns; k++)
-        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++)
-            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
-    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
-    //  }
-    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
-  }
-
-  // Separate the even and odd parts
-  pickCheckerboard(Even, CloverTermEven, CloverTerm);
-  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
-
-  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
-  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
-
-  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
-  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
-
-  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
-  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerNo, InverseNo);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerYes, InverseNo);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerNo, InverseYes);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerYes, InverseYes);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
-{
-  out.Checkerboard() = in.Checkerboard();
-  CloverFieldType *Clover;
-  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
-
-  if (dag)
-  {
-    if (in.Grid()->_isCheckerBoarded)
-    {
-      if (in.Checkerboard() == Odd)
-      {
-        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
-      }
-      else
-      {
-        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
-      }
-      out = *Clover * in;
-    }
-    else
-    {
-      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = adj(*Clover) * in;
-    }
-  }
-  else
-  {
-    if (in.Grid()->_isCheckerBoarded)
-    {
-
-      if (in.Checkerboard() == Odd)
-      {
-        //  std::cout << "Calling clover term Odd" << std::endl;
-        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
-      }
-      else
-      {
-        //  std::cout << "Calling clover term Even" << std::endl;
-        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
-      }
-      out = *Clover * in;
-      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
-    }
-    else
-    {
-      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = *Clover * in;
-    }
-  }
-
-} // MooeeInternal
-
-
-// Derivative parts
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
-{
-  assert(0);
-}
-
-// Derivative parts
-template <class Impl>
-void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
-{
-  assert(0); // not implemented yet
-}
-
-FermOpTemplateInstantiate(WilsonCloverFermion);
-AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
-TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
-//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -386,11 +386,9 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         const FermionField &in, FermionField &out,int dag)
 {
  DhopTotalTime-=usecond();
-#ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else 
-#endif
    DhopInternalSerialComms(st,lo,U,in,out,dag);
  DhopTotalTime+=usecond();
 }
@@ -401,111 +399,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 							DoubledGaugeField & U,
 							const FermionField &in, FermionField &out,int dag)
 {
-#ifdef GRID_OMP
  Compressor compressor(dag);

  int LLs = in.Grid()->_rdimensions[0];
  int len =  U.Grid()->oSites();

+  /////////////////////////////
+  // Start comms  // Gather intranode and extra node differentiated??
+  /////////////////////////////
  DhopFaceTime-=usecond();
  st.HaloExchangeOptGather(in,compressor);
-  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();

-  double ctime=0;
-  double ptime=0;
+  DhopCommTime -=usecond();
+  std::vector<std::vector<CommsRequest_t> > requests;
+  st.CommunicateBegin(requests);

-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  auto U_v   = U.View();
-  auto in_v  = in.View();
-  auto out_v = out.View();
-  int Opt = WilsonKernelsStatic::Opt;
-#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
-  { 
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      double start = usecond();
-      nthreads -= ncomms;
-      int ttid = tid - ncomms;
-      int n = U.Grid()->oSites();
-      int chunk = n / nthreads;
-      int rem = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-	myblock = ttid * chunk + ttid;
-	myn = chunk+1;
-      } else {
-	myblock = ttid*chunk + rem;
-	myn = chunk;
-      }
+  /////////////////////////////
+  // Overlap with comms
+  /////////////////////////////
+  DhopFaceTime-=usecond();
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+  DhopFaceTime+=usecond();
      
-      // do the compute
-      if (dag == DaggerYes) {
-	for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  int sU = ss;
-	  int sF = LLs * sU;
- 	  Kernels::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,1,0);
-	}
-      } else {
-	for (int ss = myblock; ss < myblock+myn; ++ss) {
-	  int sU = ss;
-	  int sF = LLs * sU;
-	  Kernels::DhopSite(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,1,0);
-	}
-      }
-      ptime = usecond() - start;
-    }
-    {
-      double start = usecond();
-      st.CommunicateThreaded();
-      ctime = usecond() - start;
-    }
+  /////////////////////////////
+  // do the compute interior
+  /////////////////////////////
+  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
+  DhopComputeTime-=usecond();
+  if (dag == DaggerYes) {
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
+  } else {
+    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
-  DhopCommTime += ctime;
-  DhopComputeTime+=ptime;
+  DhopComputeTime+=usecond();

-  // First to enter, last to leave timing
-  st.CollateThreads();
+  /////////////////////////////
+  // Complete comms
+  /////////////////////////////
+  st.CommunicateComplete(requests);
+  DhopCommTime   +=usecond();

+  /////////////////////////////
+  // do the compute exterior
+  /////////////////////////////
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();

  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
-    int sz=st.surface_list.size();
-    thread_loop( (int ss = 0; ss < sz; ss++) ,{
-      int sU = st.surface_list[ss];
-      int sF = LLs * sU;
-      Kernels::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,0,1);
-    });
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  } else {
-    int sz=st.surface_list.size();
-    thread_loop( (int ss = 0; ss < sz; ss++) ,{
-      int sU = st.surface_list[ss];
-      int sF = LLs * sU;
-      Kernels::DhopSite(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,0,1);
-    });
+    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
-#else 
-  assert(0);
-#endif
 }


 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
-					 DoubledGaugeField & U,
-					 const FermionField &in, FermionField &out,int dag)
+						    DoubledGaugeField & U,
+						    const FermionField &in, 
+						    FermionField &out,int dag)
 {
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  Compressor compressor(dag);

  int LLs = in.Grid()->_rdimensions[0];
@@ -515,24 +472,11 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
  DhopCommTime+=usecond();
  
  DhopComputeTime-=usecond();
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-
-  auto U_v = U.View();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
-    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U_v.size(),in,out);
-    //    parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
-    //      int sU = ss;
-    //      int sF = LLs * sU;
-    //      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
-    //    }
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  } else {
-    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U_v.size(),in,out);
-    //    parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
-    //      int sU = ss;
-    //      int sF = LLs * sU;
-    //      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
-    //    }
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
 }
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -375,78 +375,47 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
 						      const FermionField &in,
 						      FermionField &out, int dag) {
  assert((dag == DaggerNo) || (dag == DaggerYes));
-#ifdef GRID_OMP
+
  Compressor compressor(dag);
  int len =  U.Grid()->oSites();
-  const int LLs =  1;

+  /////////////////////////////
+  // Start comms  // Gather intranode and extra node differentiated??
+  /////////////////////////////
+  std::vector<std::vector<CommsRequest_t> > requests;
  st.Prepare();
  st.HaloGather(in,compressor);
+  st.CommunicateBegin(requests);
+
+  /////////////////////////////
+  // Overlap with comms
+  /////////////////////////////
  st.CommsMergeSHM(compressor);
-#pragma omp parallel
-  {
-    int tid = omp_get_thread_num();
-    int nthreads = omp_get_num_threads();
-    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = 1;
-    assert(nthreads > ncomms);
-    if (tid >= ncomms) {
-      nthreads -= ncomms;
-      int ttid  = tid - ncomms;
-      int n     = len;
-      int chunk = n / nthreads;
-      int rem   = n % nthreads;
-      int myblock, myn;
-      if (ttid < rem) {
-        myblock = ttid * chunk + ttid;
-        myn = chunk+1;
-      } else {
-        myblock = ttid*chunk + rem;
-        myn = chunk;
-      }
-      // do the compute
-      auto U_v   = U.View();
-      auto in_v  = in.View();
-      auto out_v = out.View();
-      auto st_v  = st.View();
-      int Opt = WilsonKernelsStatic::Opt;

-      if (dag == DaggerYes) {
-        for (int sss = myblock; sss < myblock+myn; ++sss) {
-	  Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
-	  //	  Kernels::DhopSiteDag(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-	}
-      } else {
-        for (int sss = myblock; sss < myblock+myn; ++sss) {
-	  Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
-	  //	  Kernels::DhopSite(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
-	}
-      } 
+  /////////////////////////////
+  // do the compute interior
+  /////////////////////////////
+  int Opt = WilsonKernelsStatic::Opt;
+  if (dag == DaggerYes) {
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
+  } else {
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
+  } 

-    } else {
-      st.CommunicateThreaded();
-    }
-  }  //pragma
+  /////////////////////////////
+  // Complete comms
+  /////////////////////////////
+  st.CommunicateComplete(requests);
+  st.CommsMerge(compressor);

-  {
-    auto U_v   = U.View();
-    auto in_v  = in.View();
-    auto out_v = out.View();
-    auto st_v  =  st.View();
-    int Opt = WilsonKernelsStatic::Opt;
-    if (dag == DaggerYes) {
-      thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
-	Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
-      });
-    } else {
-      thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
-	Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
-      });
-    }
+  /////////////////////////////
+  // do the compute exterior
+  /////////////////////////////
+  if (dag == DaggerYes) {
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
+  } else {
+    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  }
-#else
-  assert(0);
-#endif
 };


--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h
@@ -73,7 +73,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  return;
 }

-#ifdef GPU_VEC
+#if 1
 #define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
  if (SE._is_local) {							\
    int mask = Nsimd >> (ptype + 1);					\
@@ -96,7 +96,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
      spProj(chi, in_t);						\
    }									\
  } else {								\
-    chi  = buf[SE._offset+s];						\
+    chi  = (buf[SE._offset+s];						\
  }									\
  synchronise();
 #endif
@@ -106,15 +106,9 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, Sit
 							    SiteHalfSpinor *buf, int Ls, int s,
 							    int sU, const FermionFieldView &in, FermionFieldView &out)
 {
-#ifdef __CUDA_ARCH__
  typename SiteHalfSpinor::scalar_object chi;
  typename SiteHalfSpinor::scalar_object Uchi;
  typename SiteSpinor::scalar_object   result;
-#else 
-  SiteHalfSpinor chi;
-  SiteHalfSpinor Uchi;
-  SiteSpinor     result;
-#endif

  typedef typename SiteSpinor::scalar_type scalar_type;
  typedef typename SiteSpinor::vector_type vector_type;
@@ -173,11 +167,7 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, Sit
    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); 
    Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
    accumReconTm(result, Uchi);
-#ifdef GPU_VEC
-  insertLane (lane,out[sF],result);
-#else
-  vstream(out[sF], result);
-#endif
+    insertLane (lane,out[sF],result);
  }
 }

@@ -186,15 +176,10 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDo
 							 SiteHalfSpinor *buf,  int Ls, int s,
 							 int sU, const FermionFieldView &in, FermionFieldView &out) 
 {
-#ifdef __CUDA_ARCH__
  typename SiteHalfSpinor::scalar_object chi;
  typename SiteHalfSpinor::scalar_object Uchi;
  typename SiteSpinor::scalar_object   result;
-#else 
-  SiteHalfSpinor chi;
-  SiteHalfSpinor Uchi;
-  SiteSpinor     result;
-#endif
+
  typedef typename SiteSpinor::scalar_type scalar_type;
  typedef typename SiteSpinor::vector_type vector_type;
  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
@@ -255,11 +240,7 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDo
    Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
    accumReconTp(result, Uchi);

-#ifdef GPU_VEC
    insertLane (lane,out[sF],result);
-#else
-  vstream(out[sF], result);
-#endif
  }
 };

@@ -287,6 +268,25 @@ GPU_EMPTY(GparityWilsonImplFH);
 GPU_EMPTY(GparityWilsonImplD);
 GPU_EMPTY(GparityWilsonImplDF);

+#define KERNEL_CALL(A) \
+      const uint64_t nsimd = Simd::Nsimd(); \
+      const uint64_t    NN = Nsite*Ls*nsimd;\
+      accelerator_loopN( sss, NN, {         \
+	  uint64_t cur  = sss;              \
+	  cur = cur / nsimd;                \
+	  uint64_t   s  = cur%Ls;           \
+	  cur = cur / Ls;                   \
+	  uint64_t   sU = cur;              \
+	  WilsonKernels<Impl>::A(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);\
+      });
+ 
+#define HOST_CALL(A) \
+  accelerator_loopN( ss, Ls*Nsite, {					\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      WilsonKernels<Impl>::A(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v);	\
+  });
+
 template <class Impl>
 void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 				     int Ls, int Nsite, const FermionField &in, FermionField &out,
@@ -297,25 +297,18 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    auto out_v = out.View();
    auto st_v  =  st.View();

-    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
-#define KERNEL_CALL(A) \
-      const uint64_t nsimd = Simd::Nsimd(); \
-      const uint64_t    NN = Nsite*Ls*nsimd;\
-      accelerator_loopN( sss, NN, {         \
-	  uint64_t cur  = sss;              \
-	  cur = cur / nsimd;                \
-	  uint64_t   s  = cur%Ls;           \
-	  cur = cur / Ls;                   \
-	  uint64_t   sU = cur; 
-	  WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
-      });
-    } else { 
-      accelerator_loop( ss, U_v, {
-	int sU = ss;
-        int sF = Ls * sU;
-        WilsonKernels<Impl>::GenericDhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
-      });
-    }
+   if( interior && exterior ) { 
+     if (Opt == WilsonKernelsStatic::OptGpu) {
+       KERNEL_CALL(GpuDhopSite);
+     } else {
+       HOST_CALL(GenericDhopSite);
+     }
+   } else if( interior ) {
+     HOST_CALL(GenericDhopSiteInt);
+   } else if( exterior ) { 
+     HOST_CALL(GenericDhopSiteExt);
+   }
+
  }
  template <class Impl>
  void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
@@ -327,25 +320,16 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    auto out_v = out.View();
    auto st_v  = st.View();

-    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
-      const uint64_t nsimd = Simd::Nsimd();
-      const uint64_t    NN = Nsite*Ls*nsimd;
-      accelerator_loopN( sss, NN, {
-	  uint64_t cur  = sss;
-	  //	  uint64_t lane = cur % nsimd;
-	  cur = cur / nsimd;
-	  uint64_t   s  = cur%Ls;
-	  //	  uint64_t   sF = cur;         
-	  cur = cur / Ls;
-	  uint64_t   sU = cur;
-	  WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
-      });
-    } else { 
-      accelerator_loop( ss, U_v, {
-	int sU = ss;
-	int sF = Ls * sU;
-	WilsonKernels<Impl>::GenericDhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
-      });
+    if( interior && exterior ) { 
+      if (Opt == WilsonKernelsStatic::OptGpu) {
+	KERNEL_CALL(GpuDhopSiteDag);
+      } else {
+	HOST_CALL(GenericDhopSiteDag);
+      }
+    } else if( interior ) {
+      HOST_CALL(GenericDhopSiteDagInt);
+    } else if( exterior ) { 
+      HOST_CALL(GenericDhopSiteDagExt);
    }
  }

--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -267,7 +267,6 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
  int ptype;

  SE = st.GetEntry(ptype, dir, sF);
-  //  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
  if (gamma == Xp) {						
    if (SE->_is_local ) {					
      int perm= SE->_permute;					
--- a/Grid/qcd/action/fermion/implementation/WilsonTMFermion.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonTMFermion.h
@@ -1,97 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
-
-    Copyright (C) 2015
-
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
-
-NAMESPACE_BEGIN(Grid);
-
-/*
- * BF sequence
- *
- void bfmbase<Float>::MooeeInv(Fermion_t psi, 
- Fermion_t chi, 
- int dag, int cb)
-
- double m    = this->mass;
- double tm   = this->twistedmass;
- double mtil = 4.0+this->mass;
-
- double sq = mtil*mtil + tm*tm;
-
- double a = mtil/sq;
- double b = -tm /sq;
- if(dag) b=-b;
- axpibg5x(chi,psi,a,b);
-
- void bfmbase<Float>::Mooee(Fermion_t psi, 
- Fermion_t chi, 
- int dag,int cb)
- double a = 4.0+this->mass;
- double b = this->twistedmass;
- if(dag) b=-b;
- axpibg5x(chi,psi,a,b);
-*/
-
-template<class Impl>
-void WilsonTMFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-  RealD a = 4.0+this->mass;
-  RealD b = this->mu;
-  out.Checkerboard() = in.Checkerboard();
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-  RealD a = 4.0+this->mass;
-  RealD b = -this->mu;
-  out.Checkerboard() = in.Checkerboard();
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-  RealD m    = this->mass;
-  RealD tm   = this->mu;
-  RealD mtil = 4.0+m;
-  RealD sq   = mtil*mtil+tm*tm;
-  RealD a    = mtil/sq;
-  RealD b    = -tm /sq;
-  axpibg5x(out,in,a,b);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
-  RealD m    = this->mass;
-  RealD tm   = this->mu;
-  RealD mtil = 4.0+m;
-  RealD sq   = mtil*mtil+tm*tm;
-  RealD a    = mtil/sq;
-  RealD b    = tm /sq;
-  axpibg5x(out,in,a,b);
-}
-
-FermOpTemplateInstantiate(WilsonTMFermion);
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/CayleyFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/CayleyFermion5DInstantiation.cc
@@ -0,0 +1,45 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h>
+
+			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h>
+			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dgpu.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// FIXME: Break these out to parallel make accelerate
+FermOpTemplateInstantiate(CayleyFermion5D);
+GparityFermOpTemplateInstantiate(CayleyFermion5D);
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/ContinuedFractionFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/ContinuedFractionFermion5DInstantiation.cc
@@ -0,0 +1,37 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
+#include <Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+FermOpTemplateInstantiate(ContinuedFractionFermion5D);
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/DomainWallEOFAFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/DomainWallEOFAFermionInstantiation.cc
@@ -0,0 +1,45 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+#include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h>
+
+NAMESPACE_BEGIN(Grid);
+
+
+FermOpTemplateInstantiate(DomainWallEOFAFermion);
+GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermion5DInstantiation.cc
@@ -0,0 +1,46 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
+#include <Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h>
+#include <Grid/perfmon/PerfCount.h>
+
+NAMESPACE_BEGIN(Grid);
+  
+// S-direction is INNERMOST and takes no part in the parity.
+const std::vector<int> ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
+const std::vector<int> ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
+
+FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
+FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
+  
+NAMESPACE_END(Grid);
+
+
+
--- a/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc
@@ -0,0 +1,39 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#include <Grid.h>
+#include <Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+const std::vector<int> ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
+const std::vector<int> ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
+
+FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/PartialFractionFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/PartialFractionFermion5DInstantiation.cc
@@ -0,0 +1,38 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
+#include <Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+ 
+FermOpTemplateInstantiate(PartialFractionFermion5D);
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/StaggeredKernelsInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/StaggeredKernelsInstantiation.cc
@@ -0,0 +1,43 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h>
+
+
+NAMESPACE_BEGIN(Grid);
+
+int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
+int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
+
+FermOpStaggeredTemplateInstantiate(StaggeredKernels);
+FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/WilsonCloverFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonCloverFermionInstantiation.cc
@@ -0,0 +1,42 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
+
+    Copyright (C) 2017
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+FermOpTemplateInstantiate(WilsonCloverFermion);
+AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
+TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
+//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonFermion5DInstantiation.cc
@@ -0,0 +1,40 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+FermOpTemplateInstantiate(WilsonFermion5D);
+GparityFermOpTemplateInstantiate(WilsonFermion5D);
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/WilsonFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonFermionInstantiation.cc
@@ -0,0 +1,46 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
+const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
+int WilsonFermionStatic::HandOptDslash;
+
+FermOpTemplateInstantiate(WilsonFermion);
+AdjointFermOpTemplateInstantiate(WilsonFermion);
+TwoIndexFermOpTemplateInstantiate(WilsonFermion);
+GparityFermOpTemplateInstantiate(WilsonFermion);
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// Move these
+int WilsonKernelsStatic::Opt   = WilsonKernelsStatic::OptGeneric;
+int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
+
+// FIXME: Break these out to parallel make
+FermOpTemplateInstantiate(WilsonKernels);
+GparityFermOpTemplateInstantiate(WilsonKernels); // Specialisation in Gparity forces instantiation
+AdjointFermOpTemplateInstantiate(WilsonKernels);
+TwoIndexFermOpTemplateInstantiate(WilsonKernels);
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/WilsonTMFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTMFermionInstantiation.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+FermOpTemplateInstantiate(WilsonTMFermion);
+
+NAMESPACE_END(Grid);
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -267,11 +267,12 @@ struct getVectorType{
 template<typename T>
 class isSIMDvectorized{
  template<typename U>
-  static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,   
-						 typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *);
+  static typename std::enable_if< 
+    !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,   
+                   typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, 
+    char>::type test(void *);

-  template<typename U>
-  static double test(...);
+  template<typename U> static double test(...);
  
 public:
  enum {value = sizeof(test<T>(0)) == sizeof(char) };
--- a/1
+++ b/1
@@ -1 +0,0 @@
-README.md
--- a/407
+++ b/407
@@ -0,0 +1,407 @@
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
+
+**Data parallel C++ mathematical object library.**
+
+License: GPL v2.
+
+Last update June 2017.
+
+_Please do not send pull requests to the `master` branch which is reserved for releases._
+
+
+
+### Description
+This library provides data parallel C++ container classes with internal memory layout
+that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
+are provided, similar to HPF and cmfortran, and user control is given over the mapping of
+array indices to both MPI tasks and SIMD processing elements.
+
+* Identically shaped arrays then be processed with perfect data parallelisation.
+* Such identically shaped arrays are called conformable arrays.
+
+The transformation is based on the observation that Cartesian array processing involves
+identical processing to be performed on different regions of the Cartesian array.
+
+The library will both geometrically decompose into MPI tasks and across SIMD lanes.
+Local vector loops are parallelised with OpenMP pragmas.
+
+Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
+optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
+for most programmers.
+
+The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
+Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
+
+These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
+The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
+
+MPI, OpenMP, and SIMD parallelism are present in the library.
+Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
+
+
+### Compilers
+
+Intel ICPC v16.0.3 and later
+
+Clang v3.5 and later (need 3.8 and later for OpenMP)
+
+GCC   v4.9.x (recommended)
+
+GCC   v6.3 and later
+
+### Important: 
+
+Some versions of GCC appear to have a bug under high optimisation (-O2, -O3).
+
+The safety of these compiler versions cannot be guaranteed at this time. Follow Issue 100 for details and updates.
+
+GCC   v5.x
+
+GCC   v6.1, v6.2
+
+### Bug report
+
+_To help us tracking and solving more efficiently issues with Grid, please report problems using the issue system of GitHub rather than sending emails to Grid developers._
+
+When you file an issue, please go though the following checklist:
+
+1. Check that the code is pointing to the `HEAD` of `develop` or any commit in `master` which is tagged with a version number. 
+2. Give a description of the target platform (CPU, network, compiler). Please give the full CPU part description, using for example `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux) or `sysctl machdep.cpu.brand_string` (macOS) and the full output the `--version` option of your compiler.
+3. Give the exact `configure` command used.
+4. Attach `config.log`.
+5. Attach `grid.config.summary`.
+6. Attach the output of `make V=1`.
+7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
+
+### Required libraries
+Grid requires:
+
+[GMP](https://gmplib.org/), 
+
+[MPFR](http://www.mpfr.org/) 
+
+Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
+
+Grid optionally uses:
+
+[HDF5](https://support.hdfgroup.org/HDF5/)  
+
+[LIME](http://usqcd-software.github.io/c-lime/) for ILDG and SciDAC file format support. 
+
+[FFTW](http://www.fftw.org) either generic version or via the Intel MKL library.
+
+LAPACK either generic version or Intel MKL library.
+
+
+### Quick start
+First, start by cloning the repository:
+
+``` bash
+git clone https://github.com/paboyle/Grid.git
+```
+
+Then enter the cloned directory and set up the build system:
+
+``` bash
+cd Grid
+./bootstrap.sh
+```
+
+Now you can execute the `configure` script to generate makefiles (here from a build directory):
+
+``` bash
+mkdir build; cd build
+../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
+```
+
+where `--enable-precision=` set the default precision,
+`--enable-simd=` set the SIMD type, `--enable-
+comms=`, and `<path>` should be replaced by the prefix path where you want to
+install Grid. Other options are detailed in the next section, you can also use `configure
+--help` to display them. Like with any other program using GNU autotool, the
+`CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to
+customise the build.
+
+Finally, you can build, check, and install Grid:
+
+``` bash
+make; make check; make install
+```
+
+To minimise the build time, only the tests at the root of the `tests` directory are built by default. If you want to build tests in the sub-directory `<subdir>` you can execute:
+
+``` bash
+make -C tests/<subdir> tests
+```
+If you want to build all the tests at once just use `make tests`.
+
+### Build configuration options
+
+- `--prefix=<path>`: installation prefix for Grid.
+- `--with-gmp=<path>`: look for GMP in the UNIX prefix `<path>`
+- `--with-mpfr=<path>`: look for MPFR in the UNIX prefix `<path>`
+- `--with-fftw=<path>`: look for FFTW in the UNIX prefix `<path>`
+- `--enable-lapack[=<path>]`: enable LAPACK support in Lanczos eigensolver. A UNIX prefix containing the library can be specified (optional).
+- `--enable-mkl[=<path>]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
+- `--enable-numa`: enable NUMA first touch optimisation
+- `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
+- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
+- `--enable-precision={single|double}`: set the default precision (default: `double`).
+- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
+- `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
+- `--disable-timers`: disable system dependent high-resolution timers.
+- `--enable-chroma`: enable Chroma regression tests.
+- `--enable-doxygen-doc`: enable the Doxygen documentation generation (build with `make doxygen-doc`)
+
+### Possible communication interfaces
+
+The following options can be use with the `--enable-comms=` option to target different communication interfaces:
+
+| `<comm>`       | Description                                                   |
+| -------------- | ------------------------------------------------------------- |
+| `none`         | no communications                                             |
+| `mpi[-auto]`   | MPI communications                                            |
+| `mpi3[-auto]`  | MPI communications using MPI 3 shared memory                  |
+| `shmem `       | Cray SHMEM communications                                     |
+
+For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.  
+
+### Possible SIMD types
+
+The following options can be use with the `--enable-simd=` option to target different SIMD instruction sets:
+
+| `<code>`    | Description                            |
+| ----------- | -------------------------------------- |
+| `GEN`       | generic portable vector code           |
+| `SSE4`      | SSE 4.2 (128 bit)                      |
+| `AVX`       | AVX (256 bit)                          |
+| `AVXFMA`    | AVX (256 bit) + FMA                    |
+| `AVXFMA4`   | AVX (256 bit) + FMA4                   |
+| `AVX2`      | AVX 2 (256 bit)                        |
+| `AVX512`    | AVX 512 bit                            |
+| `NEONv8`    | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit)                     |
+| `QPX`       | IBM QPX (256 bit)                      |
+
+Alternatively, some CPU codenames can be directly used:
+
+| `<code>`    | Description                            |
+| ----------- | -------------------------------------- |
+| `KNL`       | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
+| `SKL`       | [Intel Skylake with AVX512 extensions](https://ark.intel.com/products/codename/37572/Skylake#@server) |
+| `BGQ`       | Blue Gene/Q                            |
+
+#### Notes:
+- We currently support AVX512 for the Intel compiler and GCC (KNL and SKL target). Support for clang will appear in future versions of Grid when the AVX512 support in the compiler will be more advanced.
+- For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
+- BG/Q performances are currently rather poor. This is being investigated for future versions.
+- The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`.
+
+### Build setup for Intel Knights Landing platform
+
+The following configuration is recommended for the Intel Knights Landing platform:
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=KNL        \
+             --enable-comms=mpi-auto  \
+             --enable-mkl             \
+             CXX=icpc MPICXX=mpiicpc
+```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
+
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=KNL        \
+             --enable-comms=mpi       \
+             --enable-mkl             \
+             CXX=CC CC=cc
+```
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+               --with-gmp=<path>        \
+               --with-mpfr=<path>       \
+```
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+Knight's Landing with Intel Omnipath adapters with two adapters per node 
+presently performs better with use of more than one rank per node, using shared memory 
+for interior communication. This is the mpi3 communications implementation. 
+We recommend four ranks per node for best performance, but optimum is local volume dependent.
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=KNL        \
+             --enable-comms=mpi3-auto \
+             --enable-mkl             \
+             CC=icpc MPICXX=mpiicpc 
+```
+
+### Build setup for Intel Haswell Xeon platform
+
+The following configuration is recommended for the Intel Haswell platform:
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX2       \
+             --enable-comms=mpi3-auto \
+             --enable-mkl             \
+             CXX=icpc MPICXX=mpiicpc
+```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+               --with-gmp=<path>        \
+               --with-mpfr=<path>       \
+```
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX2       \
+             --enable-comms=mpi3      \
+             --enable-mkl             \
+             CXX=CC CC=cc
+```
+Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
+one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
+```
+        export I_MPI_PIN=1
+```
+This is the default.
+
+### Build setup for Intel Skylake Xeon platform
+
+The following configuration is recommended for the Intel Skylake platform:
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX512     \
+             --enable-comms=mpi3      \
+             --enable-mkl             \
+             CXX=mpiicpc
+```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+               --with-gmp=<path>        \
+               --with-mpfr=<path>       \
+```
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX512     \
+             --enable-comms=mpi3      \
+             --enable-mkl             \
+             CXX=CC CC=cc
+```
+Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
+one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
+``` 
+        export I_MPI_PIN=1
+```
+This is the default. 
+
+#### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): 
+
+mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
+
+TBA
+
+
+### Build setup for AMD EPYC / RYZEN
+
+The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
+So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total
+are common. Each chip within the module exposes a separate NUMA domain.
+There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain.
+MPI-3 is recommended with the use of four ranks per socket,
+and 8 threads per rank. 
+
+The following configuration is recommended for the AMD EPYC platform.
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX2       \
+             --enable-comms=mpi3 \
+             CXX=mpicxx 
+```
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+               --with-gmp=<path>        \
+               --with-mpfr=<path>       \
+```
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank.
+This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. 
+
+It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and
+shared memory to communicate within this node:
+
+mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 
+
+Where omp_bind.sh does the following:
+```
+#!/bin/bash
+
+numanode=` expr $PMI_RANK % 8 `
+basecore=`expr $numanode \* 16`
+core0=`expr $basecore + 0 `
+core1=`expr $basecore + 2 `
+core2=`expr $basecore + 4 `
+core3=`expr $basecore + 6 `
+core4=`expr $basecore + 8 `
+core5=`expr $basecore + 10 `
+core6=`expr $basecore + 12 `
+core7=`expr $basecore + 14 `
+
+export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7"
+echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
+
+$@
+```
+
+Performance:
+
+#### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping): 
+
+mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
+
+TBA
+
+### Build setup for BlueGene/Q
+
+To be written...
+
+### Build setup for ARM Neon
+
+To be written...
+
+### Build setup for laptops, other compilers, non-cluster builds
+
+Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),
+and omit the enable-mkl flag. 
+
+Single node builds are enabled with 
+```
+            --enable-comms=none
+```
+
+FFTW support that is not in the default search path may then enabled with
+```
+    --with-fftw=<installpath>
+```
+
+BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation.
+
--- a/7
+++ b/7
@@ -3,19 +3,23 @@
 GPU branch code item work list
 -----------------------------

+
+
 1) Common source GPU and CPU generic kernels???
   - coalescedRead, coalescedWrite in expressions.
   - Uniform coding between GPU kernels and CPU kernels attempt
+   - Clean up PRAGMAS

 -- Figure what to do about "multLinkGpu" etc.. in FermionOperatorImpl.
 -- Gparity is the awkward one
 -- Solve non-Gparity first.
+-- Simplify the operator IMPL support

 2) - SIMD dirs in stencil

 3) Merge develop and test HMC

-4)  GPU accelerate EOFA
+4) GPU accelerate EOFA

 5) Accelerate the cshift

@@ -43,7 +47,6 @@ Single GPU simd target (VGPU)

 15) Staggered kernels inline for GPU

-
 -----
 Gianluca's changes
 - Performance impact of construct in aligned allocator???
--- a/include/Grid
+++ b/include/Grid
@@ -1 +0,0 @@
-../lib
--- a/scripts/loop.log
+++ b/scripts/loop.log