Merge pull request #1 from paboyle/master

Sync with Peter
2025-11-01 20:44:33 +00:00 · 2015-08-19 17:27:31 +02:00
parent b0eedfd7ba fdfe194c41
commit dd498f993e
168 changed files with 30055 additions and 2530 deletions
--- a/76
+++ b/76
@@ -1,15 +1,61 @@
 - PseudoFermions
 Done: Cayley, Partial , ContFrac force terms.
 Done:
  - TwoFlavour
  - TwoFlavourEvenOdd        
  - TwoFlavourRatio
  - TwoFlavourRatioEvenOdd
 Done:
  - OneFlavourRationalEvenOdd
  - OneFlavourRationalRatioEvenOdd
  - OneFlavourRationalRatio
 TODO:
 => generalise to non-const EE
 => Test DWF HMC
 => Clean up HMC
   - Fix a threading bug that has been introduced and prevents HMC running hybrid OMP mode
 => Integrators
  - Force Gradient
  - Multi-timescale looks broken and operating on single timescale for now.
    Fix/debug/rewrite this 
  - Sign of force term.
  - Prefer "RefreshInternal" or such like to "init" in naming
  - Rename "Ta" as too unclear
 - MacroMagic -> virtual reader class.
 - Link smearing/boundary conds; Policy class based implementation
 - Rectangle gauge actions.
  Iwasaki,
  Symanzik,
  ... etc...
 - Prepare multigrid for HMC.
  Alternate setup schemes.
 - RNG filling from sparser grid, lower dim grid.
 ================================================================
 *** Hacks and bug fixes to clean up and Audits
 ================================================================
 *  Extract/merge/set cleanup ; too many variants; rationalise and call simpler ones
-*  Used #define repetitive sequences to minimise code.
+
 *  Rewrite core tensor arithmetic support to be more systematic
 =  Use #define repetitive sequences to minimise code, decrease line count by thousands possible,
    with more robust and maintainable implementation.
 *  Ensure we ET as much as possible; move unop functions into ET framework.
   - tests with expression args to all functions
 * FIXME audit
 * const audit
 Insert/Extract
@@ -22,10 +68,10 @@ Insert/Extract
 * Thread scaling tests Xeon, XeonPhi
-** Make the Tensor types and Complex etc... play more nicely.
+* Make the Tensor types and Complex etc... play more nicely.
  - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar<Scalar<Scalar<Complex > > >
-  QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I
+    QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I
-  want to introduce a syntax that does not require this.
+    want to introduce a syntax that does not require this.
  - Reductions that contract indices on a site should always demote the tensor structure.
    norm2(), innerProduct.
@@ -54,13 +100,6 @@ Insert/Extract
   // localMaxAbs
   // Fourier transform equivalent.]
 ================================================================
 *** New Functionality
 ================================================================
 * - BinaryWriter, TextWriter etc...
  - use protocol buffers? replace xmlReader/Writer ec..
  - Binary use htonll, htonl
 * CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)
@@ -103,14 +142,15 @@ Algorithms (lots of reuse/port from BFM)
 * Gauge
  - Wilson, symanzik, iwasaki
 * rb4d support for 5th dimension in Mobius.
 * Flavour matrices?
 * Pauli, SU subgroup, etc.. 
 * su3 exponentiation & log etc.. [Jamie's code?]
 * TaProj
 * FFTnD ?
 * Pauli, SU subgroup, etc.. 
 * su3 exponentiation & log etc.. [Jamie's code?]
 * TaProj
 * FFTnD ?
 ======================================================================================================
 FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane)
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -11,15 +11,15 @@ int main (int argc, char ** argv)
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  int Nloop=10;
  int nmu=0;
  for(int mu=0;mu<4;mu++) if (mpi_layout[mu]>1) nmu++;
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
@@ -87,15 +87,15 @@ int main (int argc, char ** argv)
      double time = stop-start; // microseconds
-      std::cout << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }    
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking sequential halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange in "<<nmu<<" dimensions"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
  for(int lat=4;lat<=32;lat+=2){
@@ -163,7 +163,7 @@ int main (int argc, char ** argv)
      double time = stop-start;
-      std::cout << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }  
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -21,7 +21,7 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
  const int Ls=8;
@@ -79,9 +79,9 @@ int main (int argc, char ** argv)
  RealD mass=0.1;
  RealD M5  =1.8;
-  DomainWallFermion Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  std::cout << "Calling Dw"<<std::endl;
+  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall=10;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
@@ -92,12 +92,12 @@ int main (int argc, char ** argv)
  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
  double flops=1344*volume*ncall;
-  std::cout << "Called Dw"<<std::endl;
+  std::cout<<GridLogMessage << "Called Dw"<<std::endl;
-  std::cout << "norm result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
+  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
-  std::cout << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
  err = ref-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
  if (1)
@@ -120,11 +120,11 @@ int main (int argc, char ** argv)
    ref = -0.5*ref;
  }
  Dw.Dhop(src,result,1);
-  std::cout << "Called DwDag"<<std::endl;
+  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
-  std::cout << "norm result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
+  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  err = ref-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
@@ -133,24 +133,32 @@ int main (int argc, char ** argv)
  LatticeFermion r_eo  (FGrid);
-  std::cout << "Calling Deo and Doe"<<std::endl;
+  std::cout<<GridLogMessage << "Calling Deo and Doe"<<std::endl;
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);
  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
-  Dw.Dhop(src,result,DaggerNo);
+  Dw.Dhop  (src  ,result,DaggerNo);
  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
  setCheckerboard(r_eo,r_o);
  setCheckerboard(r_eo,r_e);
  err = r_eo-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
-  std::cout << "norm diff even  "<< norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
-  std::cout << "norm diff odd   "<< norm2(src_o)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -17,13 +17,13 @@ int main (int argc, char ** argv)
  std::vector<int> mpi_layout  = GridDefaultMpi();
  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking fused AXPY bandwidth ; sizeof(Real) "<<sizeof(Real)<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking fused AXPY bandwidth ; sizeof(Real) "<<sizeof(Real)<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
  for(int lat=4;lat<=32;lat+=4){
@@ -49,15 +49,15 @@ int main (int argc, char ** argv)
      double flops=vol*Nvec*2;// mul,add
      double bytes=3*vol*Nvec*sizeof(Real);
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
    }
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking a*x + y bandwidth"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
  for(int lat=4;lat<=32;lat+=4){
@@ -81,14 +81,14 @@ int main (int argc, char ** argv)
      double flops=vol*Nvec*2;// mul,add
      double bytes=3*vol*Nvec*sizeof(Real);
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
    }
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SCALE bandwidth"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SCALE bandwidth"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
  for(int lat=4;lat<=32;lat+=4){
@@ -114,15 +114,15 @@ int main (int argc, char ** argv)
      double bytes=2*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*1;// mul
-      std::cout <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
  }
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking READ bandwidth"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking READ bandwidth"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
  for(int lat=4;lat<=32;lat+=4){
@@ -147,7 +147,7 @@ int main (int argc, char ** argv)
      double bytes=vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
  }    
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -14,15 +14,15 @@ int main (int argc, char ** argv)
  std::vector<int> mpi_layout  = GridDefaultMpi();
  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SU3xSU3  x= x*y"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  x= x*y"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=2;lat<=24;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -43,18 +43,18 @@ int main (int argc, char ** argv)
      double bytes=3.0*vol*Nc*Nc*sizeof(Complex);
      double footprint=2.0*vol*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(6.0+8.0+8.0)*vol;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<footprint<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<footprint<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
    }
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SU3xSU3  z= x*y"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z= x*y"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=2;lat<=24;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -75,17 +75,17 @@ int main (int argc, char ** argv)
      double bytes=3*vol*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(6+8+8)*vol;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
    }
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SU3xSU3  mult(z,x,y)"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  mult(z,x,y)"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=2;lat<=24;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -106,17 +106,17 @@ int main (int argc, char ** argv)
      double bytes=3*vol*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(6+8+8)*vol;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
    }
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SU3xSU3  mac(z,x,y)"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  mac(z,x,y)"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=2;lat<=24;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -137,7 +137,7 @@ int main (int argc, char ** argv)
      double bytes=3*vol*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(8+8+8)*vol;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
    }
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -28,10 +28,10 @@ int main (int argc, char ** argv)
  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-  std::cout << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
+  std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
-  std::cout << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
+  std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
-  std::cout << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
+  std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);
@@ -58,8 +58,8 @@ int main (int argc, char ** argv)
  for(int nn=0;nn<Nd;nn++){
    random(pRNG,U[nn]);
    if(0) {
-      if (nn==-1) { U[nn]=zero; std::cout << "zeroing gauge field in dir "<<nn<<std::endl; }
+      if (nn==-1) { U[nn]=zero; std::cout<<GridLogMessage << "zeroing gauge field in dir "<<nn<<std::endl; }
-      else       { U[nn] = cone;std::cout << "unit gauge field in dir "<<nn<<std::endl; }
+      else       { U[nn] = cone;std::cout<<GridLogMessage << "unit gauge field in dir "<<nn<<std::endl; }
    }
    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
  }
@@ -87,9 +87,9 @@ int main (int argc, char ** argv)
  }
  ref = -0.5*ref;
  RealD mass=0.1;
-  WilsonFermion Dw(Umu,Grid,RBGrid,mass);
+  WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
-  std::cout << "Calling Dw"<<std::endl;
+  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall=10000;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
@@ -98,12 +98,12 @@ int main (int argc, char ** argv)
  double t1=usecond();
  double flops=1344*volume*ncall;
-  std::cout << "Called Dw"<<std::endl;
+  std::cout<<GridLogMessage << "Called Dw"<<std::endl;
-  std::cout << "norm result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
+  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
-  std::cout << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
  err = ref-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
  //  for(int ss=0;ss<10;ss++ ){
@@ -112,7 +112,7 @@ int main (int argc, char ** argv)
      for(int j=0;j<Nc;j++){
 	ComplexF * ref_p = (ComplexF *)&ref._odata[ss]()(i)(j);
 	ComplexF * res_p = (ComplexF *)&result._odata[ss]()(i)(j);
-	std::cout << ss<< " "<<i<<" "<<j<<" "<< (*ref_p)<<" " <<(*res_p)<<std::endl;
+	std::cout<<GridLogMessage << ss<< " "<<i<<" "<<j<<" "<< (*ref_p)<<" " <<(*res_p)<<std::endl;
      }
    }
  }
@@ -136,11 +136,11 @@ int main (int argc, char ** argv)
  }
  ref = -0.5*ref;
  Dw.Dhop(src,result,1);
-  std::cout << "Called DwDag"<<std::endl;
+  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
-  std::cout << "norm result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
+  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  err = ref-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
  Grid_finalize();
 }
--- a/8112
+++ b/8112
--- a/configure.ac
+++ b/configure.ac
@@ -3,7 +3,7 @@
 #
 # Project Grid package  
 # 
-# Time-stamp: <2015-06-09 15:26:39 neo>
+# Time-stamp: <2015-07-10 17:46:21 neo>
 AC_PREREQ([2.63])
 AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
@@ -11,7 +11,7 @@ AC_CANONICAL_SYSTEM
 AM_INIT_AUTOMAKE(subdir-objects)
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_SRCDIR([lib/Grid.h])
-AC_CONFIG_HEADERS([lib/GridConfig.h])
+AC_CONFIG_HEADERS([lib/Config.h])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 AC_MSG_NOTICE([
@@ -26,10 +26,9 @@ AC_LANG(C++)
 AC_PROG_CXX
 AC_OPENMP
 AC_PROG_RANLIB
-AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
+#AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
 AX_EXT
 # Checks for libraries.
 #AX_GCC_VAR_ATTRIBUTE(aligned)
@@ -66,7 +65,6 @@ AC_CHECK_LIB([mpfr],[mpfr_init],,
 Please install or provide the correct path to your installation
 Info at: http://www.mpfr.org/)])
 AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|MIC],\
 	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, MIC])],\
 	[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
@@ -106,9 +104,9 @@ case ${ac_SIMD} in
       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Corner] )
       supported="cross compilation"
     ;;
-     NEONv7)
+     NEONv8)
-       echo Configuring for experimental ARMv7 support 
+       echo Configuring for experimental ARMv8a support 
-       AC_DEFINE([NEONv7],[1],[NEON ARMv7 Experimental support ] )
+       AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] )
       supported="cross compilation"
     ;;
     DEBUG)
@@ -155,15 +153,15 @@ AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
 ###################################################################
 # Checks for doxygen support
 # if present enables the "make doxyfile" command
-echo
+#echo
-echo Checking doxygen support 
+#echo Checking doxygen support 
-echo :::::::::::::::::::::::::::::::::::::::::::
+#echo :::::::::::::::::::::::::::::::::::::::::::
-AC_PROG_DOXYGEN
+#AC_PROG_DOXYGEN
-if test -n "$DOXYGEN"
+#if test -n "$DOXYGEN"
-then
+#then
-AC_CONFIG_FILES([docs/doxy.cfg])
+#AC_CONFIG_FILES([docs/doxy.cfg])
-fi
+#fi
 echo
 echo Creating configuration files
--- a/gcc-bug-report/broken.cc
+++ b/gcc-bug-report/broken.cc
@@ -29,12 +29,12 @@ public:
 template<int N,class obj,typename std::enable_if<N==obj::NestLevel >::type * = nullptr > auto function(const obj &arg)-> obj
 {
-  std::cout<<"Leaf "<<obj::NestLevel<<std::endl;
+  std::cout<<GridLogMessage<<"Leaf "<<obj::NestLevel<<std::endl;
  return arg;
 }
 template<int N,class obj,typename std::enable_if<N!=obj::NestLevel >::type * = nullptr > auto function(const obj &arg)-> obj
 {
-  std::cout<<"Node "<<obj::NestLevel<<std::endl;
+  std::cout<<GridLogMessage<<"Node "<<obj::NestLevel<<std::endl;
  obj ret;
  ret.internal=function<N>(arg.internal);
  return ret;
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -1,6 +1,13 @@
 #ifndef GRID_ALIGNED_ALLOCATOR_H
 #define GRID_ALIGNED_ALLOCATOR_H
 #ifdef HAVE_MALLOC_MALLOC_H
 #include <malloc/malloc.h>
 #endif
 #ifdef HAVE_MALLOC_H
 #include <malloc.h>
 #endif
 #include <immintrin.h>
 #ifdef HAVE_MM_MALLOC_H
 #include <mm_malloc.h>
--- a/lib/GridConfig.h
+++ b/lib/GridConfig.h
@@ -1,5 +1,5 @@
-/* lib/GridConfig.h.  Generated from GridConfig.h.in by configure.  */
+/* lib/Config.h.  Generated from Config.h.in by configure.  */
-/* lib/GridConfig.h.in.  Generated from configure.ac by autoheader.  */
+/* lib/Config.h.in.  Generated from configure.ac by autoheader.  */
 /* AVX Intrinsics */
 /* #undef AVX1 */
@@ -34,9 +34,6 @@
 /* Support AVX2 (Advanced Vector Extensions 2) instructions */
 /* #undef HAVE_AVX2 */
 /* define if the compiler supports basic C++11 syntax */
 /* #undef HAVE_CXX11 */
 /* Define to 1 if you have the declaration of `be64toh', and to 0 if you
   don't. */
 #define HAVE_DECL_BE64TOH 1
@@ -120,8 +117,8 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #define HAVE_UNISTD_H 1
-/* NEON ARMv7 Experimental support */
+/* NEON ARMv8 Experimental support */
-/* #undef NEONv7 */
+/* #undef NEONv8 */
 /* Name of package */
 #define PACKAGE "grid"
--- a/lib/GridConfig.h.in
+++ b/lib/GridConfig.h.in
@@ -1,4 +1,4 @@
-/* lib/GridConfig.h.in.  Generated from configure.ac by autoheader.  */
+/* lib/Config.h.in.  Generated from configure.ac by autoheader.  */
 /* AVX Intrinsics */
 #undef AVX1
@@ -33,9 +33,6 @@
 /* Support AVX2 (Advanced Vector Extensions 2) instructions */
 #undef HAVE_AVX2
 /* define if the compiler supports basic C++11 syntax */
 #undef HAVE_CXX11
 /* Define to 1 if you have the declaration of `be64toh', and to 0 if you
   don't. */
 #undef HAVE_DECL_BE64TOH
@@ -119,8 +116,8 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H
-/* NEON ARMv7 Experimental support */
+/* NEON ARMv8 Experimental support */
-#undef NEONv7
+#undef NEONv8
 /* Name of package */
 #undef PACKAGE
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -6,92 +6,48 @@
 //  Copyright (c) 2014 University of Edinburgh. All rights reserved.
 //
 #ifndef GRID_H
 #define GRID_H
 ///////////////////
 // Std C++ dependencies
 ///////////////////
 #include <cassert>
 #include <complex>
 #include <vector>
 #include <iostream>
 #include <iomanip>
 #include <random>
 #include <functional>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/time.h>
 #include <stdio.h>
 #include <signal.h>
 #include <ctime>
 #include <sys/time.h>
 #include <chrono>
-#ifndef MAX
+///////////////////
-#define MAX(x,y) ((x)>(y)?(x):(y))
+// Grid headers
-#define MIN(x,y) ((x)>(y)?(y):(x))
+///////////////////
-#endif
+#include <MacroMagic.h>
-
+#include <Config.h>
-#define strong_inline __attribute__((always_inline)) inline
+#include <Timer.h>
-
+#include <Log.h>
 #include <GridConfig.h>
 ////////////////////////////////////////////////////////////
 // Tunable header includes
 ////////////////////////////////////////////////////////////
 #ifdef HAVE_MALLOC_MALLOC_H
 #include <malloc/malloc.h>
 #endif
 #ifdef HAVE_MALLOC_H
 #include <malloc.h>
 #endif
 #include <AlignedAllocator.h>
 #include <Simd.h>
 #include <Threads.h>
-
+#include <Communicator.h> 
-#include <Communicator.h> // subdir aggregate
+#include <Cartesian.h>    
-#include <Cartesian.h> // subdir aggregate
+#include <Tensors.h>      
-#include <Tensors.h>   // subdir aggregate
+#include <Lattice.h>      
-#include <Lattice.h>   // subdir aggregate
+#include <Cshift.h>       
-#include <Cshift.h>    // subdir aggregate
+#include <Stencil.h>      
-#include <Stencil.h>   // subdir aggregate
+#include <Algorithms.h>   
 #include <Algorithms.h>// subdir aggregate
 #include <qcd/QCD.h>
 #include <parallelIO/NerscIO.h>
-namespace Grid {
+#include <Init.h>
  void Grid_init(int *argc,char ***argv);
  void Grid_finalize(void);
  // internal, controled with --handle
  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
  void Grid_debug_handler_init(void);
  void Grid_quiesce_nodes(void);
  void Grid_unquiesce_nodes(void);
  // C++11 time facilities better?
  double usecond(void);
  const std::vector<int> GridDefaultSimd(int dims,int nsimd);
  const std::vector<int> &GridDefaultLatt(void);
  const std::vector<int> &GridDefaultMpi(void);
  const int              &GridThreads(void)  ;
  void                 GridSetThreads(int t) ;
  // Common parsing chores
  std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
  bool        GridCmdOptionExists(char** begin, char** end, const std::string& option);
  std::string GridCmdVectorIntToString(const std::vector<int> & vec);
  void GridParseLayout(char **argv,int argc,
 		       std::vector<int> &latt,
 		       std::vector<int> &simd,
 		       std::vector<int> &mpi);
 };
 #endif
--- a/lib/GridInit.cc
+++ b/lib/GridInit.cc
@@ -25,17 +25,19 @@
 namespace Grid {
-  //////////////////////////////////////////////////////
+//////////////////////////////////////////////////////
-  // Convenience functions to access stadard command line arg
+// Convenience functions to access stadard command line arg
-  // driven parallelism controls
+// driven parallelism controls
-  //////////////////////////////////////////////////////
+//////////////////////////////////////////////////////
-  static std::vector<int> Grid_default_latt;
+static std::vector<int> Grid_default_latt;
-  static std::vector<int> Grid_default_mpi;
+static std::vector<int> Grid_default_mpi;
 int GridThread::_threads;
  int GridThread::_threads;
-  const std::vector<int> GridDefaultSimd(int dims,int nsimd)
+const std::vector<int> &GridDefaultLatt(void)     {return Grid_default_latt;};
-  {
+const std::vector<int> &GridDefaultMpi(void)      {return Grid_default_mpi;};
 const std::vector<int> GridDefaultSimd(int dims,int nsimd)
 {
    std::vector<int> layout(dims);
    int nn=nsimd;
    for(int d=dims-1;d>=0;d--){
@@ -48,15 +50,11 @@ namespace Grid {
    }
    assert(nn==1);
    return layout;
-  }
+}
-  const std::vector<int> &GridDefaultLatt(void)     {return Grid_default_latt;};
+////////////////////////////////////////////////////////////
-  const std::vector<int> &GridDefaultMpi(void)      {return Grid_default_mpi;};
+// Command line parsing assist for stock controls
-
+////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////
  // Command line parsing assist for stock controls
  ////////////////////////////////////////////////////////////
 std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option)
 {
  char ** itr = std::find(begin, end, option);
@@ -70,6 +68,23 @@ bool GridCmdOptionExists(char** begin, char** end, const std::string& option)
 {
  return std::find(begin, end, option) != end;
 }
  // Comma separated list
 void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec)
 {
  size_t pos = 0;
  std::string token;
  std::string delimiter(",");
  vec.resize(0);
  while ((pos = str.find(delimiter)) != std::string::npos) {
    token = str.substr(0, pos);
    vec.push_back(token);
    str.erase(0, pos + delimiter.length());
  }
  token = str;
  vec.push_back(token);
  return;
 }
 void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
 {
@@ -84,6 +99,7 @@ void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
  return;
 }
 void GridParseLayout(char **argv,int argc,
 		     std::vector<int> &latt,
 		     std::vector<int> &mpi)
@@ -117,8 +133,9 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
  std::copy(vec.begin(), vec.end(),std::ostream_iterator<int>(oss, " "));
  return oss.str();
 }
-  /////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////
-  /////////////////////////////////////////////////////////
+//
 /////////////////////////////////////////////////////////
 void Grid_init(int *argc,char ***argv)
 {
 #ifdef GRID_COMMS_MPI
@@ -126,15 +143,33 @@ void Grid_init(int *argc,char ***argv)
 #endif
  // Parse command line args.
  GridLogger::StopWatch.Start();
  std::string arg;
  std::vector<std::string> logstreams;
  std::string defaultLog("Error,Warning,Message,Performance");
  GridCmdOptionCSL(defaultLog,logstreams);
  GridLogConfigure(logstreams);
  if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
-    std::cout<<"--help : this message"<<std::endl;
+    std::cout<<GridLogMessage<<"--help : this message"<<std::endl;
-    std::cout<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
+    std::cout<<GridLogMessage<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
-    std::cout<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
+    std::cout<<GridLogMessage<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
-    std::cout<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
+    std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
-    std::cout<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
+    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
-    std::cout<<"--omp n         : default number of OMP threads"<<std::endl;    
+    std::cout<<GridLogMessage<<"--omp n         : default number of OMP threads"<<std::endl;    
-    std::cout<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
+    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Debug"<<std::endl;    
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
    arg = GridCmdOptionPayload(*argv,*argv+*argc,"--log");
    GridCmdOptionCSL(arg,logstreams);
    GridLogConfigure(logstreams);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
@@ -142,8 +177,7 @@ void Grid_init(int *argc,char ***argv)
    Grid_quiesce_nodes();
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
-    WilsonFermion::HandOptDslash=1;
+    WilsonFermionStatic::HandOptDslash=1;
    WilsonFermion5D::HandOptDslash=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
@@ -152,38 +186,18 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
-    std::cout<<"Grid Decomposition\n";
+    std::cout<<GridLogMessage<<"Grid Decomposition\n";
-    std::cout<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
+    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
-    std::cout<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
+    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
-    std::cout<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
-    std::cout<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
-    std::cout<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
-    std::cout<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
  }
 }
  ////////////////////////////////////////////////////////////
  // Verbose limiter on MPI tasks
  ////////////////////////////////////////////////////////////
  void Grid_quiesce_nodes(void)
  {
 #ifdef GRID_COMMS_MPI
    int me;
    MPI_Comm_rank(MPI_COMM_WORLD,&me);
    if ( me ) { 
      std::cout.setstate(std::ios::badbit);
    }
 #endif
  }
  void Grid_unquiesce_nodes(void)
  {
 #ifdef GRID_COMMS_MPI
    std::cout.clear();
 #endif
  }
 void Grid_finalize(void)
 {
--- a/lib/Init.h
+++ b/lib/Init.h
@@ -0,0 +1,32 @@
 #ifndef GRID_INIT_H
 #define GRID_INIT_H
 namespace Grid {
  void Grid_init(int *argc,char ***argv);
  void Grid_finalize(void);
  // internal, controled with --handle
  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
  void Grid_debug_handler_init(void);
  void Grid_quiesce_nodes(void);
  void Grid_unquiesce_nodes(void);
  const std::vector<int> GridDefaultSimd(int dims,int nsimd);
  const std::vector<int> &GridDefaultLatt(void);
  const std::vector<int> &GridDefaultMpi(void);
  const int              &GridThreads(void)  ;
  void                    GridSetThreads(int t) ;
  // Common parsing chores
  std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
  bool        GridCmdOptionExists(char** begin, char** end, const std::string& option);
  std::string GridCmdVectorIntToString(const std::vector<int> & vec);
  void GridParseLayout(char **argv,int argc,
 		       std::vector<int> &latt,
 		       std::vector<int> &simd,
 		       std::vector<int> &mpi);
 };
 #endif
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -0,0 +1,62 @@
 #include <Grid.h>
 namespace Grid {
 GridStopWatch GridLogger::StopWatch;
 std::ostream  GridLogger::devnull(0);
 GridLogger GridLogError      (1,"Error");
 GridLogger GridLogWarning    (1,"Warning");
 GridLogger GridLogMessage    (1,"Message");
 GridLogger GridLogDebug      (1,"Debug");
 GridLogger GridLogPerformance(1,"Performance");
 GridLogger GridLogIterative  (1,"Iterative");
 void GridLogConfigure(std::vector<std::string> &logstreams)
 {
  GridLogError.Active(0);
  GridLogWarning.Active(0);
  GridLogMessage.Active(0);
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
  for(int i=0;i<logstreams.size();i++){
    if ( logstreams[i]== std::string("Error")       ) GridLogError.Active(1);
    if ( logstreams[i]== std::string("Warning")     ) GridLogWarning.Active(1);
    if ( logstreams[i]== std::string("Message")     ) GridLogMessage.Active(1);
    if ( logstreams[i]== std::string("Iterative")   ) GridLogIterative.Active(1);
    if ( logstreams[i]== std::string("Debug")       ) GridLogDebug.Active(1);
    if ( logstreams[i]== std::string("Performance") ) GridLogPerformance.Active(1);
  }
 }
 ////////////////////////////////////////////////////////////
 // Verbose limiter on MPI tasks
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void)
 {
 #ifdef GRID_COMMS_MPI
  int me;
  MPI_Comm_rank(MPI_COMM_WORLD,&me);
  if ( me ) { 
    std::cout.setstate(std::ios::badbit);
  }
 #endif
 }
 void Grid_unquiesce_nodes(void)
 {
 #ifdef GRID_COMMS_MPI
    std::cout.clear();
 #endif
 }
 std::ostream& operator<< (std::ostream& stream, const GridTime& time)
 {
  stream << time.count()<<" ms";
  return stream;
 }
 }
--- a/lib/Log.h
+++ b/lib/Log.h
@@ -0,0 +1,46 @@
 #ifndef GRID_LOG_H
 #define GRID_LOG_H
 namespace Grid {
 // Dress the output; use std::chrono for time stamping via the StopWatch class
 std::ostream& operator<< (std::ostream& stream, const GridTime& time);
 class GridLogger { 
  int active;
  std::string name;
 public:
  static GridStopWatch StopWatch;
  static std::ostream devnull;
  GridLogger(int on, std::string nm): active(on), name(nm) { 
  };
  void Active(int on) {active = on;};
  friend std::ostream& operator<< (std::ostream& stream, const GridLogger& log){
    if ( log.active ) {
      StopWatch.Stop();
      GridTime now = StopWatch.Elapsed();
      StopWatch.Start();
      stream << "Grid : "<<log.name << " : " << now << " : ";
      return stream;
    } else { 
      return devnull;
    }
  }
 };
 void GridLogConfigure(std::vector<std::string> &logstreams);
 extern GridLogger GridLogError;
 extern GridLogger GridLogWarning;
 extern GridLogger GridLogMessage;
 extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 }
 #endif
--- a/lib/MacroMagic.h
+++ b/lib/MacroMagic.h
@@ -0,0 +1,83 @@
 #ifndef GRID_MACRO_MAGIC_H
 #define GRID_MACRO_MAGIC_H
 #define strong_inline __attribute__((always_inline)) inline
 #ifndef MAX
 #define MAX(x,y) ((x)>(y)?(x):(y))
 #define MIN(x,y) ((x)>(y)?(y):(x))
 #endif
 #define GRID_MACRO_FIRST(a, ...) a
 #define GRID_MACRO_SECOND(a, b, ...) b
 #define GRID_MACRO_EMPTY()
 #define GRID_MACRO_EVAL(...)     GRID_MACRO_EVAL1024(__VA_ARGS__)
 #define GRID_MACRO_EVAL1024(...) GRID_MACRO_EVAL512(GRID_MACRO_EVAL512(__VA_ARGS__))
 #define GRID_MACRO_EVAL512(...)  GRID_MACRO_EVAL256(GRID_MACRO_EVAL256(__VA_ARGS__))
 #define GRID_MACRO_EVAL256(...)  GRID_MACRO_EVAL128(GRID_MACRO_EVAL128(__VA_ARGS__))
 #define GRID_MACRO_EVAL128(...)  GRID_MACRO_EVAL64(GRID_MACRO_EVAL64(__VA_ARGS__))
 #define GRID_MACRO_EVAL64(...)   GRID_MACRO_EVAL32(GRID_MACRO_EVAL32(__VA_ARGS__))
 #define GRID_MACRO_EVAL32(...)   GRID_MACRO_EVAL16(GRID_MACRO_EVAL16(__VA_ARGS__))
 #define GRID_MACRO_EVAL16(...)   GRID_MACRO_EVAL8(GRID_MACRO_EVAL8(__VA_ARGS__))
 #define GRID_MACRO_EVAL8(...)    GRID_MACRO_EVAL4(GRID_MACRO_EVAL4(__VA_ARGS__))
 #define GRID_MACRO_EVAL4(...)    GRID_MACRO_EVAL2(GRID_MACRO_EVAL2(__VA_ARGS__))
 #define GRID_MACRO_EVAL2(...)    GRID_MACRO_EVAL1(GRID_MACRO_EVAL1(__VA_ARGS__))
 #define GRID_MACRO_EVAL1(...) __VA_ARGS__
 #define GRID_MACRO_DEFER1(m) m GRID_MACRO_EMPTY()
 #define GRID_MACRO_DEFER2(m) m GRID_MACRO_EMPTY GRID_MACRO_EMPTY()()
 #define GRID_MACRO_DEFER3(m) m GRID_MACRO_EMPTY GRID_MACRO_EMPTY GRID_MACRO_EMPTY()()()
 #define GRID_MACRO_DEFER4(m) m GRID_MACRO_EMPTY GRID_MACRO_EMPTY GRID_MACRO_EMPTY GRID_MACRO_EMPTY()()()()
 #define GRID_MACRO_IS_PROBE(...) GRID_MACRO_SECOND(__VA_ARGS__, 0)
 #define GRID_MACRO_PROBE() ~, 1
 #define GRID_MACRO_CAT(a,b) a ## b
 #define GRID_MACRO_NOT(x) GRID_MACRO_IS_PROBE(GRID_MACRO_CAT(_GRID_MACRO_NOT_, x))
 #define _GRID_MACRO_NOT_0 GRID_MACRO_PROBE()
 #define GRID_MACRO_BOOL(x) GRID_MACRO_NOT(GRID_MACRO_NOT(x))
 #define GRID_MACRO_IF_ELSE(condition) _GRID_MACRO_IF_ELSE(GRID_MACRO_BOOL(condition))
 #define _GRID_MACRO_IF_ELSE(condition) GRID_MACRO_CAT(_GRID_MACRO_IF_, condition)
 #define _GRID_MACRO_IF_1(...) __VA_ARGS__ _GRID_MACRO_IF_1_ELSE
 #define _GRID_MACRO_IF_0(...)             _GRID_MACRO_IF_0_ELSE
 #define _GRID_MACRO_IF_1_ELSE(...)
 #define _GRID_MACRO_IF_0_ELSE(...) __VA_ARGS__
 #define GRID_MACRO_HAS_ARGS(...) GRID_MACRO_BOOL(GRID_MACRO_FIRST(_GRID_MACRO_END_OF_ARGUMENTS_ __VA_ARGS__)())
 #define _GRID_MACRO_END_OF_ARGUMENTS_() 0
 #define GRID_MACRO_MAP(m, first, second, ...)   \
  m(first,second)                           \
  GRID_MACRO_IF_ELSE(GRID_MACRO_HAS_ARGS(__VA_ARGS__))(				       \
 				 GRID_MACRO_DEFER4(_GRID_MACRO_MAP)()(m, __VA_ARGS__)   \
 				     )(                                 \
 				       /* Do nothing, just terminate */ \
 									)
 #define _GRID_MACRO_MAP() GRID_MACRO_MAP
 #define GRID_MACRO_MEMBER(A,B)        A B;
 #define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" "#B <<" = "<< obj. B <<" ; " <<std::endl;
 #define GRID_DECL_CLASS_MEMBERS(cname,...)		\
  \
  \
  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))		\
  \
  \
  friend std::ostream & operator << (std::ostream &os, const cname &obj ) {	\
    os<<"class "<<#cname<<" {"<<std::endl;\
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__))	\
      os<<"}";								\
    return os;\
  };  
 #endif
--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@
-HFILES=./Cshift.h ./simd/Grid_avx.h ./simd/Grid_vector_types.h ./simd/Grid_sse4.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_vector_unops.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./Tensors.h ./Algorithms.h ./communicator/Communicator_base.h ./lattice/Lattice_rng.h ./lattice/Lattice_reduction.h ./lattice/Lattice_transfer.h ./lattice/Lattice_unary.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_comparison.h ./lattice/Lattice_overload.h ./lattice/Lattice_reality.h ./lattice/Lattice_local.h ./lattice/Lattice_conformable.h ./lattice/Lattice_where.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_ET.h ./lattice/Lattice_transpose.h ./lattice/Lattice_trace.h ./Stencil.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_exp.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_class.h ./tensors/Tensor_logical.h ./tensors/Tensor_transpose.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_index.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_outer.h ./tensors/Tensor_inner.h ./tensors/Tensor_traits.h ./tensors/Tensor_Ta.h ./tensors/Tensor_unary.h ./tensors/Tensor_determinant.h ./tensors/Tensor_arith.h ./tensors/Tensor_extract_merge.h ./Communicator.h ./Cartesian.h ./parallelIO/NerscIO.h ./qcd/QCD.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/hmc/HMC.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/LinalgUtils.h ./qcd/utils/CovariantCshift.h ./qcd/utils/WilsonLoops.h ./qcd/action/ActionBase.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/spin/TwoSpinor.h ./qcd/spin/Dirac.h ./cshift/Cshift_common.h ./cshift/Cshift_none.h ./cshift/Cshift_mpi.h ./Simd.h ./GridConfig.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_red_black.h ./cartesian/Cartesian_full.h ./AlignedAllocator.h ./Lattice.h ./Old/Tensor_poke.h ./Old/Tensor_peek.h ./Threads.h ./Grid.h ./algorithms/Preconditioner.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/AdefGeneric.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Remez.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./algorithms/CoarsenedMatrix.h ./stencil/Lebesgue.h
+HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Config.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./Init.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./Log.h ./MacroMagic.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./parallelIO/NerscIO.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/WilsonLoops.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h ./Timer.h
-CCFILES=./qcd/hmc/integrators/Integrator.cc ./qcd/hmc/HMC.cc ./qcd/utils/SpaceTimeGrid.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/spin/Dirac.cc ./GridInit.cc ./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
+CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./Init.cc ./Log.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/hmc/HMC.cc ./qcd/hmc/integrators/Integrator.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -41,6 +41,12 @@
 namespace Grid {
  struct StencilEntry { 
    int _offset;
    int _is_local;
    int _permute;
    int _around_the_world;
  };
  class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
  public:
@@ -58,9 +64,9 @@ namespace Grid {
      std::vector<int>                  _permute_type;
      // npoints x Osites() of these
-      std::vector<std::vector<int>    > _offsets;
+      std::vector<std::vector<StencilEntry> > _entries;
-      std::vector<std::vector<int>    > _is_local;
+
-      std::vector<std::vector<int> >    _permute;
+      inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
      int _unified_buffer_size;
      int _request_count;
@@ -77,8 +83,8 @@ namespace Grid {
      // Can this be avoided with simpler coding of comms?
      void Local     (int point, int dimension,int shift,int cbmask);
      void Comms     (int point, int dimension,int shift,int cbmask);
-      void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute);
+      void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
-      void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset);
+      void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
      // Could allow a functional munging of the halo to another type during the comms.
      // this could implement the 16bit/32bit/64bit compression.
--- a/lib/Timer.h
+++ b/lib/Timer.h
@@ -0,0 +1,52 @@
 #ifndef GRID_TIME_H
 #define GRID_TIME_H
 #include <sys/time.h>
 #include <ctime>
 #include <chrono>
 namespace Grid {
  // Dress the output; use std::chrono
 // C++11 time facilities better?
 double usecond(void);
 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
 typedef  std::chrono::milliseconds          GridTime;
 class GridStopWatch {
 private:
  bool running;
  GridTimePoint start;
  GridTime accumulator;
 public:
  GridStopWatch () { 
    Reset();
  }
  void     Start(void) { 
    assert(running == false);
    start = GridClock::now(); 
    running = true;
  }
  void     Stop(void)  { 
    assert(running == true);
    accumulator+= std::chrono::duration_cast<GridTime>(GridClock::now()-start); 
    running = false; 
  };
  void     Reset(void){
    running = false;
    start = GridClock::now();
    accumulator = std::chrono::duration_cast<GridTime>(start-start); 
  }
  GridTime Elapsed(void) {
    assert(running == false);
    return accumulator;
  }
 };
 }
 #endif
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -32,12 +32,12 @@ namespace Grid {
      displacements[2*_d]=0;
      //// report back
-      std::cout<<"directions    :";
+      std::cout<<GridLogMessage<<"directions    :";
      for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
      std::cout <<std::endl;
-      std::cout<<"displacements :";
+      std::cout<<GridLogMessage<<"displacements :";
      for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
-      std::cout <<std::endl;
+      std::cout<<std::endl;
    }
    /*
@@ -100,9 +100,9 @@ namespace Grid {
 	  eProj._odata[ss](i)=CComplex(1.0);
 	}
 	eProj=eProj - iProj;
-	std::cout<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
+	std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
      }
-      std::cout <<"CheckOrthog done"<<std::endl;
+      std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
    }
    void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
      blockProject(CoarseVec,FineVec,subspace);
@@ -113,27 +113,27 @@ namespace Grid {
    void CreateSubspaceRandom(GridParallelRNG &RNG){
      for(int i=0;i<nbasis;i++){
 	random(RNG,subspace[i]);
-	std::cout<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
+	std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
      }
      Orthogonalise();
    }
-    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop) {
+    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
      RealD scale;
-      ConjugateGradient<FineField> CG(1.0e-4,10000);
+      ConjugateGradient<FineField> CG(1.0e-2,10000);
      FineField noise(FineGrid);
      FineField Mn(FineGrid);
-      for(int b=0;b<nbasis;b++){
+      for(int b=0;b<nn;b++){
 	gaussian(RNG,noise);
 	scale = std::pow(norm2(noise),-0.5); 
 	noise=noise*scale;
-	hermop.Op(noise,Mn); std::cout << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+	hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-	for(int i=0;i<2;i++){
+	for(int i=0;i<1;i++){
 	  CG(hermop,noise,subspace[b]);
@@ -143,8 +143,9 @@ namespace Grid {
 	}
-	hermop.Op(noise,Mn); std::cout << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
+	hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
-	subspace[b] = noise;
+	subspace[b]   = noise;
      }
      Orthogonalise();
@@ -188,24 +189,22 @@ namespace Grid {
      SimpleCompressor<siteVector> compressor;
      Stencil.HaloExchange(in,comm_buf,compressor);
-      //PARALLEL_FOR_LOOP
+PARALLEL_FOR_LOOP
      for(int ss=0;ss<Grid()->oSites();ss++){
        siteVector res = zero;
 	siteVector nbr;
-	int offset,local,perm,ptype;
+	int ptype;
-
+	StencilEntry *SE;
 	for(int point=0;point<geom.npoint;point++){
-	  offset = Stencil._offsets [point][ss];
+
-	  local  = Stencil._is_local[point][ss];
+	  SE=Stencil.GetEntry(ptype,point,ss);
 	  perm   = Stencil._permute [point][ss];
 	  ptype  = Stencil._permute_type[point];
-	  if(local&&perm) { 
+	  if(SE->_is_local&&SE->_permute) { 
-	    permute(nbr,in._odata[offset],ptype);
+	    permute(nbr,in._odata[SE->_offset],ptype);
-	  } else if(local) { 
+	  } else if(SE->_is_local) { 
-	    nbr = in._odata[offset];
+	    nbr = in._odata[SE->_offset];
 	  } else {
-	    nbr = comm_buf[offset];
+	    nbr = comm_buf[SE->_offset];
 	  }
 	  res = res + A[point]._odata[ss]*nbr;
 	}
@@ -251,10 +250,6 @@ namespace Grid {
      // Orthogonalise the subblocks over the basis
      blockOrthogonalise(InnerProd,Subspace.subspace);
      //Subspace.Orthogonalise();
      //      Subspace.CheckOrthogonal();
      //Subspace.Orthogonalise();
      //      Subspace.CheckOrthogonal();
      // Compute the matrix elements of linop between this orthonormal
      // set of vectors.
@@ -305,6 +300,7 @@ namespace Grid {
 	  Subspace.ProjectToSubspace(oProj,oblock);
 	  //	  blockProject(iProj,iblock,Subspace.subspace);
 	  //	  blockProject(oProj,oblock,Subspace.subspace);
 PARALLEL_FOR_LOOP
 	  for(int ss=0;ss<Grid()->oSites();ss++){
 	    for(int j=0;j<nbasis;j++){
 	      if( disp!= 0 ) {
@@ -320,12 +316,12 @@ namespace Grid {
      ///////////////////////////
      // test code worth preserving in if block
      ///////////////////////////
-      std::cout<< " Computed matrix elements "<< self_stencil <<std::endl;
+      std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
      for(int p=0;p<geom.npoint;p++){
-	std::cout<< "A["<<p<<"]" << std::endl;
+	std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
-	std::cout<< A[p] << std::endl;
+	std::cout<<GridLogMessage<< A[p] << std::endl;
      }
-      std::cout<< " picking by block0 "<< self_stencil <<std::endl;
+      std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
      phi=Subspace.subspace[0];
      std::vector<int> bc(FineGrid->_ndimension,0);
@@ -333,9 +329,9 @@ namespace Grid {
      blockPick(Grid(),phi,tmp,bc);      // Pick out a block
      linop.Op(tmp,Mphi);                // Apply big dop
      blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
-      std::cout<< " Computed matrix elements from block zero only "<<std::endl;
+      std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
-      std::cout<< iProj <<std::endl;
+      std::cout<<GridLogMessage<< iProj <<std::endl;
-      std::cout<<"Computed Coarse Operator"<<std::endl;
+      std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
      //      ForceHermitian();
      AssertHermitian();
@@ -344,9 +340,9 @@ namespace Grid {
    void ForceDiagonal(void) {
-      std::cout<<"**************************************************"<<std::endl;
+      std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
-      std::cout<<"****   Forcing coarse operator to be diagonal ****"<<std::endl;
+      std::cout<<GridLogMessage<<"****   Forcing coarse operator to be diagonal ****"<<std::endl;
-      std::cout<<"**************************************************"<<std::endl;
+      std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
      for(int p=0;p<8;p++){
 	A[p]=zero;
      }
@@ -386,13 +382,13 @@ namespace Grid {
 	Diff = AA - adj(AAc);
-	std::cout<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
+	std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
-	std::cout<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
+	std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
      }
      Diff = A[8] - adj(A[8]);
-      std::cout<<"Norm diff local "<< norm2(Diff)<<std::endl;
+      std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
-      std::cout<<"Norm local "<< norm2(A[8])<<std::endl;
+      std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
    }
  };
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -71,6 +71,47 @@ namespace Grid {
      }
    };
    ////////////////////////////////////////////////////////////////////
    // Construct herm op and shift it for mgrid smoother
    ////////////////////////////////////////////////////////////////////
    template<class Matrix,class Field>
    class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
      Matrix &_Mat;
      RealD _shift;
    public:
    ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
      // Support for coarsening to a multigrid
      void OpDiag (const Field &in, Field &out) {
 	_Mat.Mdiag(in,out);
 	assert(0);
      }
      void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	_Mat.Mdir(in,out,dir,disp);
 	assert(0);
      }
      void Op     (const Field &in, Field &out){
 	_Mat.M(in,out);
 	assert(0);
      }
      void AdjOp     (const Field &in, Field &out){
 	_Mat.Mdag(in,out);
 	assert(0);
      }
      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 	_Mat.MdagM(in,out,n1,n2);
 	out = out + _shift*in;
 	ComplexD dot;	
 	dot= innerProduct(in,out);
 	n1=real(dot);
 	n2=norm2(out);
      }
      void HermOp(const Field &in, Field &out){
 	RealD n1,n2;
 	HermOpAndNorm(in,out,n1,n2);
      }
    };
    ////////////////////////////////////////////////////////////////////
    // Wrap an already herm matrix
    ////////////////////////////////////////////////////////////////////
@@ -147,6 +188,7 @@ namespace Grid {
    };
    template<class Matrix,class Field>
      class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
    protected:
      Matrix &_Mat;
    public:
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
@@ -173,6 +215,7 @@ namespace Grid {
    };
    template<class Matrix,class Field>
      class SchurDiagOneOperator :  public SchurOperatorBase<Field> {
    protected:
      Matrix &_Mat;
    public:
      SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
@@ -199,6 +242,7 @@ namespace Grid {
      }
    };
    /////////////////////////////////////////////////////////////
    // Base classes for functions of operators
    /////////////////////////////////////////////////////////////
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -50,6 +50,17 @@ namespace Grid {
      return;
    }
    // Convenience for plotting the approximation
    void   PlotApprox(std::ostream &out) {
      out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
      for(double x=lo;x<hi;x+=(hi-lo)/50.0){
 	out <<x<<"\t"<<approx(x)<<std::endl;
      }
    };
    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
    //
    Chebyshev(double _lo,double _hi,int _order, double (* func)(double) ){
      lo=_lo;
      hi=_hi;
@@ -68,7 +79,34 @@ namespace Grid {
 	Coeffs[j] = s * 2.0/order;
      }
    };
    void JacksonSmooth(void){
      double M=order;
      double alpha = M_PI/(M+2);
      double lmax = std::cos(alpha);
      double sumUsq =0;
      std::vector<double> U(M);
      std::vector<double> a(M);
      std::vector<double> g(M);
      for(int n=0;n<=M;n++){
 	U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
 	sumUsq += U[n]*U[n];
      }      
      sumUsq = std::sqrt(sumUsq);
      for(int i=1;i<=M;i++){
 	a[i] = U[i]/sumUsq;
      }
      g[0] = 1.0;
      for(int m=1;m<=M;m++){
 	g[m] = 0;
 	for(int i=0;i<=M-m;i++){
 	  g[m]+= a[i]*a[m+i];
 	}
      }
      for(int m=1;m<=M;m++){
 	Coeffs[m]*=g[m];
      }
    }
    double approx(double x) // Convenience for plotting the approximation
    {
      double Tn;
@@ -95,46 +133,39 @@ namespace Grid {
      return sum;
    };
-    // Convenience for plotting the approximation
+    // Implement the required interface
    void   PlotApprox(std::ostream &out) {
      out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
      for(double x=lo;x<hi;x+=(hi-lo)/50.0){
 	out <<x<<"\t"<<approx(x)<<std::endl;
      }
    };
    // Implement the required interface; could require Lattice base class
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
-      Field T0 = in;
+      GridBase *grid=in._grid;
-      Field T1 = T0; // Field T1(T0._grid); more efficient but hardwires Lattice class
+
-      Field T2 = T1;
+      int vol=grid->gSites();
      Field T0(grid); T0 = in;  
      Field T1(grid); 
      Field T2(grid);
      Field y(grid);
      // use a pointer trick to eliminate copies
      Field *Tnm = &T0;
      Field *Tn  = &T1;
      Field *Tnp = &T2;
-      Field y   = in;
+
-  
+      std::cout<<GridLogMessage << "Chebyshev ["<<lo<<","<<hi<<"]"<< " order "<<order <<std::endl;
      // Tn=T1 = (xscale M + mscale)in
      double xscale = 2.0/(hi-lo);
      double mscale = -(hi+lo)/(hi-lo);
-
+      Linop.HermOp(T0,y);
      // Tn=T1 = (xscale M + mscale)in
      Linop.Op(T0,y);
      T1=y*xscale+in*mscale;
      // sum = .5 c[0] T0 + c[1] T1
      out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
      for(int n=2;n<order;n++){
-	Linop.Op(*Tn,y);
+	Linop.HermOp(*Tn,y);
 	y=xscale*y+mscale*(*Tn);
 	*Tnp=2.0*y-(*Tnm);
-	
+
 	out=out+Coeffs[n]* (*Tnp);
 	// Cycle pointers to avoid copies
--- a/lib/algorithms/approx/MultiShiftFunction.h
+++ b/lib/algorithms/approx/MultiShiftFunction.h
@@ -1,6 +1,8 @@
 #ifndef MULTI_SHIFT_FUNCTION
 #define MULTI_SHIFT_FUNCTION
 namespace Grid {
 class MultiShiftFunction {
 public:
  int order;
@@ -9,20 +11,29 @@ public:
  std::vector<RealD> tolerances;
  RealD norm;
  RealD lo,hi;
  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
  RealD approx(RealD x);
  void csv(std::ostream &out);
  void gnuplot(std::ostream &out);
-  MultiShiftFunction(AlgRemez & remez,double tol,bool inverse) :
+
-      order(remez.getDegree()),
+  void Init(AlgRemez & remez,double tol,bool inverse) 
      tolerances(remez.getDegree(),tol),
      poles(remez.getDegree()),
      residues(remez.getDegree())
  {
    order=remez.getDegree();
    tolerances.resize(remez.getDegree(),tol);
    poles.resize(remez.getDegree());
    residues.resize(remez.getDegree());
    remez.getBounds(lo,hi);
    if ( inverse ) remez.getIPFE (&residues[0],&poles[0],&norm);
-    else remez.getPFE (&residues[0],&poles[0],&norm);
+    else           remez.getPFE (&residues[0],&poles[0],&norm);
  }
  // Allow deferred initialisation
  MultiShiftFunction(void){};
  MultiShiftFunction(AlgRemez & remez,double tol,bool inverse)
  {
    Init(remez,tol,inverse);
  }
 };
 }
 #endif
--- a/lib/algorithms/approx/Remez.cc
+++ b/lib/algorithms/approx/Remez.cc
@@ -757,3 +757,4 @@ void AlgRemez::csv(std::ostream & os)
  }
  return;
 }
--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@@ -15,7 +15,9 @@
 #ifndef INCLUDED_ALG_REMEZ_H
 #define INCLUDED_ALG_REMEZ_H
-#include <algorithms/approx/bigfloat_double.h>
+#include <stddef.h>
 #include <algorithms/approx/bigfloat.h>
 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
 #define SUM_MAX 10 // Maximum number of terms in exponential
@@ -28,6 +30,7 @@
  remez.getIPFE(res,pole,&norm);
  remez.csv(ostream &os);
 */
 class AlgRemez
 {
 private:
--- a/lib/algorithms/iterative/AdefGeneric.h
+++ b/lib/algorithms/iterative/AdefGeneric.h
@@ -149,7 +149,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
      }
      RealD rrn=sqrt(rn/ssq);
-      std::cout<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;
+      std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;
      // Stopping condition
      if ( rn <= rsq ) { 
@@ -161,8 +161,8 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
 	RealD srcnorm = sqrt(norm2(src));
 	RealD tmpnorm = sqrt(norm2(tmp));
 	RealD true_residual = tmpnorm/srcnorm;
-	std::cout<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl;
+	std::cout<<GridLogMessage<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl;
-	std::cout<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
+	std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
 	return k;
      }
    }
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -13,9 +13,7 @@ namespace Grid {
 public:                                                
    RealD   Tolerance;
    Integer MaxIterations;
    int verbose;
    ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
      verbose=1;
    };
@@ -42,14 +40,12 @@ public:
      cp =a;
      ssq=norm2(src);
-      if ( verbose ) {
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
 	std::cout <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
      }
      RealD rsq =  Tolerance* Tolerance*ssq;
@@ -58,7 +54,7 @@ public:
 	return;
      }
-      if(verbose) std::cout << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" rsq"<<rsq<<std::endl;
+      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" rsq"<<rsq<<std::endl;
      int k;
      for (k=1;k<=MaxIterations;k++){
@@ -80,7 +76,7 @@ public:
 	psi= a*p+psi;
 	p  = p*b+r;
-	if (verbose) std::cout<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 	// Stopping condition
 	if ( cp <= rsq ) { 
@@ -94,14 +90,14 @@ public:
 	  RealD resnorm = sqrt(norm2(p));
 	  RealD true_residual = resnorm/srcnorm;
-	  std::cout<<"ConjugateGradient: Converged on iteration " <<k
+	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
 		   <<" computed residual "<<sqrt(cp/ssq)
 		   <<" true residual     "<<true_residual
 		   <<" target "<<Tolerance<<std::endl;
 	  return;
 	}
      }
-      std::cout<<"ConjugateGradient did NOT converge"<<std::endl;
+      std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl;
      assert(0);
    }
  };
--- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -27,10 +27,14 @@ public:
 void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
 {
  GridBase *grid = src._grid;
  int nshift = shifts.order;
  std::vector<Field> results(nshift,grid);
  (*this)(Linop,src,results,psi);
 }
 void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
 {
  int nshift = shifts.order;
  (*this)(Linop,src,results);
@@ -91,7 +95,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  cp = norm2(src);
  for(int s=0;s<nshift;s++){
    rsq[s] = cp * mresidual[s] * mresidual[s];
-    std::cout<<"ConjugateGradientMultiShift: shift "<<s
+    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
 	     <<" target resid "<<rsq[s]<<std::endl;
    ps[s] = src;
  }
@@ -109,7 +113,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  // p and mmp is equal to d after this since
  // the d computation is tricky
  //  qq = real(innerProduct(p,mmp));
-  //  std::cout << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl;
+  //  std::cout<<GridLogMessage << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl;
  b = -cp /d;
@@ -214,7 +218,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	if(css<rsq[s]){
 	  if ( ! converged[s] )
-	    std::cout<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
+	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	      converged[s]=1;
 	} else {
 	  all_converged=0;
@@ -225,8 +229,8 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
    if ( all_converged ){
-      std::cout<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
+      std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
-      std::cout<< "CGMultiShift: Checking solutions"<<std::endl;
+      std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
      // Check answers 
      for(int s=0; s < nshift; s++) { 
@@ -235,13 +239,13 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	axpy(r,-alpha[s],src,tmp);
 	RealD rn = norm2(r);
 	RealD cn = norm2(src);
-	std::cout<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
+	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
      }
      return;
    }
  }
  // ugly hack
-  std::cout<<"CG multi shift did not converge"<<std::endl;
+  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
  assert(0);
 }
--- a/lib/algorithms/iterative/ConjugateResidual.h
+++ b/lib/algorithms/iterative/ConjugateResidual.h
@@ -41,7 +41,7 @@ namespace Grid {
      ssq=norm2(src);
      rsq=Tolerance*Tolerance*ssq;
-      if (verbose) std::cout<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
+      if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
      for(int k=1;k<MaxIterations;k++){
@@ -60,13 +60,13 @@ namespace Grid {
 	axpy(p,b,p,r);
 	pAAp=axpy_norm(Ap,b,Ap,Ar);
-	if(verbose) std::cout<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+	if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 	if(cp<rsq) {
 	  Linop.HermOp(psi,Ap);
 	  axpy(r,-1.0,src,Ap);
 	  RealD true_resid = norm2(r)/ssq;
-	  std::cout<<"ConjugateResidual: Converged on iteration " <<k
+	  std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
 		   << " computed residual "<<sqrt(cp/ssq)
 	           << " true residual "<<sqrt(true_resid)
 	           << " target "       <<Tolerance <<std::endl;
@@ -75,7 +75,7 @@ namespace Grid {
      }
-      std::cout<<"ConjugateResidual did NOT converge"<<std::endl;
+      std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
      assert(0);
    }
  };
--- a/lib/algorithms/iterative/PrecConjugateResidual.h
+++ b/lib/algorithms/iterative/PrecConjugateResidual.h
@@ -0,0 +1,92 @@
 #ifndef GRID_PREC_CONJUGATE_RESIDUAL_H
 #define GRID_PREC_CONJUGATE_RESIDUAL_H
 namespace Grid {
    /////////////////////////////////////////////////////////////
    // Base classes for iterative processes based on operators
    // single input vec, single output vec.
    /////////////////////////////////////////////////////////////
  template<class Field> 
    class PrecConjugateResidual : public OperatorFunction<Field> {
  public:                                                
    RealD   Tolerance;
    Integer MaxIterations;
    int verbose;
    LinearFunction<Field> &Preconditioner;
    PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit),      Preconditioner(Prec)
    { 
      verbose=1;
    };
    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
      RealD a, b, c, d;
      RealD cp, ssq,rsq;
      RealD rAr, rAAr, rArp;
      RealD pAp, pAAp;
      GridBase *grid = src._grid;
      Field r(grid),  p(grid), Ap(grid), Ar(grid), z(grid);
      psi=zero;
      r  = src;
      Preconditioner(r,p);
      Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
      Ar=Ap;
      rAr=pAp;
      rAAr=pAAp;
      cp =norm2(r);
      ssq=norm2(src);
      rsq=Tolerance*Tolerance*ssq;
      if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
      for(int k=0;k<MaxIterations;k++){
 	Preconditioner(Ap,z);
 	RealD rq= real(innerProduct(Ap,z)); 
 	a = rAr/rq;
   	axpy(psi,a,p,psi);
   cp = axpy_norm(r,-a,z,r);
 	rArp=rAr;
 	Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
 	b   =rAr/rArp;
 	axpy(p,b,p,r);
 	pAAp=axpy_norm(Ap,b,Ap,Ar);
 	if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 	if(cp<rsq) {
 	  Linop.HermOp(psi,Ap);
 	  axpy(r,-1.0,src,Ap);
 	  RealD true_resid = norm2(r)/ssq;
 	  std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k
 		   << " computed residual "<<sqrt(cp/ssq)
 	           << " true residual "<<sqrt(true_resid)
 	           << " target "       <<Tolerance <<std::endl;
 	  return;
 	}
      }
      std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
      assert(0);
    }
  };
 }
 #endif
--- a/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -45,13 +45,13 @@ namespace Grid {
 	cp=GCRnStep(Linop,src,psi,rsq);
-	if ( verbose ) std::cout<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
+	if ( verbose ) std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
 	if(cp<rsq) {
 	  Linop.HermOp(psi,r);
 	  axpy(r,-1.0,src,r);
 	  RealD tr = norm2(r);
-	  std::cout<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
+	  std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
 		   << " computed residual "<<sqrt(cp/ssq)
 	           << " true residual "    <<sqrt(tr/ssq)
 	           << " target "           <<Tolerance <<std::endl;
@@ -59,7 +59,7 @@ namespace Grid {
 	}
      }
-      std::cout<<"Variable Preconditioned GCR did not converge"<<std::endl;
+      std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
      assert(0);
    }
    RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
@@ -96,21 +96,21 @@ namespace Grid {
      /////////////////////
      Preconditioner(r,z);
-      std::cout<< " Preconditioner in " << norm2(r)<<std::endl; 
+      std::cout<<GridLogMessage<< " Preconditioner in " << norm2(r)<<std::endl; 
-      std::cout<< " Preconditioner out " << norm2(z)<<std::endl; 
+      std::cout<<GridLogMessage<< " Preconditioner out " << norm2(z)<<std::endl; 
      Linop.HermOp(z,tmp); 
-      std::cout<< " Preconditioner Aout " << norm2(tmp)<<std::endl; 
+      std::cout<<GridLogMessage<< " Preconditioner Aout " << norm2(tmp)<<std::endl; 
      ttmp=tmp;
      tmp=tmp-r;
-      std::cout<< " Preconditioner resid " << std::sqrt(norm2(tmp)/norm2(r))<<std::endl; 
+      std::cout<<GridLogMessage<< " Preconditioner resid " << std::sqrt(norm2(tmp)/norm2(r))<<std::endl; 
      /*
-      std::cout<<r<<std::endl;
+      std::cout<<GridLogMessage<<r<<std::endl;
-      std::cout<<z<<std::endl;
+      std::cout<<GridLogMessage<<z<<std::endl;
-      std::cout<<ttmp<<std::endl;
+      std::cout<<GridLogMessage<<ttmp<<std::endl;
-      std::cout<<tmp<<std::endl;
+      std::cout<<GridLogMessage<<tmp<<std::endl;
      */
      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
@@ -137,7 +137,7 @@ namespace Grid {
 	cp = axpy_norm(r,-a,q[peri_k],r);  
-	std::cout<< " VPCG_step resid" <<sqrt(cp/rsq)<<std::endl; 
+	std::cout<<GridLogMessage<< " VPGCR_step resid" <<sqrt(cp/rsq)<<std::endl; 
 	if((k==nstep-1)||(cp<rsq)){
 	  return cp;
 	}
@@ -148,7 +148,7 @@ namespace Grid {
 	Linop.HermOp(z,tmp);
        tmp=tmp-r;
-	std::cout<< " Preconditioner resid" <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
+	std::cout<<GridLogMessage<< " Preconditioner resid" <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
 	q[peri_kp]=Az;
 	p[peri_kp]=z;
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@@ -89,7 +89,7 @@ namespace Grid {
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
-      std::cout << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
+      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      ///////////////////////////////////////////////////
@@ -108,7 +108,7 @@ namespace Grid {
      RealD ns = norm2(in);
      RealD nr = norm2(resid);
-      std::cout << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
+      std::cout<<GridLogMessage << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
    }     
  };
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -8,7 +8,7 @@ class SimpleCompressor {
 public:
  void Point(int) {};
-  vobj operator() (const vobj &arg) {
+  vobj operator() (const vobj &arg,int dimension,int plane,int osite,GridBase *grid) {
    return arg;
  }
 };
@@ -24,7 +24,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
    cbmask = 0x3;
  }
-
+  
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int e1=rhs._grid->_slice_nblock[dimension];
@@ -36,7 +36,7 @@ PARALLEL_NESTED_LOOP2
      int bo = n*rhs._grid->_slice_block[dimension];
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
      if ( ocb &cbmask ) {
-	buffer[bo+b]=compress(rhs._odata[so+o+b]);
+	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
      }
    }
  }
@@ -69,7 +69,7 @@ PARALLEL_NESTED_LOOP2
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
      if ( ocb & cbmask ) {
 	cobj temp; 
-	temp =compress(rhs._odata[so+o+b]);
+	temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 	extract<cobj>(temp,pointers,offset);
      }
    }
--- a/lib/lattice/Lattice_ET.h
+++ b/lib/lattice/Lattice_ET.h
@@ -132,18 +132,18 @@ inline void CBFromExpression(int &cb,const T1& lat)   // Lattice leaf
    assert(cb==lat.checkerboard);
  } 
  cb=lat.checkerboard;
-  //  std::cout<<"Lattice leaf cb "<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
 }
 template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
 inline void CBFromExpression(int &cb,const T1& notlat)   // non-lattice leaf
 {
-  //  std::cout<<"Non lattice leaf cb"<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
 }
 template <typename Op, typename T1>
 inline void CBFromExpression(int &cb,const LatticeUnaryExpression<Op,T1 > &expr)
 {
  CBFromExpression(cb,std::get<0>(expr.second));// recurse 
-  //  std::cout<<"Unary node cb "<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2>
@@ -151,7 +151,7 @@ inline void CBFromExpression(int &cb,const LatticeBinaryExpression<Op,T1,T2> &ex
 {
  CBFromExpression(cb,std::get<0>(expr.second));// recurse
  CBFromExpression(cb,std::get<1>(expr.second));
-  //  std::cout<<"Binary node cb "<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2, typename T3>
 inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
@@ -159,7 +159,7 @@ inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3
  CBFromExpression(cb,std::get<0>(expr.second));// recurse
  CBFromExpression(cb,std::get<1>(expr.second));
  CBFromExpression(cb,std::get<2>(expr.second));
-  //  std::cout<<"Trinary node cb "<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
 }
 ////////////////////////////////////////////
@@ -370,7 +370,7 @@ using namespace Grid;
   tmp.func(eval(0,v1),eval(0,v2));
   auto var = v1+v2;
-   std::cout<<typeid(var).name()<<std::endl;
+   std::cout<<GridLogMessage<<typeid(var).name()<<std::endl;
   v3=v1+v2;
   v3=v1+v2+v1*v2;
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -60,6 +60,11 @@ public:
    GridBase *_grid;
    int checkerboard;
    std::vector<vobj,alignedAllocator<vobj> > _odata;
    // to pthread need a computable loop where loop induction is not required
    int begin(void) { return 0;};
    int end(void)   { return _odata.size(); }
    vobj & operator[](int i) { return _odata[i]; };
 public:
    typedef typename vobj::scalar_type scalar_type;
@@ -221,7 +226,7 @@ PARALLEL_FOR_LOOP
    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
      this->checkerboard = r.checkerboard;
      conformable(*this,r);
-      std::cout<<"Lattice operator ="<<std::endl;
+      std::cout<<GridLogMessage<<"Lattice operator ="<<std::endl;
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r._odata[ss];
--- a/lib/lattice/Lattice_peekpoke.h
+++ b/lib/lattice/Lattice_peekpoke.h
@@ -14,6 +14,7 @@ namespace Grid {
       auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
    {
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
      ret.checkerboard=lhs.checkerboard;
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
@@ -24,6 +25,7 @@ PARALLEL_FOR_LOOP
       auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
    {
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
      ret.checkerboard=lhs.checkerboard;
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -125,7 +125,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  assert(grid!=NULL);
  // FIXME
-  std::cout<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
+  std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
--- a/lib/lattice/Lattice_rng.h
+++ b/lib/lattice/Lattice_rng.h
@@ -5,6 +5,37 @@
 namespace Grid {
  //////////////////////////////////////////////////////////////
  // Allow the RNG state to be less dense than the fine grid
  //////////////////////////////////////////////////////////////
  inline int RNGfillable(GridBase *coarse,GridBase *fine)
  {
    int rngdims = coarse->_ndimension;
    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
    int lowerdims   = fine->_ndimension - coarse->_ndimension;
    assert(lowerdims >= 0);
    for(int d=0;d<lowerdims;d++){
      assert(fine->_simd_layout[d]==1);
      assert(fine->_processors[d]==1);
    }
    // local and global volumes subdivide cleanly after SIMDization
    int multiplicity=1;
    for(int d=0;d<rngdims;d++){
      int fd= d+lowerdims;
      assert(coarse->_processors[d]  == fine->_processors[fd]);
      assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
      assert((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[fd]); 
      multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
    }
    return multiplicity;
  }
  // Wrap seed_seq to give common interface with random_device
  class fixedSeed {
  public:
@@ -226,26 +257,32 @@ namespace Grid {
      typedef typename vobj::scalar_type scalar_type;
      typedef typename vobj::vector_type vector_type;
-      conformable(_grid,l._grid);
+      int multiplicity = RNGfillable(_grid,l._grid);
      int     Nsimd =_grid->Nsimd();
      int     osites=_grid->oSites();
      int words=sizeof(scalar_object)/sizeof(scalar_type);
      std::vector<scalar_object> buf(Nsimd);
      for(int ss=0;ss<osites;ss++){
 	for(int si=0;si<Nsimd;si++){
-	  int gdx = generator_idx(ss,si); // index of generator state
+PARALLEL_FOR_LOOP
-	  scalar_type *pointer = (scalar_type *)&buf[si];
+      for(int ss=0;ss<osites;ss++){
-	  for(int idx=0;idx<words;idx++){
+
-	    fillScalar(pointer[idx],dist,_generators[gdx]);
+	std::vector<scalar_object> buf(Nsimd);
 	for(int m=0;m<multiplicity;m++) {// Draw from same generator multiplicity times
 	  int sm=multiplicity*ss+m;      // Maps the generator site to the fine site
 	  for(int si=0;si<Nsimd;si++){
 	    int gdx = generator_idx(ss,si); // index of generator state
 	    scalar_type *pointer = (scalar_type *)&buf[si];
 	    for(int idx=0;idx<words;idx++){
 	      fillScalar(pointer[idx],dist,_generators[gdx]);
 	    }
 	  }
 	  // merge into SIMD lanes
 	  merge(l._odata[sm],buf);
 	}
 	// merge into SIMD lanes
 	merge(l._odata[ss],buf);
      }
    };
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -17,13 +17,14 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
  }
 }
  ////////////////////////////////////////////////////////////////////////////////////////////
  // remove and insert a half checkerboard
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
    half.checkerboard = cb;
    int ssh=0;
-PARALLEL_FOR_LOOP
+    //PARALLEL_FOR_LOOP
    for(int ss=0;ss<full._grid->oSites();ss++){
      std::vector<int> coor;
      int cbos;
@@ -40,7 +41,7 @@ PARALLEL_FOR_LOOP
  template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
    int cb = half.checkerboard;
    int ssh=0;
-PARALLEL_FOR_LOOP
+    //PARALLEL_FOR_LOOP
    for(int ss=0;ss<full._grid->oSites();ss++){
      std::vector<int> coor;
      int cbos;
@@ -158,6 +159,7 @@ template<class vobj,class CComplex>
  fine_inner = localInnerProduct(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<coarse->oSites();ss++){
    CoarseInner._odata[ss] = coarse_inner._odata[ss];
  }
@@ -297,5 +299,42 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 }
 template<class vobj>
 void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
  typedef typename vobj::scalar_object sobj;
  GridBase *cg = coarse._grid;
  GridBase *fg =   fine._grid;
  int nd = cg->_ndimension;
  subdivides(cg,fg); 
  assert(cg->_ndimension==fg->_ndimension);
  std::vector<int> ratio(cg->_ndimension);
  for(int d=0;d<cg->_ndimension;d++){
    ratio[d] = fg->_fdimensions[d]/cg->_fdimensions[d];
  }
  std::vector<int> fcoor(nd);
  std::vector<int> ccoor(nd);
  for(int g=0;g<fg->gSites();g++){
    fg->GlobalIndexToGlobalCoor(g,fcoor);
    for(int d=0;d<nd;d++){
      ccoor[d] = fcoor[d]%cg->_gdimensions[d];
    }
    sobj tmp;
    peekSite(tmp,coarse,ccoor);
    pokeSite(tmp,fine,fcoor);
  }
 }
 }
 #endif
--- a/lib/lattice/Lattice_where.h
+++ b/lib/lattice/Lattice_where.h
@@ -22,7 +22,6 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
  typedef typename iobj::vector_type mask_type;
  const int Nsimd = grid->Nsimd();
  const int words = sizeof(vobj)/sizeof(vector_type);
  std::vector<Integer> mask(Nsimd);
  std::vector<scalar_object> truevals (Nsimd);
--- a/lib/pugixml/README.md
+++ b/lib/pugixml/README.md
@@ -0,0 +1,44 @@
 pugixml [![Build Status](https://travis-ci.org/zeux/pugixml.svg?branch=master)](https://travis-ci.org/zeux/pugixml) [![Build status](https://ci.appveyor.com/api/projects/status/9hdks1doqvq8pwe7/branch/master?svg=true)](https://ci.appveyor.com/project/zeux/pugixml)
 =======
 pugixml is a C++ XML processing library, which consists of a DOM-like interface with rich traversal/modification
 capabilities, an extremely fast XML parser which constructs the DOM tree from an XML file/buffer, and an XPath 1.0
 implementation for complex data-driven tree queries. Full Unicode support is also available, with Unicode interface
 variants and conversions between different Unicode encodings (which happen automatically during parsing/saving).
 pugixml is used by a lot of projects, both open-source and proprietary, for performance and easy-to-use interface.
 ## Documentation
 Documentation for the current release of pugixml is available on-line as two separate documents:
 * [Quick-start guide](http://pugixml.org/docs/quickstart.html), that aims to provide enough information to start using the library;
 * [Complete reference manual](http://pugixml.org/docs/manual.html), that describes all features of the library in detail.
 You’re advised to start with the quick-start guide; however, many important library features are either not described in it at all or only mentioned briefly; if you require more information you should read the complete manual.
 ## License
 This library is available to anybody free of charge, under the terms of MIT License:
 Copyright (c) 2006-2015 Arseny Kapoulkine
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
 files (the "Software"), to deal in the Software without
 restriction, including without limitation the rights to use,
 copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the
 Software is furnished to do so, subject to the following
 conditions:
 The above copyright notice and this permission notice shall be
 included in all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
--- a/lib/pugixml/pugiconfig.hpp
+++ b/lib/pugixml/pugiconfig.hpp
@@ -0,0 +1,71 @@
 /**
 * pugixml parser - version 1.6
 * --------------------------------------------------------
 * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 * Report bugs and download new versions at http://pugixml.org/
 *
 * This library is distributed under the MIT License. See notice at the end
 * of this file.
 *
 * This work is based on the pugxml parser, which is:
 * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
 */
 #ifndef HEADER_PUGICONFIG_HPP
 #define HEADER_PUGICONFIG_HPP
 // Uncomment this to enable wchar_t mode
 // #define PUGIXML_WCHAR_MODE
 // Uncomment this to disable XPath
 // #define PUGIXML_NO_XPATH
 // Uncomment this to disable STL
 // #define PUGIXML_NO_STL
 // Uncomment this to disable exceptions
 // #define PUGIXML_NO_EXCEPTIONS
 // Set this to control attributes for public classes/functions, i.e.:
 // #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL
 // #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL
 // #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall
 // In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead
 // Tune these constants to adjust memory-related behavior
 // #define PUGIXML_MEMORY_PAGE_SIZE 32768
 // #define PUGIXML_MEMORY_OUTPUT_STACK 10240
 // #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096
 // Uncomment this to switch to header-only version
 // #define PUGIXML_HEADER_ONLY
 // Uncomment this to enable long long support
 // #define PUGIXML_HAS_LONG_LONG
 #endif
 /**
 * Copyright (c) 2006-2015 Arseny Kapoulkine
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
--- a/lib/pugixml/pugixml.cpp
+++ b/lib/pugixml/pugixml.cpp
--- a/lib/pugixml/pugixml.hpp
+++ b/lib/pugixml/pugixml.hpp
--- a/lib/pugixml/readme.txt
+++ b/lib/pugixml/readme.txt
@@ -0,0 +1,52 @@
 pugixml 1.6 - an XML processing library
 Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 Report bugs and download new versions at http://pugixml.org/
 This is the distribution of pugixml, which is a C++ XML processing library,
 which consists of a DOM-like interface with rich traversal/modification
 capabilities, an extremely fast XML parser which constructs the DOM tree from
 an XML file/buffer, and an XPath 1.0 implementation for complex data-driven
 tree queries. Full Unicode support is also available, with Unicode interface
 variants and conversions between different Unicode encodings (which happen
 automatically during parsing/saving).
 The distribution contains the following folders:
 	contrib/ - various contributions to pugixml
 	docs/ - documentation
 		docs/samples - pugixml usage examples
 		docs/quickstart.html - quick start guide
 		docs/manual.html - complete manual
 	scripts/ - project files for IDE/build systems
 	src/ - header and source files
 	readme.txt - this file.
 This library is distributed under the MIT License:
 Copyright (c) 2006-2015 Arseny Kapoulkine
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
 files (the "Software"), to deal in the Software without
 restriction, including without limitation the rights to use,
 copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the
 Software is furnished to do so, subject to the following
 conditions:
 The above copyright notice and this permission notice shall be
 included in all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -19,6 +19,7 @@ namespace QCD {
    static const int Nd=4;
    static const int Nhs=2; // half spinor
    static const int Nds=8; // double stored gauge field
    static const int Ngp=2; // gparity index range
    //////////////////////////////////////////////////////////////////////////////
    // QCD iMatrix types
@@ -27,7 +28,16 @@ namespace QCD {
    static const int ColourIndex = 2;
    static const int SpinIndex   = 1;
    static const int LorentzIndex= 0;
-    
+
    // Useful traits is this a spin index
    //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
    const int SpinorIndex = 2;
    template<typename T> struct isSpinor {
      static const bool value = (SpinorIndex==T::TensorLevel);
    };
    template <typename T> using IfSpinor    = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
    template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
    // ChrisK very keen to add extra space for Gparity doubling.
    //
@@ -49,6 +59,10 @@ namespace QCD {
    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
    // Spin matrix
    typedef iSpinMatrix<Complex  >          SpinMatrix;
    typedef iSpinMatrix<ComplexF >          SpinMatrixF;
--- a/lib/qcd/action/ActionBase.h
+++ b/lib/qcd/action/ActionBase.h
@@ -7,14 +7,30 @@ template<class GaugeField>
 class Action { 
 public:
-  virtual void  init(const GaugeField &U, GridParallelRNG& pRNG) = 0;
+  virtual void  init (const GaugeField &U, GridParallelRNG& pRNG) = 0;  // 
-  virtual RealD S(const GaugeField &U)                           = 0;  // evaluate the action
+  virtual RealD S    (const GaugeField &U)                        = 0;  // evaluate the action
-  virtual void  deriv(const GaugeField &U,GaugeField & dSdU )    = 0;  // evaluate the action derivative
+  virtual void  deriv(const GaugeField &U,GaugeField & dSdU )     = 0;  // evaluate the action derivative
-  //virtual void  refresh(const GaugeField & ) {}                ; 
+  virtual void  refresh(const GaugeField & ) {};                        // Default to no-op for actions with no internal fields
  // Boundary conditions?
  // Heatbath?
  virtual ~Action() {};
 };
 // Could derive PseudoFermion action with a PF field, FermionField, and a Grid; implement refresh
 template<class GaugeField, class FermionField>
 class PseudoFermionAction : public Action<GaugeField> {
 public:
  FermionField Phi;
  GridParallelRNG &pRNG;
  GridBase &Grid;
  PseudoFermionAction(GridBase &_Grid,GridParallelRNG &_pRNG) : Grid(_Grid), Phi(&_Grid), pRNG(_pRNG) {
  };
  virtual void refresh(const GaugeField &gauge) {
    gaussian(Phi,pRNG);
  };
 };
 }}
 #endif
--- a/lib/qcd/action/ActionParams.h
+++ b/lib/qcd/action/ActionParams.h
@@ -0,0 +1,29 @@
 #ifndef GRID_QCD_ACTION_PARAMS_H
 #define GRID_QCD_ACTION_PARAMS_H
 namespace Grid {
 namespace QCD {
    // These can move into a params header and be given MacroMagic serialisation
    struct GparityWilsonImplParams {
      std::vector<int> twists; 
    };
    struct WilsonImplParams { };
    struct OneFlavourRationalParams { 
      RealD  lo;
      RealD  hi;
      int MaxIter;   // Vector?
      RealD tolerance; // Vector? 
      int    degree=10;
      int precision=64;
      OneFlavourRationalParams (RealD _lo,RealD _hi,int _maxit,RealD tol=1.0e-8,int _degree = 10,int _precision=64) :
        lo(_lo), hi(_hi), MaxIter(_maxit), tolerance(tol), degree(_degree), precision(_precision)
      {};
    };
 }}
 #endif
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -6,19 +6,15 @@
 // are separating the concept of the operator from that of action.
 //
 // The FermAction contains methods to create 
 //
 // * Linear operators             (Hermitian and non-hermitian)  .. my LinearOperator
 // * System solvers               (Hermitian and non-hermitian)  .. my OperatorFunction
 // * MultiShift System solvers    (Hermitian and non-hermitian)  .. my OperatorFunction
 ////////////////////////////////////////////
 // Abstract base interface
 ////////////////////////////////////////////
 #include <qcd/action/ActionBase.h>
-
+#include <qcd/action/ActionParams.h>
 #include <qcd/action/fermion/FermionOperator.h>
 ////////////////////////////////////////////
 // Gauge Actions
@@ -30,53 +26,146 @@
 // Utility functions
 ////////////////////////////////////////////
 #include <qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
 #include <qcd/action/fermion/FermionOperatorImpl.h>
 #include <qcd/action/fermion/FermionOperator.h>
 #include <qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Explicit explicit template instantiation is still required in the .cc files
 //
 // - CayleyFermion5D.cc
 // - PartialFractionFermion5D.cc
 // - WilsonFermion5D.cc
 // - WilsonKernelsHand.cc
 // - ContinuedFractionFermion5D.cc
 // - WilsonFermion.cc
 // - WilsonKernels.cc
 //
 // The explicit instantiation is only avoidable if we move this source to headers and end up with include/parse/recompile
 // for EVERY .cc file. This define centralises the list and restores global push of impl cases
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 #define FermOpTemplateInstantiate(A) \
  template class A<GparityWilsonImplF>;		\
  template class A<GparityWilsonImplD>;		\
  template class A<WilsonImplF>;		\
  template class A<WilsonImplD>;
 ////////////////////////////////////////////
-// 4D formulations
+// Fermion operators / actions
 ////////////////////////////////////////////
-#include <qcd/action/fermion/WilsonFermion.h>
+
 #include <qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
 #include <qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 //#include <qcd/action/fermion/CloverFermion.h>
-////////////////////////////////////////////
+#include <qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
 // 5D formulations...
 ////////////////////////////////////////////
 #include <qcd/action/fermion/WilsonFermion5D.h> // used by all 5d overlap types
 //////////
 // Cayley
 //////////
 #include <qcd/action/fermion/CayleyFermion5D.h>
 #include <qcd/action/fermion/DomainWallFermion.h>
 #include <qcd/action/fermion/DomainWallFermion.h>
 #include <qcd/action/fermion/MobiusFermion.h>
 #include <qcd/action/fermion/ScaledShamirFermion.h>
 #include <qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
 #include <qcd/action/fermion/MobiusZolotarevFermion.h>
 #include <qcd/action/fermion/ShamirZolotarevFermion.h>
 #include <qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
 #include <qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>
-//////////////////////
+#include <qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
-// Continued fraction
+#include <qcd/action/fermion/OverlapWilsonContFracTanhFermion.h>
-//////////////////////
+#include <qcd/action/fermion/OverlapWilsonContFracZolotarevFermion.h>
 #include <qcd/action/fermion/ContinuedFractionFermion5D.h>
 #include <qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
 #include <qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>
-//////////////////////
+#include <qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
 // Partial fraction
 //////////////////////
 #include <qcd/action/fermion/PartialFractionFermion5D.h>
 #include <qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
 #include <qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // More maintainable to maintain the following typedef list centrally, as more "impl" targets
 // are added, (e.g. extension for gparity, half precision project in comms etc..)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Cayley 5d
 namespace Grid {
  namespace QCD {
 typedef WilsonFermion<WilsonImplR> WilsonFermionR;
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 typedef WilsonFermion<WilsonImplD> WilsonFermionD;
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
 typedef MobiusZolotarevFermion<WilsonImplR> MobiusZolotarevFermionR;
 typedef MobiusZolotarevFermion<WilsonImplF> MobiusZolotarevFermionF;
 typedef MobiusZolotarevFermion<WilsonImplD> MobiusZolotarevFermionD;
 typedef ShamirZolotarevFermion<WilsonImplR> ShamirZolotarevFermionR;
 typedef ShamirZolotarevFermion<WilsonImplF> ShamirZolotarevFermionF;
 typedef ShamirZolotarevFermion<WilsonImplD> ShamirZolotarevFermionD;
 typedef OverlapWilsonCayleyTanhFermion<WilsonImplR> OverlapWilsonCayleyTanhFermionR;
 typedef OverlapWilsonCayleyTanhFermion<WilsonImplF> OverlapWilsonCayleyTanhFermionF;
 typedef OverlapWilsonCayleyTanhFermion<WilsonImplD> OverlapWilsonCayleyTanhFermionD;
 typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplR> OverlapWilsonCayleyZolotarevFermionR;
 typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplF> OverlapWilsonCayleyZolotarevFermionF;
 typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD> OverlapWilsonCayleyZolotarevFermionD;
 // Continued fraction
 typedef OverlapWilsonContFracTanhFermion<WilsonImplR> OverlapWilsonContFracTanhFermionR;
 typedef OverlapWilsonContFracTanhFermion<WilsonImplF> OverlapWilsonContFracTanhFermionF;
 typedef OverlapWilsonContFracTanhFermion<WilsonImplD> OverlapWilsonContFracTanhFermionD;
 typedef OverlapWilsonContFracZolotarevFermion<WilsonImplR> OverlapWilsonContFracZolotarevFermionR;
 typedef OverlapWilsonContFracZolotarevFermion<WilsonImplF> OverlapWilsonContFracZolotarevFermionF;
 typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD> OverlapWilsonContFracZolotarevFermionD;
 // Partial fraction
 typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplR> OverlapWilsonPartialFractionTanhFermionR;
 typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplF> OverlapWilsonPartialFractionTanhFermionF;
 typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD> OverlapWilsonPartialFractionTanhFermionD;
 typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplR> OverlapWilsonPartialFractionZolotarevFermionR;
 typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplF> OverlapWilsonPartialFractionZolotarevFermionF;
 typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonPartialFractionZolotarevFermionD;
 // Gparity cases; partial list until tested
 typedef WilsonFermion<GparityWilsonImplR>     GparityWilsonFermionR;
 typedef WilsonFermion<GparityWilsonImplF>     GparityWilsonFermionF;
 typedef WilsonFermion<GparityWilsonImplD>     GparityWilsonFermionD;
 typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
  }}
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
 #include <qcd/action/fermion/g5HermitianLinop.h>
 ////////////////////////////////////////
 // Pseudo fermion combinations for HMC
 ////////////////////////////////////////
 #include <qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
 #include <qcd/action/pseudofermion/TwoFlavour.h>
 #include <qcd/action/pseudofermion/TwoFlavourRatio.h>
 #include <qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
 #include <qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>
 //IroIro inserted general "Nf" param; could also be done,
 //but not clear why unless into large Nf BSM studies
 //Even there, don't want the explicit (2) on power denominator
 //if even number of flavours, so further generalised interface
 //would be required but easy.
 #include <qcd/action/pseudofermion/OneFlavourRational.h>
 #include <qcd/action/pseudofermion/OneFlavourRationalRatio.h>
 #include <qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
 #include <qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -2,27 +2,27 @@
 namespace Grid {
 namespace QCD {
- CayleyFermion5D::CayleyFermion5D(LatticeGaugeField &_Umu,
+ template<class Impl>
-				  GridCartesian         &FiveDimGrid,
+ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
-				  GridRedBlackCartesian &FiveDimRedBlackGrid,
+					GridCartesian         &FiveDimGrid,
-				  GridCartesian         &FourDimGrid,
+					GridRedBlackCartesian &FiveDimRedBlackGrid,
-				  GridRedBlackCartesian &FourDimRedBlackGrid,
+					GridCartesian         &FourDimGrid,
-				  RealD _mass,RealD _M5) :
+					GridRedBlackCartesian &FourDimRedBlackGrid,
-   WilsonFermion5D(_Umu,
+					RealD _mass,RealD _M5,const ImplParams &p) :
   WilsonFermion5D<Impl>(_Umu,
 		   FiveDimGrid,
 		   FiveDimRedBlackGrid,
 		   FourDimGrid,
-		   FourDimRedBlackGrid,_M5),
+ 	 	   FourDimRedBlackGrid,_M5,p),
   mass(_mass)
 {
 }
-  // override multiply
+ template<class Impl>
-  RealD CayleyFermion5D::M    (const LatticeFermion &psi, LatticeFermion &chi)
+  void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
  {
    LatticeFermion Din(psi._grid);
    // Assemble Din
    int Ls=this->Ls;
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	//	Din = bs psi[s] + cs[s] psi[s+1}
@@ -37,11 +37,57 @@ namespace QCD {
 	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
      }
    }
  }
 template<class Impl>
  void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
  {
    int Ls=this->Ls;
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
 	axpby_ssp_pminus(Din,1.0,Din,-mass*cs[Ls-1],psi,s,Ls-1);
      } else if ( s==(Ls-1)) { 
 	axpby_ssp_pplus (Din,bs[s],psi,-mass*cs[0],psi,s,0);
 	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
      } else {
 	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
 	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
      }
    }
  }
-    DW(Din,chi,DaggerNo);
+  // override multiply
 template<class Impl>
  RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
  {
    int Ls=this->Ls;
    FermionField Din(psi._grid);
    // Assemble Din
    /*
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	//	Din = bs psi[s] + cs[s] psi[s+1}
 	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
 	//      Din+= -mass*cs[s] psi[s+1}
 	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
      } else if ( s==(Ls-1)) { 
 	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
 	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
      } else {
 	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
 	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
      }
    }
    */
    Meooe5D(psi,Din);
    this->DW(Din,chi,DaggerNo);
    // ((b D_W + D_w hop terms +1) on s-diag
    axpby(chi,1.0,1.0,chi,psi); 
    // Call Mooee??
    for(int s=0;s<Ls;s++){
      if ( s==0 ){
 	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
@@ -57,20 +103,26 @@ namespace QCD {
    return norm2(chi);
  }
-  RealD CayleyFermion5D::Mdag (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
  RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
  {
    // Under adjoint
    //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
    //D2- P+     D2+            P-D1-^dag D2+dag
-    LatticeFermion Din(psi._grid);
+    FermionField Din(psi._grid);
    // Apply Dw
-    DW(psi,Din,DaggerYes); 
+    this->DW(psi,Din,DaggerYes); 
    Meooe5D(Din,chi);
    int Ls=this->Ls;
    for(int s=0;s<Ls;s++){
      // Collect the terms in DW
      //	Chi = bs Din[s] + cs[s] Din[s+1}
      //    Chi+= -mass*cs[s] psi[s+1}
      /*
      if ( s==0 ) {
 	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,-mass*cs[Ls-1],Din,s,Ls-1);
@@ -81,6 +133,10 @@ namespace QCD {
 	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
      }
      */
      // FIXME just call MooeeDag??
      // Collect the terms indept of DW
      if ( s==0 ){
 	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s+1);
@@ -99,10 +155,17 @@ namespace QCD {
  }
  // half checkerboard operations
-  void CayleyFermion5D::Meooe       (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
  void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
  {
-    LatticeFermion tmp(psi._grid);
+    int Ls=this->Ls;
    FermionField tmp(psi._grid);
    // Assemble the 5d matrix
    Meooe5D(psi,tmp); 
 #if 0
    std::cout << "Meooe Test replacement norm2 tmp = " <<norm2(tmp)<<std::endl;
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	//	tmp = bs psi[s] + cs[s] psi[s+1}
@@ -117,24 +180,33 @@ namespace QCD {
 	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
      }
    }
    std::cout << "Meooe Test replacement norm2 tmp old = " <<norm2(tmp)<<std::endl;
 #endif
    // Apply 4d dslash
    if ( psi.checkerboard == Odd ) {
-      DhopEO(tmp,chi,DaggerNo);
+      this->DhopEO(tmp,chi,DaggerNo);
    } else {
-      DhopOE(tmp,chi,DaggerNo);
+      this->DhopOE(tmp,chi,DaggerNo);
    }
  }
-  void CayleyFermion5D::MeooeDag    (const LatticeFermion &psi, LatticeFermion &chi)
+  template<class Impl>
  void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
  {
-    LatticeFermion tmp(psi._grid);
+    FermionField tmp(psi._grid);
    // Apply 4d dslash
    if ( psi.checkerboard == Odd ) {
-      DhopEO(psi,tmp,DaggerYes);
+      this->DhopEO(psi,tmp,DaggerYes);
    } else {
-      DhopOE(psi,tmp,DaggerYes);
+      this->DhopOE(psi,tmp,DaggerYes);
    }
    Meooe5D(tmp,chi); 
 #if 0
    std::cout << "Meooe Test replacement norm2 chi new = " <<norm2(chi)<<std::endl;
    // Assemble the 5d matrix
    int Ls=this->Ls;
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	axpby_ssp_pplus(chi,beo[s],tmp,   -ceo[s+1]  ,tmp,s,s+1);
@@ -147,10 +219,15 @@ namespace QCD {
 	axpby_ssp_pminus(chi,1.0   ,chi,-ceo[s-1],tmp,s,s-1);
      }
    }
    std::cout << "Meooe Test replacement norm2 chi old = " <<norm2(chi)<<std::endl;
 #endif
  }
-  void CayleyFermion5D::Mooee       (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
  void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
  {
    int Ls=this->Ls;
    for (int s=0;s<Ls;s++){
      if ( s==0 ) {
 	axpby_ssp_pminus(chi,bee[s],psi ,-cee[s],psi,s,s+1);
@@ -165,8 +242,10 @@ namespace QCD {
    }
  }
-  void  CayleyFermion5D::Mdir (const LatticeFermion &psi, LatticeFermion &chi,int dir,int disp){
+ template<class Impl>
-    LatticeFermion tmp(psi._grid);
+  void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
    int Ls=this->Ls;
    FermionField tmp(psi._grid);
    // Assemble the 5d matrix
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
@@ -183,11 +262,13 @@ namespace QCD {
      }
    }
    // Apply 4d dslash fragment
-    DhopDir(tmp,chi,dir,disp);
+    this->DhopDir(tmp,chi,dir,disp);
  }
-  void CayleyFermion5D::MooeeDag    (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
  void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
  {
    int Ls=this->Ls;
    for (int s=0;s<Ls;s++){
      // Assemble the 5d matrix
      if ( s==0 ) {
@@ -203,8 +284,10 @@ namespace QCD {
    }
  }
-  void CayleyFermion5D::MooeeInv    (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
  void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
  {
    int Ls=this->Ls;
    // Apply (L^{\prime})^{-1}
    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
    for (int s=1;s<Ls;s++){
@@ -227,8 +310,10 @@ namespace QCD {
    }
  }
-  void CayleyFermion5D::MooeeInvDag (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
  void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
  {
    int Ls=this->Ls;
    // Apply (U^{\prime})^{-dagger}
    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
    for (int s=1;s<Ls;s++){
@@ -249,16 +334,66 @@ namespace QCD {
      axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1);  // chi[Ls]
    }
  }
  // force terms; five routines; default to Dhop on diagonal
  template<class Impl>
  void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    FermionField Din(V._grid);
    if ( dag == DaggerNo ) {
      //      U d/du [D_w D5] V = U d/du DW D5 V
      Meooe5D(V,Din);
      this->DhopDeriv(mat,U,Din,dag);
    } else {
      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
      Meooe5D(U,Din);
      this->DhopDeriv(mat,Din,V,dag);
    }
  };
 template<class Impl>
  void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    FermionField Din(V._grid);
    if ( dag == DaggerNo ) {
      //      U d/du [D_w D5] V = U d/du DW D5 V
      Meooe5D(V,Din);
      this->DhopDerivOE(mat,U,Din,dag);
    } else {
      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
      Meooe5D(U,Din);
      this->DhopDerivOE(mat,Din,V,dag);
    }
  };
 template<class Impl>
  void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    FermionField Din(V._grid);
    if ( dag == DaggerNo ) {
      //      U d/du [D_w D5] V = U d/du DW D5 V
      Meooe5D(V,Din);
      this->DhopDerivEO(mat,U,Din,dag);
    } else {
      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
      Meooe5D(U,Din);
      this->DhopDerivEO(mat,Din,V,dag);
    }
  };
  // Tanh
-  void CayleyFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
+ template<class Impl>
  void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
  {
    SetCoefficientsZolotarev(1.0,zdata,b,c);
  }
  //Zolo
-  void CayleyFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
+ template<class Impl>
  void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
  {
    int Ls=this->Ls;
    ///////////////////////////////////////////////////////////
    // The Cayley coeffs (unprec)
@@ -308,8 +443,8 @@ namespace QCD {
    ceo.resize(Ls);
    for(int i=0;i<Ls;i++){
-      bee[i]=as[i]*(bs[i]*(4.0-M5) +1.0);
+      bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
-      cee[i]=as[i]*(1.0-cs[i]*(4.0-M5));
+      cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
      beo[i]=as[i]*bs[i];
      ceo[i]=-as[i]*cs[i];
    }
@@ -362,6 +497,8 @@ namespace QCD {
    }
  }
  FermOpTemplateInstantiate(CayleyFermion5D);
 }}
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@@ -5,25 +5,36 @@ namespace Grid {
  namespace QCD {
-    class CayleyFermion5D : public WilsonFermion5D
+    template<class Impl>
    class CayleyFermion5D : public WilsonFermion5D<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
    public:
      // override multiply
-      virtual RealD  M    (const LatticeFermion &in, LatticeFermion &out);
+      virtual RealD  M    (const FermionField &in, FermionField &out);
-      virtual RealD  Mdag (const LatticeFermion &in, LatticeFermion &out);
+      virtual RealD  Mdag (const FermionField &in, FermionField &out);
      // half checkerboard operations
-      virtual void   Meooe       (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   Meooe       (const FermionField &in, FermionField &out);
-      virtual void   MeooeDag    (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
-      virtual void   Mooee       (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   Mooee       (const FermionField &in, FermionField &out);
-      virtual void   MooeeDag    (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
      virtual void   Instantiatable(void)=0;
      // force terms; five routines; default to Dhop on diagonal
      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
      void   Meooe5D       (const FermionField &in, FermionField &out);
      void   MeooeDag5D    (const FermionField &in, FermionField &out);
      //    protected:
      RealD mass;
@@ -48,12 +59,12 @@ namespace Grid {
      std::vector<RealD> dee;    
      // Constructors
-      CayleyFermion5D(LatticeGaugeField &_Umu,
+      CayleyFermion5D(GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
 		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 		      GridCartesian         &FourDimGrid,
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
-		      RealD _mass,RealD _M5);
+		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
--- a/lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
+++ b/lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
@@ -3,20 +3,22 @@
 namespace Grid {
  namespace QCD {
-    void ContinuedFractionFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
+    template<class Impl>
    void ContinuedFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
    {
      SetCoefficientsZolotarev(1.0/scale,zdata);
    }
-    void ContinuedFractionFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
+    template<class Impl>
    void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
    {
      // How to check Ls matches??
-      //      std::cout << Ls << " Ls"<<std::endl;
+      //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-      //      std::cout << zdata->n  << " - n"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-      //      std::cout << zdata->da << " -da "<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-      //      std::cout << zdata->db << " -db"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-      //      std::cout << zdata->dn << " -dn"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-      //      std::cout << zdata->dd << " -dd"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
-
+      int Ls = this->Ls;
      assert(zdata->db==Ls);// Beta has Ls coeffs
      R=(1+this->mass)/(1-this->mass);
@@ -39,7 +41,7 @@ namespace Grid {
      ZoloHiInv =1.0/zolo_hi;
-      dw_diag = (4.0-M5)*ZoloHiInv;
+      dw_diag = (4.0-this->M5)*ZoloHiInv;
      See.resize(Ls);
      Aee.resize(Ls);
@@ -55,17 +57,20 @@ namespace Grid {
 	See[s] = Aee[s] - 1.0/See[s-1];
      }
      for(int s=0;s<Ls;s++){
-	std::cout <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
+	std::cout<<GridLogMessage <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
      }
    }
-    RealD  ContinuedFractionFermion5D::M           (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
    RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
    {
-      LatticeFermion D(psi._grid);
+      int Ls = this->Ls;
-      DW(psi,D,DaggerNo); 
+      FermionField D(psi._grid);
      this->DW(psi,D,DaggerNo); 
      int sign=1;
      for(int s=0;s<Ls;s++){
@@ -83,15 +88,20 @@ namespace Grid {
      }
      return norm2(chi);
    }
-    RealD  ContinuedFractionFermion5D::Mdag        (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
    RealD  ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
    {
      // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
      // The rest of matrix is symmetric.
      // Can ignore "dag"
      return M(psi,chi);
    }
-    void  ContinuedFractionFermion5D::Mdir (const LatticeFermion &psi, LatticeFermion &chi,int dir,int disp){
+    template<class Impl>
-      DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
+    void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
      int Ls = this->Ls;
      this->DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
      int sign=1;
      for(int s=0;s<Ls;s++){
 	if ( s==(Ls-1) ){
@@ -102,13 +112,16 @@ namespace Grid {
 	sign=-sign; 
      }
    }
-    void   ContinuedFractionFermion5D::Meooe       (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
    void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
    {
      int Ls = this->Ls;
      // Apply 4d dslash
      if ( psi.checkerboard == Odd ) {
-	DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
+	this->DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
      } else {
-	DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
+	this->DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
      }
      int sign=1;
@@ -121,12 +134,16 @@ namespace Grid {
 	sign=-sign; 
      }
    }
-    void   ContinuedFractionFermion5D::MeooeDag    (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
    void   ContinuedFractionFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
    {
-      Meooe(psi,chi);
+      this->Meooe(psi,chi);
    }
-    void   ContinuedFractionFermion5D::Mooee       (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
    void   ContinuedFractionFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
    {
      int Ls = this->Ls;
      int sign=1;
      for(int s=0;s<Ls;s++){
 	if ( s==0 ) {
@@ -144,12 +161,16 @@ namespace Grid {
      }
    }
-    void   ContinuedFractionFermion5D::MooeeDag    (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
    void   ContinuedFractionFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
    {
-      Mooee(psi,chi);
+      this->Mooee(psi,chi);
    }
-    void   ContinuedFractionFermion5D::MooeeInv    (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
    void   ContinuedFractionFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
    {
      int Ls = this->Ls;
      // Apply Linv
      axpby_ssp(chi,1.0/cc_d[0],psi,0.0,psi,0,0); 
      for(int s=1;s<Ls;s++){
@@ -165,27 +186,88 @@ namespace Grid {
 	axpbg5y_ssp(chi,1.0/cc_d[s],chi,-1.0*cc_d[s+1]/See[s]/cc_d[s],chi,s,s+1);
      }
    }
-    void   ContinuedFractionFermion5D::MooeeInvDag (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
    void   ContinuedFractionFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
    {
-      MooeeInv(psi,chi);
+      this->MooeeInv(psi,chi);
    }
  // force terms; five routines; default to Dhop on diagonal
    template<class Impl>
   void ContinuedFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    int Ls = this->Ls;
    FermionField D(V._grid);
    int sign=1;
    for(int s=0;s<Ls;s++){
      if ( s==(Ls-1) ){
 	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
      } else {
 	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
      }
      sign=-sign; 
    }
    this->DhopDeriv(mat,D,V,DaggerNo); 
  };
    template<class Impl>
   void ContinuedFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    int Ls = this->Ls;
    FermionField D(V._grid);
    int sign=1;
    for(int s=0;s<Ls;s++){
      if ( s==(Ls-1) ){
 	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
      } else {
 	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
      }
      sign=-sign; 
    }
    this->DhopDerivOE(mat,D,V,DaggerNo); 
  };
  template<class Impl>
  void ContinuedFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    int Ls = this->Ls;
    FermionField D(V._grid);
    int sign=1;
    for(int s=0;s<Ls;s++){
      if ( s==(Ls-1) ){
 	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
      } else {
 	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
      }
      sign=-sign; 
    }
    this->DhopDerivEO(mat,D,V,DaggerNo); 
  };
    // Constructors
-    ContinuedFractionFermion5D::ContinuedFractionFermion5D(
+    template<class Impl>
-							   LatticeGaugeField &_Umu,
+    ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
 							   GaugeField &_Umu,
 							   GridCartesian         &FiveDimGrid,
 							   GridRedBlackCartesian &FiveDimRedBlackGrid,
 							   GridCartesian         &FourDimGrid,
 							   GridRedBlackCartesian &FourDimRedBlackGrid,
-							   RealD _mass,RealD M5) :
+							   RealD _mass,RealD M5,const ImplParams &p) :
-      WilsonFermion5D(_Umu,
+      WilsonFermion5D<Impl>(_Umu,
-		      FiveDimGrid, FiveDimRedBlackGrid,
+			    FiveDimGrid, FiveDimRedBlackGrid,
-		      FourDimGrid, FourDimRedBlackGrid,M5),
+			    FourDimGrid, FourDimRedBlackGrid,M5,p),
      mass(_mass)
    {
      int Ls = this->Ls;
      assert((Ls&0x1)==1); // Odd Ls required
    }
    FermOpTemplateInstantiate(ContinuedFractionFermion5D);
  }
 }
--- a/lib/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/lib/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -5,35 +5,43 @@ namespace Grid {
  namespace QCD {
-    class ContinuedFractionFermion5D : public WilsonFermion5D
+    template<class Impl>
    class ContinuedFractionFermion5D : public WilsonFermion5D<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
    public:
      // override multiply
-      virtual RealD  M    (const LatticeFermion &in, LatticeFermion &out);
+      virtual RealD  M    (const FermionField &in, FermionField &out);
-      virtual RealD  Mdag (const LatticeFermion &in, LatticeFermion &out);
+      virtual RealD  Mdag (const FermionField &in, FermionField &out);
      // half checkerboard operaions
-      virtual void   Meooe       (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   Meooe       (const FermionField &in, FermionField &out);
-      virtual void   MeooeDag    (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
-      virtual void   Mooee       (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   Mooee       (const FermionField &in, FermionField &out);
-      virtual void   MooeeDag    (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
      // force terms; five routines; default to Dhop on diagonal
      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      //      virtual void   Instantiatable(void)=0;
      virtual void   Instantiatable(void) =0;
      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
      // Constructors
-      ContinuedFractionFermion5D(LatticeGaugeField &_Umu,
+      ContinuedFractionFermion5D(GaugeField &_Umu,
 				 GridCartesian         &FiveDimGrid,
 				 GridRedBlackCartesian &FiveDimRedBlackGrid,
 				 GridCartesian         &FourDimGrid,
 				 GridRedBlackCartesian &FourDimRedBlackGrid,
-				 RealD _mass,RealD M5);
+				 RealD _mass,RealD M5,const ImplParams &p= ImplParams());
    protected:
--- a/lib/qcd/action/fermion/DomainWallFermion.h
+++ b/lib/qcd/action/fermion/DomainWallFermion.h
@@ -7,24 +7,27 @@ namespace Grid {
  namespace QCD {
-    class DomainWallFermion : public CayleyFermion5D
+    template<class Impl>
    class DomainWallFermion : public CayleyFermion5D<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
    public:
      virtual void   Instantiatable(void) {};
      // Constructors
-      DomainWallFermion(LatticeGaugeField &_Umu,
+      DomainWallFermion(GaugeField &_Umu,
 			GridCartesian         &FiveDimGrid,
 			GridRedBlackCartesian &FiveDimRedBlackGrid,
 			GridCartesian         &FourDimGrid,
 			GridRedBlackCartesian &FourDimRedBlackGrid,
-			RealD _mass,RealD _M5) : 
+			RealD _mass,RealD _M5,const ImplParams &p= ImplParams()) : 
-      CayleyFermion5D(_Umu,
+      CayleyFermion5D<Impl>(_Umu,
-		      FiveDimGrid,
+			    FiveDimGrid,
-		      FiveDimRedBlackGrid,
+			    FiveDimRedBlackGrid,
-		      FourDimGrid,
+			    FourDimGrid,
-		      FourDimRedBlackGrid,_mass,_M5)
+			    FourDimRedBlackGrid,_mass,_M5,p)
      {
 	RealD eps = 1.0;
@@ -32,9 +35,9 @@ namespace Grid {
 	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
 	assert(zdata->n==this->Ls);
-	std::cout << "DomainWallFermion with Ls="<<Ls<<std::endl;
+	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
 	// Call base setter
-	this->CayleyFermion5D::SetCoefficientsTanh(zdata,1.0,0.0);
+	this->SetCoefficientsTanh(zdata,1.0,0.0);
 	Approx::zolotarev_free(zdata);
      }
--- a/lib/qcd/action/fermion/FermionOperator.h
+++ b/lib/qcd/action/fermion/FermionOperator.h
@@ -5,16 +5,20 @@ namespace Grid {
  namespace QCD {
-    //////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////
-    // Four component fermions
+    // Allow to select  between gauge representation rank bc's, flavours etc.
-    // Should type template the vector and gauge types
+    // and single/double precision.
-    // Think about multiple representations
+    ////////////////////////////////////////////////////////////////
-    //////////////////////////////////////////////////////////////////////////////
+    
-    template<class FermionField,class GaugeField>
+    template<class Impl>
-    class FermionOperator : public CheckerBoardedSparseMatrixBase<FermionField>
+    class FermionOperator : public CheckerBoardedSparseMatrixBase<typename Impl::FermionField>, public Impl
    {
    public:
      INHERIT_IMPL_TYPES(Impl);
      FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
      GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
      GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
@@ -28,6 +32,8 @@ namespace Grid {
      virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;
      // half checkerboard operaions
      virtual int    ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
      virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
      virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0;
      virtual void   Mooee       (const FermionField &in, FermionField &out)=0;
@@ -39,13 +45,31 @@ namespace Grid {
      virtual void Dhop  (const FermionField &in, FermionField &out,int dag)=0;
      virtual void DhopOE(const FermionField &in, FermionField &out,int dag)=0;
      virtual void DhopEO(const FermionField &in, FermionField &out,int dag)=0;
      virtual void DhopDir(const FermionField &in, FermionField &out,int dir,int disp)=0; // implemented by WilsonFermion and WilsonFermion5D
-      virtual void  Mdiag(const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
+      // force terms; five routines; default to Dhop on diagonal
-      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDeriv(mat,U,V,dag);};
-      virtual void  DhopDir(const FermionField &in, FermionField &out,int dir,int disp)=0; // implemented by WilsonFermion and WilsonFermion5D
+      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivOE(mat,U,V,dag);};
      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivEO(mat,U,V,dag);};
      virtual void MooDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=zero;}; // Clover can override these
      virtual void MeeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=zero;};
      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
      virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
      virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
      virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
      virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
      ///////////////////////////////////////////////
      // Updates gauge field during HMC
      ///////////////////////////////////////////////
      virtual void ImportGauge(const GaugeField & _U)=0;
    };
  }
 }
 #endif
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -0,0 +1,356 @@
 #ifndef  GRID_QCD_FERMION_OPERATOR_IMPL_H
 #define  GRID_QCD_FERMION_OPERATOR_IMPL_H
 namespace Grid {
  namespace QCD {
    //////////////////////////////////////////////
    // Template parameter class constructs to package
    // externally control Fermion implementations
    // in orthogonal directions
    //
    // Ultimately need Impl to always define types where XXX is opaque
    //
    //    typedef typename XXX               Simd;
    //    typedef typename XXX     GaugeLinkField;	
    //    typedef typename XXX         GaugeField;
    //    typedef typename XXX      GaugeActField;
    //    typedef typename XXX       FermionField;
    //    typedef typename XXX  DoubledGaugeField;
    //    typedef typename XXX         SiteSpinor;
    //    typedef typename XXX     SiteHalfSpinor;	
    //    typedef typename XXX         Compressor;	
    //
    // and Methods:
    //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
    //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
    //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St)
    //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
    //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
    //
    //
    // To acquire the typedefs from "Base" (either a base class or template param) use:
    //
    // INHERIT_GIMPL_TYPES(Base)
    // INHERIT_FIMPL_TYPES(Base)
    // INHERIT_IMPL_TYPES(Base)
    //
    // The Fermion operators will do the following:
    //
    // struct MyOpParams { 
    //   RealD mass;
    // };
    //
    //
    // template<class Impl>
    // class MyOp : pubic<Impl> { 
    // public:
    //
    //    INHERIT_ALL_IMPL_TYPES(Impl);
    //
    //    MyOp(MyOpParams Myparm, ImplParams &ImplParam) :  Impl(ImplParam)
    //    {
    //
    //    };
    //    
    //  }
    //////////////////////////////////////////////
    ////////////////////////////////////////////////////////////////////////
    // Implementation dependent gauge types
    ////////////////////////////////////////////////////////////////////////
 #define INHERIT_IMPL_TYPES(Base) \
    INHERIT_GIMPL_TYPES(Base)\
    INHERIT_FIMPL_TYPES(Base)
 #define INHERIT_GIMPL_TYPES(GImpl) \
    typedef typename GImpl::Simd                           Simd;\
    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
    typedef typename GImpl::GaugeField               GaugeField;	
    // Composition with smeared link, bc's etc.. probably need multiple inheritance
    // Variable precision "S" and variable Nc
    template<class S,int Nrepresentation=Nc>
    class ImplGauge { 
    public:
      typedef S Simd;
      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd  >;
      typedef iImplGaugeLink    <Simd>           SiteGaugeLink;
      typedef iImplGaugeField   <Simd>           SiteGaugeField;
      typedef Lattice<SiteGaugeLink>                GaugeLinkField; // bit ugly naming; polarised gauge field, lorentz... all ugly
      typedef Lattice<SiteGaugeField>                   GaugeField;
    };
    ////////////////////////////////////////////////////////////////////////
    // Implementation dependent fermion types
    ////////////////////////////////////////////////////////////////////////
 #define INHERIT_FIMPL_TYPES(Impl)\
    typedef typename Impl::FermionField           FermionField;		\
    typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
    typedef typename Impl::SiteSpinor               SiteSpinor;		\
    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
    typedef typename Impl::Compressor               Compressor;		\
    typedef typename Impl::ImplParams ImplParams;
    ///////
    // Single flavour four spinors with colour index
    ///////
    template<class S,int Nrepresentation=Nc>
    class WilsonImpl :  public ImplGauge<S,Nrepresentation> { 
    public:
      typedef ImplGauge<S,Nrepresentation> Gimpl;
      INHERIT_GIMPL_TYPES(Gimpl);
      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
      typedef iImplSpinor    <Simd>           SiteSpinor;
      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
      typedef iImplDoubledGaugeField<Simd>    SiteDoubledGaugeField;
      typedef Lattice<SiteSpinor>                 FermionField;
      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
      typedef WilsonImplParams ImplParams;
      ImplParams Params;
      WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
        mult(&phi(),&U(mu),&chi());
      }
      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
      {
        conformable(Uds._grid,GaugeGrid);
        conformable(Umu._grid,GaugeGrid);
        GaugeLinkField U(GaugeGrid);
        for(int mu=0;mu<Nd;mu++){
  	  U = PeekIndex<LorentzIndex>(Umu,mu);
 	  PokeIndex<LorentzIndex>(Uds,U,mu);
 	  U = adj(Cshift(U,mu,-1));
 	  PokeIndex<LorentzIndex>(Uds,U,mu+4);
 	}
      }
      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
 	GaugeLinkField link(mat._grid);
 	link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
 	PokeIndex<LorentzIndex>(mat,link,mu);
      }   
      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
 	int Ls=Btilde._grid->_fdimensions[0];
 	GaugeLinkField tmp(mat._grid);
 	tmp = zero;
 PARALLEL_FOR_LOOP
 	for(int sss=0;sss<tmp._grid->oSites();sss++){
 	  int sU=sss;
 	  for(int s=0;s<Ls;s++){
 	    int sF = s+Ls*sU;
 	    tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
 	  }
 	}
 	PokeIndex<LorentzIndex>(mat,tmp,mu);
      }
    };
    ////////////////////////////////////////////////////////////////////////////////////////
    // Flavour doubled spinors; is Gparity the only? what about C*?
    ////////////////////////////////////////////////////////////////////////////////////////
    template<class S,int Nrepresentation>
    class GparityWilsonImpl : public ImplGauge<S,Nrepresentation> { 
    public:
      typedef ImplGauge<S,Nrepresentation> Gimpl;
      INHERIT_GIMPL_TYPES(Gimpl);
      template<typename vtype> using iImplSpinor             = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp >;
      template<typename vtype> using iImplHalfSpinor         = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp >;
      template<typename vtype> using iImplDoubledGaugeField  = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >, Ngp >;
      typedef iImplSpinor    <Simd>           SiteSpinor;
      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
      typedef iImplDoubledGaugeField<Simd>    SiteDoubledGaugeField;
      typedef Lattice<SiteSpinor>                 FermionField;
      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
      typedef GparityWilsonImplParams ImplParams;
      ImplParams Params;
      GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
      // provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
 	typedef SiteHalfSpinor vobj;
 	typedef typename SiteHalfSpinor::scalar_object sobj;
 	vobj vtmp;
 	sobj stmp;
 	GridBase *grid = St._grid;
 	const int Nsimd = grid->Nsimd();
 	int direction    = St._directions[mu];
 	int distance     = St._distances[mu];
 	int ptype        = St._permute_type[mu]; 
 	int sl           = St._grid->_simd_layout[direction];
 	// Fixme X.Y.Z.T hardcode in stencil
 	int mmu          = mu % Nd;
 	// assert our assumptions
 	assert((distance==1)||(distance==-1)); // nearest neighbour stencil hard code
 	assert((sl==1)||(sl==2));
 	std::vector<int> icoor;
 	if ( SE->_around_the_world && Params.twists[mmu] ) {
 	  if ( sl == 2 ) {
 	    std::vector<sobj> vals(Nsimd);
 	    extract(chi,vals);
 	    for(int s=0;s<Nsimd;s++){
 	      grid->iCoorFromIindex(icoor,s);
 	      assert((icoor[direction]==0)||(icoor[direction]==1));
 	      int permute_lane;
 	      if ( distance == 1) {
 		permute_lane = icoor[direction]?1:0;
 	      } else {
 		permute_lane = icoor[direction]?0:1;
 	      }
 	      if ( permute_lane ) { 
 		stmp(0) = vals[s](1);
 		stmp(1) = vals[s](0);
 		vals[s] = stmp;
 	      }
 	    }
 	    merge(vtmp,vals);
 	  } else { 
 	    vtmp(0) = chi(1);
 	    vtmp(1) = chi(0);
 	  }
 	  mult(&phi(0),&U(0)(mu),&vtmp(0));
 	  mult(&phi(1),&U(1)(mu),&vtmp(1));
 	} else { 
 	  mult(&phi(0),&U(0)(mu),&chi(0));
 	  mult(&phi(1),&U(1)(mu),&chi(1));
 	}
      }
      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
      {
 	conformable(Uds._grid,GaugeGrid);
 	conformable(Umu._grid,GaugeGrid);
 	GaugeLinkField Utmp(GaugeGrid);
 	GaugeLinkField U(GaugeGrid);
 	GaugeLinkField Uconj(GaugeGrid);
 	Lattice<iScalar<vInteger> > coor(GaugeGrid);
 	for(int mu=0;mu<Nd;mu++){
 	  LatticeCoordinate(coor,mu);
 	  U     = PeekIndex<LorentzIndex>(Umu,mu);
 	  Uconj = conjugate(U);
 	  // This phase could come from a simple bc 1,1,-1,1 ..
 	  int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
 	  if ( Params.twists[mu] ) { 
 	    Uconj = where(coor==neglink,-Uconj,Uconj);
 	  }
 PARALLEL_FOR_LOOP
 	  for(auto ss=U.begin();ss<U.end();ss++){
 	    Uds[ss](0)(mu) = U[ss]();
 	    Uds[ss](1)(mu) = Uconj[ss]();
 	  }
 	  U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
 	  Uconj = adj(Cshift(Uconj,mu,-1));
 	  Utmp = U;
 	  if ( Params.twists[mu] ) { 
 	    Utmp = where(coor==0,Uconj,Utmp);
 	  }
 PARALLEL_FOR_LOOP
 	  for(auto ss=U.begin();ss<U.end();ss++){
 	    Uds[ss](0)(mu+4) = Utmp[ss]();
 	  }
 	  Utmp = Uconj;
 	  if ( Params.twists[mu] ) { 
 	    Utmp = where(coor==0,U,Utmp);
 	  }
 PARALLEL_FOR_LOOP
 	  for(auto ss=U.begin();ss<U.end();ss++){
 	    Uds[ss](1)(mu+4) = Utmp[ss]();
 	  }
 	}
      }
      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
 	assert(0);
 	// Fixme
 	return;
      }
      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
 	assert(0);
 	// Fixme
 	return;
      }
    };
    typedef WilsonImpl<vComplex ,Nc> WilsonImplR; // Real.. whichever prec
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
    typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
  }
 }
 #endif
--- a/lib/qcd/action/fermion/MobiusFermion.h
+++ b/lib/qcd/action/fermion/MobiusFermion.h
@@ -7,35 +7,38 @@ namespace Grid {
  namespace QCD {
-    class MobiusFermion : public CayleyFermion5D
+    template<class Impl>
    class MobiusFermion : public CayleyFermion5D<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
    public:
      virtual void   Instantiatable(void) {};
      // Constructors
-      MobiusFermion(LatticeGaugeField &_Umu,
+      MobiusFermion(GaugeField &_Umu,
 		    GridCartesian         &FiveDimGrid,
 		    GridRedBlackCartesian &FiveDimRedBlackGrid,
 		    GridCartesian         &FourDimGrid,
 		    GridRedBlackCartesian &FourDimRedBlackGrid,
 		    RealD _mass,RealD _M5,
-		    RealD b, RealD c) : 
+		    RealD b, RealD c,const ImplParams &p= ImplParams()) : 
-      CayleyFermion5D(_Umu,
+      CayleyFermion5D<Impl>(_Umu,
-		      FiveDimGrid,
+			    FiveDimGrid,
-		      FiveDimRedBlackGrid,
+			    FiveDimRedBlackGrid,
-		      FourDimGrid,
+			    FourDimGrid,
-		      FourDimRedBlackGrid,_mass,_M5)
+			    FourDimRedBlackGrid,_mass,_M5,p)
      {
 	RealD eps = 1.0;
-	std::cout << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<Ls<<" Tanh approx"<<std::endl;
+	std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
 	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
 	assert(zdata->n==this->Ls);
 	// Call base setter
-	this->CayleyFermion5D::SetCoefficientsTanh(zdata,b,c);
+	this->SetCoefficientsTanh(zdata,b,c);
 	Approx::zolotarev_free(zdata);
--- a/lib/qcd/action/fermion/MobiusZolotarevFermion.h
+++ b/lib/qcd/action/fermion/MobiusZolotarevFermion.h
@@ -7,26 +7,29 @@ namespace Grid {
  namespace QCD {
-    class MobiusZolotarevFermion : public CayleyFermion5D
+    template<class Impl>
    class MobiusZolotarevFermion : public CayleyFermion5D<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
    public:
      virtual void   Instantiatable(void) {};
      // Constructors
-       MobiusZolotarevFermion(LatticeGaugeField &_Umu,
+       MobiusZolotarevFermion(GaugeField &_Umu,
 			      GridCartesian         &FiveDimGrid,
 			      GridRedBlackCartesian &FiveDimRedBlackGrid,
 			      GridCartesian         &FourDimGrid,
 			      GridRedBlackCartesian &FourDimRedBlackGrid,
 			      RealD _mass,RealD _M5,
 			      RealD b, RealD c,
-			      RealD lo, RealD hi) : 
+			      RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
-      CayleyFermion5D(_Umu,
+      CayleyFermion5D<Impl>(_Umu,
-		      FiveDimGrid,
+			    FiveDimGrid,
-		      FiveDimRedBlackGrid,
+			    FiveDimRedBlackGrid,
-		      FourDimGrid,
+			    FourDimGrid,
-		      FourDimRedBlackGrid,_mass,_M5)
+			    FourDimRedBlackGrid,_mass,_M5,p)
      {
 	RealD eps = lo/hi;
@@ -34,10 +37,10 @@ namespace Grid {
 	Approx::zolotarev_data *zdata = Approx::zolotarev(eps,this->Ls,0);
 	assert(zdata->n==this->Ls);
-	std::cout << "MobiusZolotarevFermion (b="<<b<<",c="<<c<<") with Ls= "<<Ls<<" Zolotarev range ["<<lo<<","<<hi<<"]"<<std::endl;
+	std::cout<<GridLogMessage << "MobiusZolotarevFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Zolotarev range ["<<lo<<","<<hi<<"]"<<std::endl;
 	// Call base setter
-	this->CayleyFermion5D::SetCoefficientsZolotarev(hi,zdata,b,c);
+	this->SetCoefficientsZolotarev(hi,zdata,b,c);
 	Approx::zolotarev_free(zdata);
      }
--- a/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
@@ -7,25 +7,28 @@ namespace Grid {
  namespace QCD {
-    class OverlapWilsonCayleyTanhFermion : public MobiusFermion
+    template<class Impl>
    class OverlapWilsonCayleyTanhFermion : public MobiusFermion<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
    public:
      // Constructors
-    OverlapWilsonCayleyTanhFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
 				   GridCartesian         &FiveDimGrid,
 				   GridRedBlackCartesian &FiveDimRedBlackGrid,
 				   GridCartesian         &FourDimGrid,
 				   GridRedBlackCartesian &FourDimRedBlackGrid,
 				   RealD _mass,RealD _M5,
-				   RealD scale) :
+				   RealD scale,const ImplParams &p= ImplParams()) :
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      MobiusFermion(_Umu,
+      MobiusFermion<Impl>(_Umu,
-		    FiveDimGrid,
+			  FiveDimGrid,
-		    FiveDimRedBlackGrid,
+			  FiveDimRedBlackGrid,
-		    FourDimGrid,
+			  FourDimGrid,
-		    FourDimRedBlackGrid,_mass,_M5,0.5*scale,0.5*scale)
+			  FourDimRedBlackGrid,_mass,_M5,0.5*scale,0.5*scale,p)
 	{
 	}
    };
--- a/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
@@ -7,25 +7,28 @@ namespace Grid {
  namespace QCD {
-    class OverlapWilsonCayleyZolotarevFermion : public MobiusZolotarevFermion
+    template<class Impl>
    class OverlapWilsonCayleyZolotarevFermion : public MobiusZolotarevFermion<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
    public:
      // Constructors
-    OverlapWilsonCayleyZolotarevFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
 					GridCartesian         &FiveDimGrid,
 					GridRedBlackCartesian &FiveDimRedBlackGrid,
 					GridCartesian         &FourDimGrid,
 					GridRedBlackCartesian &FourDimRedBlackGrid,
 					RealD _mass,RealD _M5,
-					RealD lo, RealD hi) : 
+					RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
      // b+c=1.0, b-c = 0 <=> b =c = 1/2
-      MobiusZolotarevFermion(_Umu,
+      MobiusZolotarevFermion<Impl>(_Umu,
-			     FiveDimGrid,
+				   FiveDimGrid,
-			     FiveDimRedBlackGrid,
+				   FiveDimRedBlackGrid,
-			     FourDimGrid,
+				   FourDimGrid,
-			     FourDimRedBlackGrid,_mass,_M5,0.5,0.5,lo,hi)
+				   FourDimRedBlackGrid,_mass,_M5,0.5,0.5,lo,hi,p)
      {}
--- a/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@@ -7,31 +7,34 @@ namespace Grid {
  namespace QCD {
-    class OverlapWilsonContFracTanhFermion : public ContinuedFractionFermion5D
+    template<class Impl>
    class OverlapWilsonContFracTanhFermion : public ContinuedFractionFermion5D<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
    public:
      virtual void   Instantiatable(void){};
      // Constructors
-    OverlapWilsonContFracTanhFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
 				     GridCartesian         &FiveDimGrid,
 				     GridRedBlackCartesian &FiveDimRedBlackGrid,
 				     GridCartesian         &FourDimGrid,
 				     GridRedBlackCartesian &FourDimRedBlackGrid,
 				     RealD _mass,RealD _M5,
-				     RealD scale) :
+				     RealD scale,const ImplParams &p= ImplParams()) :
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      ContinuedFractionFermion5D(_Umu,
+      ContinuedFractionFermion5D<Impl>(_Umu,
-				 FiveDimGrid,
+				       FiveDimGrid,
-				 FiveDimRedBlackGrid,
+				       FiveDimRedBlackGrid,
-				 FourDimGrid,
+				       FourDimGrid,
-				 FourDimRedBlackGrid,_mass,_M5)
+				       FourDimRedBlackGrid,_mass,_M5,p)
 	{
-	  assert((Ls&0x1)==1); // Odd Ls required
+	  assert((this->Ls&0x1)==1); // Odd Ls required
-	  int nrational=Ls-1;// Even rational order
+	  int nrational=this->Ls-1;// Even rational order
 	  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
-	  SetCoefficientsTanh(zdata,scale);
+	  this->SetCoefficientsTanh(zdata,scale);
 	  Approx::zolotarev_free(zdata);
 	}
    };
--- a/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@@ -7,34 +7,36 @@ namespace Grid {
  namespace QCD {
-    class OverlapWilsonContFracZolotarevFermion : public ContinuedFractionFermion5D
+    template<class Impl>
    class OverlapWilsonContFracZolotarevFermion : public ContinuedFractionFermion5D<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
      virtual void   Instantiatable(void){};
      // Constructors
-    OverlapWilsonContFracZolotarevFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
 					  GridCartesian         &FiveDimGrid,
 					  GridRedBlackCartesian &FiveDimRedBlackGrid,
 					  GridCartesian         &FourDimGrid,
 					  GridRedBlackCartesian &FourDimRedBlackGrid,
 					  RealD _mass,RealD _M5,
-					  RealD lo,RealD hi):
+					  RealD lo,RealD hi,const ImplParams &p= ImplParams()):
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      ContinuedFractionFermion5D(_Umu,
+      ContinuedFractionFermion5D<Impl>(_Umu,
-				 FiveDimGrid,
+				       FiveDimGrid,
-				 FiveDimRedBlackGrid,
+				       FiveDimRedBlackGrid,
-				 FourDimGrid,
+				       FourDimGrid,
-				 FourDimRedBlackGrid,_mass,_M5)
+				       FourDimRedBlackGrid,_mass,_M5,p)
 	{
-	  assert((Ls&0x1)==1); // Odd Ls required
+	  assert((this->Ls&0x1)==1); // Odd Ls required
-	  int nrational=Ls;// Odd rational order
+	  int nrational=this->Ls;// Odd rational order
 	  RealD eps = lo/hi;
 	  Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
-	  SetCoefficientsZolotarev(hi,zdata);
+	  this->SetCoefficientsZolotarev(hi,zdata);
 	  Approx::zolotarev_free(zdata);
 	}
--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@@ -7,31 +7,34 @@ namespace Grid {
  namespace QCD {
-    class OverlapWilsonPartialFractionTanhFermion : public PartialFractionFermion5D
+    template<class Impl>
    class OverlapWilsonPartialFractionTanhFermion : public PartialFractionFermion5D<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
    public:
      virtual void   Instantiatable(void){};
      // Constructors
-    OverlapWilsonPartialFractionTanhFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
 					    GridCartesian         &FiveDimGrid,
 					    GridRedBlackCartesian &FiveDimRedBlackGrid,
 					    GridCartesian         &FourDimGrid,
 					    GridRedBlackCartesian &FourDimRedBlackGrid,
 					    RealD _mass,RealD _M5,
-					    RealD scale) :
+					    RealD scale,const ImplParams &p= ImplParams()) :
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      PartialFractionFermion5D(_Umu,
+      PartialFractionFermion5D<Impl>(_Umu,
-			       FiveDimGrid,
+				     FiveDimGrid,
-			       FiveDimRedBlackGrid,
+				     FiveDimRedBlackGrid,
-			       FourDimGrid,
+				     FourDimGrid,
-			       FourDimRedBlackGrid,_mass,_M5)
+				     FourDimRedBlackGrid,_mass,_M5,p)
 	{
-	  assert((Ls&0x1)==1); // Odd Ls required
+	  assert((this->Ls&0x1)==1); // Odd Ls required
-	  int nrational=Ls-1;// Even rational order
+	  int nrational=this->Ls-1;// Even rational order
 	  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
-	  SetCoefficientsTanh(zdata,scale);
+	  this->SetCoefficientsTanh(zdata,scale);
 	  Approx::zolotarev_free(zdata);
 	}
    };
--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@@ -7,34 +7,36 @@ namespace Grid {
  namespace QCD {
-    class OverlapWilsonPartialFractionZolotarevFermion : public PartialFractionFermion5D
+    template<class Impl>
    class OverlapWilsonPartialFractionZolotarevFermion : public PartialFractionFermion5D<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
      virtual void   Instantiatable(void){};
      // Constructors
-    OverlapWilsonPartialFractionZolotarevFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
-					  GridCartesian         &FiveDimGrid,
+						 GridCartesian         &FiveDimGrid,
-					  GridRedBlackCartesian &FiveDimRedBlackGrid,
+						 GridRedBlackCartesian &FiveDimRedBlackGrid,
-					  GridCartesian         &FourDimGrid,
+						 GridCartesian         &FourDimGrid,
-					  GridRedBlackCartesian &FourDimRedBlackGrid,
+						 GridRedBlackCartesian &FourDimRedBlackGrid,
-					  RealD _mass,RealD _M5,
+						 RealD _mass,RealD _M5,
-					  RealD lo,RealD hi):
+						 RealD lo,RealD hi,const ImplParams &p= ImplParams()):
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      PartialFractionFermion5D(_Umu,
+      PartialFractionFermion5D<Impl>(_Umu,
-			       FiveDimGrid,
+				     FiveDimGrid,
-			       FiveDimRedBlackGrid,
+				     FiveDimRedBlackGrid,
-			       FourDimGrid,
+				     FourDimGrid,
-			       FourDimRedBlackGrid,_mass,_M5)
+				     FourDimRedBlackGrid,_mass,_M5,p)
 	{
-	  assert((Ls&0x1)==1); // Odd Ls required
+	  assert((this->Ls&0x1)==1); // Odd Ls required
-	  int nrational=Ls;// Odd rational order
+	  int nrational=this->Ls;// Odd rational order
 	  RealD eps = lo/hi;
 	  Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
-	  SetCoefficientsZolotarev(hi,zdata);
+	  this->SetCoefficientsZolotarev(hi,zdata);
 	  Approx::zolotarev_free(zdata);
 	}
--- a/lib/qcd/action/fermion/PartialFractionFermion5D.cc
+++ b/lib/qcd/action/fermion/PartialFractionFermion5D.cc
@@ -2,12 +2,15 @@
 namespace Grid {
  namespace QCD {
-    void  PartialFractionFermion5D::Mdir (const LatticeFermion &psi, LatticeFermion &chi,int dir,int disp){
+
    template<class Impl>
    void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
      // this does both dag and undag but is trivial; make a common helper routing
      int sign = 1;
      int Ls = this->Ls;
-      DhopDir(psi,chi,dir,disp);
+      this->DhopDir(psi,chi,dir,disp);
      int nblock=(Ls-1)/2;
      for(int b=0;b<nblock;b++){
@@ -18,15 +21,16 @@ namespace Grid {
      ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
    }
-    void   PartialFractionFermion5D::Meooe_internal(const LatticeFermion &psi, LatticeFermion &chi,int dag)
+    template<class Impl>
    void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
    {
-      // this does both dag and undag but is trivial; make a common helper routing
+      int Ls = this->Ls;
      int sign = dag ? (-1) : 1;
      if ( psi.checkerboard == Odd ) {
-	DhopEO(psi,chi,DaggerNo);
+	this->DhopEO(psi,chi,DaggerNo);
      } else {
-	DhopOE(psi,chi,DaggerNo);
+	this->DhopOE(psi,chi,DaggerNo);
      }
      int nblock=(Ls-1)/2;
@@ -38,10 +42,12 @@ namespace Grid {
      ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
    }
-    void   PartialFractionFermion5D::Mooee_internal(const LatticeFermion &psi, LatticeFermion &chi,int dag)
+    template<class Impl>
    void   PartialFractionFermion5D<Impl>::Mooee_internal(const FermionField &psi, FermionField &chi,int dag)
    {
      // again dag and undag are trivially related
      int sign = dag ? (-1) : 1;
      int Ls = this->Ls;
      int nblock=(Ls-1)/2;
      for(int b=0;b<nblock;b++){
@@ -69,11 +75,13 @@ namespace Grid {
      }
    }
-    void   PartialFractionFermion5D::MooeeInv_internal(const LatticeFermion &psi, LatticeFermion &chi,int dag)
+    template<class Impl>
    void   PartialFractionFermion5D<Impl>::MooeeInv_internal(const FermionField &psi, FermionField &chi,int dag)
    {
      int sign = dag ? (-1) : 1;
      int Ls = this->Ls;
-      LatticeFermion tmp(psi._grid);
+      FermionField tmp(psi._grid);
      ///////////////////////////////////////////////////////////////////////////////////////
      //Linv
@@ -129,10 +137,12 @@ namespace Grid {
      axpby_ssp  (chi, 1.0/scale,tmp,0.0,tmp,Ls-1,Ls-1);
    }
-    void   PartialFractionFermion5D::M_internal(const LatticeFermion &psi, LatticeFermion &chi,int dag)
+    template<class Impl>
    void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, FermionField &chi,int dag)
    {
-      LatticeFermion D(psi._grid);
+      FermionField D(psi._grid);
      int Ls = this->Ls;
      int sign = dag ? (-1) : 1;
      // For partial frac Hw case (b5=c5=1) chroma quirkily computes
@@ -186,7 +196,7 @@ namespace Grid {
      //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H
      //
-      DW(psi,D,DaggerNo); 
+      this->DW(psi,D,DaggerNo); 
      int nblock=(Ls-1)/2;
      for(int b=0;b<nblock;b++){
@@ -217,61 +227,127 @@ namespace Grid {
    }
-    RealD  PartialFractionFermion5D::M    (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
    RealD  PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
    {
      M_internal(in,out,DaggerNo);
      return norm2(out);
    }
-    RealD  PartialFractionFermion5D::Mdag (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
    RealD  PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
    {
      M_internal(in,out,DaggerYes);
      return norm2(out);
    }
-    void PartialFractionFermion5D::Meooe       (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
    void PartialFractionFermion5D<Impl>::Meooe       (const FermionField &in, FermionField &out)
    {
      Meooe_internal(in,out,DaggerNo);
    }
-    void PartialFractionFermion5D::MeooeDag    (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
    void PartialFractionFermion5D<Impl>::MeooeDag    (const FermionField &in, FermionField &out)
    {
      Meooe_internal(in,out,DaggerYes);
    }
-    void PartialFractionFermion5D::Mooee       (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
    void PartialFractionFermion5D<Impl>::Mooee       (const FermionField &in, FermionField &out)
    {
      Mooee_internal(in,out,DaggerNo);
    }
-    void PartialFractionFermion5D::MooeeDag    (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
    void PartialFractionFermion5D<Impl>::MooeeDag    (const FermionField &in, FermionField &out)
    {
      Mooee_internal(in,out,DaggerYes);
    }
-    void PartialFractionFermion5D::MooeeInv    (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
    void PartialFractionFermion5D<Impl>::MooeeInv    (const FermionField &in, FermionField &out)
    {
      MooeeInv_internal(in,out,DaggerNo);
    }
-    void PartialFractionFermion5D::MooeeInvDag (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
    void PartialFractionFermion5D<Impl>::MooeeInvDag (const FermionField &in, FermionField &out)
    {
      MooeeInv_internal(in,out,DaggerYes);
    }
-    void  PartialFractionFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
+
  // force terms; five routines; default to Dhop on diagonal
    template<class Impl>
   void PartialFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    int Ls = this->Ls;
    FermionField D(V._grid);
    int nblock=(Ls-1)/2;
    for(int b=0;b<nblock;b++){
      int s = 2*b;
      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
    }
    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
    this->DhopDeriv(mat,D,V,DaggerNo); 
  };
    template<class Impl>
   void PartialFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    int Ls = this->Ls;
    FermionField D(V._grid);
    int nblock=(Ls-1)/2;
    for(int b=0;b<nblock;b++){
      int s = 2*b;
      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
    }
    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
    this->DhopDerivOE(mat,D,V,DaggerNo); 
  };
    template<class Impl>
   void PartialFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    int Ls = this->Ls;
    FermionField D(V._grid);
    int nblock=(Ls-1)/2;
    for(int b=0;b<nblock;b++){
      int s = 2*b;
      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
    }
    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
    this->DhopDerivEO(mat,D,V,DaggerNo); 
  };
    template<class Impl>
    void  PartialFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
      SetCoefficientsZolotarev(1.0/scale,zdata);
    }
-    void  PartialFractionFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){
+    template<class Impl>
    void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){
      // check on degree matching
-      //      std::cout << Ls << " Ls"<<std::endl;
+      //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-      //      std::cout << zdata->n  << " - n"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-      //      std::cout << zdata->da << " -da "<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-      //      std::cout << zdata->db << " -db"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-      //      std::cout << zdata->dn << " -dn"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-      //      std::cout << zdata->dd << " -dd"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
      int Ls = this->Ls;
      assert(Ls == (2*zdata->da -1) );
      // Part frac
      //      RealD R;
      R=(1+mass)/(1-mass);
-      dw_diag = (4.0-M5);
+      dw_diag = (4.0-this->M5);
      //      std::vector<RealD> p; 
      //      std::vector<RealD> q;
@@ -291,18 +367,22 @@ namespace Grid {
    }
      // Constructors
-    PartialFractionFermion5D::PartialFractionFermion5D(LatticeGaugeField &_Umu,
+    template<class Impl>
-						       GridCartesian         &FiveDimGrid,
+    PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
-						       GridRedBlackCartesian &FiveDimRedBlackGrid,
+							     GridCartesian         &FiveDimGrid,
-						       GridCartesian         &FourDimGrid,
+							     GridRedBlackCartesian &FiveDimRedBlackGrid,
-						       GridRedBlackCartesian &FourDimRedBlackGrid,
+							     GridCartesian         &FourDimGrid,
-						       RealD _mass,RealD M5) :
+							     GridRedBlackCartesian &FourDimRedBlackGrid,
-      WilsonFermion5D(_Umu,
+							     RealD _mass,RealD M5,
-		      FiveDimGrid, FiveDimRedBlackGrid,
+							     const ImplParams &p) :
-		      FourDimGrid, FourDimRedBlackGrid,M5),
+      WilsonFermion5D<Impl>(_Umu,
 			    FiveDimGrid, FiveDimRedBlackGrid,
 			    FourDimGrid, FourDimRedBlackGrid,M5,p),
      mass(_mass)
    {
      int Ls = this->Ls;
      assert((Ls&0x1)==1); // Odd Ls required
      int nrational=Ls-1;
@@ -321,6 +401,8 @@ namespace Grid {
    }
    FermOpTemplateInstantiate(PartialFractionFermion5D);
 }
 }
--- a/lib/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/lib/qcd/action/fermion/PartialFractionFermion5D.h
@@ -5,41 +5,48 @@ namespace Grid {
  namespace QCD {
-    class PartialFractionFermion5D : public WilsonFermion5D
+    template<class Impl>
    class PartialFractionFermion5D : public WilsonFermion5D<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
      const int part_frac_chroma_convention=1;
-      void   Meooe_internal(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
-      void   Mooee_internal(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
-      void   MooeeInv_internal(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void   MooeeInv_internal(const FermionField &in, FermionField &out,int dag);
-      void   M_internal(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void   M_internal(const FermionField &in, FermionField &out,int dag);
      // override multiply
-      virtual RealD  M    (const LatticeFermion &in, LatticeFermion &out);
+      virtual RealD  M    (const FermionField &in, FermionField &out);
-      virtual RealD  Mdag (const LatticeFermion &in, LatticeFermion &out);
+      virtual RealD  Mdag (const FermionField &in, FermionField &out);
      // half checkerboard operaions
-      virtual void   Meooe       (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   Meooe       (const FermionField &in, FermionField &out);
-      virtual void   MeooeDag    (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
-      virtual void   Mooee       (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   Mooee       (const FermionField &in, FermionField &out);
-      virtual void   MooeeDag    (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
-      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
      // force terms; five routines; default to Dhop on diagonal
      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      virtual void   Instantiatable(void) =0; // ensure no make-eee
      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
      // Constructors
-      PartialFractionFermion5D(LatticeGaugeField &_Umu,
+      PartialFractionFermion5D(GaugeField &_Umu,
-				    GridCartesian         &FiveDimGrid,
+			       GridCartesian         &FiveDimGrid,
-				    GridRedBlackCartesian &FiveDimRedBlackGrid,
+			       GridRedBlackCartesian &FiveDimRedBlackGrid,
-				    GridCartesian         &FourDimGrid,
+			       GridCartesian         &FourDimGrid,
-				    GridRedBlackCartesian &FourDimRedBlackGrid,
+			       GridRedBlackCartesian &FourDimRedBlackGrid,
-				    RealD _mass,RealD M5);
+			       RealD _mass,RealD M5,const ImplParams &p= ImplParams());
    protected:
--- a/lib/qcd/action/fermion/ScaledShamirFermion.h
+++ b/lib/qcd/action/fermion/ScaledShamirFermion.h
@@ -7,12 +7,14 @@ namespace Grid {
  namespace QCD {
-    class ScaledShamirFermion : public MobiusFermion
+    template<class Impl>
    class ScaledShamirFermion : public MobiusFermion<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
      // Constructors
-    ScaledShamirFermion(LatticeGaugeField &_Umu,
+    ScaledShamirFermion(GaugeField &_Umu,
 			GridCartesian         &FiveDimGrid,
 			GridRedBlackCartesian &FiveDimRedBlackGrid,
 			GridCartesian         &FourDimGrid,
@@ -21,7 +23,7 @@ namespace Grid {
 			RealD scale) :
      // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
-      MobiusFermion(_Umu,
+      MobiusFermion<Impl>(_Umu,
 		    FiveDimGrid,
 		    FiveDimRedBlackGrid,
 		    FourDimGrid,
--- a/lib/qcd/action/fermion/ShamirZolotarevFermion.h
+++ b/lib/qcd/action/fermion/ShamirZolotarevFermion.h
@@ -7,27 +7,29 @@ namespace Grid {
  namespace QCD {
-    class ShamirZolotarevFermion : public MobiusZolotarevFermion
+    template<class Impl>
    class ShamirZolotarevFermion : public MobiusZolotarevFermion<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
      // Constructors
-    ShamirZolotarevFermion(LatticeGaugeField &_Umu,
+    ShamirZolotarevFermion(GaugeField &_Umu,
 			   GridCartesian         &FiveDimGrid,
 			   GridRedBlackCartesian &FiveDimRedBlackGrid,
 			   GridCartesian         &FourDimGrid,
 			   GridRedBlackCartesian &FourDimRedBlackGrid,
 			   RealD _mass,RealD _M5,
-			   RealD lo, RealD hi) : 
+			   RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
      // b+c = 1; b-c = 1 => b=1, c=0
-      MobiusZolotarevFermion(_Umu,
+      MobiusZolotarevFermion<Impl>(_Umu,
-			     FiveDimGrid,
+				   FiveDimGrid,
-			     FiveDimRedBlackGrid,
+				   FiveDimRedBlackGrid,
-			     FourDimGrid,
+				   FourDimGrid,
-			     FourDimRedBlackGrid,_mass,_M5,1.0,0.0,lo,hi)
+				   FourDimRedBlackGrid,_mass,_M5,1.0,0.0,lo,hi,p)
      {}
--- a/lib/qcd/action/fermion/WilsonCompressor.h
+++ b/lib/qcd/action/fermion/WilsonCompressor.h
@@ -4,6 +4,7 @@
 namespace Grid {
 namespace QCD {
  template<class SiteHalfSpinor,class SiteSpinor>
  class WilsonCompressor {
  public:
    int mu;
@@ -18,9 +19,13 @@ namespace QCD {
      mu=p;
    };
-    vHalfSpinColourVector operator () (const vSpinColourVector &in)
+    virtual SiteHalfSpinor operator () (const SiteSpinor &in,int dim,int plane,int osite,GridBase *grid) {
      return spinproject(in);
    }
    SiteHalfSpinor spinproject(const SiteSpinor &in)
    {
-      vHalfSpinColourVector ret;
+      SiteHalfSpinor ret;
      int mudag=mu;
      if (dag) {
 	mudag=(mu+Nd)%(2*Nd);
@@ -57,5 +62,7 @@ namespace QCD {
      return ret;
    }
  };
 }} // namespace close
 #endif
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -3,179 +3,291 @@
 namespace Grid {
 namespace QCD {
-const std::vector<int> WilsonFermion::directions   ({0,1,2,3, 0, 1, 2, 3});
+  const std::vector<int> WilsonFermionStatic::directions   ({0,1,2,3, 0, 1, 2, 3});
-const std::vector<int> WilsonFermion::displacements({1,1,1,1,-1,-1,-1,-1});
+  const std::vector<int> WilsonFermionStatic::displacements({1,1,1,1,-1,-1,-1,-1});
  int WilsonFermionStatic::HandOptDslash;
-int WilsonFermion::HandOptDslash;
+  /////////////////////////////////
  // Constructor and gauge import
  /////////////////////////////////
-WilsonFermion::WilsonFermion(LatticeGaugeField &_Umu,
+  template<class Impl>
-			     GridCartesian         &Fgrid,
+  WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu,
-			     GridRedBlackCartesian &Hgrid, 
+				     GridCartesian         &Fgrid,
-			     RealD _mass) :
+				     GridRedBlackCartesian &Hgrid, 
-  _grid(&Fgrid),
+				     RealD _mass,const ImplParams &p) :
-  _cbgrid(&Hgrid),
+        Kernels(p),
-  Stencil    (&Fgrid,npoint,Even,directions,displacements),
+        _grid(&Fgrid),
-  StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
+	_cbgrid(&Hgrid),
-  StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
+	Stencil    (&Fgrid,npoint,Even,directions,displacements),
-  mass(_mass),
+	StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
-  Umu(&Fgrid),
+	StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
-  UmuEven(&Hgrid),
+	mass(_mass),
-  UmuOdd (&Hgrid)
+	Umu(&Fgrid),
-{
+	UmuEven(&Hgrid),
-  // Allocate the required comms buffer
+	UmuOdd (&Hgrid) 
-  comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
+  {
-  DoubleStore(Umu,_Umu);
+    // Allocate the required comms buffer
-  pickCheckerboard(Even,UmuEven,Umu);
+    comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
-  pickCheckerboard(Odd ,UmuOdd,Umu);
+    ImportGauge(_Umu);
 }
 void WilsonFermion::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu)
 {
  conformable(Uds._grid,GaugeGrid());
  conformable(Umu._grid,GaugeGrid());
  LatticeColourMatrix U(GaugeGrid());
  for(int mu=0;mu<Nd;mu++){
    U = PeekIndex<LorentzIndex>(Umu,mu);
    PokeIndex<LorentzIndex>(Uds,U,mu);
    U = adj(Cshift(U,mu,-1));
    PokeIndex<LorentzIndex>(Uds,U,mu+4);
  }
 }
-RealD WilsonFermion::M(const LatticeFermion &in, LatticeFermion &out)
+  template<class Impl>
-{
+  void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
-  out.checkerboard=in.checkerboard;
+  {
-  Dhop(in,out,DaggerNo);
+    Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
-  return axpy_norm(out,4+mass,in,out);
+    pickCheckerboard(Even,UmuEven,Umu);
-}
+    pickCheckerboard(Odd ,UmuOdd,Umu);
 RealD WilsonFermion::Mdag(const LatticeFermion &in, LatticeFermion &out)
 {
  out.checkerboard=in.checkerboard;
  Dhop(in,out,DaggerYes);
  return axpy_norm(out,4+mass,in,out);
 }
 void WilsonFermion::Meooe(const LatticeFermion &in, LatticeFermion &out)
 {
  if ( in.checkerboard == Odd ) {
    DhopEO(in,out,DaggerNo);
  } else {
    DhopOE(in,out,DaggerNo);
  }
 }
 void WilsonFermion::MeooeDag(const LatticeFermion &in, LatticeFermion &out)
 {
  if ( in.checkerboard == Odd ) {
    DhopEO(in,out,DaggerYes);
  } else {
    DhopOE(in,out,DaggerYes);
  }
 }
 void WilsonFermion::Mooee(const LatticeFermion &in, LatticeFermion &out)
 {
  out.checkerboard = in.checkerboard;
  out = (4.0+mass)*in;
  return ;
 }
 void WilsonFermion::MooeeDag(const LatticeFermion &in, LatticeFermion &out)
 {
  out.checkerboard = in.checkerboard;
  Mooee(in,out);
 }
 void WilsonFermion::MooeeInv(const LatticeFermion &in, LatticeFermion &out)
 {
  out.checkerboard = in.checkerboard;
  out = (1.0/(4.0+mass))*in;
  return ;
 }
 void WilsonFermion::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
 {
  out.checkerboard = in.checkerboard;
  MooeeInv(in,out);
 }
 void WilsonFermion::Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp)
 {
  DhopDir(in,out,dir,disp);
 }
 void WilsonFermion::DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir,int disp){
  WilsonCompressor compressor(DaggerNo);
  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
-  assert( (disp==1)||(disp==-1) );
+  /////////////////////////////
-
+  // Implement the interface
-  int skip = (disp==1) ? 0 : 1;
+  /////////////////////////////
-
+      
-  int dirdisp = dir+skip*4;
+  template<class Impl>
-
+  RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) 
-PARALLEL_FOR_LOOP
+  {
-  for(int sss=0;sss<in._grid->oSites();sss++){
+    out.checkerboard=in.checkerboard;
-    DiracOptDhopDir(Stencil,Umu,comm_buf,sss,sss,in,out,dirdisp);
+    Dhop(in,out,DaggerNo);
    return axpy_norm(out,4+mass,in,out);
  }
-};
+  template<class Impl>
  RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) 
  {
    out.checkerboard=in.checkerboard;
    Dhop(in,out,DaggerYes);
    return axpy_norm(out,4+mass,in,out);
  }
-void WilsonFermion::DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField & U,
+  template<class Impl>
-				const LatticeFermion &in, LatticeFermion &out,int dag)
+  void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) 
-{
+  {
-  assert((dag==DaggerNo) ||(dag==DaggerYes));
+    if ( in.checkerboard == Odd ) {
-  WilsonCompressor compressor(dag);
+      DhopEO(in,out,DaggerNo);
-  st.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
+    } else {
-
+      DhopOE(in,out,DaggerNo);
-  if ( dag == DaggerYes ) {
+    }
-    if( HandOptDslash ) {
+  }
-PARALLEL_FOR_LOOP
+  template<class Impl>
-      for(int sss=0;sss<in._grid->oSites();sss++){
+  void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) 
-        DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
+  {
-      }
+    if ( in.checkerboard == Odd ) {
-    } else { 
+      DhopEO(in,out,DaggerYes);
-PARALLEL_FOR_LOOP
+    } else {
-      for(int sss=0;sss<in._grid->oSites();sss++){
+      DhopOE(in,out,DaggerYes);
        DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
      }
    }
  } else {
    if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
      for(int sss=0;sss<in._grid->oSites();sss++){
        DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out);
      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int sss=0;sss<in._grid->oSites();sss++){
        DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out);
      }
    }
  }
 }
 void WilsonFermion::DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag)
 {
  conformable(in._grid,_cbgrid);    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
-  assert(in.checkerboard==Even);
+  template<class Impl>
-  out.checkerboard = Odd;
+  void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
    out.checkerboard = in.checkerboard;
    out = (4.0+mass)*in;
  }
  template<class Impl>
  void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
    out.checkerboard = in.checkerboard;
    Mooee(in,out);
  }
  template<class Impl>
  void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
    out.checkerboard = in.checkerboard;
    out = (1.0/(4.0+mass))*in;
  }
  template<class Impl>
  void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
    out.checkerboard = in.checkerboard;
    MooeeInv(in,out);
  }
  ///////////////////////////////////
  // Internal
  ///////////////////////////////////
-  DhopInternal(StencilEven,UmuOdd,in,out,dag);
+  template<class Impl>
-}
+  void WilsonFermion<Impl>::DerivInternal(CartesianStencil & st,
-void WilsonFermion::DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag)
+					  DoubledGaugeField & U,
-{
+					  GaugeField &mat,
-  conformable(in._grid,_cbgrid);    // verifies half grid
+					  const FermionField &A,
-  conformable(in._grid,out._grid); // drops the cb check
+					  const FermionField &B,int dag) {
    assert((dag==DaggerNo) ||(dag==DaggerYes));
    Compressor compressor(dag);
    FermionField Btilde(B._grid);
    FermionField Atilde(B._grid);
    Atilde = A;
-  assert(in.checkerboard==Odd);
+    st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
-  out.checkerboard = Even;
+    
    for(int mu=0;mu<Nd;mu++){
      ////////////////////////////////////////////////////////////////////////
      // Flip gamma (1+g)<->(1-g) if dag
      ////////////////////////////////////////////////////////////////////////
      int gamma = mu;
      if ( dag ) gamma+= Nd;
      ////////////////////////
      // Call the single hop
      ////////////////////////
 PARALLEL_FOR_LOOP
 	for(int sss=0;sss<B._grid->oSites();sss++){
 	  Kernels::DiracOptDhopDir(st,U,comm_buf,sss,sss,B,Btilde,mu,gamma);
 	}
      //////////////////////////////////////////////////
      // spin trace outer product
      //////////////////////////////////////////////////
      Impl::InsertForce4D(mat,Btilde,Atilde,mu);
-  DhopInternal(StencilOdd,UmuEven,in,out,dag);
+    }
-}
+  }
-void WilsonFermion::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
+  
-{
+  template<class Impl>
-  conformable(in._grid,_grid); // verifies full grid
+  void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  conformable(in._grid,out._grid);
+  {
    conformable(U._grid,_grid);  
    conformable(U._grid,V._grid);
    conformable(U._grid,mat._grid);
    mat.checkerboard = U.checkerboard;
    DerivInternal(Stencil,Umu,mat,U,V,dag);
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    conformable(U._grid,_cbgrid);  
    conformable(U._grid,V._grid);
    conformable(U._grid,mat._grid);
    assert(V.checkerboard==Even);
    assert(U.checkerboard==Odd);
    mat.checkerboard = Odd;
    DerivInternal(StencilEven,UmuOdd,mat,U,V,dag);
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    conformable(U._grid,_cbgrid);  
    conformable(U._grid,V._grid);
    conformable(U._grid,mat._grid);
    assert(V.checkerboard==Odd);
    assert(U.checkerboard==Even);
    mat.checkerboard = Even;
    DerivInternal(StencilOdd,UmuEven,mat,U,V,dag);
  }
-  out.checkerboard = in.checkerboard;
+  template<class Impl>
  void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) {
    conformable(in._grid,_grid); // verifies full grid
    conformable(in._grid,out._grid);
    out.checkerboard = in.checkerboard;
    DhopInternal(Stencil,Umu,in,out,dag);
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) {
    conformable(in._grid,_cbgrid);    // verifies half grid
    conformable(in._grid,out._grid); // drops the cb check
    assert(in.checkerboard==Even);
    out.checkerboard = Odd;
    DhopInternal(StencilEven,UmuOdd,in,out,dag);
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) {
    conformable(in._grid,_cbgrid);    // verifies half grid
    conformable(in._grid,out._grid); // drops the cb check
    assert(in.checkerboard==Odd);
    out.checkerboard = Even;
    DhopInternal(StencilOdd,UmuEven,in,out,dag);
  }
  template<class Impl>
  void WilsonFermion<Impl>::Mdir (const FermionField &in, FermionField &out,int dir,int disp) {
    DhopDir(in,out,dir,disp);
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir,int disp){
    int skip = (disp==1) ? 0 : 1;
    int dirdisp = dir+skip*4;
    DhopDirDisp(in,out,dirdisp,dirdisp,DaggerNo);
  };
  template<class Impl>
  void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp,int gamma,int dag) {
    Compressor compressor(dag);
    Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
 PARALLEL_FOR_LOOP
      for(int sss=0;sss<in._grid->oSites();sss++){
 	Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sss,sss,in,out,dirdisp,gamma);
      }
  };
-  DhopInternal(Stencil,Umu,in,out,dag);
+
-}
+  template<class Impl>
  void WilsonFermion<Impl>::DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) {
    assert((dag==DaggerNo) ||(dag==DaggerYes));
    Compressor compressor(dag);
    st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
    if ( dag == DaggerYes ) {
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
 	}
      }
    } else {
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out);
 	}
      }
    }
  };
  FermOpTemplateInstantiate(WilsonFermion);
 }}
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -5,9 +5,21 @@ namespace Grid {
  namespace QCD {
-    class WilsonFermion : public FermionOperator<LatticeFermion,LatticeGaugeField>
+    class WilsonFermionStatic {
    public:
      static int HandOptDslash; // these are a temporary hack
      static int MortonOrder;
      static const std::vector<int> directions   ;
      static const std::vector<int> displacements;
      static const int npoint=8;
    };
    template<class Impl>
    class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic
    {
    public:
    INHERIT_IMPL_TYPES(Impl);
    typedef WilsonKernels<Impl> Kernels;
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
@@ -17,75 +29,102 @@ namespace Grid {
      GridBase *FermionGrid(void)            { return _grid;}
      GridBase *FermionRedBlackGrid(void)    { return _cbgrid;}
-      // override multiply
+      //////////////////////////////////////////////////////////////////
-      virtual RealD  M    (const LatticeFermion &in, LatticeFermion &out);
+      // override multiply; cut number routines if pass dagger argument
-      virtual RealD  Mdag (const LatticeFermion &in, LatticeFermion &out);
+      // and also make interface more uniformly consistent
      //////////////////////////////////////////////////////////////////
      RealD M(const FermionField &in, FermionField &out);
      RealD Mdag(const FermionField &in, FermionField &out);
-      // half checkerboard operaions
+      /////////////////////////////////////////////////////////
-      void   Meooe       (const LatticeFermion &in, LatticeFermion &out);
+      // half checkerboard operations
-      void   MeooeDag    (const LatticeFermion &in, LatticeFermion &out);
+      // could remain virtual so we  can derive Clover from Wilson base
-      virtual void   Mooee       (const LatticeFermion &in, LatticeFermion &out); // remain virtual so we 
+      /////////////////////////////////////////////////////////
-      virtual void   MooeeDag    (const LatticeFermion &in, LatticeFermion &out); // can derive Clover
+      void Meooe(const FermionField &in, FermionField &out) ;
-      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out); // from Wilson base
+      void MeooeDag(const FermionField &in, FermionField &out) ;
-      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
+      void Mooee(const FermionField &in, FermionField &out) ;
      void MooeeDag(const FermionField &in, FermionField &out) ;
      void MooeeInv(const FermionField &in, FermionField &out) ;
      void MooeeInvDag(const FermionField &in, FermionField &out) ;
      ////////////////////////
      // Derivative interface
      ////////////////////////
      // Interface calls an internal routine
      void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      ///////////////////////////////////////////////////////////////
      // non-hermitian hopping term; half cb or both
-      void Dhop  (const LatticeFermion &in, LatticeFermion &out,int dag);
+      ///////////////////////////////////////////////////////////////
-      void DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void Dhop(const FermionField &in, FermionField &out,int dag) ;
-      void DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void DhopOE(const FermionField &in, FermionField &out,int dag) ;
      void DhopEO(const FermionField &in, FermionField &out,int dag) ;
-      // Multigrid assistance
+      ///////////////////////////////////////////////////////////////
-      void   Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      // Multigrid assistance; force term uses too
-      void DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      ///////////////////////////////////////////////////////////////
      void Mdir (const FermionField &in, FermionField &out,int dir,int disp) ;
      void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
      void DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp,int gamma,int dag) ;
      ///////////////////////////////////////////////////////////////
      // Extra methods added by derived
      ///////////////////////////////////////////////////////////////
-      void DhopInternal(CartesianStencil & st,
+      void DerivInternal(CartesianStencil & st,
-			LatticeDoubledGaugeField &U,
+			 DoubledGaugeField & U,
-			const LatticeFermion &in, 
+			 GaugeField &mat,
-			LatticeFermion &out,
+			 const FermionField &A,
-			int dag);
+			 const FermionField &B,
 			 int dag);
      void DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
 			const FermionField &in, FermionField &out,int dag) ;
      // Constructor
-      WilsonFermion(LatticeGaugeField &_Umu,GridCartesian &Fgrid,GridRedBlackCartesian &Hgrid,RealD _mass);
+      WilsonFermion(GaugeField &_Umu,
 		    GridCartesian         &Fgrid,
 		    GridRedBlackCartesian &Hgrid, 
 		    RealD _mass,
 		    const ImplParams &p= ImplParams()
 		    ) ;
-      // DoubleStore
+      // DoubleStore impl dependent
-      void DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu);
+      void ImportGauge(const GaugeField &_Umu);
      ///////////////////////////////////////////////////////////////
      // Data members require to support the functionality
      ///////////////////////////////////////////////////////////////
      static int HandOptDslash; // these are a temporary hack
      static int MortonOrder;
-    protected:
+      //    protected:
    public:
      RealD                        mass;
      GridBase                     *    _grid; 
      GridBase                     *  _cbgrid;
      static const int npoint=8;
      static const std::vector<int> directions   ;
      static const std::vector<int> displacements;
      //Defines the stencils for even and odd
      CartesianStencil Stencil; 
      CartesianStencil StencilEven; 
      CartesianStencil StencilOdd; 
      // Copy of the gauge field , with even and odd subsets
-      LatticeDoubledGaugeField Umu;
+      DoubledGaugeField Umu;
-      LatticeDoubledGaugeField UmuEven;
+      DoubledGaugeField UmuEven;
-      LatticeDoubledGaugeField UmuOdd;
+      DoubledGaugeField UmuOdd;
      // Comms buffer
-      std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  comm_buf;
+      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    };
    typedef WilsonFermion<WilsonImplF> WilsonFermionF;
    typedef WilsonFermion<WilsonImplD> WilsonFermionD;
  }
 }
 #endif
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -3,19 +3,20 @@
 namespace Grid {
 namespace QCD {
-  // S-direction is INNERMOST and takes no part in the parity.
+// S-direction is INNERMOST and takes no part in the parity.
-  const std::vector<int> WilsonFermion5D::directions   ({1,2,3,4, 1, 2, 3, 4});
+const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
-  const std::vector<int> WilsonFermion5D::displacements({1,1,1,1,-1,-1,-1,-1});
+const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
-
+int WilsonFermion5DStatic::HandOptDslash;
  int WilsonFermion5D::HandOptDslash;
  // 5d lattice for DWF.
-  WilsonFermion5D::WilsonFermion5D(LatticeGaugeField &_Umu,
+template<class Impl>
-					   GridCartesian         &FiveDimGrid,
+WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
-					   GridRedBlackCartesian &FiveDimRedBlackGrid,
+				       GridCartesian         &FiveDimGrid,
-					   GridCartesian         &FourDimGrid,
+				       GridRedBlackCartesian &FiveDimRedBlackGrid,
-					   GridRedBlackCartesian &FourDimRedBlackGrid,
+				       GridCartesian         &FourDimGrid,
-					   RealD _M5) :
+				       GridRedBlackCartesian &FourDimRedBlackGrid,
 				       RealD _M5,const ImplParams &p) :
  Kernels(p),
  _FiveDimGrid(&FiveDimGrid),
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
  _FourDimGrid(&FourDimGrid),
@@ -65,33 +66,26 @@ namespace QCD {
  // Allocate the required comms buffer
  comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
-  
+
-  DoubleStore(Umu,_Umu);
+  ImportGauge(_Umu);
 }  
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
  pickCheckerboard(Even,UmuEven,Umu);
  pickCheckerboard(Odd ,UmuOdd,Umu);
 }
-void WilsonFermion5D::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu)
+template<class Impl>
-{
+void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
  assert(GaugeGrid()->_ndimension==4);
  conformable(Uds._grid,GaugeGrid());
  conformable(Umu._grid,GaugeGrid());
  LatticeColourMatrix U(GaugeGrid());
  for(int mu=0;mu<Nd;mu++){
    U = PeekIndex<LorentzIndex>(Umu,mu);
    PokeIndex<LorentzIndex>(Uds,U,mu);
    U = adj(Cshift(U,mu,-1));
    PokeIndex<LorentzIndex>(Uds,U,mu+4);
  }
 }
 void WilsonFermion5D::DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir5,int disp)
 {
  int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
                    // we drop off the innermost fifth dimension
  //  assert( (disp==1)||(disp==-1) );
  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
-  WilsonCompressor compressor(DaggerNo);
+  Compressor compressor(DaggerNo);
-  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
+  Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
  int skip = (disp==1) ? 0 : 1;
@@ -100,25 +94,133 @@ void WilsonFermion5D::DhopDir(const LatticeFermion &in, LatticeFermion &out,int
  assert(dirdisp<=7);
  assert(dirdisp>=0);
-//PARALLEL_FOR_LOOP
+PARALLEL_FOR_LOOP
  for(int ss=0;ss<Umu._grid->oSites();ss++){
    for(int s=0;s<Ls;s++){
      int sU=ss;
      int sF = s+Ls*sU; 
-      DiracOptDhopDir(Stencil,Umu,comm_buf,sF,sU,in,out,dirdisp);
+      Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sF,sU,in,out,dirdisp,dirdisp);
    }
  }
 };
-void WilsonFermion5D::DhopInternal(CartesianStencil & st, LebesgueOrder &lo,
+template<class Impl>
-				   LatticeDoubledGaugeField & U,
+void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
-			   const LatticeFermion &in, LatticeFermion &out,int dag)
+					  DoubledGaugeField & U,
 					  GaugeField &mat,
 					  const FermionField &A,
 					  const FermionField &B,
 					  int dag)
 {
  assert((dag==DaggerNo) ||(dag==DaggerYes));
  conformable(st._grid,A._grid);
  conformable(st._grid,B._grid);
  Compressor compressor(dag);
  FermionField Btilde(B._grid);
  FermionField Atilde(B._grid);
  st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
  Atilde=A;
  for(int mu=0;mu<Nd;mu++){
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma if dag
    ////////////////////////////////////////////////////////////////////////
    int gamma = mu;
    if ( dag ) gamma+= Nd;
    ////////////////////////
    // Call the single hop
    ////////////////////////
 PARALLEL_FOR_LOOP
    for(int sss=0;sss<U._grid->oSites();sss++){
      for(int s=0;s<Ls;s++){
 	int sU=sss;
 	int sF = s+Ls*sU;
 	assert ( sF< B._grid->oSites());
 	assert ( sU< U._grid->oSites());
 	Kernels::DiracOptDhopDir(st,U,comm_buf,sF,sU,B,Btilde,mu,gamma);
    ////////////////////////////
    // spin trace outer product
    ////////////////////////////
      }
    }
    Impl::InsertForce5D(mat,Btilde,Atilde,mu);
  }
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDeriv(      GaugeField &mat,
 					    const FermionField &A,
 					    const FermionField &B,
 					    int dag)
 {
  conformable(A._grid,FermionGrid());  
  conformable(A._grid,B._grid);
  conformable(GaugeGrid(),mat._grid);
  mat.checkerboard = A.checkerboard;
  DerivInternal(Stencil,Umu,mat,A,B,dag);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 					const FermionField &A,
 					const FermionField &B,
 					int dag)
 {
  conformable(A._grid,FermionRedBlackGrid());
  conformable(GaugeRedBlackGrid(),mat._grid);
  conformable(A._grid,B._grid);
  assert(B.checkerboard==Odd);
  assert(A.checkerboard==Even);
  mat.checkerboard = Even;
  DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 				  const FermionField &A,
 				  const FermionField &B,
 				  int dag)
 {
  conformable(A._grid,FermionRedBlackGrid());
  conformable(GaugeRedBlackGrid(),mat._grid);
  conformable(A._grid,B._grid);
  assert(B.checkerboard==Even);
  assert(A.checkerboard==Odd);
  mat.checkerboard = Odd;
  DerivInternal(StencilEven,UmuOdd,mat,A,B,dag);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag)
 {
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-  WilsonCompressor compressor(dag);
+  Compressor compressor(dag);
-  st.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
+  st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  // Not loop ordering and data layout.
@@ -126,13 +228,13 @@ void WilsonFermion5D::DhopInternal(CartesianStencil & st, LebesgueOrder &lo,
  // - per thread reuse in L1 cache for U
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
  if ( dag == DaggerYes ) {
-    if( HandOptDslash ) {
+    if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	for(int s=0;s<Ls;s++){
 	  int sU=ss;
 	  int sF = s+Ls*sU;
-	  DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
+	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
 	  }
      }
    } else { 
@@ -143,20 +245,20 @@ PARALLEL_FOR_LOOP
 	  for(sd=0;sd<Ls;sd++){
 	    int sU=ss;
 	    int sF = sd+Ls*sU;
-	    DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
+	    Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
 	  }
 	}
      }
    }
  } else {
-    if( HandOptDslash ) {
+    if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	for(int s=0;s<Ls;s++){
 	  //	  int sU=lo.Reorder(ss);
 	  int sU=ss;
 	  int sF = s+Ls*sU;
-	  DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
+	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 	}
      }
@@ -167,13 +269,14 @@ PARALLEL_FOR_LOOP
 	  //	  int sU=lo.Reorder(ss);
 	  int sU=ss;
 	  int sF = s+Ls*sU; 
-	  DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out);
+	  Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out);
 	}
      }
    }
  }
 }
-void WilsonFermion5D::DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag)
+template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
@@ -183,7 +286,8 @@ void WilsonFermion5D::DhopOE(const LatticeFermion &in, LatticeFermion &out,int d
  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
 }
-void WilsonFermion5D::DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag)
+template<class Impl>
 void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
@@ -193,7 +297,8 @@ void WilsonFermion5D::DhopEO(const LatticeFermion &in, LatticeFermion &out,int d
  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
 }
-void WilsonFermion5D::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
+template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  conformable(in._grid,FermionGrid()); // verifies full grid
  conformable(in._grid,out._grid);
@@ -202,12 +307,16 @@ void WilsonFermion5D::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag
  DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
 }
-void WilsonFermion5D::DW(const LatticeFermion &in, LatticeFermion &out,int dag)
+template<class Impl>
 void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
 {
  out.checkerboard=in.checkerboard;
  Dhop(in,out,dag); // -0.5 is included
  axpy(out,4.0-M5,in,out);
 }
 FermOpTemplateInstantiate(WilsonFermion5D);
 }}
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -14,22 +14,24 @@ namespace Grid {
    // i.e. even even contains fifth dim hopping term.
    //
    // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
    ////////////////////////////
    //ContFrac:
    //  Ls always odd. Rational poly deg is either Ls or Ls-1
    //PartFrac 
    //  Ls always odd. Rational poly deg is either Ls or Ls-1
    //
    //Cayley: Ls always even, Rational poly deg is Ls
    // 
    // Just set nrational as Ls. Forget about Ls-1 cases.
    //
    // Require odd Ls for cont and part frac
    ////////////////////////////
    ////////////////////////////////////////////////////////////////////////////////
-    class WilsonFermion5D : public FermionOperator<LatticeFermion,LatticeGaugeField>
+
    class WilsonFermion5DStatic { 
    public:
      // S-direction is INNERMOST and takes no part in the parity.
      static int HandOptDslash; // these are a temporary hack
      static const std::vector<int> directions;
      static const std::vector<int> displacements;
      const int npoint = 8;
    };
    template<class Impl>
    class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
     typedef WilsonKernels<Impl> Kernels;
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
      ///////////////////////////////////////////////////////////////
@@ -39,54 +41,65 @@ namespace Grid {
      GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
      // full checkerboard operations; leave unimplemented as abstract for now
-      virtual RealD  M    (const LatticeFermion &in, LatticeFermion &out){assert(0); return 0.0;};
+      virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
-      virtual RealD  Mdag (const LatticeFermion &in, LatticeFermion &out){assert(0); return 0.0;};
+      virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};
      // half checkerboard operations; leave unimplemented as abstract for now
-      virtual void   Meooe       (const LatticeFermion &in, LatticeFermion &out){assert(0);};
+      virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MeooeDag    (const LatticeFermion &in, LatticeFermion &out){assert(0);};
+      virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   Mooee       (const LatticeFermion &in, LatticeFermion &out){assert(0);};
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MooeeDag    (const LatticeFermion &in, LatticeFermion &out){assert(0);};
+
-      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out){assert(0);};
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out){assert(0);};
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
      // These can be overridden by fancy 5d chiral action
      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
      // Implement hopping term non-hermitian hopping term; half cb or both
      // Implement s-diagonal DW
-      void DW    (const LatticeFermion &in, LatticeFermion &out,int dag);
+      void DW    (const FermionField &in, FermionField &out,int dag);
-      void Dhop  (const LatticeFermion &in, LatticeFermion &out,int dag);
+      void Dhop  (const FermionField &in, FermionField &out,int dag);
-      void DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void DhopOE(const FermionField &in, FermionField &out,int dag);
-      void DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void DhopEO(const FermionField &in, FermionField &out,int dag);
      // add a DhopComm
      // -- suboptimal interface will presently trigger multiple comms.
-      void DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
      ///////////////////////////////////////////////////////////////
      // New methods added 
      ///////////////////////////////////////////////////////////////
      void DerivInternal(CartesianStencil & st,
 			 DoubledGaugeField & U,
 			 GaugeField &mat,
 			 const FermionField &A,
 			 const FermionField &B,
 			 int dag);
      void DhopInternal(CartesianStencil & st,
 			LebesgueOrder &lo,
-			LatticeDoubledGaugeField &U,
+			DoubledGaugeField &U,
-			const LatticeFermion &in, 
+			const FermionField &in, 
-			LatticeFermion &out,
+			FermionField &out,
 			int dag);
      // Constructors
-      WilsonFermion5D(LatticeGaugeField &_Umu,
+      WilsonFermion5D(GaugeField &_Umu,
-			  GridCartesian         &FiveDimGrid,
+		      GridCartesian         &FiveDimGrid,
-			  GridRedBlackCartesian &FiveDimRedBlackGrid,
+		      GridRedBlackCartesian &FiveDimRedBlackGrid,
-			  GridCartesian         &FourDimGrid,
+		      GridCartesian         &FourDimGrid,
-			  GridRedBlackCartesian &FourDimRedBlackGrid,
+		      GridRedBlackCartesian &FourDimRedBlackGrid,
-			  double _M5);
+		      double _M5,const ImplParams &p= ImplParams());
      // DoubleStore
-      void DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu);
+      void ImportGauge(const GaugeField &_Umu);
      ///////////////////////////////////////////////////////////////
      // Data members require to support the functionality
      ///////////////////////////////////////////////////////////////
      static int HandOptDslash; // these are a temporary hack
    protected:
      // Add these to the support from Wilson
@@ -95,10 +108,6 @@ namespace Grid {
      GridBase *_FiveDimGrid;
      GridBase *_FiveDimRedBlackGrid;
      static const int npoint=8;
      static const std::vector<int> directions   ;
      static const std::vector<int> displacements;
      double                        M5;
      int Ls;
@@ -108,15 +117,15 @@ namespace Grid {
      CartesianStencil StencilOdd; 
      // Copy of the gauge field , with even and odd subsets
-      LatticeDoubledGaugeField Umu;
+      DoubledGaugeField Umu;
-      LatticeDoubledGaugeField UmuEven;
+      DoubledGaugeField UmuEven;
-      LatticeDoubledGaugeField UmuOdd;
+      DoubledGaugeField UmuOdd;
      LebesgueOrder Lebesgue;
      LebesgueOrder LebesgueEvenOdd;
      // Comms buffer
-      std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  comm_buf;
+      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    };
  }
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -1,429 +1,374 @@
 #include <Grid.h>
 namespace Grid {
 namespace QCD {
-void DiracOptDhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
+template<class Impl> 
-			std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
-			int sF,int sU,const LatticeFermion &in, LatticeFermion &out)
+						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						  int sF,int sU,const FermionField &in, FermionField &out)
 {
-    vHalfSpinColourVector  tmp;    
+  SiteHalfSpinor  tmp;    
-    vHalfSpinColourVector  chi;    
+  SiteHalfSpinor  chi;    
-    vSpinColourVector result;
+  SiteHalfSpinor Uchi;
-    vHalfSpinColourVector Uchi;
+  SiteSpinor result;
-    int offset,local,perm, ptype;
+  StencilEntry *SE;
  int ptype;
-
+  // Xp
-    // Xp
+  SE=st.GetEntry(ptype,Xp,sF);
-    int ss = sF;
+  if ( SE->_is_local && SE->_permute ) {
-    offset = st._offsets [Xp][ss];
+    spProjXp(tmp,in._odata[SE->_offset]);
-    local  = st._is_local[Xp][ss];
+    permute(chi,tmp,ptype);
-    perm   = st._permute[Xp][ss];
+  } else if ( SE->_is_local ) {
-
+    spProjXp(chi,in._odata[SE->_offset]);
-    ptype  = st._permute_type[Xp];
+  } else { 
-    if ( local && perm ) {
+    chi=buf[SE->_offset];
-      spProjXp(tmp,in._odata[offset]);
+  }
-      permute(chi,tmp,ptype);
+  Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
-    } else if ( local ) {
+  spReconXp(result,Uchi);
      spProjXp(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Xp),&chi());
    spReconXp(result,Uchi);
    // Yp
    offset = st._offsets [Yp][ss];
    local  = st._is_local[Yp][ss];
    perm   = st._permute[Yp][ss];
    ptype  = st._permute_type[Yp];
    if ( local && perm ) {
      spProjYp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjYp(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Yp),&chi());
    accumReconYp(result,Uchi);
    // Zp
    offset = st._offsets [Zp][ss];
    local  = st._is_local[Zp][ss];
    perm   = st._permute[Zp][ss];
    ptype  = st._permute_type[Zp];
    if ( local && perm ) {
      spProjZp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjZp(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Zp),&chi());
    accumReconZp(result,Uchi);
    // Tp
    offset = st._offsets [Tp][ss];
    local  = st._is_local[Tp][ss];
    perm   = st._permute[Tp][ss];
    ptype  = st._permute_type[Tp];
    if ( local && perm ) {
      spProjTp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjTp(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Tp),&chi());
    accumReconTp(result,Uchi);
    // Xm
    offset = st._offsets [Xm][ss];
    local  = st._is_local[Xm][ss];
    perm   = st._permute[Xm][ss];
    ptype  = st._permute_type[Xm];
    if ( local && perm ) {
      spProjXm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjXm(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Xm),&chi());
    accumReconXm(result,Uchi);
    // Ym
    offset = st._offsets [Ym][ss];
    local  = st._is_local[Ym][ss];
    perm   = st._permute[Ym][ss];
    ptype  = st._permute_type[Ym];
    if ( local && perm ) {
      spProjYm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjYm(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Ym),&chi());
    accumReconYm(result,Uchi);
    // Zm
    offset = st._offsets [Zm][ss];
    local  = st._is_local[Zm][ss];
    perm   = st._permute[Zm][ss];
    ptype  = st._permute_type[Zm];
    if ( local && perm ) {
      spProjZm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjZm(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Zm),&chi());
    accumReconZm(result,Uchi);
    // Tm
    offset = st._offsets [Tm][ss];
    local  = st._is_local[Tm][ss];
    perm   = st._permute[Tm][ss];
    ptype  = st._permute_type[Tm];
    if ( local && perm ) {
      spProjTm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjTm(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Tm),&chi());
    accumReconTm(result,Uchi);
    vstream(out._odata[ss],result*(-0.5));
 }
 void DiracOptDhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
 			   std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 			   int sF,int sU,const LatticeFermion &in, LatticeFermion &out)
 {
    vHalfSpinColourVector  tmp;    
    vHalfSpinColourVector  chi;    
    vSpinColourVector result;
    vHalfSpinColourVector Uchi;
    int offset,local,perm, ptype;
    // Xp
    int ss=sF;
    offset = st._offsets [Xm][ss];
    local  = st._is_local[Xm][ss];
    perm   = st._permute[Xm][ss];
    ptype  = st._permute_type[Xm];
    if ( local && perm ) {
      spProjXp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjXp(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Xm),&chi());
    spReconXp(result,Uchi);
    // Yp
    offset = st._offsets [Ym][ss];
    local  = st._is_local[Ym][ss];
    perm   = st._permute[Ym][ss];
    ptype  = st._permute_type[Ym];
    if ( local && perm ) {
      spProjYp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjYp(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Ym),&chi());
    accumReconYp(result,Uchi);
    // Zp
    offset = st._offsets [Zm][ss];
    local  = st._is_local[Zm][ss];
    perm   = st._permute[Zm][ss];
    ptype  = st._permute_type[Zm];
    if ( local && perm ) {
      spProjZp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjZp(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Zm),&chi());
    accumReconZp(result,Uchi);
    // Tp
    offset = st._offsets [Tm][ss];
    local  = st._is_local[Tm][ss];
    perm   = st._permute[Tm][ss];
    ptype  = st._permute_type[Tm];
    if ( local && perm ) {
      spProjTp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjTp(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Tm),&chi());
    accumReconTp(result,Uchi);
    // Xm
    offset = st._offsets [Xp][ss];
    local  = st._is_local[Xp][ss];
    perm   = st._permute[Xp][ss];
    ptype  = st._permute_type[Xp];
    if ( local && perm ) 
    {
      spProjXm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjXm(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Xp),&chi());
    accumReconXm(result,Uchi);
    // Ym
    offset = st._offsets [Yp][ss];
    local  = st._is_local[Yp][ss];
    perm   = st._permute[Yp][ss];
    ptype  = st._permute_type[Yp];
    if ( local && perm ) {
      spProjYm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjYm(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Yp),&chi());
    accumReconYm(result,Uchi);
    // Zm
    offset = st._offsets [Zp][ss];
    local  = st._is_local[Zp][ss];
    perm   = st._permute[Zp][ss];
    ptype  = st._permute_type[Zp];
    if ( local && perm ) {
      spProjZm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjZm(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Zp),&chi());
    accumReconZm(result,Uchi);
    // Tm
    offset = st._offsets [Tp][ss];
    local  = st._is_local[Tp][ss];
    perm   = st._permute[Tp][ss];
    ptype  = st._permute_type[Tp];
    if ( local && perm ) {
      spProjTm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
      spProjTm(chi,in._odata[offset]);
    } else { 
      chi=buf[offset];
    }
    mult(&Uchi(),&U._odata[sU](Tp),&chi());
    accumReconTm(result,Uchi);
    vstream(out._odata[ss],result*(-0.5));
 }
 void DiracOptDhopDir(CartesianStencil &st,LatticeDoubledGaugeField &U,
 			std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 		       int sF,int sU,const LatticeFermion &in, LatticeFermion &out,int dirdisp)
 {
    vHalfSpinColourVector  tmp;    
    vHalfSpinColourVector  chi;    
    vSpinColourVector result;
    vHalfSpinColourVector Uchi;
    int offset,local,perm, ptype;
    int ss=sF;
-    offset = st._offsets [dirdisp][ss];
+  // Yp
-    local  = st._is_local[dirdisp][ss];
+  SE=st.GetEntry(ptype,Yp,sF);
-    perm   = st._permute[dirdisp][ss];
+  if ( SE->_is_local && SE->_permute ) {
-    ptype  = st._permute_type[dirdisp];
+    spProjYp(tmp,in._odata[SE->_offset]);
    permute(chi,tmp,ptype);
  } else if ( SE->_is_local ) {
    spProjYp(chi,in._odata[SE->_offset]);
  } else { 
    chi=buf[SE->_offset];
  }
  Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
  accumReconYp(result,Uchi);
-    // Xp
+  // Zp
-    if(dirdisp==Xp){
+  SE=st.GetEntry(ptype,Zp,sF);
-      if ( local && perm ) {
+  if (  SE->_is_local && SE->_permute ) {
-	spProjXp(tmp,in._odata[offset]);
+    spProjZp(tmp,in._odata[SE->_offset]);
-	permute(chi,tmp,ptype);
+    permute(chi,tmp,ptype);
-      } else if ( local ) {
+  } else if ( SE->_is_local ) {
-	spProjXp(chi,in._odata[offset]);
+    spProjZp(chi,in._odata[SE->_offset]);
-      } else { 
+  } else { 
-	chi=buf[offset];
+    chi=buf[SE->_offset];
-      }
+  }
-      mult(&Uchi(),&U._odata[sU](Xp),&chi());
+  Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
-      spReconXp(result,Uchi);
+  accumReconZp(result,Uchi);
    }
-    // Yp
+  // Tp
-    if ( dirdisp==Yp ){
+  SE=st.GetEntry(ptype,Tp,sF);
-      if ( local && perm ) {
+  if (  SE->_is_local && SE->_permute ) {
-	spProjYp(tmp,in._odata[offset]);
+    spProjTp(tmp,in._odata[SE->_offset]);
-	permute(chi,tmp,ptype);
+    permute(chi,tmp,ptype);
-      } else if ( local ) {
+  } else if ( SE->_is_local ) {
-	spProjYp(chi,in._odata[offset]);
+    spProjTp(chi,in._odata[SE->_offset]);
-      } else { 
+  } else { 
-	chi=buf[offset];
+    chi=buf[SE->_offset];
-      }
+  }
-      mult(&Uchi(),&U._odata[sU](Yp),&chi());
+  Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
-      spReconYp(result,Uchi);
+  accumReconTp(result,Uchi);
    }
-    // Zp
+  // Xm
-    if ( dirdisp ==Zp ){
+  SE=st.GetEntry(ptype,Xm,sF);
-      if ( local && perm ) {
+  if (  SE->_is_local && SE->_permute ) {
-	spProjZp(tmp,in._odata[offset]);
+    spProjXm(tmp,in._odata[SE->_offset]);
-	permute(chi,tmp,ptype);
+    permute(chi,tmp,ptype);
-      } else if ( local ) {
+  } else if ( SE->_is_local ) {
-	spProjZp(chi,in._odata[offset]);
+    spProjXm(chi,in._odata[SE->_offset]);
-      } else { 
+  } else { 
-	chi=buf[offset];
+    chi=buf[SE->_offset];
-      }
+  }
-      mult(&Uchi(),&U._odata[sU](Zp),&chi());
+  Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
-      spReconZp(result,Uchi);
+  accumReconXm(result,Uchi);
-    }
+  
  // Ym
  SE=st.GetEntry(ptype,Ym,sF);
  if (  SE->_is_local && SE->_permute ) {
    spProjYm(tmp,in._odata[SE->_offset]);
    permute(chi,tmp,ptype);
  } else if ( SE->_is_local ) {
    spProjYm(chi,in._odata[SE->_offset]);
  } else { 
    chi=buf[SE->_offset];
  }
  Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
  accumReconYm(result,Uchi);
  // Zm
  SE=st.GetEntry(ptype,Zm,sF);
  if (  SE->_is_local && SE->_permute ) {
    spProjZm(tmp,in._odata[SE->_offset]);
    permute(chi,tmp,ptype);
  } else if ( SE->_is_local ) {
    spProjZm(chi,in._odata[SE->_offset]);
  } else { 
    chi=buf[SE->_offset];
  }
  Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
  accumReconZm(result,Uchi);
-    // Tp
+  // Tm
-    if ( dirdisp ==Tp ){
+  SE=st.GetEntry(ptype,Tm,sF);
-      if ( local && perm ) {
+  if (  SE->_is_local && SE->_permute ) {
-	spProjTp(tmp,in._odata[offset]);
+    spProjTm(tmp,in._odata[SE->_offset]);
-	permute(chi,tmp,ptype);
+    permute(chi,tmp,ptype);
-      } else if ( local ) {
+  } else if ( SE->_is_local ) {
-	spProjTp(chi,in._odata[offset]);
+    spProjTm(chi,in._odata[SE->_offset]);
-      } else { 
+  } else { 
-	chi=buf[offset];
+    chi=buf[SE->_offset];
-      }
+  }
-      mult(&Uchi(),&U._odata[sU](Tp),&chi());
+  Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
-      spReconTp(result,Uchi);
+  accumReconTm(result,Uchi);
    }
-    // Xm
+  vstream(out._odata[sF],result*(-0.5));
-    if ( dirdisp==Xm ){
+};
      if ( local && perm ) {
 	spProjXm(tmp,in._odata[offset]);
 	permute(chi,tmp,ptype);
      } else if ( local ) {
 	spProjXm(chi,in._odata[offset]);
      } else { 
 	chi=buf[offset];
      }
      mult(&Uchi(),&U._odata[sU](Xm),&chi());
      spReconXm(result,Uchi);
    }
-    // Ym
+template<class Impl> 
-    if ( dirdisp == Ym ){
+void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
-      if ( local && perm ) {
+					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-	spProjYm(tmp,in._odata[offset]);
+					      int sF,int sU,const FermionField &in, FermionField &out)
-	permute(chi,tmp,ptype);
+{
-      } else if ( local ) {
+  SiteHalfSpinor  tmp;    
-	spProjYm(chi,in._odata[offset]);
+  SiteHalfSpinor  chi;    
-      } else { 
+  SiteSpinor result;
-	chi=buf[offset];
+  SiteHalfSpinor Uchi;
-      }
+  StencilEntry *SE;
-      mult(&Uchi(),&U._odata[sU](Ym),&chi());
+  int ptype;
      spReconYm(result,Uchi);
    }
-    // Zm
+  // Xp
-    if ( dirdisp == Zm ){
+  SE=st.GetEntry(ptype,Xm,sF);
-      if ( local && perm ) {
+  if (  SE->_is_local && SE->_permute ) {
-	spProjZm(tmp,in._odata[offset]);
+    spProjXp(tmp,in._odata[SE->_offset]);
-	permute(chi,tmp,ptype);
+    permute(chi,tmp,ptype);
-      } else if ( local ) {
+  } else if ( SE->_is_local ) {
-	spProjZm(chi,in._odata[offset]);
+    spProjXp(chi,in._odata[SE->_offset]);
-      } else { 
+  } else { 
-	chi=buf[offset];
+    chi=buf[SE->_offset];
-      }
+  }
-      mult(&Uchi(),&U._odata[sU](Zm),&chi());
+  Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
-      spReconZm(result,Uchi);
+  spReconXp(result,Uchi);
    }
-    // Tm
+  // Yp
-    if ( dirdisp==Tm ) {
+  SE=st.GetEntry(ptype,Ym,sF);
-      if ( local && perm ) {
+  if (  SE->_is_local && SE->_permute ) {
-	spProjTm(tmp,in._odata[offset]);
+    spProjYp(tmp,in._odata[SE->_offset]);
-	permute(chi,tmp,ptype);
+    permute(chi,tmp,ptype);
-      } else if ( local ) {
+  } else if ( SE->_is_local ) {
-	spProjTm(chi,in._odata[offset]);
+    spProjYp(chi,in._odata[SE->_offset]);
-      } else { 
+  } else { 
-	chi=buf[offset];
+    chi=buf[SE->_offset];
-      }
+  }
-      mult(&Uchi(),&U._odata[sU](Tm),&chi());
+  Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
-      spReconTm(result,Uchi);
+  accumReconYp(result,Uchi);
-    }
+  
  // Zp
  SE=st.GetEntry(ptype,Zm,sF);
  if (  SE->_is_local && SE->_permute ) {
    spProjZp(tmp,in._odata[SE->_offset]);
    permute(chi,tmp,ptype);
  } else if ( SE->_is_local ) {
    spProjZp(chi,in._odata[SE->_offset]);
  } else { 
    chi=buf[SE->_offset];
  }
  Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
  accumReconZp(result,Uchi);
  // Tp
  SE=st.GetEntry(ptype,Tm,sF);
  if (  SE->_is_local && SE->_permute ) {
    spProjTp(tmp,in._odata[SE->_offset]);
    permute(chi,tmp,ptype);
  } else if ( SE->_is_local ) {
    spProjTp(chi,in._odata[SE->_offset]);
  } else { 
    chi=buf[SE->_offset];
  }
  Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
  accumReconTp(result,Uchi);
  // Xm
  SE=st.GetEntry(ptype,Xp,sF);
  if (  SE->_is_local && SE->_permute ) {
    spProjXm(tmp,in._odata[SE->_offset]);
    permute(chi,tmp,ptype);
  } else if ( SE->_is_local ) {
    spProjXm(chi,in._odata[SE->_offset]);
  } else { 
    chi=buf[SE->_offset];
  }
  Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
  accumReconXm(result,Uchi);
-    vstream(out._odata[ss],result*(-0.5));
+  // Ym
  SE=st.GetEntry(ptype,Yp,sF);
  if (  SE->_is_local && SE->_permute ) {
    spProjYm(tmp,in._odata[SE->_offset]);
    permute(chi,tmp,ptype);
  } else if ( SE->_is_local ) {
    spProjYm(chi,in._odata[SE->_offset]);
  } else { 
    chi=buf[SE->_offset];
  }
  Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
  accumReconYm(result,Uchi);
  // Zm
  SE=st.GetEntry(ptype,Zp,sF);
  if (  SE->_is_local && SE->_permute ) {
    spProjZm(tmp,in._odata[SE->_offset]);
    permute(chi,tmp,ptype);
  } else if ( SE->_is_local ) {
    spProjZm(chi,in._odata[SE->_offset]);
  } else { 
    chi=buf[SE->_offset];
  }
  Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
  accumReconZm(result,Uchi);
  // Tm
  SE=st.GetEntry(ptype,Tp,sF);
  if (  SE->_is_local && SE->_permute ) {
    spProjTm(tmp,in._odata[SE->_offset]);
    permute(chi,tmp,ptype);
  } else if ( SE->_is_local ) {
    spProjTm(chi,in._odata[SE->_offset]);
  } else { 
    chi=buf[SE->_offset];
  }
  Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
  accumReconTm(result,Uchi);
  vstream(out._odata[sF],result*(-0.5));
 }
 template<class Impl> 
 void WilsonKernels<Impl>::DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
 					  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					  int sF,int sU,const FermionField &in, FermionField &out,int dir,int gamma)
 {
  SiteHalfSpinor  tmp;    
  SiteHalfSpinor  chi;    
  SiteSpinor   result;
  SiteHalfSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  SE=st.GetEntry(ptype,dir,sF);
  // Xp
  if(gamma==Xp){
    if (  SE->_is_local && SE->_permute ) {
      spProjXp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else if ( SE->_is_local ) {
      spProjXp(chi,in._odata[SE->_offset]);
    } else { 
      chi=buf[SE->_offset];
    }
    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
    spReconXp(result,Uchi);
  }
  // Yp
  if ( gamma==Yp ){
    if (  SE->_is_local && SE->_permute ) {
      spProjYp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else if ( SE->_is_local ) {
      spProjYp(chi,in._odata[SE->_offset]);
    } else { 
      chi=buf[SE->_offset];
    }
    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
    spReconYp(result,Uchi);
  }
  // Zp
  if ( gamma ==Zp ){
    if (  SE->_is_local && SE->_permute ) {
      spProjZp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else if ( SE->_is_local ) {
      spProjZp(chi,in._odata[SE->_offset]);
    } else { 
      chi=buf[SE->_offset];
    }
    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
    spReconZp(result,Uchi);
  }
  // Tp
  if ( gamma ==Tp ){
    if (  SE->_is_local && SE->_permute ) {
      spProjTp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else if ( SE->_is_local ) {
      spProjTp(chi,in._odata[SE->_offset]);
    } else { 
      chi=buf[SE->_offset];
    }
    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
    spReconTp(result,Uchi);
  }
  // Xm
  if ( gamma==Xm ){
    if (  SE->_is_local && SE->_permute ) {
      spProjXm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else if ( SE->_is_local ) {
      spProjXm(chi,in._odata[SE->_offset]);
    } else { 
      chi=buf[SE->_offset];
    }
    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
    spReconXm(result,Uchi);
  }
  // Ym
  if ( gamma == Ym ){
    if (  SE->_is_local && SE->_permute ) {
      spProjYm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else if ( SE->_is_local ) {
      spProjYm(chi,in._odata[SE->_offset]);
    } else { 
      chi=buf[SE->_offset];
    }
    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
    spReconYm(result,Uchi);
  }
  // Zm
  if ( gamma == Zm ){
    if (  SE->_is_local && SE->_permute ) {
      spProjZm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else if ( SE->_is_local ) {
      spProjZm(chi,in._odata[SE->_offset]);
    } else { 
      chi=buf[SE->_offset];
    }
    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
    spReconZm(result,Uchi);
  }
  // Tm
  if ( gamma==Tm ) {
    if (  SE->_is_local && SE->_permute ) {
      spProjTm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else if ( SE->_is_local ) {
      spProjTm(chi,in._odata[SE->_offset]);
    } else { 
      chi=buf[SE->_offset];
    }
    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
    spReconTm(result,Uchi);
  }
  vstream(out._odata[sF],result*(-0.5));
 }
  FermOpTemplateInstantiate(WilsonKernels);
 }}
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -6,44 +6,44 @@ namespace Grid {
  namespace QCD {
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Helper classes that implement Wilson stencil for a single site.
+    // Helper routines that implement Wilson stencil for a single site.
    // Common to both the WilsonFermion and WilsonFermion5D
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // Generic version works for any Nc and with extra flavour indices
    //    namespace DiracOpt {
-      // These ones will need to be package intelligently. WilsonType base class
+    template<class Impl> class WilsonKernels : public FermionOperator<Impl> { 
-      // for use by DWF etc..
+    public:
-      void DiracOptDhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
+
-			    std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+     INHERIT_IMPL_TYPES(Impl);
-			    int sF,int sU,const LatticeFermion &in, LatticeFermion &out);
+     typedef FermionOperator<Impl> Base;
-      void DiracOptDhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
+     
-			       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+    public:
-			       int sF,int sU,const LatticeFermion &in, LatticeFermion &out);
+     void DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
-      void DiracOptDhopDir(CartesianStencil &st,LatticeDoubledGaugeField &U,
+			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			   std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+			   int sF,int sU,const FermionField &in, FermionField &out);
 			   int sF,int sU,const LatticeFermion &in, LatticeFermion &out,int dirdisp);
-      //  };
+     void DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in,FermionField &out);
-      // Hand unrolled for Nc=3, one flavour
+     void DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
-      //    namespace DiracOptHand {
+			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-      // These ones will need to be package intelligently. WilsonType base class
+			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
      // for use by DWF etc..
-      void DiracOptHandDhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
+     void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
-				std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+			       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-				int sF,int sU,const LatticeFermion &in, LatticeFermion &out);
+			       int sF,int sU,const FermionField &in, FermionField &out){
-      void DiracOptHandDhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
+       DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-				   std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+     }
 				   int sF,int sU,const LatticeFermion &in, LatticeFermion &out);
-      //    };
+     void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
-  
+				  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 				  int sF,int sU,const FermionField &in, FermionField &out){
       DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
     }
-    void DiracOptHandDhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
+     WilsonKernels(const ImplParams &p= ImplParams()) : Base(p) {};
-				 std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+
-				 int sF,int sU,const LatticeFermion &in, LatticeFermion &out);
+    };
  }
 }
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -280,48 +280,50 @@
 namespace Grid {
 namespace QCD {
-void DiracOptHandDhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
+#if 0
-			    std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+template<class Simd>
-			    int sF,int sU,const LatticeFermion &in, LatticeFermion &out)
+void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int sF,int sU,const FermionField &in, FermionField &out)
 {
-  REGISTER vComplex result_00; // 12 regs on knc
+  REGISTER Simd result_00; // 12 regs on knc
-  REGISTER vComplex result_01;
+  REGISTER Simd result_01;
-  REGISTER vComplex result_02;
+  REGISTER Simd result_02;
-  REGISTER vComplex result_10;
+  REGISTER Simd result_10;
-  REGISTER vComplex result_11;
+  REGISTER Simd result_11;
-  REGISTER vComplex result_12;
+  REGISTER Simd result_12;
-  REGISTER vComplex result_20;
+  REGISTER Simd result_20;
-  REGISTER vComplex result_21;
+  REGISTER Simd result_21;
-  REGISTER vComplex result_22;
+  REGISTER Simd result_22;
-  REGISTER vComplex result_30;
+  REGISTER Simd result_30;
-  REGISTER vComplex result_31;
+  REGISTER Simd result_31;
-  REGISTER vComplex result_32; // 20 left
+  REGISTER Simd result_32; // 20 left
-  REGISTER vComplex Chi_00;    // two spinor; 6 regs
+  REGISTER Simd Chi_00;    // two spinor; 6 regs
-  REGISTER vComplex Chi_01;
+  REGISTER Simd Chi_01;
-  REGISTER vComplex Chi_02;
+  REGISTER Simd Chi_02;
-  REGISTER vComplex Chi_10;
+  REGISTER Simd Chi_10;
-  REGISTER vComplex Chi_11;
+  REGISTER Simd Chi_11;
-  REGISTER vComplex Chi_12;   // 14 left
+  REGISTER Simd Chi_12;   // 14 left
-  REGISTER vComplex UChi_00;  // two spinor; 6 regs
+  REGISTER Simd UChi_00;  // two spinor; 6 regs
-  REGISTER vComplex UChi_01;
+  REGISTER Simd UChi_01;
-  REGISTER vComplex UChi_02;
+  REGISTER Simd UChi_02;
-  REGISTER vComplex UChi_10;
+  REGISTER Simd UChi_10;
-  REGISTER vComplex UChi_11;
+  REGISTER Simd UChi_11;
-  REGISTER vComplex UChi_12;  // 8 left
+  REGISTER Simd UChi_12;  // 8 left
-  REGISTER vComplex U_00;  // two rows of U matrix
+  REGISTER Simd U_00;  // two rows of U matrix
-  REGISTER vComplex U_10;
+  REGISTER Simd U_10;
-  REGISTER vComplex U_20;  
+  REGISTER Simd U_20;  
-  REGISTER vComplex U_01;
+  REGISTER Simd U_01;
-  REGISTER vComplex U_11;
+  REGISTER Simd U_11;
-  REGISTER vComplex U_21;  // 2 reg left.
+  REGISTER Simd U_21;  // 2 reg left.
 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
@@ -360,11 +362,6 @@ void DiracOptHandDhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
    MULT_2SPIN(Xp);
  }
  XP_RECON;
  //  std::cout << "XP_RECON"<<std::endl;
  //  std::cout << result_00 <<" "<<result_01 <<" "<<result_02 <<std::endl;
  //  std::cout << result_10 <<" "<<result_11 <<" "<<result_12 <<std::endl;
  //  std::cout << result_20 <<" "<<result_21 <<" "<<result_22 <<std::endl;
  //  std::cout << result_30 <<" "<<result_31 <<" "<<result_32 <<std::endl;
  // Yp
  offset = st._offsets [Yp][ss];
@@ -446,12 +443,6 @@ void DiracOptHandDhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
    MULT_2SPIN(Xm);
  }
  XM_RECON_ACCUM;
  //  std::cout << "XM_RECON_ACCUM"<<std::endl;
  //  std::cout << result_00 <<" "<<result_01 <<" "<<result_02 <<std::endl;
  //  std::cout << result_10 <<" "<<result_11 <<" "<<result_12 <<std::endl;
  //  std::cout << result_20 <<" "<<result_21 <<" "<<result_22 <<std::endl;
  //  std::cout << result_30 <<" "<<result_31 <<" "<<result_32 <<std::endl;
  // Ym
  offset = st._offsets [Ym][ss];
@@ -530,48 +521,49 @@ void DiracOptHandDhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
  }
 }
-void DiracOptHandDhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
+template<class Simd>
-			       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
-			       int ss,int sU,const LatticeFermion &in, LatticeFermion &out)
+							      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							      int ss,int sU,const FermionField &in, FermionField &out)
 {
-  REGISTER vComplex result_00; // 12 regs on knc
+  REGISTER Simd result_00; // 12 regs on knc
-  REGISTER vComplex result_01;
+  REGISTER Simd result_01;
-  REGISTER vComplex result_02;
+  REGISTER Simd result_02;
-  REGISTER vComplex result_10;
+  REGISTER Simd result_10;
-  REGISTER vComplex result_11;
+  REGISTER Simd result_11;
-  REGISTER vComplex result_12;
+  REGISTER Simd result_12;
-  REGISTER vComplex result_20;
+  REGISTER Simd result_20;
-  REGISTER vComplex result_21;
+  REGISTER Simd result_21;
-  REGISTER vComplex result_22;
+  REGISTER Simd result_22;
-  REGISTER vComplex result_30;
+  REGISTER Simd result_30;
-  REGISTER vComplex result_31;
+  REGISTER Simd result_31;
-  REGISTER vComplex result_32; // 20 left
+  REGISTER Simd result_32; // 20 left
-  REGISTER vComplex Chi_00;    // two spinor; 6 regs
+  REGISTER Simd Chi_00;    // two spinor; 6 regs
-  REGISTER vComplex Chi_01;
+  REGISTER Simd Chi_01;
-  REGISTER vComplex Chi_02;
+  REGISTER Simd Chi_02;
-  REGISTER vComplex Chi_10;
+  REGISTER Simd Chi_10;
-  REGISTER vComplex Chi_11;
+  REGISTER Simd Chi_11;
-  REGISTER vComplex Chi_12;   // 14 left
+  REGISTER Simd Chi_12;   // 14 left
-  REGISTER vComplex UChi_00;  // two spinor; 6 regs
+  REGISTER Simd UChi_00;  // two spinor; 6 regs
-  REGISTER vComplex UChi_01;
+  REGISTER Simd UChi_01;
-  REGISTER vComplex UChi_02;
+  REGISTER Simd UChi_02;
-  REGISTER vComplex UChi_10;
+  REGISTER Simd UChi_10;
-  REGISTER vComplex UChi_11;
+  REGISTER Simd UChi_11;
-  REGISTER vComplex UChi_12;  // 8 left
+  REGISTER Simd UChi_12;  // 8 left
-  REGISTER vComplex U_00;  // two rows of U matrix
+  REGISTER Simd U_00;  // two rows of U matrix
-  REGISTER vComplex U_10;
+  REGISTER Simd U_10;
-  REGISTER vComplex U_20;  
+  REGISTER Simd U_20;  
-  REGISTER vComplex U_01;
+  REGISTER Simd U_01;
-  REGISTER vComplex U_11;
+  REGISTER Simd U_11;
-  REGISTER vComplex U_21;  // 2 reg left.
+  REGISTER Simd U_21;  // 2 reg left.
 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
@@ -752,7 +744,7 @@ void DiracOptHandDhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
  TP_RECON_ACCUM;
  {
-    vSpinColourVector & ref (out._odata[ss]);
+    SiteSpinor & ref (out._odata[ss]);
    vstream(ref()(0)(0),result_00*(-0.5));
    vstream(ref()(0)(1),result_01*(-0.5));
    vstream(ref()(0)(2),result_02*(-0.5));
@@ -767,4 +759,5 @@ void DiracOptHandDhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
    vstream(ref()(3)(2),result_32*(-0.5));
  }
 }
 #endif
 }}
--- a/lib/qcd/action/fermion/g5HermitianLinop.h
+++ b/lib/qcd/action/fermion/g5HermitianLinop.h
@@ -1,7 +1,9 @@
 #ifndef G5_HERMITIAN_LINOP
 #define G5_HERMITIAN_LINOP
 namespace Grid {
  namespace QCD {
 ////////////////////////////////////////////////////////////////////
 // Wrap an already herm matrix
 ////////////////////////////////////////////////////////////////////
--- a/lib/qcd/action/gauge/WilsonGaugeAction.h
+++ b/lib/qcd/action/gauge/WilsonGaugeAction.h
@@ -7,7 +7,7 @@ namespace Grid{
    ////////////////////////////////////////////////////////////////////////
    // Wilson Gauge Action .. should I template the Nc etc..
    ////////////////////////////////////////////////////////////////////////
-    template<class GaugeField,class MatrixField>
+    template<class GaugeField, class MatrixField>
      class WilsonGaugeAction : public Action<GaugeField> {
    private:
      RealD beta;
@@ -18,12 +18,13 @@ namespace Grid{
      virtual RealD S(const GaugeField &U) {
 	RealD plaq = WilsonLoops<MatrixField,GaugeField>::avgPlaquette(U);
-	std::cout << "Plaq : "<<plaq << "\n";
+	std::cout<<GridLogMessage << "Plaq : "<<plaq << "\n";
-	double vol = U._grid->gSites();
+	RealD vol = U._grid->gSites();
-	return beta*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5;
+	RealD action=beta*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5;
 	std::cout << GridLogMessage << "WilsonGauge action "<<action<<std::endl;
 	return action;
      };
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	//not optimal implementation FIXME
 	//extend Ta to include Lorentz indexes
 	RealD factor = 0.5*beta/RealD(Nc);
--- a/lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
+++ b/lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
@@ -0,0 +1,112 @@
 #ifndef QCD_EVEN_ODD_SCHUR_DIFFERENTIABLE_H
 #define QCD_EVEN_ODD_SCHUR_DIFFERENTIABLE_H
 namespace Grid{
  namespace QCD{
    // Base even odd HMC on the normal Mee based schur decomposition.
    //
    //     M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
    //         (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
    //
    // Determinant is det of middle factor
    // This assumes Mee is indept of U.
    //
    template<class Impl>
    class SchurDifferentiableOperator :  public SchurDiagMooeeOperator<FermionOperator<Impl>,typename Impl::FermionField> 
      {
      public:
      INHERIT_IMPL_TYPES(Impl);
 	typedef FermionOperator<Impl> Matrix;
 	SchurDifferentiableOperator (Matrix &Mat) : SchurDiagMooeeOperator<Matrix,FermionField>(Mat) {};
 	void MpcDeriv(GaugeField &Force,const FermionField &U,const FermionField &V) {
 	  GridBase *fgrid   = this->_Mat.FermionGrid();
 	  GridBase *fcbgrid = this->_Mat.FermionRedBlackGrid();
 	  GridBase *ugrid   = this->_Mat.GaugeGrid();
 	  GridBase *ucbgrid = this->_Mat.GaugeRedBlackGrid();
 	  Real coeff = 1.0;
 	  FermionField tmp1(fcbgrid);
 	  FermionField tmp2(fcbgrid);
 	  conformable(fcbgrid,U._grid);
 	  conformable(fcbgrid,V._grid);
 	  // Assert the checkerboard?? or code for either
 	  assert(U.checkerboard==Odd);
 	  assert(V.checkerboard==U.checkerboard);
 	  GaugeField ForceO(ucbgrid);
 	  GaugeField ForceE(ucbgrid);
 	  //  X^dag Der_oe MeeInv Meo Y
 	  // Use Mooee as nontrivial but gauge field indept
 	  this->_Mat.Meooe   (V,tmp1);      // odd->even -- implicit -0.5 factor to be applied
 	  this->_Mat.MooeeInv(tmp1,tmp2);   // even->even 
 	  this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerNo);
 	  //  Accumulate X^dag M_oe MeeInv Der_eo Y
 	  this->_Mat.MeooeDag   (U,tmp1);    // even->odd -- implicit -0.5 factor to be applied
 	  this->_Mat.MooeeInvDag(tmp1,tmp2); // even->even 
 	  this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerNo);
 	  assert(ForceE.checkerboard==Even);
 	  assert(ForceO.checkerboard==Odd);
 	  setCheckerboard(Force,ForceE); 
 	  setCheckerboard(Force,ForceO);
 	  Force=-Force;
 	}
 	void MpcDagDeriv(GaugeField &Force,const FermionField &U,const FermionField &V) {
 	  GridBase *fgrid   = this->_Mat.FermionGrid();
 	  GridBase *fcbgrid = this->_Mat.FermionRedBlackGrid();
 	  GridBase *ugrid   = this->_Mat.GaugeGrid();
 	  GridBase *ucbgrid = this->_Mat.GaugeRedBlackGrid();
 	  Real coeff = 1.0;
 	  FermionField tmp1(fcbgrid);
 	  FermionField tmp2(fcbgrid);
 	  conformable(fcbgrid,U._grid);
 	  conformable(fcbgrid,V._grid);
 	  // Assert the checkerboard?? or code for either
 	  assert(V.checkerboard==Odd);
 	  assert(V.checkerboard==V.checkerboard);
 	  GaugeField ForceO(ucbgrid);
 	  GaugeField ForceE(ucbgrid);
 	  //  X^dag Der_oe MeeInv Meo Y
 	  // Use Mooee as nontrivial but gauge field indept
 	  this->_Mat.MeooeDag   (V,tmp1);      // odd->even -- implicit -0.5 factor to be applied
 	  this->_Mat.MooeeInvDag(tmp1,tmp2);   // even->even 
 	  this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerYes);
 	  //  Accumulate X^dag M_oe MeeInv Der_eo Y
 	  this->_Mat.Meooe   (U,tmp1);    // even->odd -- implicit -0.5 factor to be applied
 	  this->_Mat.MooeeInv(tmp1,tmp2); // even->even 
 	  this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerYes);
 	  assert(ForceE.checkerboard==Even);
 	  assert(ForceO.checkerboard==Odd);
 	  setCheckerboard(Force,ForceE); 
 	  setCheckerboard(Force,ForceO);
 	  Force=-Force;
 	}
    };
  }
 }
 #endif
--- a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
@@ -0,0 +1,185 @@
 #ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_H
 #define QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_H
 namespace Grid{
  namespace QCD{
    ///////////////////////////////////////
    // One flavour rational
    ///////////////////////////////////////
    // S_f = chi^dag *  N(Mpc^dag*Mpc)/D(Mpc^dag*Mpc) * chi
    //
    // Here, M is some operator 
    // N and D makeup the rat. poly 
    //
    template<class Impl>
    class OneFlavourEvenOddRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef OneFlavourRationalParams Params;
      Params param;
      MultiShiftFunction PowerHalf   ;
      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerQuarter;
      MultiShiftFunction PowerNegQuarter;
    private:
      FermionOperator<Impl> & FermOp;// the basic operator
      // NOT using "Nroots"; IroIro is -- perhaps later, but this wasn't good for us historically
      // and hasenbusch works better
      FermionField PhiEven; // the pseudo fermion field for this trajectory
      FermionField PhiOdd; // the pseudo fermion field for this trajectory
    public:
      OneFlavourEvenOddRationalPseudoFermionAction(FermionOperator<Impl>  &Op, 
 						   Params & p ) : FermOp(Op), 
 	PhiEven(Op.FermionRedBlackGrid()), 
 	PhiOdd (Op.FermionRedBlackGrid()), 
 	param(p) 
      {
 	AlgRemez remez(param.lo,param.hi,param.precision);
 	// MdagM^(+- 1/2)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
 	remez.generateApprox(param.degree,1,2);
 	PowerHalf.Init(remez,param.tolerance,false);
 	PowerNegHalf.Init(remez,param.tolerance,true);
 	// MdagM^(+- 1/4)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
 	remez.generateApprox(param.degree,1,4);
   	PowerQuarter.Init(remez,param.tolerance,false);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
 	// P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
 	//        = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
 	// Phi = MpcdagMpc^{1/4} eta 
 	//
 	// P(eta) = e^{- eta^dag eta}
 	//
 	// e^{x^2/2 sig^2} => sig^2 = 0.5.
 	// 
 	// So eta should be of width sig = 1/sqrt(2).
 	RealD scale = std::sqrt(0.5);
 	FermionField eta    (FermOp.FermionGrid());
 	FermionField etaOdd (FermOp.FermionRedBlackGrid());
 	FermionField etaEven(FermOp.FermionRedBlackGrid());
 	gaussian(pRNG,eta);	eta=eta*scale;
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	FermOp.ImportGauge(U);
 	// mutishift CG
 	SchurDifferentiableOperator<Impl> Mpc(FermOp);
 	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerQuarter);
 	msCG(Mpc,etaOdd,PhiOdd);
 	//////////////////////////////////////////////////////
 	// FIXME : Clover term not yet..
 	//////////////////////////////////////////////////////
 	assert(FermOp.ConstEE() == 1);
 	PhiEven = zero;
      };
      //////////////////////////////////////////////////////
      // S = phi^dag (Mdag M)^-1/2 phi
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	FermOp.ImportGauge(U);
 	FermionField Y(FermOp.FermionRedBlackGrid());
 	SchurDifferentiableOperator<Impl> Mpc(FermOp);
 	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegQuarter);
 	msCG(Mpc,PhiOdd,Y);
 	RealD action = norm2(Y);
 	std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 solve or -1/2 solve faster??? "<<action<<std::endl;
 	return action;
      };
      //////////////////////////////////////////////////////
      // Need
      // dS_f/dU = chi^dag   d[N/D]  chi
      //
      // N/D is expressed as partial fraction expansion:
      //
      //           a0 + \sum_k ak/(M^dagM + bk)
      //
      // d[N/D] is then
      //
      //          \sum_k -ak [M^dagM+bk]^{-1}  [ dM^dag M + M^dag dM ] [M^dag M + bk]^{-1}
      //
      // Need
      //       Mf Phi_k = [MdagM+bk]^{-1} Phi
      //       Mf Phi   = \sum_k ak [MdagM+bk]^{-1} Phi
      //
      // With these building blocks
      //
      //       dS/dU =  \sum_k -ak Mf Phi_k^dag      [ dM^dag M + M^dag dM ] Mf Phi_k
      //        S    = innerprodReal(Phi,Mf Phi);
      //////////////////////////////////////////////////////
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	const int Npole = PowerNegHalf.poles.size();
 	std::vector<FermionField> MPhi_k (Npole,FermOp.FermionRedBlackGrid());
 	FermionField X(FermOp.FermionRedBlackGrid());
 	FermionField Y(FermOp.FermionRedBlackGrid());
 	GaugeField   tmp(FermOp.GaugeGrid());
 	FermOp.ImportGauge(U);
 	SchurDifferentiableOperator<Impl> Mpc(FermOp);
 	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegHalf);
 	msCG(Mpc,PhiOdd,MPhi_k);
 	dSdU = zero;
 	for(int k=0;k<Npole;k++){
 	  RealD ak = PowerNegHalf.residues[k];
 	  X  = MPhi_k[k];
 	  Mpc.Mpc(X,Y);
 	  Mpc.MpcDeriv   (tmp , Y, X );  dSdU=dSdU+ak*tmp;
 	  Mpc.MpcDagDeriv(tmp , X, Y );  dSdU=dSdU+ak*tmp;
 	}
 	dSdU = Ta(dSdU);
      };
    };
  }
 }
 #endif
--- a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@@ -0,0 +1,240 @@
 #ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_RATIO_H
 #define QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_RATIO_H
 namespace Grid{
  namespace QCD{
    ///////////////////////////////////////
    // One flavour rational
    ///////////////////////////////////////
    // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
    //
    // Here P/Q \sim R_{1/4}  ~ (V^dagV)^{1/4}  
    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
    template<class Impl>
    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef OneFlavourRationalParams Params;
      Params param;
      MultiShiftFunction PowerHalf   ;
      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerQuarter;
      MultiShiftFunction PowerNegQuarter;
    private:
      FermionOperator<Impl> & NumOp;// the basic operator
      FermionOperator<Impl> & DenOp;// the basic operator
      FermionField PhiEven; // the pseudo fermion field for this trajectory
      FermionField PhiOdd; // the pseudo fermion field for this trajectory
    public:
      OneFlavourEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 					    FermionOperator<Impl>  &_DenOp, 
 					    Params & p
 					    ) : 
      NumOp(_NumOp), 
      DenOp(_DenOp), 
      PhiOdd (_NumOp.FermionRedBlackGrid()),
      PhiEven(_NumOp.FermionRedBlackGrid()),
      param(p) 
      {
 	AlgRemez remez(param.lo,param.hi,param.precision);
 	// MdagM^(+- 1/2)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
 	remez.generateApprox(param.degree,1,2);
 	PowerHalf.Init(remez,param.tolerance,false);
 	PowerNegHalf.Init(remez,param.tolerance,true);
 	// MdagM^(+- 1/4)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
 	remez.generateApprox(param.degree,1,4);
   	PowerQuarter.Init(remez,param.tolerance,false);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
 	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
 	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
 	//
 	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
 	//
 	// P(eta) = e^{- eta^dag eta}
 	//
 	// e^{x^2/2 sig^2} => sig^2 = 0.5.
 	// 
 	// So eta should be of width sig = 1/sqrt(2).
 	RealD scale = std::sqrt(0.5);
 	FermionField eta(NumOp.FermionGrid());
 	FermionField etaOdd (NumOp.FermionRedBlackGrid());
 	FermionField etaEven(NumOp.FermionRedBlackGrid());
 	FermionField     tmp(NumOp.FermionRedBlackGrid());
 	gaussian(pRNG,eta);	eta=eta*scale;
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	// MdagM^1/4 eta
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
 	msCG_M(MdagM,etaOdd,tmp);
 	// VdagV^-1/4 MdagM^1/4 eta
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
 	msCG_V(VdagV,tmp,PhiOdd);
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
 	PhiEven = zero;
      };
      //////////////////////////////////////////////////////
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	FermionField X(NumOp.FermionRedBlackGrid());
 	FermionField Y(NumOp.FermionRedBlackGrid());
 	// VdagV^1/4 Phi
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
 	msCG_V(VdagV,PhiOdd,X);
 	// MdagM^-1/4 VdagV^1/4 Phi
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
 	msCG_M(MdagM,X,Y);
 	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
 	RealD action = norm2(Y);
 	return action;
      };
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //
      // Here, M is some 5D operator and V is the Pauli-Villars field
      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
      //
      // Need  
      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
      //
      // P/Q is expressed as partial fraction expansion: 
      // 
      //           a0 + \sum_k ak/(V^dagV + bk) 
      //  
      // d[P/Q] is then  
      //
      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
      //  
      // and similar for N/D. 
      // 
      // Need   
      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
      //   
      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
      // 
      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
      //  
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	const int n_f  = PowerNegHalf.poles.size();
 	const int n_pv = PowerQuarter.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
 	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField           Y(NumOp.FermionRedBlackGrid());
 	GaugeField   tmp(NumOp.GaugeGrid());
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
 	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
 	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
 	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
 	RealD ak;
 	dSdU = zero;
 	// With these building blocks  
 	//  
 	//       dS/dU = 
 	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
 	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
 	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
 	//(1)
 	for(int k=0;k<n_f;k++){
 	  ak = PowerNegHalf.residues[k];
 	  MdagM.Mpc(MfMpvPhi_k[k],Y);
 	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
 	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
 	}
 	//(2)
 	//(3)
 	for(int k=0;k<n_pv;k++){
          ak = PowerQuarter.residues[k];
 	  VdagV.Mpc(MpvPhi_k[k],Y);
 	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
 	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
 	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
 	}
 	dSdU = Ta(dSdU);
      };
    };
  }
 }
 #endif
--- a/lib/qcd/action/pseudofermion/OneFlavourRational.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourRational.h
@@ -0,0 +1,170 @@
 #ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_RATIONAL_H
 #define QCD_PSEUDOFERMION_ONE_FLAVOUR_RATIONAL_H
 namespace Grid{
  namespace QCD{
    ///////////////////////////////////////
    // One flavour rational
    ///////////////////////////////////////
    // S_f = chi^dag *  N(M^dag*M)/D(M^dag*M) * chi
    //
    // Here, M is some operator 
    // N and D makeup the rat. poly 
    //
    template<class Impl>
    class OneFlavourRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef OneFlavourRationalParams Params;
      Params param;
      MultiShiftFunction PowerHalf   ;
      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerQuarter;
      MultiShiftFunction PowerNegQuarter;
    private:
      FermionOperator<Impl> & FermOp;// the basic operator
      // NOT using "Nroots"; IroIro is -- perhaps later, but this wasn't good for us historically
      // and hasenbusch works better
      FermionField Phi; // the pseudo fermion field for this trajectory
    public:
      OneFlavourRationalPseudoFermionAction(FermionOperator<Impl>  &Op, 
 					    Params & p
 					    ) : FermOp(Op), Phi(Op.FermionGrid()), param(p) 
      {
 	AlgRemez remez(param.lo,param.hi,param.precision);
 	// MdagM^(+- 1/2)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
 	remez.generateApprox(param.degree,1,2);
 	PowerHalf.Init(remez,param.tolerance,false);
 	PowerNegHalf.Init(remez,param.tolerance,true);
 	// MdagM^(+- 1/4)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
 	remez.generateApprox(param.degree,1,4);
   	PowerQuarter.Init(remez,param.tolerance,false);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
 	// P(phi) = e^{- phi^dag (MdagM)^-1/2 phi}
 	//        = e^{- phi^dag (MdagM)^-1/4 (MdagM)^-1/4 phi}
 	// Phi = Mdag^{1/4} eta 
 	// P(eta) = e^{- eta^dag eta}
 	//
 	// e^{x^2/2 sig^2} => sig^2 = 0.5.
 	// 
 	// So eta should be of width sig = 1/sqrt(2).
 	RealD scale = std::sqrt(0.5);
 	FermionField eta(FermOp.FermionGrid());
 	gaussian(pRNG,eta);
 	FermOp.ImportGauge(U);
 	// mutishift CG
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(FermOp);
 	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerQuarter);
 	msCG(MdagMOp,eta,Phi);
 	Phi=Phi*scale;
      };
      //////////////////////////////////////////////////////
      // S = phi^dag (Mdag M)^-1/2 phi
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	FermOp.ImportGauge(U);
 	FermionField Y(FermOp.FermionGrid());
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(FermOp);
 	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegQuarter);
 	msCG(MdagMOp,Phi,Y);
 	RealD action = norm2(Y);
 	std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 solve or -1/2 solve faster??? "<<action<<std::endl;
 	return action;
      };
      //////////////////////////////////////////////////////
      // Need
      // dS_f/dU = chi^dag   d[N/D]  chi
      //
      // N/D is expressed as partial fraction expansion:
      //
      //           a0 + \sum_k ak/(M^dagM + bk)
      //
      // d[N/D] is then
      //
      //          \sum_k -ak [M^dagM+bk]^{-1}  [ dM^dag M + M^dag dM ] [M^dag M + bk]^{-1}
      //
      // Need
      //       Mf Phi_k = [MdagM+bk]^{-1} Phi
      //       Mf Phi   = \sum_k ak [MdagM+bk]^{-1} Phi
      //
      // With these building blocks
      //
      //       dS/dU =  \sum_k -ak Mf Phi_k^dag      [ dM^dag M + M^dag dM ] Mf Phi_k
      //        S    = innerprodReal(Phi,Mf Phi);
      //////////////////////////////////////////////////////
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	const int Npole = PowerNegHalf.poles.size();
 	std::vector<FermionField> MPhi_k (Npole,FermOp.FermionGrid());
 	FermionField X(FermOp.FermionGrid());
 	FermionField Y(FermOp.FermionGrid());
 	GaugeField   tmp(FermOp.GaugeGrid());
 	FermOp.ImportGauge(U);
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(FermOp);
 	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegHalf);
 	msCG(MdagMOp,Phi,MPhi_k);
 	dSdU = zero;
 	for(int k=0;k<Npole;k++){
 	  RealD ak = PowerNegHalf.residues[k];
 	  X  = MPhi_k[k];
 	  FermOp.M(X,Y);
 	  FermOp.MDeriv(tmp , Y, X,DaggerNo );  dSdU=dSdU+ak*tmp;
 	  FermOp.MDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+ak*tmp;
 	}
 	dSdU = Ta(dSdU);
      };
    };
  }
 }
 #endif
--- a/lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h
@@ -0,0 +1,226 @@
 #ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_RATIONAL_RATIO_H
 #define QCD_PSEUDOFERMION_ONE_FLAVOUR_RATIONAL_RATIO_H
 namespace Grid{
  namespace QCD{
    ///////////////////////////////////////
    // One flavour rational
    ///////////////////////////////////////
    // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
    //
    // Here P/Q \sim R_{1/4}  ~ (V^dagV)^{1/4}  
    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
    template<class Impl>
    class OneFlavourRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef OneFlavourRationalParams Params;
      Params param;
      MultiShiftFunction PowerHalf   ;
      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerQuarter;
      MultiShiftFunction PowerNegQuarter;
    private:
      FermionOperator<Impl> & NumOp;// the basic operator
      FermionOperator<Impl> & DenOp;// the basic operator
      FermionField Phi; // the pseudo fermion field for this trajectory
    public:
      OneFlavourRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 					    FermionOperator<Impl>  &_DenOp, 
 					    Params & p
 					    ) : NumOp(_NumOp), DenOp(_DenOp), Phi(_NumOp.FermionGrid()), param(p) 
      {
 	AlgRemez remez(param.lo,param.hi,param.precision);
 	// MdagM^(+- 1/2)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
 	remez.generateApprox(param.degree,1,2);
 	PowerHalf.Init(remez,param.tolerance,false);
 	PowerNegHalf.Init(remez,param.tolerance,true);
 	// MdagM^(+- 1/4)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
 	remez.generateApprox(param.degree,1,4);
   	PowerQuarter.Init(remez,param.tolerance,false);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
 	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
 	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
 	//
 	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
 	//
 	// P(eta) = e^{- eta^dag eta}
 	//
 	// e^{x^2/2 sig^2} => sig^2 = 0.5.
 	// 
 	// So eta should be of width sig = 1/sqrt(2).
 	RealD scale = std::sqrt(0.5);
 	FermionField tmp(NumOp.FermionGrid());
 	FermionField eta(NumOp.FermionGrid());
 	gaussian(pRNG,eta);
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	// MdagM^1/4 eta
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
 	msCG_M(MdagM,eta,tmp);
 	// VdagV^-1/4 MdagM^1/4 eta
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
 	msCG_V(VdagV,tmp,Phi);
 	Phi=Phi*scale;
      };
      //////////////////////////////////////////////////////
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	FermionField X(NumOp.FermionGrid());
 	FermionField Y(NumOp.FermionGrid());
 	// VdagV^1/4 Phi
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
 	msCG_V(VdagV,Phi,X);
 	// MdagM^-1/4 VdagV^1/4 Phi
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
 	msCG_M(MdagM,X,Y);
 	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
 	RealD action = norm2(Y);
 	return action;
      };
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //
      // Here, M is some 5D operator and V is the Pauli-Villars field
      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
      //
      // Need  
      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
      //
      // P/Q is expressed as partial fraction expansion: 
      // 
      //           a0 + \sum_k ak/(V^dagV + bk) 
      //  
      // d[P/Q] is then  
      //
      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
      //  
      // and similar for N/D. 
      // 
      // Need   
      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
      //   
      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
      // 
      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
      //  
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	const int n_f  = PowerNegHalf.poles.size();
 	const int n_pv = PowerQuarter.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionGrid());
 	std::vector<FermionField> MfMpvPhi_k   (n_f,NumOp.FermionGrid());
 	FermionField      MpvPhi(NumOp.FermionGrid());
 	FermionField    MfMpvPhi(NumOp.FermionGrid());
 	FermionField MpvMfMpvPhi(NumOp.FermionGrid());
 	FermionField           Y(NumOp.FermionGrid());
 	GaugeField   tmp(NumOp.GaugeGrid());
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagM(DenOp);
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
 	msCG_V(VdagV,Phi,MpvPhi_k,MpvPhi);
 	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
 	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
 	RealD ak;
 	dSdU = zero;
 	// With these building blocks  
 	//  
 	//       dS/dU = 
 	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
 	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
 	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
 	//(1)
 	for(int k=0;k<n_f;k++){
 	  ak = PowerNegHalf.residues[k];
 	  DenOp.M(MfMpvPhi_k[k],Y);
 	  DenOp.MDeriv(tmp , MfMpvPhi_k[k], Y,DaggerYes );  dSdU=dSdU+ak*tmp;
 	  DenOp.MDeriv(tmp , Y, MfMpvPhi_k[k], DaggerNo );  dSdU=dSdU+ak*tmp;
 	}
 	//(2)
 	//(3)
 	for(int k=0;k<n_pv;k++){
          ak = PowerQuarter.residues[k];
 	  NumOp.M(MpvPhi_k[k],Y);
 	  NumOp.MDeriv(tmp,MpvMfMpvPhi_k[k],Y,DaggerYes); dSdU=dSdU+ak*tmp;
 	  NumOp.MDeriv(tmp,Y,MpvMfMpvPhi_k[k],DaggerNo);  dSdU=dSdU+ak*tmp;     
 	  NumOp.M(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
 	  NumOp.MDeriv(tmp,Y, MpvPhi_k[k], DaggerNo); dSdU=dSdU+ak*tmp;
 	  NumOp.MDeriv(tmp,MpvPhi_k[k], Y,DaggerYes); dSdU=dSdU+ak*tmp;
 	}
 	dSdU = Ta(dSdU);
      };
    };
  }
 }
 #endif
--- a/lib/qcd/action/pseudofermion/TwoFlavour.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavour.h
@@ -0,0 +1,121 @@
 #ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_H
 #define QCD_PSEUDOFERMION_TWO_FLAVOUR_H
 namespace Grid{
  namespace QCD{
    ////////////////////////////////////////////////////////////////////////
    // Two flavour pseudofermion action for any dop
    ////////////////////////////////////////////////////////////////////////
    template<class Impl>
    class TwoFlavourPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
    private:
      FermionOperator<Impl> & FermOp;// the basic operator
      OperatorFunction<FermionField> &DerivativeSolver;
      OperatorFunction<FermionField> &ActionSolver;
      FermionField Phi; // the pseudo fermion field for this trajectory
    public:
      /////////////////////////////////////////////////
      // Pass in required objects.
      /////////////////////////////////////////////////
    TwoFlavourPseudoFermionAction(FermionOperator<Impl>  &Op, 
 				  OperatorFunction<FermionField> & DS,
 				  OperatorFunction<FermionField> & AS
 				  ) : FermOp(Op), DerivativeSolver(DS), ActionSolver(AS), Phi(Op.FermionGrid()) {
      };
      //////////////////////////////////////////////////////////////////////////////////////
      // Push the gauge field in to the dops. Assume any BC's and smearing already applied
      //////////////////////////////////////////////////////////////////////////////////////
      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
 	// P(phi) = e^{- phi^dag (MdagM)^-1 phi}
 	// Phi = Mdag eta 
 	// P(eta) = e^{- eta^dag eta}
 	//
 	// e^{x^2/2 sig^2} => sig^2 = 0.5.
 	// 
 	// So eta should be of width sig = 1/sqrt(2).
 	// and must multiply by 0.707....
 	//
 	// Chroma has this scale factor: two_flavor_monomial_w.h
 	// IroIro: does not use this scale. It is absorbed by a change of vars
 	//         in the Phi integral, and thus is only an irrelevant prefactor for the partition function.
 	//
 	RealD scale = std::sqrt(0.5);
 	FermionField eta(FermOp.FermionGrid());
 	gaussian(pRNG,eta);
 	FermOp.ImportGauge(U);
 	FermOp.Mdag(eta,Phi);
 	Phi=Phi*scale;
      };
      //////////////////////////////////////////////////////
      // S = phi^dag (Mdag M)^-1 phi
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	FermOp.ImportGauge(U);
 	FermionField X(FermOp.FermionGrid());
 	FermionField Y(FermOp.FermionGrid());
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(FermOp);
 	X=zero;
 	ActionSolver(MdagMOp,Phi,X);
 	MdagMOp.Op(X,Y);
 	RealD action = norm2(Y);
 	std::cout << GridLogMessage << "Pseudofermion action "<<action<<std::endl;
 	return action;
      };
      //////////////////////////////////////////////////////
      // dS/du = - phi^dag  (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 phi
      //       = - phi^dag M^-1 dM (MdagM)^-1 phi -  phi^dag (MdagM)^-1 dMdag dM (Mdag)^-1 phi 
      //
      //       = - Ydag dM X  - Xdag dMdag Y
      //
      //////////////////////////////////////////////////////
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	FermOp.ImportGauge(U);
 	FermionField X(FermOp.FermionGrid());
 	FermionField Y(FermOp.FermionGrid());
 	GaugeField   tmp(FermOp.GaugeGrid());
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(FermOp);
 	X=zero;
 	DerivativeSolver(MdagMOp,Phi,X);
 	MdagMOp.Op(X,Y);
 	// Our conventions really make this UdSdU; We do not differentiate wrt Udag here.
 	// So must take dSdU - adj(dSdU) and left multiply by mom to get dS/dt.
 	FermOp.MDeriv(tmp , Y, X,DaggerNo );  dSdU=tmp;
 	FermOp.MDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
 	dSdU = Ta(dSdU);
      };
    };
  }
 }
 #endif
--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
@@ -0,0 +1,157 @@
 #ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_H
 #define QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_H
 namespace Grid{
  namespace QCD{
    ////////////////////////////////////////////////////////////////////////
    // Two flavour pseudofermion action for any EO prec dop
    ////////////////////////////////////////////////////////////////////////
    template<class Impl>
    class TwoFlavourEvenOddPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
    private:
      FermionOperator<Impl> & FermOp;// the basic operator
      OperatorFunction<FermionField> &DerivativeSolver;
      OperatorFunction<FermionField> &ActionSolver;
      FermionField PhiOdd;   // the pseudo fermion field for this trajectory
      FermionField PhiEven;  // the pseudo fermion field for this trajectory
    public:
      /////////////////////////////////////////////////
      // Pass in required objects.
      /////////////////////////////////////////////////
      TwoFlavourEvenOddPseudoFermionAction(FermionOperator<Impl>  &Op, 
 					 OperatorFunction<FermionField> & DS,
 					 OperatorFunction<FermionField> & AS
 					   ) : 
        FermOp(Op), 
 	DerivativeSolver(DS), 
 	ActionSolver(AS), 
        PhiEven(Op.FermionRedBlackGrid()),
 	PhiOdd(Op.FermionRedBlackGrid())
 		  {};
      //////////////////////////////////////////////////////////////////////////////////////
      // Push the gauge field in to the dops. Assume any BC's and smearing already applied
      //////////////////////////////////////////////////////////////////////////////////////
      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
 	// P(phi) = e^{- phi^dag (MpcdagMpc)^-1 phi}
 	// Phi = McpDag eta 
 	// P(eta) = e^{- eta^dag eta}
 	//
 	// e^{x^2/2 sig^2} => sig^2 = 0.5.
 	RealD scale = std::sqrt(0.5);
 	FermionField eta    (FermOp.FermionGrid());
 	FermionField etaOdd (FermOp.FermionRedBlackGrid());
 	FermionField etaEven(FermOp.FermionRedBlackGrid());
 	gaussian(pRNG,eta);
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	FermOp.ImportGauge(U);
 	SchurDifferentiableOperator<Impl> PCop(FermOp);
 	PCop.MpcDag(etaOdd,PhiOdd);
 	FermOp.MooeeDag(etaEven,PhiEven);
 	PhiOdd =PhiOdd*scale;
 	PhiEven=PhiEven*scale;
      };
      //////////////////////////////////////////////////////
      // S = phi^dag (Mdag M)^-1 phi  (odd)
      //   + phi^dag (Mdag M)^-1 phi  (even)
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	FermOp.ImportGauge(U);
 	FermionField X(FermOp.FermionRedBlackGrid());
 	FermionField Y(FermOp.FermionRedBlackGrid());
 	SchurDifferentiableOperator<Impl> PCop(FermOp);
 	X=zero;
 	ActionSolver(PCop,PhiOdd,X);
 	PCop.Op(X,Y);
 	RealD action = norm2(Y);
 	// The EE factorised block; normally can replace with zero if det is constant (gauge field indept)
 	// Only really clover term that creates this.
 	FermOp.MooeeInvDag(PhiEven,Y);
 	action = action + norm2(Y);
 	std::cout << GridLogMessage << "Pseudofermion EO action "<<action<<std::endl;
 	return action;
      };
      //////////////////////////////////////////////////////
      //
      // dS/du = - phi^dag  (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 phi
      //       = - phi^dag M^-1 dM (MdagM)^-1 phi -  phi^dag (MdagM)^-1 dMdag dM (Mdag)^-1 phi 
      //
      //       = - Ydag dM X  - Xdag dMdag Y
      //
      //////////////////////////////////////////////////////
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	FermOp.ImportGauge(U);
 	FermionField X(FermOp.FermionRedBlackGrid());
 	FermionField Y(FermOp.FermionRedBlackGrid());
 	GaugeField tmp(FermOp.GaugeGrid());
 	SchurDifferentiableOperator<Impl> Mpc(FermOp);
 	// Our conventions really make this UdSdU; We do not differentiate wrt Udag here.
 	// So must take dSdU - adj(dSdU) and left multiply by mom to get dS/dt.
 	X=zero;
 	DerivativeSolver(Mpc,PhiOdd,X);
 	Mpc.Mpc(X,Y);
  	Mpc.MpcDeriv(tmp , Y, X );    dSdU=tmp;
 	Mpc.MpcDagDeriv(tmp , X, Y);  dSdU=dSdU+tmp;
 	// Treat the EE case. (MdagM)^-1 = Minv Minvdag
 	// Deriv defaults to zero.
 	//        FermOp.MooeeInvDag(PhiOdd,Y);
 	//      FermOp.MooeeInv(Y,X);
 	//	FermOp.MeeDeriv(tmp , Y, X,DaggerNo );    dSdU=tmp;
 	//  FermOp.MeeDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
 	assert(FermOp.ConstEE() == 1);
 	/*
        FermOp.MooeeInvDag(PhiOdd,Y);
        FermOp.MooeeInv(Y,X);
  	FermOp.MeeDeriv(tmp , Y, X,DaggerNo );    dSdU=tmp;
 	FermOp.MeeDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
 	*/
 	dSdU = Ta(dSdU);
      };
    };
  }
 }
 #endif
--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -0,0 +1,168 @@
 #ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_RATIO_H
 #define QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_RATIO_H
 namespace Grid{
  namespace QCD{
    ///////////////////////////////////////
    // Two flavour ratio
    ///////////////////////////////////////
    template<class Impl>
    class TwoFlavourEvenOddRatioPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
    private:
      FermionOperator<Impl> & NumOp;// the basic operator
      FermionOperator<Impl> & DenOp;// the basic operator
      OperatorFunction<FermionField> &DerivativeSolver;
      OperatorFunction<FermionField> &ActionSolver;
      FermionField PhiOdd;   // the pseudo fermion field for this trajectory
      FermionField PhiEven;  // the pseudo fermion field for this trajectory
    public:
      TwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 						FermionOperator<Impl>  &_DenOp, 
 						OperatorFunction<FermionField> & DS,
 						OperatorFunction<FermionField> & AS) :
      NumOp(_NumOp), 
      DenOp(_DenOp), 
      DerivativeSolver(DS), 
      ActionSolver(AS),
      PhiEven(_NumOp.FermionRedBlackGrid()),
      PhiOdd(_NumOp.FermionRedBlackGrid()) 
 	{
 	  conformable(_NumOp.FermionGrid(), _DenOp.FermionGrid());
 	  conformable(_NumOp.FermionRedBlackGrid(), _DenOp.FermionRedBlackGrid());
 	  conformable(_NumOp.GaugeGrid(), _DenOp.GaugeGrid());
 	  conformable(_NumOp.GaugeRedBlackGrid(), _DenOp.GaugeRedBlackGrid());
 	};
      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
 	// P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
 	//
 	// NumOp == V
 	// DenOp == M
 	//
 	// Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
 	//
 	// P(eta_o) = e^{- eta_o^dag eta_o}
 	//
 	// e^{x^2/2 sig^2} => sig^2 = 0.5.
 	// 
 	RealD scale = std::sqrt(0.5);
 	FermionField eta    (NumOp.FermionGrid());
 	FermionField etaOdd (NumOp.FermionRedBlackGrid());
 	FermionField etaEven(NumOp.FermionRedBlackGrid());
 	FermionField tmp    (NumOp.FermionRedBlackGrid());
 	gaussian(pRNG,eta);
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	SchurDifferentiableOperator<Impl> Mpc(DenOp);
 	SchurDifferentiableOperator<Impl> Vpc(NumOp);
 	// Odd det factors
 	Mpc.MpcDag(etaOdd,PhiOdd);
 	ActionSolver(Vpc,PhiOdd,tmp);
 	Vpc.Mpc(tmp,PhiOdd);            
 	// Even det factors
 	DenOp.MooeeDag(etaEven,tmp);
 	NumOp.MooeeInvDag(tmp,PhiEven);
 	PhiOdd =PhiOdd*scale;
 	PhiEven=PhiEven*scale;
      };
      //////////////////////////////////////////////////////
      // S = phi^dag V (Mdag M)^-1 Vdag phi
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	SchurDifferentiableOperator<Impl> Mpc(DenOp);
 	SchurDifferentiableOperator<Impl> Vpc(NumOp);
 	FermionField X(NumOp.FermionRedBlackGrid());
 	FermionField Y(NumOp.FermionRedBlackGrid());
 	X=zero;
 	Vpc.MpcDag(PhiOdd,Y);           // Y= Vdag phi
 	ActionSolver(Mpc,Y,X);          // X= (MdagM)^-1 Vdag phi
 	Mpc.Mpc(X,Y);                   // Y=  Mdag^-1 Vdag phi
 	RealD action = norm2(Y);
 	// The EE factorised block; normally can replace with zero if det is constant (gauge field indept)
 	// Only really clover term that creates this. Leave the EE portion as a future to do to make most
 	// rapid progresss on DWF for now.
 	//
 	NumOp.MooeeDag(PhiEven,X);
 	DenOp.MooeeInvDag(X,Y);
 	action = action + norm2(Y);
 	return action;
      };
      //////////////////////////////////////////////////////
      // dS/du = phi^dag dV (Mdag M)^-1 V^dag  phi
      //       - phi^dag V (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 V^dag  phi
      //       + phi^dag V (Mdag M)^-1 dV^dag  phi
      //////////////////////////////////////////////////////
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	SchurDifferentiableOperator<Impl> Mpc(DenOp);
 	SchurDifferentiableOperator<Impl> Vpc(NumOp);
 	FermionField  X(NumOp.FermionRedBlackGrid());
 	FermionField  Y(NumOp.FermionRedBlackGrid());
 	GaugeField   force(NumOp.GaugeGrid());	
 	X=zero;
 	//Y=Vdag phi
 	//X = (Mdag M)^-1 V^dag phi
 	//Y = (Mdag)^-1 V^dag  phi
 	Vpc.MpcDag(PhiOdd,Y);          // Y= Vdag phi
 	DerivativeSolver(Mpc,Y,X);     // X= (MdagM)^-1 Vdag phi
 	Mpc.Mpc(X,Y);                  // Y=  Mdag^-1 Vdag phi
 	// phi^dag V (Mdag M)^-1 dV^dag  phi
 	Vpc.MpcDagDeriv(force , X, PhiOdd );  dSdU=force;
 	// phi^dag dV (Mdag M)^-1 V^dag  phi
 	Vpc.MpcDeriv(force , PhiOdd, X );  dSdU=dSdU+force;
 	//    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi
 	//    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi
 	Mpc.MpcDeriv(force,Y,X);   dSdU=dSdU-force;
 	Mpc.MpcDagDeriv(force,X,Y);  dSdU=dSdU-force;
 	// FIXME No force contribution from EvenEven assumed here
 	// Needs a fix for clover.
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
 	dSdU = -Ta(dSdU);
      };
    };
  }
 }
 #endif
--- a/lib/qcd/action/pseudofermion/TwoFlavourRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourRatio.h
@@ -0,0 +1,134 @@
 #ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_RATIO_H
 #define QCD_PSEUDOFERMION_TWO_FLAVOUR_RATIO_H
 namespace Grid{
  namespace QCD{
    ///////////////////////////////////////
    // Two flavour ratio
    ///////////////////////////////////////
    template<class Impl>
    class TwoFlavourRatioPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
    private:
      FermionOperator<Impl> & NumOp;// the basic operator
      FermionOperator<Impl> & DenOp;// the basic operator
      OperatorFunction<FermionField> &DerivativeSolver;
      OperatorFunction<FermionField> &ActionSolver;
      FermionField Phi; // the pseudo fermion field for this trajectory
    public:
      TwoFlavourRatioPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 					 FermionOperator<Impl>  &_DenOp, 
 					 OperatorFunction<FermionField> & DS,
 					 OperatorFunction<FermionField> & AS
 					 ) : NumOp(_NumOp), DenOp(_DenOp), DerivativeSolver(DS), ActionSolver(AS), Phi(_NumOp.FermionGrid()) {};
      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
 	// P(phi) = e^{- phi^dag V (MdagM)^-1 Vdag phi}
 	//
 	// NumOp == V
 	// DenOp == M
 	//
 	// Take phi = Vdag^{-1} Mdag eta  ; eta = Mdag^{-1} Vdag Phi
 	//
 	// P(eta) = e^{- eta^dag eta}
 	//
 	// e^{x^2/2 sig^2} => sig^2 = 0.5.
 	// 
 	// So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
 	//
 	RealD scale = std::sqrt(0.5);
 	FermionField eta(NumOp.FermionGrid());
 	FermionField tmp(NumOp.FermionGrid());
 	gaussian(pRNG,eta);
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	// Note: this hard codes normal equations type solvers; alternate implementation needed for 
 	// non-herm style solvers.
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(NumOp);
 	DenOp.Mdag(eta,Phi);            // Mdag eta
 	ActionSolver(MdagMOp,Phi,tmp);  // (VdagV)^-1 Mdag eta = V^-1 Vdag^-1 Mdag eta
 	NumOp.M(tmp,Phi);               // Vdag^-1 Mdag eta
 	Phi=Phi*scale;
      };
      //////////////////////////////////////////////////////
      // S = phi^dag V (Mdag M)^-1 Vdag phi
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	FermionField X(NumOp.FermionGrid());
 	FermionField Y(NumOp.FermionGrid());
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
 	X=zero;
 	NumOp.Mdag(Phi,Y);              // Y= Vdag phi
 	ActionSolver(MdagMOp,Y,X);      // X= (MdagM)^-1 Vdag phi
 	DenOp.M(X,Y);                  // Y=  Mdag^-1 Vdag phi
 	RealD action = norm2(Y);
 	return action;
      };
      //////////////////////////////////////////////////////
      // dS/du = phi^dag dV (Mdag M)^-1 V^dag  phi
      //       - phi^dag V (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 V^dag  phi
      //       + phi^dag V (Mdag M)^-1 dV^dag  phi
      //////////////////////////////////////////////////////
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
 	FermionField  X(NumOp.FermionGrid());
 	FermionField  Y(NumOp.FermionGrid());
 	GaugeField   force(NumOp.GaugeGrid());	
 	X=zero;
 	//Y=Vdag phi
 	//X = (Mdag M)^-1 V^dag phi
 	//Y = (Mdag)^-1 V^dag  phi
 	NumOp.Mdag(Phi,Y);              // Y= Vdag phi
 	DerivativeSolver(MdagMOp,Y,X);      // X= (MdagM)^-1 Vdag phi
 	DenOp.M(X,Y);                  // Y=  Mdag^-1 Vdag phi
 	// phi^dag V (Mdag M)^-1 dV^dag  phi
 	NumOp.MDeriv(force , X, Phi, DaggerYes );  dSdU=force;
 	// phi^dag dV (Mdag M)^-1 V^dag  phi
 	NumOp.MDeriv(force , Phi, X ,DaggerNo  );  dSdU=dSdU+force;
 	//    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi
 	//    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi
 	DenOp.MDeriv(force,Y,X,DaggerNo);   dSdU=dSdU-force;
 	DenOp.MDeriv(force,X,Y,DaggerYes);  dSdU=dSdU-force;
 	dSdU = - Ta(dSdU);
      };
    };
  }
 }
 #endif
--- a/lib/qcd/hmc/HMC.cc
+++ b/lib/qcd/hmc/HMC.cc
@@ -7,8 +7,8 @@ namespace Grid{
 	// FIXME fill this constructor  now just default values
 	////////////////////////////// Default values
-	Nsweeps             = 100;
+	Nsweeps             = 200;
-	TotalSweeps         = 20;
+	TotalSweeps         = 220;
 	ThermalizationSteps = 20;
 	StartingConfig      = 0;
 	SaveInterval        = 1;
@@ -17,8 +17,5 @@ namespace Grid{
      }
  }
 }
--- a/lib/qcd/hmc/HMC.h
+++ b/lib/qcd/hmc/HMC.h
@@ -1,15 +1,16 @@
 //--------------------------------------------------------------------
 /*! @file HMC.h
- * @brief Declaration of classes for Hybrid Monte Carlo update
+ * @brief Classes for Hybrid Monte Carlo update
 *
 * @author Guido Cossu
 * Time-stamp: <2015-07-30 16:58:26 neo>
 */
 //--------------------------------------------------------------------
 #ifndef HMC_INCLUDED
 #define HMC_INCLUDED
 #include <string>
-#include <memory>
+
 namespace Grid{
  namespace QCD{
@@ -20,84 +21,98 @@ namespace Grid{
      Integer ThermalizationSteps;
      Integer StartingConfig;
      Integer SaveInterval; //Setting to 0 does not save configurations
-      std::string Filename_prefix; // To save configurations
+      std::string Filename_prefix; // To save configurations and rng seed
      HMCparameters();
    };
    template <class Algorithm> 
    class HybridMonteCarlo{
      const HMCparameters Params;
-      GridSerialRNG sRNG;
+
      GridSerialRNG sRNG; // Fixme: need a RNG management strategy.
      Integrator<Algorithm>& MD;
-      
+
      /////////////////////////////////////////////////////////
      // Metropolis step
      /////////////////////////////////////////////////////////
      bool metropolis_test(const RealD DeltaH){
 	RealD rn_test;
 	RealD prob = std::exp(-DeltaH);
 	random(sRNG,rn_test);
-	std::cout<< "--------------------------------------------\n";
+	std::cout<<GridLogMessage<< "--------------------------------------------\n";
-	std::cout<< "dH = "<<DeltaH << "  Random = "<< rn_test 
+	std::cout<<GridLogMessage<< "dH = "<<DeltaH << "  Random = "<< rn_test <<"\n";
-		 << "\nAcc. Probability = " << ((prob<1.0)? prob: 1.0)<< "   ";
+	std::cout<<GridLogMessage<< "Acc. Probability = " << ((prob<1.0)? prob: 1.0)<< "   ";
 	if((prob >1.0) || (rn_test <= prob)){       // accepted
-	  std::cout <<"-- ACCEPTED\n";
+	  std::cout<<GridLogMessage <<"-- ACCEPTED\n";
 	  return true;
 	} else {                               // rejected
-	  std::cout <<"-- REJECTED\n";
+	  std::cout<<GridLogMessage <<"-- REJECTED\n";
 	  return false;
 	}
      }
-      RealD evolve_step(LatticeLorentzColourMatrix& U){
+      /////////////////////////////////////////////////////////
-
+      // Evolution
      /////////////////////////////////////////////////////////
      RealD evolve_step(LatticeGaugeField& U){
 	MD.init(U); // set U and initialize P and phi's 
 	RealD H0 = MD.S(U); // initial state action  
-	std::cout<<"Total H before = "<< H0 << "\n";
+	std::cout<<GridLogMessage<<"Total H before = "<< H0 << "\n";
-      
+
 	MD.integrate(U);
 	RealD H1 = MD.S(U); // updated state action            
-	std::cout<<"Total H after = "<< H1 << "\n";
+	std::cout<<GridLogMessage<<"Total H after = "<< H1 << "\n";
 	return (H1-H0);
      }
    public:
    HybridMonteCarlo(HMCparameters Pms, 
 		     Integrator<Algorithm>& MolDyn):
      Params(Pms),MD(MolDyn){
 	//FIXME
-	// initialize RNGs also with seed
+      /////////////////////////////////////////
      // Constructor
      /////////////////////////////////////////
      HybridMonteCarlo(HMCparameters Pms,  Integrator<Algorithm>& MolDyn): Params(Pms),MD(MolDyn) {
 	//FIXME...  initialize RNGs also with seed ; RNG management strategy
 	sRNG.SeedRandomDevice();
      }
      ~HybridMonteCarlo(){};
-
+      void evolve(LatticeGaugeField& Uin){
      void evolve(LatticeLorentzColourMatrix& Uin){
 	Real DeltaH;
 	// Thermalizations
 	for(int iter=1; iter <= Params.ThermalizationSteps; ++iter){
-	  std::cout << "-- # Thermalization step = "<< iter <<  "\n";
+	  std::cout<<GridLogMessage << "-- # Thermalization step = "<< iter <<  "\n";
 	  DeltaH = evolve_step(Uin);
-	  std::cout<< " dH = "<< DeltaH << "\n";
+	  std::cout<<GridLogMessage<< "dH = "<< DeltaH << "\n";
 	}
 	// Actual updates (evolve a copy Ucopy then copy back eventually)
-	LatticeLorentzColourMatrix Ucopy(Uin._grid);
+	LatticeGaugeField Ucopy(Uin._grid);
 	for(int iter=Params.StartingConfig; 
 	    iter < Params.Nsweeps+Params.StartingConfig; ++iter){
-	  std::cout << "-- # Sweep = "<< iter <<  "\n";
+	  std::cout<<GridLogMessage << "-- # Sweep = "<< iter <<  "\n";
 	  Ucopy = Uin;
 	  DeltaH = evolve_step(Ucopy);
 	  if(metropolis_test(DeltaH)) Uin = Ucopy;
-	  //need sync?
+
 	}
      }
    };
--- a/lib/qcd/hmc/integrators/Integrator.cc
+++ b/lib/qcd/hmc/integrators/Integrator.cc
@@ -18,7 +18,7 @@ namespace Grid{
      Pmu = zero;
      for(int mu=0;mu<Nd;mu++){
 	SU3::GaussianLieAlgebraMatrix(pRNG, Pmu);
-	pokeLorentz(P, Pmu, mu);
+	PokeIndex<LorentzIndex>(P, Pmu, mu);
      }
    }
--- a/lib/qcd/hmc/integrators/Integrator.h
+++ b/lib/qcd/hmc/integrators/Integrator.h
@@ -1,8 +1,9 @@
 //--------------------------------------------------------------------
 /*! @file Integrator.h
- * @brief Declaration of classes for the Molecular Dynamics integrator
+ * @brief Classes for the Molecular Dynamics integrator
 *
 * @author Guido Cossu
 * Time-stamp: <2015-07-30 16:21:29 neo>
 */
 //--------------------------------------------------------------------
@@ -16,8 +17,16 @@ class Observer;
 namespace Grid{
  namespace QCD{
-    typedef Action<LatticeLorentzColourMatrix>*  ActPtr; // now force the same colours as the rest of the code
+    typedef Action<LatticeGaugeField>*  ActPtr; // now force the same colours as the rest of the code
-    typedef std::vector<ActPtr> ActionLevel;
+    struct ActionLevel{
      int multiplier;
    public:
      std::vector<ActPtr> actions;
      explicit ActionLevel(int mul = 1):multiplier(mul){assert (mul > 0);};
      void push_back(ActPtr ptr){
 	actions.push_back(ptr);
      }
    };
    typedef std::vector<ActionLevel> ActionSet;
    typedef std::vector<Observer*> ObserverList;
@@ -35,8 +44,8 @@ namespace Grid{
    namespace MDutils{
-      void generate_momenta(LatticeLorentzColourMatrix&,GridParallelRNG&);
+      void generate_momenta(LatticeGaugeField&,GridParallelRNG&);
-      void generate_momenta_su3(LatticeLorentzColourMatrix&,GridParallelRNG&);
+      void generate_momenta_su3(LatticeGaugeField&,GridParallelRNG&);
    }
    /*! @brief Class for Molecular Dynamics management */   
@@ -45,8 +54,7 @@ namespace Grid{
    private:
      IntegratorParameters Params;
      const ActionSet as;
-      const std::vector<int> Nrel; //relative step size per level
+      std::unique_ptr<LatticeGaugeField> P;
      std::unique_ptr<LatticeLorentzColourMatrix> P;
      GridParallelRNG pRNG;
      //ObserverList observers; // not yet
@@ -55,59 +63,53 @@ namespace Grid{
      void register_observers();
      void notify_observers();
-      void update_P(LatticeLorentzColourMatrix&U, int level,double ep){
+      void update_P(LatticeGaugeField&U, int level,double ep){
-	for(int a=0; a<as[level].size(); ++a){
+	for(int a=0; a<as[level].actions.size(); ++a){
-	  LatticeLorentzColourMatrix force(U._grid);
+	  LatticeGaugeField force(U._grid);
-	  as[level].at(a)->deriv(U,force);
+	  as[level].actions.at(a)->deriv(U,force);
 	  *P -= force*ep;
 	}
      }
-
+      void update_U(LatticeGaugeField&U, double ep){
      void update_U(LatticeLorentzColourMatrix&U, double ep){
 	//rewrite exponential to deal automatically  with the lorentz index?
 	LatticeColourMatrix Umu(U._grid);
 	LatticeColourMatrix Pmu(U._grid);
 	for (int mu = 0; mu < Nd; mu++){
-	  Umu=peekLorentz(U, mu);
+	  Umu=PeekIndex<LorentzIndex>(U, mu);
-	  Pmu=peekLorentz(*P, mu);
+	  Pmu=PeekIndex<LorentzIndex>(*P, mu);
 	  Umu = expMat(Pmu, ep, Params.Nexp)*Umu;
-	  pokeLorentz(U, Umu, mu);
+	  PokeIndex<LorentzIndex>(U, Umu, mu);
 	}
      }
-      friend void IntegratorAlgorithm::step (LatticeLorentzColourMatrix& U, 
+      friend void IntegratorAlgorithm::step (LatticeGaugeField& U, 
 					     int level, std::vector<int>& clock,
 					     Integrator<IntegratorAlgorithm>* Integ);
    public:
    Integrator(GridBase* grid, IntegratorParameters Par,
-		 ActionSet& Aset, std::vector<int> Nrel_):
+		 ActionSet& Aset):
-      Params(Par),as(Aset),Nrel(Nrel_),P(new LatticeLorentzColourMatrix(grid)),pRNG(grid){
+      Params(Par),as(Aset),P(new LatticeGaugeField(grid)),pRNG(grid){
 	assert(as.size() == Nrel.size());
 	pRNG.SeedRandomDevice();
      };
      ~Integrator(){}
      //Initialization of momenta and actions
-      void init(LatticeLorentzColourMatrix& U){
+      void init(LatticeGaugeField& U){
-	std::cout<< "Integrator init\n";
+	std::cout<<GridLogMessage<< "Integrator init\n";
 	MDutils::generate_momenta(*P,pRNG);
 	for(int level=0; level< as.size(); ++level){
-	  for(int actionID=0; actionID<as.at(level).size(); ++actionID){
+	  for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
-	    as[level].at(actionID)->init(U, pRNG);
+	    as[level].actions.at(actionID)->init(U, pRNG);
 	  }
 	}
      }
      // Calculate action
-      RealD S(LatticeLorentzColourMatrix& U){
+      RealD S(LatticeGaugeField& U){
 	LatticeComplex Hloc(U._grid);
 	Hloc = zero;
 	// Momenta
@@ -119,17 +121,19 @@ namespace Grid{
 	RealD H = Hsum.real();
-	std::cout << "H_p = "<< H << "\n";
+	std::cout<<GridLogMessage << "Momentum action H_p = "<< H << "\n";
 	// Actions
 	for(int level=0; level<as.size(); ++level)
-	  for(int actionID=0; actionID<as.at(level).size(); ++actionID)
+	  for(int actionID=0; actionID<as[level].actions.size(); ++actionID)
-	    H += as[level].at(actionID)->S(U);
+	    H += as[level].actions.at(actionID)->S(U);
 	std::cout<<GridLogMessage << "Total action H = "<< H << "\n";
 	return H;
      }
-      void integrate(LatticeLorentzColourMatrix& U){
+      void integrate(LatticeGaugeField& U){
 	std::vector<int> clock;
 	clock.resize(as.size(),0);
 	for(int step=0; step< Params.MDsteps; ++step)   // MD step
--- a/lib/qcd/hmc/integrators/Integrator_algorithm.h
+++ b/lib/qcd/hmc/integrators/Integrator_algorithm.h
@@ -27,41 +27,38 @@ namespace Grid{
 	int fl = Integ->as.size() -1;
 	double eps = Integ->Params.stepsize;
-	for(int l=0; l<=level; ++l) eps/= 2.0*Integ->Nrel[l];
+	for(int l=0; l<=level; ++l) eps/= 2.0*Integ->as[l].multiplier;
-	int fin = Integ->Nrel[0];
+	int fin = Integ->as[0].multiplier;
-	for(int l=1; l<=level; ++l) fin*= 2.0*Integ->Nrel[l];
+	for(int l=1; l<=level; ++l) fin*= 2.0*Integ->as[l].multiplier;
 	fin = 3*Integ->Params.MDsteps*fin -1;
-	
+	for(int e=0; e<Integ->as[level].multiplier; ++e){
 	for(int e=0; e<Integ->Nrel[level]; ++e){
 	  if(clock[level] == 0){    // initial half step 
 	    Integ->update_P(U,level,lambda*eps);
 	    ++clock[level];
-	    for(int l=0; l<level;++l) std::cout<<"   ";
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<"P "<< clock[level] <<std::endl;
+	    std::cout<<GridLogMessage<<"P "<< clock[level] <<std::endl;
 	  }
 	  if(level == fl){          // lowest level 
 	    Integ->update_U(U,0.5*eps);
-	    
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    for(int l=0; l<level;++l) std::cout<<"   ";
+	    std::cout<<GridLogMessage<<"U "<< (clock[level]+1) <<std::endl;
 	    std::cout<<"U "<< (clock[level]+1) <<std::endl;
 	  }else{                 // recursive function call 
 	    step(U,level+1,clock, Integ);
 	  }
 	  Integ->update_P(U,level,(1.0-2.0*lambda)*eps);
 	  ++clock[level];
-	  for(int l=0; l<level;++l) std::cout<<"   ";
+	  for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	  std::cout<<"P "<< (clock[level]) <<std::endl;
+	  std::cout<<GridLogMessage<<"P "<< (clock[level]) <<std::endl;
 	  if(level == fl){          // lowest level 
 	    Integ->update_U(U,0.5*eps);
-	    for(int l=0; l<level;++l) std::cout<<"   ";
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<"U "<< (clock[level]+1) <<std::endl;
+	    std::cout<<GridLogMessage<<"U "<< (clock[level]+1) <<std::endl;
 	  }else{                 // recursive function call 
 	    step(U,level+1,clock, Integ);
 	  }    
@@ -71,19 +68,17 @@ namespace Grid{
 	    Integ->update_P(U,level,lambda*eps);
 	    ++clock[level];
-	    for(int l=0; l<level;++l) std::cout<<"   ";
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<"P "<< clock[level] <<std::endl;
+	    std::cout<<GridLogMessage<<"P "<< clock[level] <<std::endl;
 	  }else{                  // bulk step
 	    Integ->update_P(U,level,lambda*2.0*eps);
 	    clock[level]+=2;
-	    for(int l=0; l<level;++l) std::cout<<"   ";
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<"P "<< clock[level] <<std::endl;
+	    std::cout<<GridLogMessage<<"P "<< clock[level] <<std::endl;
 	  }
 	}
-	
+		
      }
    };
@@ -93,6 +88,7 @@ namespace Grid{
      void step (LatticeLorentzColourMatrix& U, 
 		 int level, std::vector<int>& clock,
 		 Integrator<LeapFrog>* Integ){
 	// level  : current level
 	// fl     : final level
 	// eps    : current step size
@@ -101,45 +97,43 @@ namespace Grid{
 	double eps = Integ->Params.stepsize;
 	// Get current level step size
-	for(int l=0; l<=level; ++l) eps/= Integ->Nrel[l];
+	for(int l=0; l<=level; ++l) eps/= Integ->as[l].multiplier;
 	int fin = 1;
-	for(int l=0; l<=level; ++l) fin*= Integ->Nrel[l];
+	for(int l=0; l<=level; ++l) fin*= Integ->as[l].multiplier;
 	fin = 2*Integ->Params.MDsteps*fin - 1;
-	for(int e=0; e<Integ->Nrel[level]; ++e){
+	for(int e=0; e<Integ->as[level].multiplier; ++e){
 	  if(clock[level] == 0){    // initial half step
 	    Integ->update_P(U, level,eps/2.0);
 	    ++clock[level];
-	    for(int l=0; l<level;++l) std::cout<<"   ";
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<"P "<< 0.5*clock[level] <<std::endl;
+	    std::cout<<GridLogMessage<<"P "<< 0.5*clock[level] <<std::endl;
 	  }
 	  if(level == fl){          // lowest level
 	    Integ->update_U(U, eps);
-	    for(int l=0; l<level;++l) std::cout<<"   ";
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<"U "<< 0.5*(clock[level]+1) <<std::endl;
+	    std::cout<<GridLogMessage<<"U "<< 0.5*(clock[level]+1) <<std::endl;
 	  }else{                 // recursive function call
 	    step(U, level+1,clock, Integ);
 	  }
 	  if(clock[level] == fin){  // final half step
 	    Integ->update_P(U, level,eps/2.0);
 	    ++clock[level];
-	    for(int l=0; l<level;++l) std::cout<<"   ";
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<"P "<< 0.5*clock[level] <<std::endl;
+	    std::cout<<GridLogMessage<<"P "<< 0.5*clock[level] <<std::endl;
 	  }else{                  // bulk step
 	    Integ->update_P(U, level,eps);
 	    clock[level]+=2;
-	    for(int l=0; l<level;++l) std::cout<<"   ";
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<"P "<< 0.5*clock[level] <<std::endl;
+	    std::cout<<GridLogMessage<<"P "<< 0.5*clock[level] <<std::endl;
 	  }
 	}
      }
    };
--- a/lib/qcd/spin/Dirac.h
+++ b/lib/qcd/spin/Dirac.h
@@ -4,7 +4,6 @@ namespace Grid{
 namespace QCD {
  const int SpinorIndex = 2;
  class Gamma {
@@ -344,14 +343,19 @@ namespace QCD {
      typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type 
 	{
 	  iVector<vtype,N> ret;
-	  ret._internal=G*arg._internal;
+	  for(int i=0;i<N;i++){
 	    ret._internal[i]=G*arg._internal[i];
 	  }
 	  return ret;
 	}
    template<class vtype,int N> inline auto operator * ( const Gamma &G,const iMatrix<vtype,N> &arg) ->
      typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type 
 	{
 	  iMatrix<vtype,N> ret;
-	  ret._internal=G*arg._internal;
+	  for(int i=0;i<N;i++){
 	  for(int j=0;j<N;j++){
 	    ret._internal[i][j]=G*arg._internal[i][j];
 	  }}
 	  return ret;
 	}
@@ -369,14 +373,19 @@ namespace QCD {
      typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type 
 	{
 	  iVector<vtype,N> ret;
-	  ret._internal=arg._internal*G;
+	  for(int i=0;i<N;i++){
 	    ret._internal=arg._internal[i]*G;
 	  }
 	  return ret;
 	}
    template<class vtype,int N> inline auto operator * (const iMatrix<vtype,N> &arg, const Gamma &G) ->
      typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type 
 	{
 	  iMatrix<vtype,N> ret;
-	  ret._internal=arg._internal*G;
+	  for(int i=0;i<N;i++){
 	  for(int j=0;j<N;j++){
 	    ret._internal[i][j]=arg._internal[i][j]*G;
 	  }}
 	  return ret;
 	}
--- a/lib/qcd/spin/TwoSpinor.h
+++ b/lib/qcd/spin/TwoSpinor.h
--- a/lib/qcd/utils/LinalgUtils.h
+++ b/lib/qcd/utils/LinalgUtils.h
@@ -32,10 +32,11 @@ void ag5xpby_ssp(Lattice<vobj> &z,RealD a,const Lattice<vobj> &x,RealD b,const L
  conformable(x,z);
  GridBase *grid=x._grid;
  int Ls = grid->_rdimensions[0];
  Gamma G5(Gamma::Gamma5);
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp;
-    multGamma5(tmp(),a*x._odata[ss+s]());
+    tmp = G5*x._odata[ss+s]*a;
    tmp = tmp + b*y._odata[ss+sp];
    vstream(z._odata[ss+s],tmp);
  }
@@ -49,10 +50,11 @@ void axpbg5y_ssp(Lattice<vobj> &z,RealD a,const Lattice<vobj> &x,RealD b,const L
  conformable(x,z);
  GridBase *grid=x._grid;
  int Ls = grid->_rdimensions[0];
  Gamma G5(Gamma::Gamma5);
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp;
-    multGamma5(tmp(),b*y._odata[ss+sp]());
+    tmp = G5*y._odata[ss+sp]*b;
    tmp = tmp + a*x._odata[ss+s];
    vstream(z._odata[ss+s],tmp);
  }
@@ -66,12 +68,13 @@ void ag5xpbg5y_ssp(Lattice<vobj> &z,RealD a,const Lattice<vobj> &x,RealD b,const
  conformable(x,z);
  GridBase *grid=x._grid;
  int Ls = grid->_rdimensions[0];
  Gamma G5(Gamma::Gamma5);
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp1;
    vobj tmp2;
    tmp1 = a*x._odata[ss+s]+b*y._odata[ss+sp];
-    multGamma5(tmp2(),tmp1());
+    tmp2 = G5*tmp1;
    vstream(z._odata[ss+s],tmp2);
  }
 }
@@ -117,12 +120,13 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
  z.checkerboard = x.checkerboard;
  conformable(x,z);
  int Ls = grid->_rdimensions[0];
  Gamma G5(Gamma::Gamma5);
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp;
    for(int s=0;s<Ls;s++){
      int sp = Ls-1-s;
-      multGamma5(tmp(),x._odata[ss+s]());
+      tmp = G5*x._odata[ss+s];
      vstream(z._odata[ss+sp],tmp);
    }
  }
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -372,7 +372,7 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
    LatticeReal d(grid); d=zero;
    LatticeReal alpha(grid);
-    //    std::cout<<"xi "<<xi <<std::endl;
+    //    std::cout<<GridLogMessage<<"xi "<<xi <<std::endl;
    alpha = toReal(2.0*xi);
    do { 
@@ -468,11 +468,11 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
    LatticeMatrix Vcheck(grid);
    Vcheck = zero;
    Vcheck = where(Accepted,V*adj(V) - 1.0,Vcheck);
-    //    std::cout << "SU3 check " <<norm2(Vcheck)<<std::endl;
+    //    std::cout<<GridLogMessage << "SU3 check " <<norm2(Vcheck)<<std::endl;
    assert(norm2(Vcheck)<1.0e-4);
    // Verify the link stays in SU(3)
-    //    std::cout <<"Checking the modified link"<<std::endl;
+    //    std::cout<<GridLogMessage <<"Checking the modified link"<<std::endl;
    Vcheck = link*adj(link) - 1.0;
    assert(norm2(Vcheck)<1.0e-4);
    /////////////////////////////////
@@ -483,42 +483,42 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
    for(int gen=0;gen<generators();gen++){
      Matrix ta;
      generator(gen,ta);
-      std::cout<< "Nc = "<<ncolour<<" t_"<<gen<<std::endl;
+      std::cout<<GridLogMessage<< "Nc = "<<ncolour<<" t_"<<gen<<std::endl;
-      std::cout<<ta<<std::endl;
+      std::cout<<GridLogMessage<<ta<<std::endl;
    }
  }
  static void testGenerators(void){
    Matrix ta;
    Matrix tb;
-    std::cout<<"Checking trace ta tb is 0.5 delta_ab"<<std::endl;
+    std::cout<<GridLogMessage<<"Checking trace ta tb is 0.5 delta_ab"<<std::endl;
    for(int a=0;a<generators();a++){
      for(int b=0;b<generators();b++){
 	generator(a,ta);
 	generator(b,tb);
 	Complex tr =TensorRemove(trace(ta*tb)); 
-	std::cout<<tr<<" ";
+	std::cout<<GridLogMessage<<tr<<" ";
 	if(a==b) assert(abs(tr-Complex(0.5))<1.0e-6);
 	if(a!=b) assert(abs(tr)<1.0e-6);
      }
-      std::cout<<std::endl;
+      std::cout<<GridLogMessage<<std::endl;
    }
-    std::cout<<"Checking hermitian"<<std::endl;
+    std::cout<<GridLogMessage<<"Checking hermitian"<<std::endl;
    for(int a=0;a<generators();a++){
      generator(a,ta);
-      std::cout<<a<<" ";
+      std::cout<<GridLogMessage<<a<<" ";
      assert(norm2(ta-adj(ta))<1.0e-6);
    }    
-    std::cout<<std::endl;
+    std::cout<<GridLogMessage<<std::endl;
-    std::cout<<"Checking traceless"<<std::endl;
+    std::cout<<GridLogMessage<<"Checking traceless"<<std::endl;
    for(int a=0;a<generators();a++){
      generator(a,ta);
      Complex tr =TensorRemove(trace(ta)); 
-      std::cout<<a<<" ";
+      std::cout<<GridLogMessage<<a<<" ";
      assert(abs(tr)<1.0e-6);
    }    
-    std::cout<<std::endl;
+    std::cout<<GridLogMessage<<std::endl;
  }
  // reunitarise??
@@ -554,9 +554,7 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
    for(int a=0;a<generators();a++){
      gaussian(pRNG,ca); 
      generator(a,ta);
      la=toComplex(ca)*ci*ta;
      out += la; 
    }
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -7,7 +7,6 @@ namespace QCD {
 template<class GaugeMat,class GaugeLorentz>
 class WilsonLoops {
 public:
  //////////////////////////////////////////////////
  // directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -1,14 +1,16 @@
 //----------------------------------------------------------------------
 /*! @file Grid_sse4.h
-  @brief Optimization libraries for NEON (ARM) instructions set ARMv7
+  @brief Optimization libraries for NEON (ARM) instructions set ARMv8
  Experimental - Using intrinsics - DEVELOPING! 
 */
-// Time-stamp: <2015-06-09 15:25:40 neo>
+// Time-stamp: <2015-07-10 17:45:09 neo>
 //----------------------------------------------------------------------
 #include <arm_neon.h>
 // ARMv8 supports double precision
 namespace Optimization {
  template<class vtype>
@@ -22,50 +24,47 @@ namespace Optimization {
    float f[4];
  };
  union u128d {
-    float32x4_t v;
+    float64x2_t v;
-    float f[4];
+    double f[4];
  };
  struct Vsplat{
    //Complex float
    inline float32x4_t operator()(float a, float b){
-      float32x4_t foo;
+      float tmp[4]={a,b,a,b};
-      return foo;
+      return vld1q_f32(tmp);
    }
    // Real float
    inline float32x4_t operator()(float a){
-      float32x4_t foo;
+      return vld1q_dup_f32(&a);
      return foo;
    }
    //Complex double
    inline float32x4_t operator()(double a, double b){
-      float32x4_t foo;
+      float tmp[4]={(float)a,(float)b,(float)a,(float)b};
-      return foo;
+      return vld1q_f32(tmp);
    }
    //Real double
    inline float32x4_t operator()(double a){
-      float32x4_t foo;
+      return vld1q_dup_f32(&a);
      return foo;
    }
    //Integer
    inline uint32x4_t operator()(Integer a){
-      uint32x4_t foo;
+      return vld1q_dup_u32(&a);
      return foo;
    }
  };
  struct Vstore{
    //Float 
    inline void operator()(float32x4_t a, float* F){
-      
+      vst1q_f32(F, a);
    }
    //Double
    inline void operator()(float32x4_t a, double* D){
-      
+      vst1q_f32((float*)D, a);
    }
    //Integer
    inline void operator()(uint32x4_t a, Integer* I){
-     
+      vst1q_u32(I, a);
    }
  };
@@ -130,36 +129,30 @@ namespace Optimization {
  struct Sum{
    //Complex/Real float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
+      return vaddq_f32(a,b);
      return foo;
    }
    //Complex/Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-    //  float32x4_t foo;
+      return vaddq_f64(a,b);
-    //  return foo;
+    }
    //}
    //Integer
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      uint32x4_t foo;
+      return vaddq_u32(a,b);
      return foo;
    }
  };
  struct Sub{
    //Complex/Real float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
+      return vsubq_f32(a,b);
      return foo;
    }
    //Complex/Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-    //  float32x4_t foo;
+      return vsubq_f64(a,b);
-    //  return foo;
+    }
    //}
    //Integer
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      uint32x4_t foo;
+      return vsubq_u32(a,b);
      return foo;
    }
  };
@@ -170,24 +163,24 @@ namespace Optimization {
      return foo;
    }
    // Complex double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-    //  float32x4_t foo;
+      float32x4_t foo;
-    //  return foo;
+      return foo;
-    //}
+    }
  };
  struct Mult{
    // Real float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      return a;
+      return vmulq_f32(a,b);
    }
    // Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-    //  return 0;
+      return vmulq_f64(a,b);
-    //}
+    }
    // Integer
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      return a;
+      return vmulq_u32(a,b);
    }
  };
@@ -219,6 +212,7 @@ namespace Optimization {
  struct TimesI{
    //Complex single
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
      //need shuffle
      return in;
    }
    //Complex double
@@ -242,20 +236,25 @@ namespace Optimization {
  //Real float Reduce
  template<>
  inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+    float32x2_t high = vget_high_f32(in);
    float32x2_t low = vget_low_f32(in);
    float32x2_t tmp = vadd_f32(low, high);
    float32x2_t sum = vpadd_f32(tmp, tmp);
    return vget_lane_f32(sum,0);
  }
  //Complex double Reduce
  template<>
-  inline Grid::ComplexD Reduce<Grid::ComplexD, float32x4_t>::operator()(float32x4_t in){
+  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
    return 0;
  }
  //Real double Reduce
  template<>
-  inline Grid::RealD Reduce<Grid::RealD, float32x4_t>::operator()(float32x4_t in){
+  inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
-    return 0;
+    float64x2_t sum = vpaddq_f64(in, in);
    return vgetq_lane_f64(sum,0);
  }
  //Integer Reduce
@@ -272,7 +271,7 @@ namespace Optimization {
 namespace Grid {
  typedef float32x4_t  SIMD_Ftype; // Single precision type
-  typedef float32x4_t  SIMD_Dtype; // Double precision type - no double on ARMv7
+  typedef float64x2_t  SIMD_Dtype; // Double precision type
  typedef uint32x4_t   SIMD_Itype; // Integer type
  inline void v_prefetch0(int size, const char *ptr){};  // prefetch utilities
--- a/Show More
+++ b/Show More
`@@ -1,4 +1,4 @@`

	HFILES=./Cshift.h ./simd/Grid_avx.h ./simd/Grid_vector_types.h ./simd/Grid_sse4.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_vector_unops.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./Tensors.h ./Algorithms.h ./communicator/Communicator_base.h ./lattice/Lattice_rng.h ./lattice/Lattice_reduction.h ./lattice/Lattice_transfer.h ./lattice/Lattice_unary.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_comparison.h ./lattice/Lattice_overload.h ./lattice/Lattice_reality.h ./lattice/Lattice_local.h ./lattice/Lattice_conformable.h ./lattice/Lattice_where.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_ET.h ./lattice/Lattice_transpose.h ./lattice/Lattice_trace.h ./Stencil.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_exp.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_class.h ./tensors/Tensor_logical.h ./tensors/Tensor_transpose.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_index.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_outer.h ./tensors/Tensor_inner.h ./tensors/Tensor_traits.h ./tensors/Tensor_Ta.h ./tensors/Tensor_unary.h ./tensors/Tensor_determinant.h ./tensors/Tensor_arith.h ./tensors/Tensor_extract_merge.h ./Communicator.h ./Cartesian.h ./parallelIO/NerscIO.h ./qcd/QCD.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/hmc/HMC.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/LinalgUtils.h ./qcd/utils/CovariantCshift.h ./qcd/utils/WilsonLoops.h ./qcd/action/ActionBase.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/spin/TwoSpinor.h ./qcd/spin/Dirac.h ./cshift/Cshift_common.h ./cshift/Cshift_none.h ./cshift/Cshift_mpi.h ./Simd.h ./GridConfig.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_red_black.h ./cartesian/Cartesian_full.h ./AlignedAllocator.h ./Lattice.h ./Old/Tensor_poke.h ./Old/Tensor_peek.h ./Threads.h ./Grid.h ./algorithms/Preconditioner.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/AdefGeneric.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Remez.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./algorithms/CoarsenedMatrix.h ./stencil/Lebesgue.h	HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Config.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./Init.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./Log.h ./MacroMagic.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./parallelIO/NerscIO.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/WilsonLoops.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h ./Timer.h

	CCFILES=./qcd/hmc/integrators/Integrator.cc ./qcd/hmc/HMC.cc ./qcd/utils/SpaceTimeGrid.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/spin/Dirac.cc ./GridInit.cc ./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc	CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./Init.cc ./Log.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/hmc/HMC.cc ./qcd/hmc/integrators/Integrator.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc