Merge pull request #1 from paboyle/master

Sync with Peter
2025-09-18 17:21:05 +01:00 · 2015-08-19 17:27:31 +02:00
parent b0eedfd7ba fdfe194c41
commit dd498f993e
168 changed files with 30055 additions and 2530 deletions
--- a/76
+++ b/76
@@ -1,15 +1,61 @@
+- PseudoFermions
+
+Done: Cayley, Partial , ContFrac force terms.
+
+Done:
+  - TwoFlavour
+  - TwoFlavourEvenOdd        
+  - TwoFlavourRatio
+  - TwoFlavourRatioEvenOdd
+
+Done:
+  - OneFlavourRationalEvenOdd
+  - OneFlavourRationalRatioEvenOdd
+  - OneFlavourRationalRatio
+
+TODO:
+=> generalise to non-const EE
+=> Test DWF HMC
+=> Clean up HMC
+   - Fix a threading bug that has been introduced and prevents HMC running hybrid OMP mode
+
+=> Integrators
+  - Force Gradient
+  - Multi-timescale looks broken and operating on single timescale for now.
+    Fix/debug/rewrite this 
+  - Sign of force term.
+  - Prefer "RefreshInternal" or such like to "init" in naming
+  - Rename "Ta" as too unclear
+
+- MacroMagic -> virtual reader class.
+
+- Link smearing/boundary conds; Policy class based implementation
+
+- Rectangle gauge actions.
+  Iwasaki,
+  Symanzik,
+  ... etc...
+
+- Prepare multigrid for HMC.
+  Alternate setup schemes.
+
+- RNG filling from sparser grid, lower dim grid.
+
 ================================================================
 *** Hacks and bug fixes to clean up and Audits
 ================================================================

 *  Extract/merge/set cleanup ; too many variants; rationalise and call simpler ones
-*  Used #define repetitive sequences to minimise code.
+
 *  Rewrite core tensor arithmetic support to be more systematic
+ =  Use #define repetitive sequences to minimise code, decrease line count by thousands possible,
+    with more robust and maintainable implementation.
+
 *  Ensure we ET as much as possible; move unop functions into ET framework.
   - tests with expression args to all functions

-
 * FIXME audit
+
 * const audit

 Insert/Extract
@@ -22,10 +68,10 @@ Insert/Extract

 * Thread scaling tests Xeon, XeonPhi

-** Make the Tensor types and Complex etc... play more nicely.
+* Make the Tensor types and Complex etc... play more nicely.
  - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar<Scalar<Scalar<Complex > > >
-  QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I
-  want to introduce a syntax that does not require this.
+    QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I
+    want to introduce a syntax that does not require this.

  - Reductions that contract indices on a site should always demote the tensor structure.
    norm2(), innerProduct.
@@ -54,13 +100,6 @@ Insert/Extract
   // localMaxAbs
   // Fourier transform equivalent.]

-================================================================
-*** New Functionality
-================================================================
-
-* - BinaryWriter, TextWriter etc...
-  - use protocol buffers? replace xmlReader/Writer ec..
-  - Binary use htonll, htonl

 * CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)

@@ -103,14 +142,15 @@ Algorithms (lots of reuse/port from BFM)
 * Gauge
  - Wilson, symanzik, iwasaki

-* rb4d support for 5th dimension in Mobius.
-
 * Flavour matrices?
-* Pauli, SU subgroup, etc.. 
-* su3 exponentiation & log etc.. [Jamie's code?]
-* TaProj
-* FFTnD ?

+* Pauli, SU subgroup, etc.. 
+
+* su3 exponentiation & log etc.. [Jamie's code?]
+
+* TaProj
+
+* FFTnD ?

 ======================================================================================================
 FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane)
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -11,15 +11,15 @@ int main (int argc, char ** argv)
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  int Nloop=10;
  int nmu=0;
  for(int mu=0;mu<4;mu++) if (mpi_layout[mu]>1) nmu++;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;



@@ -87,15 +87,15 @@ int main (int argc, char ** argv)

      double time = stop-start; // microseconds

-      std::cout << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }    


-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking sequential halo exchange in "<<nmu<<" dimensions"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;


  for(int lat=4;lat<=32;lat+=2){
@@ -163,7 +163,7 @@ int main (int argc, char ** argv)

      double time = stop-start;

-      std::cout << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }  

--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -21,7 +21,7 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);

  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  std::vector<int> latt4 = GridDefaultLatt();
  const int Ls=8;
@@ -79,9 +79,9 @@ int main (int argc, char ** argv)

  RealD mass=0.1;
  RealD M5  =1.8;
-  DomainWallFermion Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  
-  std::cout << "Calling Dw"<<std::endl;
+  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall=10;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
@@ -92,12 +92,12 @@ int main (int argc, char ** argv)
  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
  double flops=1344*volume*ncall;
  
-  std::cout << "Called Dw"<<std::endl;
-  std::cout << "norm result "<< norm2(result)<<std::endl;
-  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
-  std::cout << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+  std::cout<<GridLogMessage << "Called Dw"<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
  err = ref-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;


  if (1)
@@ -120,11 +120,11 @@ int main (int argc, char ** argv)
    ref = -0.5*ref;
  }
  Dw.Dhop(src,result,1);
-  std::cout << "Called DwDag"<<std::endl;
-  std::cout << "norm result "<< norm2(result)<<std::endl;
-  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
+  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  err = ref-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;

  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
@@ -133,24 +133,32 @@ int main (int argc, char ** argv)
  LatticeFermion r_eo  (FGrid);


-  std::cout << "Calling Deo and Doe"<<std::endl;
+  std::cout<<GridLogMessage << "Calling Deo and Doe"<<std::endl;
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);

+  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
+
+
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
-  Dw.Dhop(src,result,DaggerNo);
+  Dw.Dhop  (src  ,result,DaggerNo);
+
+  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
+  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
+  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;

  setCheckerboard(r_eo,r_o);
  setCheckerboard(r_eo,r_e);

  err = r_eo-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;

  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
-  std::cout << "norm diff even  "<< norm2(src_e)<<std::endl;
-  std::cout << "norm diff odd   "<< norm2(src_o)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;

  Grid_finalize();
 }
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -17,13 +17,13 @@ int main (int argc, char ** argv)
  std::vector<int> mpi_layout  = GridDefaultMpi();

  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking fused AXPY bandwidth ; sizeof(Real) "<<sizeof(Real)<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking fused AXPY bandwidth ; sizeof(Real) "<<sizeof(Real)<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

  for(int lat=4;lat<=32;lat+=4){

@@ -49,15 +49,15 @@ int main (int argc, char ** argv)
      
      double flops=vol*Nvec*2;// mul,add
      double bytes=3*vol*Nvec*sizeof(Real);
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;

    }

-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking a*x + y bandwidth"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
  
  for(int lat=4;lat<=32;lat+=4){

@@ -81,14 +81,14 @@ int main (int argc, char ** argv)
     
      double flops=vol*Nvec*2;// mul,add
      double bytes=3*vol*Nvec*sizeof(Real);
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;

    }

-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SCALE bandwidth"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SCALE bandwidth"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;

  for(int lat=4;lat<=32;lat+=4){

@@ -114,15 +114,15 @@ int main (int argc, char ** argv)
      
      double bytes=2*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*1;// mul
-      std::cout <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;

  }

-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking READ bandwidth"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking READ bandwidth"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

  for(int lat=4;lat<=32;lat+=4){

@@ -147,7 +147,7 @@ int main (int argc, char ** argv)
      
      double bytes=vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;

  }    

--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -14,15 +14,15 @@ int main (int argc, char ** argv)
  std::vector<int> mpi_layout  = GridDefaultMpi();

  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SU3xSU3  x= x*y"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  x= x*y"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=2;lat<=24;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -43,18 +43,18 @@ int main (int argc, char ** argv)
      double bytes=3.0*vol*Nc*Nc*sizeof(Complex);
      double footprint=2.0*vol*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(6.0+8.0+8.0)*vol;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<footprint<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<footprint<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;

    }


-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SU3xSU3  z= x*y"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z= x*y"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=2;lat<=24;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -75,17 +75,17 @@ int main (int argc, char ** argv)
      
      double bytes=3*vol*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(6+8+8)*vol;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;

    }

-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SU3xSU3  mult(z,x,y)"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  mult(z,x,y)"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=2;lat<=24;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -106,17 +106,17 @@ int main (int argc, char ** argv)
      
      double bytes=3*vol*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(6+8+8)*vol;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;

    }

-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SU3xSU3  mac(z,x,y)"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  mac(z,x,y)"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=2;lat<=24;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -137,7 +137,7 @@ int main (int argc, char ** argv)
      
      double bytes=3*vol*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(8+8+8)*vol;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;

    }

--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -28,10 +28,10 @@ int main (int argc, char ** argv)
  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);

  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-  std::cout << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
-  std::cout << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
-  std::cout << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
+  std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
+  std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;

  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);
@@ -58,8 +58,8 @@ int main (int argc, char ** argv)
  for(int nn=0;nn<Nd;nn++){
    random(pRNG,U[nn]);
    if(0) {
-      if (nn==-1) { U[nn]=zero; std::cout << "zeroing gauge field in dir "<<nn<<std::endl; }
-      else       { U[nn] = cone;std::cout << "unit gauge field in dir "<<nn<<std::endl; }
+      if (nn==-1) { U[nn]=zero; std::cout<<GridLogMessage << "zeroing gauge field in dir "<<nn<<std::endl; }
+      else       { U[nn] = cone;std::cout<<GridLogMessage << "unit gauge field in dir "<<nn<<std::endl; }
    }
    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
  }
@@ -87,9 +87,9 @@ int main (int argc, char ** argv)
  }
  ref = -0.5*ref;
  RealD mass=0.1;
-  WilsonFermion Dw(Umu,Grid,RBGrid,mass);
+  WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
  
-  std::cout << "Calling Dw"<<std::endl;
+  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall=10000;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
@@ -98,12 +98,12 @@ int main (int argc, char ** argv)
  double t1=usecond();
  double flops=1344*volume*ncall;
  
-  std::cout << "Called Dw"<<std::endl;
-  std::cout << "norm result "<< norm2(result)<<std::endl;
-  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
-  std::cout << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+  std::cout<<GridLogMessage << "Called Dw"<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
  err = ref-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;


  //  for(int ss=0;ss<10;ss++ ){
@@ -112,7 +112,7 @@ int main (int argc, char ** argv)
      for(int j=0;j<Nc;j++){
 	ComplexF * ref_p = (ComplexF *)&ref._odata[ss]()(i)(j);
 	ComplexF * res_p = (ComplexF *)&result._odata[ss]()(i)(j);
-	std::cout << ss<< " "<<i<<" "<<j<<" "<< (*ref_p)<<" " <<(*res_p)<<std::endl;
+	std::cout<<GridLogMessage << ss<< " "<<i<<" "<<j<<" "<< (*ref_p)<<" " <<(*res_p)<<std::endl;
      }
    }
  }
@@ -136,11 +136,11 @@ int main (int argc, char ** argv)
  }
  ref = -0.5*ref;
  Dw.Dhop(src,result,1);
-  std::cout << "Called DwDag"<<std::endl;
-  std::cout << "norm result "<< norm2(result)<<std::endl;
-  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
+  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  err = ref-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;

  Grid_finalize();
 }
--- a/8112
+++ b/8112
--- a/configure.ac
+++ b/configure.ac
@@ -3,7 +3,7 @@
 #
 # Project Grid package  
 # 
-# Time-stamp: <2015-06-09 15:26:39 neo>
+# Time-stamp: <2015-07-10 17:46:21 neo>

 AC_PREREQ([2.63])
 AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
@@ -11,7 +11,7 @@ AC_CANONICAL_SYSTEM
 AM_INIT_AUTOMAKE(subdir-objects)
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_SRCDIR([lib/Grid.h])
-AC_CONFIG_HEADERS([lib/GridConfig.h])
+AC_CONFIG_HEADERS([lib/Config.h])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])

 AC_MSG_NOTICE([
@@ -26,10 +26,9 @@ AC_LANG(C++)
 AC_PROG_CXX
 AC_OPENMP
 AC_PROG_RANLIB
-AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
+#AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
 AX_EXT

-
 # Checks for libraries.
 #AX_GCC_VAR_ATTRIBUTE(aligned)

@@ -66,7 +65,6 @@ AC_CHECK_LIB([mpfr],[mpfr_init],,
 Please install or provide the correct path to your installation
 Info at: http://www.mpfr.org/)])

-
 AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|MIC],\
 	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, MIC])],\
 	[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
@@ -106,9 +104,9 @@ case ${ac_SIMD} in
       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Corner] )
       supported="cross compilation"
     ;;
-     NEONv7)
-       echo Configuring for experimental ARMv7 support 
-       AC_DEFINE([NEONv7],[1],[NEON ARMv7 Experimental support ] )
+     NEONv8)
+       echo Configuring for experimental ARMv8a support 
+       AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] )
       supported="cross compilation"
     ;;
     DEBUG)
@@ -155,15 +153,15 @@ AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
 ###################################################################
 # Checks for doxygen support
 # if present enables the "make doxyfile" command
-echo
-echo Checking doxygen support 
-echo :::::::::::::::::::::::::::::::::::::::::::
-AC_PROG_DOXYGEN
+#echo
+#echo Checking doxygen support 
+#echo :::::::::::::::::::::::::::::::::::::::::::
+#AC_PROG_DOXYGEN

-if test -n "$DOXYGEN"
-then
-AC_CONFIG_FILES([docs/doxy.cfg])
-fi
+#if test -n "$DOXYGEN"
+#then
+#AC_CONFIG_FILES([docs/doxy.cfg])
+#fi

 echo
 echo Creating configuration files
--- a/gcc-bug-report/broken.cc
+++ b/gcc-bug-report/broken.cc
@@ -29,12 +29,12 @@ public:

 template<int N,class obj,typename std::enable_if<N==obj::NestLevel >::type * = nullptr > auto function(const obj &arg)-> obj
 {
-  std::cout<<"Leaf "<<obj::NestLevel<<std::endl;
+  std::cout<<GridLogMessage<<"Leaf "<<obj::NestLevel<<std::endl;
  return arg;
 }
 template<int N,class obj,typename std::enable_if<N!=obj::NestLevel >::type * = nullptr > auto function(const obj &arg)-> obj
 {
-  std::cout<<"Node "<<obj::NestLevel<<std::endl;
+  std::cout<<GridLogMessage<<"Node "<<obj::NestLevel<<std::endl;
  obj ret;
  ret.internal=function<N>(arg.internal);
  return ret;
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -1,6 +1,13 @@
 #ifndef GRID_ALIGNED_ALLOCATOR_H
 #define GRID_ALIGNED_ALLOCATOR_H

+#ifdef HAVE_MALLOC_MALLOC_H
+#include <malloc/malloc.h>
+#endif
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
 #include <immintrin.h>
 #ifdef HAVE_MM_MALLOC_H
 #include <mm_malloc.h>
--- a/lib/GridConfig.h
+++ b/lib/GridConfig.h
@@ -1,5 +1,5 @@
-/* lib/GridConfig.h.  Generated from GridConfig.h.in by configure.  */
-/* lib/GridConfig.h.in.  Generated from configure.ac by autoheader.  */
+/* lib/Config.h.  Generated from Config.h.in by configure.  */
+/* lib/Config.h.in.  Generated from configure.ac by autoheader.  */

 /* AVX Intrinsics */
 /* #undef AVX1 */
@@ -34,9 +34,6 @@
 /* Support AVX2 (Advanced Vector Extensions 2) instructions */
 /* #undef HAVE_AVX2 */

-/* define if the compiler supports basic C++11 syntax */
-/* #undef HAVE_CXX11 */
-
 /* Define to 1 if you have the declaration of `be64toh', and to 0 if you
   don't. */
 #define HAVE_DECL_BE64TOH 1
@@ -120,8 +117,8 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #define HAVE_UNISTD_H 1

-/* NEON ARMv7 Experimental support */
-/* #undef NEONv7 */
+/* NEON ARMv8 Experimental support */
+/* #undef NEONv8 */

 /* Name of package */
 #define PACKAGE "grid"
--- a/lib/GridConfig.h.in
+++ b/lib/GridConfig.h.in
@@ -1,4 +1,4 @@
-/* lib/GridConfig.h.in.  Generated from configure.ac by autoheader.  */
+/* lib/Config.h.in.  Generated from configure.ac by autoheader.  */

 /* AVX Intrinsics */
 #undef AVX1
@@ -33,9 +33,6 @@
 /* Support AVX2 (Advanced Vector Extensions 2) instructions */
 #undef HAVE_AVX2

-/* define if the compiler supports basic C++11 syntax */
-#undef HAVE_CXX11
-
 /* Define to 1 if you have the declaration of `be64toh', and to 0 if you
   don't. */
 #undef HAVE_DECL_BE64TOH
@@ -119,8 +116,8 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H

-/* NEON ARMv7 Experimental support */
-#undef NEONv7
+/* NEON ARMv8 Experimental support */
+#undef NEONv8

 /* Name of package */
 #undef PACKAGE
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -6,92 +6,48 @@
 //  Copyright (c) 2014 University of Edinburgh. All rights reserved.
 //

-
 #ifndef GRID_H
 #define GRID_H

+///////////////////
+// Std C++ dependencies
+///////////////////
 #include <cassert>
-
 #include <complex>
 #include <vector>
-
 #include <iostream>
 #include <iomanip>
 #include <random>
 #include <functional>
-
 #include <stdio.h>
 #include <stdlib.h>
-#include <sys/time.h>
 #include <stdio.h>
 #include <signal.h>
+#include <ctime>
+#include <sys/time.h>
+#include <chrono>

-#ifndef MAX
-#define MAX(x,y) ((x)>(y)?(x):(y))
-#define MIN(x,y) ((x)>(y)?(y):(x))
-#endif
-
-#define strong_inline __attribute__((always_inline)) inline
-
-#include <GridConfig.h>
-
-////////////////////////////////////////////////////////////
-// Tunable header includes
-////////////////////////////////////////////////////////////
-
-#ifdef HAVE_MALLOC_MALLOC_H
-#include <malloc/malloc.h>
-#endif
-#ifdef HAVE_MALLOC_H
-#include <malloc.h>
-#endif
-
+///////////////////
+// Grid headers
+///////////////////
+#include <MacroMagic.h>
+#include <Config.h>
+#include <Timer.h>
+#include <Log.h>
 #include <AlignedAllocator.h>
-
 #include <Simd.h>
 #include <Threads.h>
-
-#include <Communicator.h> // subdir aggregate
-#include <Cartesian.h> // subdir aggregate
-#include <Tensors.h>   // subdir aggregate
-#include <Lattice.h>   // subdir aggregate
-#include <Cshift.h>    // subdir aggregate
-#include <Stencil.h>   // subdir aggregate
-#include <Algorithms.h>// subdir aggregate
-
+#include <Communicator.h> 
+#include <Cartesian.h>    
+#include <Tensors.h>      
+#include <Lattice.h>      
+#include <Cshift.h>       
+#include <Stencil.h>      
+#include <Algorithms.h>   
 #include <qcd/QCD.h>
 #include <parallelIO/NerscIO.h>

-namespace Grid {
+#include <Init.h>

-  void Grid_init(int *argc,char ***argv);
-  void Grid_finalize(void);
-  // internal, controled with --handle
-  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
-  void Grid_debug_handler_init(void);
-  void Grid_quiesce_nodes(void);
-  void Grid_unquiesce_nodes(void);
-
-  // C++11 time facilities better?
-  double usecond(void);
-
-  const std::vector<int> GridDefaultSimd(int dims,int nsimd);
-  const std::vector<int> &GridDefaultLatt(void);
-  const std::vector<int> &GridDefaultMpi(void);
-  const int              &GridThreads(void)  ;
-  void                 GridSetThreads(int t) ;
-
-  // Common parsing chores
-  std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
-  bool        GridCmdOptionExists(char** begin, char** end, const std::string& option);
-  std::string GridCmdVectorIntToString(const std::vector<int> & vec);
-
-  void GridParseLayout(char **argv,int argc,
-		       std::vector<int> &latt,
-		       std::vector<int> &simd,
-		       std::vector<int> &mpi);
-
-
-};

 #endif
--- a/lib/GridInit.cc
+++ b/lib/GridInit.cc
@@ -25,17 +25,19 @@

 namespace Grid {

-  //////////////////////////////////////////////////////
-  // Convenience functions to access stadard command line arg
-  // driven parallelism controls
-  //////////////////////////////////////////////////////
-  static std::vector<int> Grid_default_latt;
-  static std::vector<int> Grid_default_mpi;
+//////////////////////////////////////////////////////
+// Convenience functions to access stadard command line arg
+// driven parallelism controls
+//////////////////////////////////////////////////////
+static std::vector<int> Grid_default_latt;
+static std::vector<int> Grid_default_mpi;
+int GridThread::_threads;

-  int GridThread::_threads;

-  const std::vector<int> GridDefaultSimd(int dims,int nsimd)
-  {
+const std::vector<int> &GridDefaultLatt(void)     {return Grid_default_latt;};
+const std::vector<int> &GridDefaultMpi(void)      {return Grid_default_mpi;};
+const std::vector<int> GridDefaultSimd(int dims,int nsimd)
+{
    std::vector<int> layout(dims);
    int nn=nsimd;
    for(int d=dims-1;d>=0;d--){
@@ -48,15 +50,11 @@ namespace Grid {
    }
    assert(nn==1);
    return layout;
-  }
+}
  
-  
-  const std::vector<int> &GridDefaultLatt(void)     {return Grid_default_latt;};
-  const std::vector<int> &GridDefaultMpi(void)      {return Grid_default_mpi;};
-
-  ////////////////////////////////////////////////////////////
-  // Command line parsing assist for stock controls
-  ////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+// Command line parsing assist for stock controls
+////////////////////////////////////////////////////////////
 std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option)
 {
  char ** itr = std::find(begin, end, option);
@@ -70,6 +68,23 @@ bool GridCmdOptionExists(char** begin, char** end, const std::string& option)
 {
  return std::find(begin, end, option) != end;
 }
+  // Comma separated list
+void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec)
+{
+  size_t pos = 0;
+  std::string token;
+  std::string delimiter(",");
+
+  vec.resize(0);
+  while ((pos = str.find(delimiter)) != std::string::npos) {
+    token = str.substr(0, pos);
+    vec.push_back(token);
+    str.erase(0, pos + delimiter.length());
+  }
+  token = str;
+  vec.push_back(token);
+  return;
+}

 void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
 {
@@ -84,6 +99,7 @@ void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
  return;
 }

+
 void GridParseLayout(char **argv,int argc,
 		     std::vector<int> &latt,
 		     std::vector<int> &mpi)
@@ -117,8 +133,9 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
  std::copy(vec.begin(), vec.end(),std::ostream_iterator<int>(oss, " "));
  return oss.str();
 }
-  /////////////////////////////////////////////////////////
-  /////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////
+//
+/////////////////////////////////////////////////////////
 void Grid_init(int *argc,char ***argv)
 {
 #ifdef GRID_COMMS_MPI
@@ -126,15 +143,33 @@ void Grid_init(int *argc,char ***argv)
 #endif
  // Parse command line args.

+  GridLogger::StopWatch.Start();
+
+  std::string arg;
+  std::vector<std::string> logstreams;
+  std::string defaultLog("Error,Warning,Message,Performance");
+
+  GridCmdOptionCSL(defaultLog,logstreams);
+  GridLogConfigure(logstreams);
+
  if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
-    std::cout<<"--help : this message"<<std::endl;
-    std::cout<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
-    std::cout<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
-    std::cout<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
-    std::cout<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
-    std::cout<<"--omp n         : default number of OMP threads"<<std::endl;    
-    std::cout<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
+    std::cout<<GridLogMessage<<"--help : this message"<<std::endl;
+    std::cout<<GridLogMessage<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
+    std::cout<<GridLogMessage<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
+    std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
+    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
+    std::cout<<GridLogMessage<<"--omp n         : default number of OMP threads"<<std::endl;    
+    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
+    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Debug"<<std::endl;    
  }
+
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
+    arg = GridCmdOptionPayload(*argv,*argv+*argc,"--log");
+    GridCmdOptionCSL(arg,logstreams);
+    GridLogConfigure(logstreams);
+  }
+
+
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
@@ -142,8 +177,7 @@ void Grid_init(int *argc,char ***argv)
    Grid_quiesce_nodes();
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
-    WilsonFermion::HandOptDslash=1;
-    WilsonFermion5D::HandOptDslash=1;
+    WilsonFermionStatic::HandOptDslash=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
@@ -152,39 +186,19 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
-    std::cout<<"Grid Decomposition\n";
-    std::cout<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
-    std::cout<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
-    std::cout<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
-    std::cout<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
-    std::cout<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
-    std::cout<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"Grid Decomposition\n";
+    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
+    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
+    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
  }

+
 }

  
-  ////////////////////////////////////////////////////////////
-  // Verbose limiter on MPI tasks
-  ////////////////////////////////////////////////////////////
-  void Grid_quiesce_nodes(void)
-  {
-#ifdef GRID_COMMS_MPI
-    int me;
-    MPI_Comm_rank(MPI_COMM_WORLD,&me);
-    if ( me ) { 
-      std::cout.setstate(std::ios::badbit);
-    }
-#endif
-  }
-  void Grid_unquiesce_nodes(void)
-  {
-#ifdef GRID_COMMS_MPI
-    std::cout.clear();
-#endif
-  }
-
-  
 void Grid_finalize(void)
 {
 #ifdef GRID_COMMS_MPI
--- a/lib/Init.h
+++ b/lib/Init.h
@@ -0,0 +1,32 @@
+#ifndef GRID_INIT_H
+#define GRID_INIT_H
+
+namespace Grid {
+
+  void Grid_init(int *argc,char ***argv);
+  void Grid_finalize(void);
+  // internal, controled with --handle
+  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
+  void Grid_debug_handler_init(void);
+  void Grid_quiesce_nodes(void);
+  void Grid_unquiesce_nodes(void);
+
+  const std::vector<int> GridDefaultSimd(int dims,int nsimd);
+  const std::vector<int> &GridDefaultLatt(void);
+  const std::vector<int> &GridDefaultMpi(void);
+  const int              &GridThreads(void)  ;
+  void                    GridSetThreads(int t) ;
+
+  // Common parsing chores
+  std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
+  bool        GridCmdOptionExists(char** begin, char** end, const std::string& option);
+  std::string GridCmdVectorIntToString(const std::vector<int> & vec);
+
+  void GridParseLayout(char **argv,int argc,
+		       std::vector<int> &latt,
+		       std::vector<int> &simd,
+		       std::vector<int> &mpi);
+
+
+};
+#endif
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -0,0 +1,62 @@
+#include <Grid.h>
+
+namespace Grid {
+
+GridStopWatch GridLogger::StopWatch;
+std::ostream  GridLogger::devnull(0);
+
+GridLogger GridLogError      (1,"Error");
+GridLogger GridLogWarning    (1,"Warning");
+GridLogger GridLogMessage    (1,"Message");
+GridLogger GridLogDebug      (1,"Debug");
+GridLogger GridLogPerformance(1,"Performance");
+GridLogger GridLogIterative  (1,"Iterative");
+
+void GridLogConfigure(std::vector<std::string> &logstreams)
+{
+  GridLogError.Active(0);
+  GridLogWarning.Active(0);
+  GridLogMessage.Active(0);
+  GridLogIterative.Active(0);
+  GridLogDebug.Active(0);
+  GridLogPerformance.Active(0);
+
+  for(int i=0;i<logstreams.size();i++){
+    if ( logstreams[i]== std::string("Error")       ) GridLogError.Active(1);
+    if ( logstreams[i]== std::string("Warning")     ) GridLogWarning.Active(1);
+    if ( logstreams[i]== std::string("Message")     ) GridLogMessage.Active(1);
+    if ( logstreams[i]== std::string("Iterative")   ) GridLogIterative.Active(1);
+    if ( logstreams[i]== std::string("Debug")       ) GridLogDebug.Active(1);
+    if ( logstreams[i]== std::string("Performance") ) GridLogPerformance.Active(1);
+  }
+}
+
+////////////////////////////////////////////////////////////
+// Verbose limiter on MPI tasks
+////////////////////////////////////////////////////////////
+void Grid_quiesce_nodes(void)
+{
+#ifdef GRID_COMMS_MPI
+  int me;
+  MPI_Comm_rank(MPI_COMM_WORLD,&me);
+  if ( me ) { 
+    std::cout.setstate(std::ios::badbit);
+  }
+#endif
+}
+
+void Grid_unquiesce_nodes(void)
+{
+#ifdef GRID_COMMS_MPI
+    std::cout.clear();
+#endif
+}
+
+std::ostream& operator<< (std::ostream& stream, const GridTime& time)
+{
+  stream << time.count()<<" ms";
+  return stream;
+}
+
+}
+
--- a/lib/Log.h
+++ b/lib/Log.h
@@ -0,0 +1,46 @@
+#ifndef GRID_LOG_H
+#define GRID_LOG_H
+namespace Grid {
+
+// Dress the output; use std::chrono for time stamping via the StopWatch class
+
+std::ostream& operator<< (std::ostream& stream, const GridTime& time);
+
+class GridLogger { 
+  int active;
+  std::string name;
+public:
+
+  static GridStopWatch StopWatch;
+  static std::ostream devnull;
+  
+  GridLogger(int on, std::string nm): active(on), name(nm) { 
+  };
+  
+  void Active(int on) {active = on;};
+
+  friend std::ostream& operator<< (std::ostream& stream, const GridLogger& log){
+    if ( log.active ) {
+      StopWatch.Stop();
+      GridTime now = StopWatch.Elapsed();
+      StopWatch.Start();
+      stream << "Grid : "<<log.name << " : " << now << " : ";
+      return stream;
+    } else { 
+      return devnull;
+    }
+  }
+
+};
+
+void GridLogConfigure(std::vector<std::string> &logstreams);
+
+extern GridLogger GridLogError;
+extern GridLogger GridLogWarning;
+extern GridLogger GridLogMessage;
+extern GridLogger GridLogDebug  ;
+extern GridLogger GridLogPerformance;
+extern GridLogger GridLogIterative  ;
+
+}
+#endif
--- a/lib/MacroMagic.h
+++ b/lib/MacroMagic.h
@@ -0,0 +1,83 @@
+#ifndef GRID_MACRO_MAGIC_H
+#define GRID_MACRO_MAGIC_H
+
+#define strong_inline __attribute__((always_inline)) inline
+
+#ifndef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#define MIN(x,y) ((x)>(y)?(y):(x))
+#endif
+
+#define GRID_MACRO_FIRST(a, ...) a
+#define GRID_MACRO_SECOND(a, b, ...) b
+
+#define GRID_MACRO_EMPTY()
+
+#define GRID_MACRO_EVAL(...)     GRID_MACRO_EVAL1024(__VA_ARGS__)
+#define GRID_MACRO_EVAL1024(...) GRID_MACRO_EVAL512(GRID_MACRO_EVAL512(__VA_ARGS__))
+#define GRID_MACRO_EVAL512(...)  GRID_MACRO_EVAL256(GRID_MACRO_EVAL256(__VA_ARGS__))
+#define GRID_MACRO_EVAL256(...)  GRID_MACRO_EVAL128(GRID_MACRO_EVAL128(__VA_ARGS__))
+#define GRID_MACRO_EVAL128(...)  GRID_MACRO_EVAL64(GRID_MACRO_EVAL64(__VA_ARGS__))
+#define GRID_MACRO_EVAL64(...)   GRID_MACRO_EVAL32(GRID_MACRO_EVAL32(__VA_ARGS__))
+#define GRID_MACRO_EVAL32(...)   GRID_MACRO_EVAL16(GRID_MACRO_EVAL16(__VA_ARGS__))
+#define GRID_MACRO_EVAL16(...)   GRID_MACRO_EVAL8(GRID_MACRO_EVAL8(__VA_ARGS__))
+#define GRID_MACRO_EVAL8(...)    GRID_MACRO_EVAL4(GRID_MACRO_EVAL4(__VA_ARGS__))
+#define GRID_MACRO_EVAL4(...)    GRID_MACRO_EVAL2(GRID_MACRO_EVAL2(__VA_ARGS__))
+#define GRID_MACRO_EVAL2(...)    GRID_MACRO_EVAL1(GRID_MACRO_EVAL1(__VA_ARGS__))
+#define GRID_MACRO_EVAL1(...) __VA_ARGS__
+
+#define GRID_MACRO_DEFER1(m) m GRID_MACRO_EMPTY()
+#define GRID_MACRO_DEFER2(m) m GRID_MACRO_EMPTY GRID_MACRO_EMPTY()()
+#define GRID_MACRO_DEFER3(m) m GRID_MACRO_EMPTY GRID_MACRO_EMPTY GRID_MACRO_EMPTY()()()
+#define GRID_MACRO_DEFER4(m) m GRID_MACRO_EMPTY GRID_MACRO_EMPTY GRID_MACRO_EMPTY GRID_MACRO_EMPTY()()()()
+
+#define GRID_MACRO_IS_PROBE(...) GRID_MACRO_SECOND(__VA_ARGS__, 0)
+#define GRID_MACRO_PROBE() ~, 1
+
+#define GRID_MACRO_CAT(a,b) a ## b
+
+#define GRID_MACRO_NOT(x) GRID_MACRO_IS_PROBE(GRID_MACRO_CAT(_GRID_MACRO_NOT_, x))
+#define _GRID_MACRO_NOT_0 GRID_MACRO_PROBE()
+
+#define GRID_MACRO_BOOL(x) GRID_MACRO_NOT(GRID_MACRO_NOT(x))
+
+#define GRID_MACRO_IF_ELSE(condition) _GRID_MACRO_IF_ELSE(GRID_MACRO_BOOL(condition))
+#define _GRID_MACRO_IF_ELSE(condition) GRID_MACRO_CAT(_GRID_MACRO_IF_, condition)
+
+#define _GRID_MACRO_IF_1(...) __VA_ARGS__ _GRID_MACRO_IF_1_ELSE
+#define _GRID_MACRO_IF_0(...)             _GRID_MACRO_IF_0_ELSE
+
+#define _GRID_MACRO_IF_1_ELSE(...)
+#define _GRID_MACRO_IF_0_ELSE(...) __VA_ARGS__
+
+#define GRID_MACRO_HAS_ARGS(...) GRID_MACRO_BOOL(GRID_MACRO_FIRST(_GRID_MACRO_END_OF_ARGUMENTS_ __VA_ARGS__)())
+#define _GRID_MACRO_END_OF_ARGUMENTS_() 0
+
+#define GRID_MACRO_MAP(m, first, second, ...)   \
+  m(first,second)                           \
+  GRID_MACRO_IF_ELSE(GRID_MACRO_HAS_ARGS(__VA_ARGS__))(				       \
+				 GRID_MACRO_DEFER4(_GRID_MACRO_MAP)()(m, __VA_ARGS__)   \
+				     )(                                 \
+				       /* Do nothing, just terminate */ \
+									)
+
+#define _GRID_MACRO_MAP() GRID_MACRO_MAP
+
+#define GRID_MACRO_MEMBER(A,B)        A B;
+
+#define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" "#B <<" = "<< obj. B <<" ; " <<std::endl;
+
+#define GRID_DECL_CLASS_MEMBERS(cname,...)		\
+  \
+  \
+  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))		\
+  \
+  \
+  friend std::ostream & operator << (std::ostream &os, const cname &obj ) {	\
+    os<<"class "<<#cname<<" {"<<std::endl;\
+    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__))	\
+      os<<"}";								\
+    return os;\
+  };  
+
+#endif
--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@

-HFILES=./Cshift.h ./simd/Grid_avx.h ./simd/Grid_vector_types.h ./simd/Grid_sse4.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_vector_unops.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./Tensors.h ./Algorithms.h ./communicator/Communicator_base.h ./lattice/Lattice_rng.h ./lattice/Lattice_reduction.h ./lattice/Lattice_transfer.h ./lattice/Lattice_unary.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_comparison.h ./lattice/Lattice_overload.h ./lattice/Lattice_reality.h ./lattice/Lattice_local.h ./lattice/Lattice_conformable.h ./lattice/Lattice_where.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_ET.h ./lattice/Lattice_transpose.h ./lattice/Lattice_trace.h ./Stencil.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_exp.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_class.h ./tensors/Tensor_logical.h ./tensors/Tensor_transpose.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_index.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_outer.h ./tensors/Tensor_inner.h ./tensors/Tensor_traits.h ./tensors/Tensor_Ta.h ./tensors/Tensor_unary.h ./tensors/Tensor_determinant.h ./tensors/Tensor_arith.h ./tensors/Tensor_extract_merge.h ./Communicator.h ./Cartesian.h ./parallelIO/NerscIO.h ./qcd/QCD.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/hmc/HMC.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/LinalgUtils.h ./qcd/utils/CovariantCshift.h ./qcd/utils/WilsonLoops.h ./qcd/action/ActionBase.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/spin/TwoSpinor.h ./qcd/spin/Dirac.h ./cshift/Cshift_common.h ./cshift/Cshift_none.h ./cshift/Cshift_mpi.h ./Simd.h ./GridConfig.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_red_black.h ./cartesian/Cartesian_full.h ./AlignedAllocator.h ./Lattice.h ./Old/Tensor_poke.h ./Old/Tensor_peek.h ./Threads.h ./Grid.h ./algorithms/Preconditioner.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/AdefGeneric.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Remez.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./algorithms/CoarsenedMatrix.h ./stencil/Lebesgue.h
+HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Config.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./Init.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./Log.h ./MacroMagic.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./parallelIO/NerscIO.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/WilsonLoops.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h ./Timer.h

-CCFILES=./qcd/hmc/integrators/Integrator.cc ./qcd/hmc/HMC.cc ./qcd/utils/SpaceTimeGrid.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/spin/Dirac.cc ./GridInit.cc ./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
+CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./Init.cc ./Log.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/hmc/HMC.cc ./qcd/hmc/integrators/Integrator.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -41,6 +41,12 @@

 namespace Grid {
  
+  struct StencilEntry { 
+    int _offset;
+    int _is_local;
+    int _permute;
+    int _around_the_world;
+  };

  class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
  public:
@@ -58,9 +64,9 @@ namespace Grid {
      std::vector<int>                  _permute_type;

      // npoints x Osites() of these
-      std::vector<std::vector<int>    > _offsets;
-      std::vector<std::vector<int>    > _is_local;
-      std::vector<std::vector<int> >    _permute;
+      std::vector<std::vector<StencilEntry> > _entries;
+
+      inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }

      int _unified_buffer_size;
      int _request_count;
@@ -77,8 +83,8 @@ namespace Grid {
      // Can this be avoided with simpler coding of comms?
      void Local     (int point, int dimension,int shift,int cbmask);
      void Comms     (int point, int dimension,int shift,int cbmask);
-      void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute);
-      void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset);
+      void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
+      void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);

      // Could allow a functional munging of the halo to another type during the comms.
      // this could implement the 16bit/32bit/64bit compression.
--- a/lib/Timer.h
+++ b/lib/Timer.h
@@ -0,0 +1,52 @@
+#ifndef GRID_TIME_H
+#define GRID_TIME_H
+
+#include <sys/time.h>
+#include <ctime>
+#include <chrono>
+
+namespace Grid {
+
+
+  // Dress the output; use std::chrono
+
+// C++11 time facilities better?
+double usecond(void);
+
+typedef  std::chrono::system_clock          GridClock;
+typedef  std::chrono::time_point<GridClock> GridTimePoint;
+typedef  std::chrono::milliseconds          GridTime;
+
+ 
+class GridStopWatch {
+private:
+  bool running;
+  GridTimePoint start;
+  GridTime accumulator;
+public:
+  GridStopWatch () { 
+    Reset();
+  }
+  void     Start(void) { 
+    assert(running == false);
+    start = GridClock::now(); 
+    running = true;
+  }
+  void     Stop(void)  { 
+    assert(running == true);
+    accumulator+= std::chrono::duration_cast<GridTime>(GridClock::now()-start); 
+    running = false; 
+  };
+  void     Reset(void){
+    running = false;
+    start = GridClock::now();
+    accumulator = std::chrono::duration_cast<GridTime>(start-start); 
+  }
+  GridTime Elapsed(void) {
+    assert(running == false);
+    return accumulator;
+  }
+};
+
+}
+#endif
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -32,12 +32,12 @@ namespace Grid {
      displacements[2*_d]=0;
      
      //// report back
-      std::cout<<"directions    :";
+      std::cout<<GridLogMessage<<"directions    :";
      for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
      std::cout <<std::endl;
-      std::cout<<"displacements :";
+      std::cout<<GridLogMessage<<"displacements :";
      for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
-      std::cout <<std::endl;
+      std::cout<<std::endl;
    }
  
    /*
@@ -100,9 +100,9 @@ namespace Grid {
 	  eProj._odata[ss](i)=CComplex(1.0);
 	}
 	eProj=eProj - iProj;
-	std::cout<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
+	std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
      }
-      std::cout <<"CheckOrthog done"<<std::endl;
+      std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
    }
    void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
      blockProject(CoarseVec,FineVec,subspace);
@@ -113,27 +113,27 @@ namespace Grid {
    void CreateSubspaceRandom(GridParallelRNG &RNG){
      for(int i=0;i<nbasis;i++){
 	random(RNG,subspace[i]);
-	std::cout<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
+	std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
      }
      Orthogonalise();
    }
-    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop) {
+    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {

      RealD scale;

-      ConjugateGradient<FineField> CG(1.0e-4,10000);
+      ConjugateGradient<FineField> CG(1.0e-2,10000);
      FineField noise(FineGrid);
      FineField Mn(FineGrid);

-      for(int b=0;b<nbasis;b++){
+      for(int b=0;b<nn;b++){
 	
 	gaussian(RNG,noise);
 	scale = std::pow(norm2(noise),-0.5); 
 	noise=noise*scale;

-	hermop.Op(noise,Mn); std::cout << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+	hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;

-	for(int i=0;i<2;i++){
+	for(int i=0;i<1;i++){

 	  CG(hermop,noise,subspace[b]);

@@ -143,8 +143,9 @@ namespace Grid {

 	}

-	hermop.Op(noise,Mn); std::cout << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
-	subspace[b] = noise;
+	hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
+	subspace[b]   = noise;
+
      }

      Orthogonalise();
@@ -188,24 +189,22 @@ namespace Grid {
      SimpleCompressor<siteVector> compressor;
      Stencil.HaloExchange(in,comm_buf,compressor);

-      //PARALLEL_FOR_LOOP
+PARALLEL_FOR_LOOP
      for(int ss=0;ss<Grid()->oSites();ss++){
        siteVector res = zero;
 	siteVector nbr;
-	int offset,local,perm,ptype;
-
+	int ptype;
+	StencilEntry *SE;
 	for(int point=0;point<geom.npoint;point++){
-	  offset = Stencil._offsets [point][ss];
-	  local  = Stencil._is_local[point][ss];
-	  perm   = Stencil._permute [point][ss];
-	  ptype  = Stencil._permute_type[point];

-	  if(local&&perm) { 
-	    permute(nbr,in._odata[offset],ptype);
-	  } else if(local) { 
-	    nbr = in._odata[offset];
+	  SE=Stencil.GetEntry(ptype,point,ss);
+	  
+	  if(SE->_is_local&&SE->_permute) { 
+	    permute(nbr,in._odata[SE->_offset],ptype);
+	  } else if(SE->_is_local) { 
+	    nbr = in._odata[SE->_offset];
 	  } else {
-	    nbr = comm_buf[offset];
+	    nbr = comm_buf[SE->_offset];
 	  }
 	  res = res + A[point]._odata[ss]*nbr;
 	}
@@ -251,10 +250,6 @@ namespace Grid {

      // Orthogonalise the subblocks over the basis
      blockOrthogonalise(InnerProd,Subspace.subspace);
-      //Subspace.Orthogonalise();
-      //      Subspace.CheckOrthogonal();
-      //Subspace.Orthogonalise();
-      //      Subspace.CheckOrthogonal();

      // Compute the matrix elements of linop between this orthonormal
      // set of vectors.
@@ -305,6 +300,7 @@ namespace Grid {
 	  Subspace.ProjectToSubspace(oProj,oblock);
 	  //	  blockProject(iProj,iblock,Subspace.subspace);
 	  //	  blockProject(oProj,oblock,Subspace.subspace);
+PARALLEL_FOR_LOOP
 	  for(int ss=0;ss<Grid()->oSites();ss++){
 	    for(int j=0;j<nbasis;j++){
 	      if( disp!= 0 ) {
@@ -320,12 +316,12 @@ namespace Grid {
      ///////////////////////////
      // test code worth preserving in if block
      ///////////////////////////
-      std::cout<< " Computed matrix elements "<< self_stencil <<std::endl;
+      std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
      for(int p=0;p<geom.npoint;p++){
-	std::cout<< "A["<<p<<"]" << std::endl;
-	std::cout<< A[p] << std::endl;
+	std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
+	std::cout<<GridLogMessage<< A[p] << std::endl;
      }
-      std::cout<< " picking by block0 "<< self_stencil <<std::endl;
+      std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;

      phi=Subspace.subspace[0];
      std::vector<int> bc(FineGrid->_ndimension,0);
@@ -333,9 +329,9 @@ namespace Grid {
      blockPick(Grid(),phi,tmp,bc);      // Pick out a block
      linop.Op(tmp,Mphi);                // Apply big dop
      blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
-      std::cout<< " Computed matrix elements from block zero only "<<std::endl;
-      std::cout<< iProj <<std::endl;
-      std::cout<<"Computed Coarse Operator"<<std::endl;
+      std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
+      std::cout<<GridLogMessage<< iProj <<std::endl;
+      std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
      //      ForceHermitian();
      AssertHermitian();
@@ -344,9 +340,9 @@ namespace Grid {
    void ForceDiagonal(void) {


-      std::cout<<"**************************************************"<<std::endl;
-      std::cout<<"****   Forcing coarse operator to be diagonal ****"<<std::endl;
-      std::cout<<"**************************************************"<<std::endl;
+      std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
+      std::cout<<GridLogMessage<<"****   Forcing coarse operator to be diagonal ****"<<std::endl;
+      std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
      for(int p=0;p<8;p++){
 	A[p]=zero;
      }
@@ -386,13 +382,13 @@ namespace Grid {
 	
 	Diff = AA - adj(AAc);

-	std::cout<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
-	std::cout<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
+	std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
+	std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
 	  
      }
      Diff = A[8] - adj(A[8]);
-      std::cout<<"Norm diff local "<< norm2(Diff)<<std::endl;
-      std::cout<<"Norm local "<< norm2(A[8])<<std::endl;
+      std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
+      std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
    }
    
  };
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -71,6 +71,47 @@ namespace Grid {
      }
    };

+    ////////////////////////////////////////////////////////////////////
+    // Construct herm op and shift it for mgrid smoother
+    ////////////////////////////////////////////////////////////////////
+    template<class Matrix,class Field>
+    class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
+      Matrix &_Mat;
+      RealD _shift;
+    public:
+    ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
+      // Support for coarsening to a multigrid
+      void OpDiag (const Field &in, Field &out) {
+	_Mat.Mdiag(in,out);
+	assert(0);
+      }
+      void OpDir  (const Field &in, Field &out,int dir,int disp) {
+	_Mat.Mdir(in,out,dir,disp);
+	assert(0);
+      }
+      void Op     (const Field &in, Field &out){
+	_Mat.M(in,out);
+	assert(0);
+      }
+      void AdjOp     (const Field &in, Field &out){
+	_Mat.Mdag(in,out);
+	assert(0);
+      }
+      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+	_Mat.MdagM(in,out,n1,n2);
+	out = out + _shift*in;
+
+	ComplexD dot;	
+	dot= innerProduct(in,out);
+	n1=real(dot);
+	n2=norm2(out);
+      }
+      void HermOp(const Field &in, Field &out){
+	RealD n1,n2;
+	HermOpAndNorm(in,out,n1,n2);
+      }
+    };
+
    ////////////////////////////////////////////////////////////////////
    // Wrap an already herm matrix
    ////////////////////////////////////////////////////////////////////
@@ -147,6 +188,7 @@ namespace Grid {
    };
    template<class Matrix,class Field>
      class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
+    protected:
      Matrix &_Mat;
    public:
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
@@ -173,6 +215,7 @@ namespace Grid {
    };
    template<class Matrix,class Field>
      class SchurDiagOneOperator :  public SchurOperatorBase<Field> {
+    protected:
      Matrix &_Mat;
    public:
      SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
@@ -199,6 +242,7 @@ namespace Grid {
      }
    };

+
    /////////////////////////////////////////////////////////////
    // Base classes for functions of operators
    /////////////////////////////////////////////////////////////
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -50,6 +50,17 @@ namespace Grid {
      return;
    }

+    // Convenience for plotting the approximation
+    void   PlotApprox(std::ostream &out) {
+      out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
+      for(double x=lo;x<hi;x+=(hi-lo)/50.0){
+	out <<x<<"\t"<<approx(x)<<std::endl;
+      }
+    };
+
+    
+    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
+    //
    Chebyshev(double _lo,double _hi,int _order, double (* func)(double) ){
      lo=_lo;
      hi=_hi;
@@ -68,7 +79,34 @@ namespace Grid {
 	Coeffs[j] = s * 2.0/order;
      }
    };
+    void JacksonSmooth(void){
+      double M=order;
+      double alpha = M_PI/(M+2);
+      double lmax = std::cos(alpha);
+      double sumUsq =0;
+      std::vector<double> U(M);
+      std::vector<double> a(M);
+      std::vector<double> g(M);
+      for(int n=0;n<=M;n++){
+	U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
+	sumUsq += U[n]*U[n];
+      }      
+      sumUsq = std::sqrt(sumUsq);

+      for(int i=1;i<=M;i++){
+	a[i] = U[i]/sumUsq;
+      }
+      g[0] = 1.0;
+      for(int m=1;m<=M;m++){
+	g[m] = 0;
+	for(int i=0;i<=M-m;i++){
+	  g[m]+= a[i]*a[m+i];
+	}
+      }
+      for(int m=1;m<=M;m++){
+	Coeffs[m]*=g[m];
+      }
+    }
    double approx(double x) // Convenience for plotting the approximation
    {
      double Tn;
@@ -95,41 +133,34 @@ namespace Grid {
      return sum;
    };

-    // Convenience for plotting the approximation
-    void   PlotApprox(std::ostream &out) {
-      out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
-      for(double x=lo;x<hi;x+=(hi-lo)/50.0){
-	out <<x<<"\t"<<approx(x)<<std::endl;
-      }
-    };
-
-    // Implement the required interface; could require Lattice base class
+    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {

-      Field T0 = in;
-      Field T1 = T0; // Field T1(T0._grid); more efficient but hardwires Lattice class
-      Field T2 = T1;
+      GridBase *grid=in._grid;
+
+      int vol=grid->gSites();
+
+      Field T0(grid); T0 = in;  
+      Field T1(grid); 
+      Field T2(grid);
+      Field y(grid);
      
-      // use a pointer trick to eliminate copies
      Field *Tnm = &T0;
      Field *Tn  = &T1;
      Field *Tnp = &T2;
-      Field y   = in;

+      std::cout<<GridLogMessage << "Chebyshev ["<<lo<<","<<hi<<"]"<< " order "<<order <<std::endl;
+      // Tn=T1 = (xscale M + mscale)in
      double xscale = 2.0/(hi-lo);
      double mscale = -(hi+lo)/(hi-lo);
-
-      // Tn=T1 = (xscale M + mscale)in
-      Linop.Op(T0,y);
-
+      Linop.HermOp(T0,y);
      T1=y*xscale+in*mscale;

      // sum = .5 c[0] T0 + c[1] T1
      out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
-
      for(int n=2;n<order;n++){
 	
-	Linop.Op(*Tn,y);
+	Linop.HermOp(*Tn,y);

 	y=xscale*y+mscale*(*Tn);

--- a/lib/algorithms/approx/MultiShiftFunction.h
+++ b/lib/algorithms/approx/MultiShiftFunction.h
@@ -1,6 +1,8 @@
 #ifndef MULTI_SHIFT_FUNCTION
 #define MULTI_SHIFT_FUNCTION
+
 namespace Grid {
+
 class MultiShiftFunction {
 public:
  int order;
@@ -9,20 +11,29 @@ public:
  std::vector<RealD> tolerances;
  RealD norm;
  RealD lo,hi;
+
  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
  RealD approx(RealD x);
  void csv(std::ostream &out);
  void gnuplot(std::ostream &out);
-  MultiShiftFunction(AlgRemez & remez,double tol,bool inverse) :
-      order(remez.getDegree()),
-      tolerances(remez.getDegree(),tol),
-      poles(remez.getDegree()),
-      residues(remez.getDegree())
+
+  void Init(AlgRemez & remez,double tol,bool inverse) 
  {
+    order=remez.getDegree();
+    tolerances.resize(remez.getDegree(),tol);
+    poles.resize(remez.getDegree());
+    residues.resize(remez.getDegree());
    remez.getBounds(lo,hi);
    if ( inverse ) remez.getIPFE (&residues[0],&poles[0],&norm);
-    else remez.getPFE (&residues[0],&poles[0],&norm);
+    else           remez.getPFE (&residues[0],&poles[0],&norm);
  }
+  // Allow deferred initialisation
+  MultiShiftFunction(void){};
+  MultiShiftFunction(AlgRemez & remez,double tol,bool inverse)
+  {
+    Init(remez,tol,inverse);
+  }
+
 };
 }
 #endif
--- a/lib/algorithms/approx/Remez.cc
+++ b/lib/algorithms/approx/Remez.cc
@@ -757,3 +757,4 @@ void AlgRemez::csv(std::ostream & os)
  }
  return;
 }
+
--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@@ -15,7 +15,9 @@
 #ifndef INCLUDED_ALG_REMEZ_H
 #define INCLUDED_ALG_REMEZ_H

-#include <algorithms/approx/bigfloat_double.h>
+#include <stddef.h>
+
+#include <algorithms/approx/bigfloat.h>

 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
 #define SUM_MAX 10 // Maximum number of terms in exponential
@@ -28,6 +30,7 @@
  remez.getIPFE(res,pole,&norm);
  remez.csv(ostream &os);
 */
+
 class AlgRemez
 {
 private:
--- a/lib/algorithms/iterative/AdefGeneric.h
+++ b/lib/algorithms/iterative/AdefGeneric.h
@@ -149,7 +149,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
      }

      RealD rrn=sqrt(rn/ssq);
-      std::cout<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;
+      std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;

      // Stopping condition
      if ( rn <= rsq ) { 
@@ -161,8 +161,8 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
 	RealD srcnorm = sqrt(norm2(src));
 	RealD tmpnorm = sqrt(norm2(tmp));
 	RealD true_residual = tmpnorm/srcnorm;
-	std::cout<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl;
-	std::cout<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
+	std::cout<<GridLogMessage<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl;
+	std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
 	return k;
      }
    }
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -13,9 +13,7 @@ namespace Grid {
 public:                                                
    RealD   Tolerance;
    Integer MaxIterations;
-    int verbose;
    ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
-      verbose=1;
    };


@@ -42,14 +40,12 @@ public:
      cp =a;
      ssq=norm2(src);

-      if ( verbose ) {
-	std::cout <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
-      }
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;

      RealD rsq =  Tolerance* Tolerance*ssq;
      
@@ -58,7 +54,7 @@ public:
 	return;
      }
      
-      if(verbose) std::cout << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" rsq"<<rsq<<std::endl;
+      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" rsq"<<rsq<<std::endl;
      
      int k;
      for (k=1;k<=MaxIterations;k++){
@@ -80,7 +76,7 @@ public:
 	psi= a*p+psi;
 	p  = p*b+r;
 	  
-	if (verbose) std::cout<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 	
 	// Stopping condition
 	if ( cp <= rsq ) { 
@@ -94,14 +90,14 @@ public:
 	  RealD resnorm = sqrt(norm2(p));
 	  RealD true_residual = resnorm/srcnorm;

-	  std::cout<<"ConjugateGradient: Converged on iteration " <<k
+	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
 		   <<" computed residual "<<sqrt(cp/ssq)
 		   <<" true residual     "<<true_residual
 		   <<" target "<<Tolerance<<std::endl;
 	  return;
 	}
      }
-      std::cout<<"ConjugateGradient did NOT converge"<<std::endl;
+      std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl;
      assert(0);
    }
  };
--- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -27,10 +27,14 @@ public:

 void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
 {
-
  GridBase *grid = src._grid;
  int nshift = shifts.order;
  std::vector<Field> results(nshift,grid);
+  (*this)(Linop,src,results,psi);
+}
+void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
+{
+  int nshift = shifts.order;

  (*this)(Linop,src,results);
  
@@ -91,7 +95,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  cp = norm2(src);
  for(int s=0;s<nshift;s++){
    rsq[s] = cp * mresidual[s] * mresidual[s];
-    std::cout<<"ConjugateGradientMultiShift: shift "<<s
+    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
 	     <<" target resid "<<rsq[s]<<std::endl;
    ps[s] = src;
  }
@@ -109,7 +113,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  // p and mmp is equal to d after this since
  // the d computation is tricky
  //  qq = real(innerProduct(p,mmp));
-  //  std::cout << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl;
+  //  std::cout<<GridLogMessage << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl;
  
  b = -cp /d;
  
@@ -214,7 +218,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	
 	if(css<rsq[s]){
 	  if ( ! converged[s] )
-	    std::cout<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
+	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	      converged[s]=1;
 	} else {
 	  all_converged=0;
@@ -225,8 +229,8 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
    
    if ( all_converged ){

-      std::cout<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
-      std::cout<< "CGMultiShift: Checking solutions"<<std::endl;
+      std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
+      std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
      
      // Check answers 
      for(int s=0; s < nshift; s++) { 
@@ -235,13 +239,13 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	axpy(r,-alpha[s],src,tmp);
 	RealD rn = norm2(r);
 	RealD cn = norm2(src);
-	std::cout<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
+	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
      }
      return;
    }
  }
  // ugly hack
-  std::cout<<"CG multi shift did not converge"<<std::endl;
+  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
  assert(0);
 }

--- a/lib/algorithms/iterative/ConjugateResidual.h
+++ b/lib/algorithms/iterative/ConjugateResidual.h
@@ -41,7 +41,7 @@ namespace Grid {
      ssq=norm2(src);
      rsq=Tolerance*Tolerance*ssq;

-      if (verbose) std::cout<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
+      if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;

      for(int k=1;k<MaxIterations;k++){

@@ -60,13 +60,13 @@ namespace Grid {
 	axpy(p,b,p,r);
 	pAAp=axpy_norm(Ap,b,Ap,Ar);
 	
-	if(verbose) std::cout<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+	if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;

 	if(cp<rsq) {
 	  Linop.HermOp(psi,Ap);
 	  axpy(r,-1.0,src,Ap);
 	  RealD true_resid = norm2(r)/ssq;
-	  std::cout<<"ConjugateResidual: Converged on iteration " <<k
+	  std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
 		   << " computed residual "<<sqrt(cp/ssq)
 	           << " true residual "<<sqrt(true_resid)
 	           << " target "       <<Tolerance <<std::endl;
@@ -75,7 +75,7 @@ namespace Grid {

      }

-      std::cout<<"ConjugateResidual did NOT converge"<<std::endl;
+      std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
      assert(0);
    }
  };
--- a/lib/algorithms/iterative/PrecConjugateResidual.h
+++ b/lib/algorithms/iterative/PrecConjugateResidual.h
@@ -0,0 +1,92 @@
+#ifndef GRID_PREC_CONJUGATE_RESIDUAL_H
+#define GRID_PREC_CONJUGATE_RESIDUAL_H
+
+namespace Grid {
+
+    /////////////////////////////////////////////////////////////
+    // Base classes for iterative processes based on operators
+    // single input vec, single output vec.
+    /////////////////////////////////////////////////////////////
+
+  template<class Field> 
+    class PrecConjugateResidual : public OperatorFunction<Field> {
+  public:                                                
+    RealD   Tolerance;
+    Integer MaxIterations;
+    int verbose;
+    LinearFunction<Field> &Preconditioner;
+
+    PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit),      Preconditioner(Prec)
+    { 
+      verbose=1;
+    };
+
+    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+
+      RealD a, b, c, d;
+      RealD cp, ssq,rsq;
+      
+      RealD rAr, rAAr, rArp;
+      RealD pAp, pAAp;
+
+      GridBase *grid = src._grid;
+      Field r(grid),  p(grid), Ap(grid), Ar(grid), z(grid);
+      
+      psi=zero;
+      r  = src;
+      Preconditioner(r,p);
+
+      
+
+      Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
+      Ar=Ap;
+      rAr=pAp;
+      rAAr=pAAp;
+
+      cp =norm2(r);
+      ssq=norm2(src);
+      rsq=Tolerance*Tolerance*ssq;
+
+      if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
+
+      for(int k=0;k<MaxIterations;k++){
+
+
+	Preconditioner(Ap,z);
+	RealD rq= real(innerProduct(Ap,z)); 
+
+	a = rAr/rq;
+
+   	axpy(psi,a,p,psi);
+   cp = axpy_norm(r,-a,z,r);
+
+	rArp=rAr;
+
+	Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
+
+	b   =rAr/rArp;
+ 
+	axpy(p,b,p,r);
+	pAAp=axpy_norm(Ap,b,Ap,Ar);
+	
+	if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+
+	if(cp<rsq) {
+	  Linop.HermOp(psi,Ap);
+	  axpy(r,-1.0,src,Ap);
+	  RealD true_resid = norm2(r)/ssq;
+	  std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k
+		   << " computed residual "<<sqrt(cp/ssq)
+	           << " true residual "<<sqrt(true_resid)
+	           << " target "       <<Tolerance <<std::endl;
+	  return;
+	}
+
+      }
+
+      std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
+      assert(0);
+    }
+  };
+}
+#endif
--- a/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -45,13 +45,13 @@ namespace Grid {

 	cp=GCRnStep(Linop,src,psi,rsq);

-	if ( verbose ) std::cout<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
+	if ( verbose ) std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;

 	if(cp<rsq) {
 	  Linop.HermOp(psi,r);
 	  axpy(r,-1.0,src,r);
 	  RealD tr = norm2(r);
-	  std::cout<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
+	  std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
 		   << " computed residual "<<sqrt(cp/ssq)
 	           << " true residual "    <<sqrt(tr/ssq)
 	           << " target "           <<Tolerance <<std::endl;
@@ -59,7 +59,7 @@ namespace Grid {
 	}

      }
-      std::cout<<"Variable Preconditioned GCR did not converge"<<std::endl;
+      std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
      assert(0);
    }
    RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
@@ -96,21 +96,21 @@ namespace Grid {
      /////////////////////
      Preconditioner(r,z);

-      std::cout<< " Preconditioner in " << norm2(r)<<std::endl; 
-      std::cout<< " Preconditioner out " << norm2(z)<<std::endl; 
+      std::cout<<GridLogMessage<< " Preconditioner in " << norm2(r)<<std::endl; 
+      std::cout<<GridLogMessage<< " Preconditioner out " << norm2(z)<<std::endl; 
      
      Linop.HermOp(z,tmp); 

-      std::cout<< " Preconditioner Aout " << norm2(tmp)<<std::endl; 
+      std::cout<<GridLogMessage<< " Preconditioner Aout " << norm2(tmp)<<std::endl; 
      ttmp=tmp;
      tmp=tmp-r;

-      std::cout<< " Preconditioner resid " << std::sqrt(norm2(tmp)/norm2(r))<<std::endl; 
+      std::cout<<GridLogMessage<< " Preconditioner resid " << std::sqrt(norm2(tmp)/norm2(r))<<std::endl; 
      /*
-      std::cout<<r<<std::endl;
-      std::cout<<z<<std::endl;
-      std::cout<<ttmp<<std::endl;
-      std::cout<<tmp<<std::endl;
+      std::cout<<GridLogMessage<<r<<std::endl;
+      std::cout<<GridLogMessage<<z<<std::endl;
+      std::cout<<GridLogMessage<<ttmp<<std::endl;
+      std::cout<<GridLogMessage<<tmp<<std::endl;
      */

      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
@@ -137,7 +137,7 @@ namespace Grid {

 	cp = axpy_norm(r,-a,q[peri_k],r);  

-	std::cout<< " VPCG_step resid" <<sqrt(cp/rsq)<<std::endl; 
+	std::cout<<GridLogMessage<< " VPGCR_step resid" <<sqrt(cp/rsq)<<std::endl; 
 	if((k==nstep-1)||(cp<rsq)){
 	  return cp;
 	}
@@ -148,7 +148,7 @@ namespace Grid {

 	Linop.HermOp(z,tmp);
        tmp=tmp-r;
-	std::cout<< " Preconditioner resid" <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
+	std::cout<<GridLogMessage<< " Preconditioner resid" <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 

 	q[peri_kp]=Az;
 	p[peri_kp]=z;
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@@ -89,7 +89,7 @@ namespace Grid {
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
-      std::cout << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
+      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);

      ///////////////////////////////////////////////////
@@ -108,7 +108,7 @@ namespace Grid {
      RealD ns = norm2(in);
      RealD nr = norm2(resid);

-      std::cout << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
+      std::cout<<GridLogMessage << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
    }     
  };

--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -8,7 +8,7 @@ class SimpleCompressor {
 public:
  void Point(int) {};

-  vobj operator() (const vobj &arg) {
+  vobj operator() (const vobj &arg,int dimension,int plane,int osite,GridBase *grid) {
    return arg;
  }
 };
@@ -36,7 +36,7 @@ PARALLEL_NESTED_LOOP2
      int bo = n*rhs._grid->_slice_block[dimension];
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
      if ( ocb &cbmask ) {
-	buffer[bo+b]=compress(rhs._odata[so+o+b]);
+	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
      }
    }
  }
@@ -69,7 +69,7 @@ PARALLEL_NESTED_LOOP2
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
      if ( ocb & cbmask ) {
 	cobj temp; 
-	temp =compress(rhs._odata[so+o+b]);
+	temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 	extract<cobj>(temp,pointers,offset);
      }
    }
--- a/lib/lattice/Lattice_ET.h
+++ b/lib/lattice/Lattice_ET.h
@@ -132,18 +132,18 @@ inline void CBFromExpression(int &cb,const T1& lat)   // Lattice leaf
    assert(cb==lat.checkerboard);
  } 
  cb=lat.checkerboard;
-  //  std::cout<<"Lattice leaf cb "<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
 }
 template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
 inline void CBFromExpression(int &cb,const T1& notlat)   // non-lattice leaf
 {
-  //  std::cout<<"Non lattice leaf cb"<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
 }
 template <typename Op, typename T1>
 inline void CBFromExpression(int &cb,const LatticeUnaryExpression<Op,T1 > &expr)
 {
  CBFromExpression(cb,std::get<0>(expr.second));// recurse 
-  //  std::cout<<"Unary node cb "<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
 }

 template <typename Op, typename T1, typename T2>
@@ -151,7 +151,7 @@ inline void CBFromExpression(int &cb,const LatticeBinaryExpression<Op,T1,T2> &ex
 {
  CBFromExpression(cb,std::get<0>(expr.second));// recurse
  CBFromExpression(cb,std::get<1>(expr.second));
-  //  std::cout<<"Binary node cb "<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2, typename T3>
 inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
@@ -159,7 +159,7 @@ inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3
  CBFromExpression(cb,std::get<0>(expr.second));// recurse
  CBFromExpression(cb,std::get<1>(expr.second));
  CBFromExpression(cb,std::get<2>(expr.second));
-  //  std::cout<<"Trinary node cb "<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
 }

 ////////////////////////////////////////////
@@ -370,7 +370,7 @@ using namespace Grid;
   tmp.func(eval(0,v1),eval(0,v2));

   auto var = v1+v2;
-   std::cout<<typeid(var).name()<<std::endl;
+   std::cout<<GridLogMessage<<typeid(var).name()<<std::endl;

   v3=v1+v2;
   v3=v1+v2+v1*v2;
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -61,6 +61,11 @@ public:
    int checkerboard;
    std::vector<vobj,alignedAllocator<vobj> > _odata;
    
+    // to pthread need a computable loop where loop induction is not required
+    int begin(void) { return 0;};
+    int end(void)   { return _odata.size(); }
+    vobj & operator[](int i) { return _odata[i]; };
+
 public:
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;
@@ -221,7 +226,7 @@ PARALLEL_FOR_LOOP
    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
      this->checkerboard = r.checkerboard;
      conformable(*this,r);
-      std::cout<<"Lattice operator ="<<std::endl;
+      std::cout<<GridLogMessage<<"Lattice operator ="<<std::endl;
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r._odata[ss];
--- a/lib/lattice/Lattice_peekpoke.h
+++ b/lib/lattice/Lattice_peekpoke.h
@@ -14,6 +14,7 @@ namespace Grid {
       auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
    {
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
+      ret.checkerboard=lhs.checkerboard;
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
@@ -24,6 +25,7 @@ PARALLEL_FOR_LOOP
       auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
    {
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
+      ret.checkerboard=lhs.checkerboard;
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -125,7 +125,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  assert(grid!=NULL);

  // FIXME
-  std::cout<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
+  std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;

  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
--- a/lib/lattice/Lattice_rng.h
+++ b/lib/lattice/Lattice_rng.h
@@ -5,6 +5,37 @@

 namespace Grid {

+
+  //////////////////////////////////////////////////////////////
+  // Allow the RNG state to be less dense than the fine grid
+  //////////////////////////////////////////////////////////////
+  inline int RNGfillable(GridBase *coarse,GridBase *fine)
+  {
+
+    int rngdims = coarse->_ndimension;
+
+    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
+    int lowerdims   = fine->_ndimension - coarse->_ndimension;
+    assert(lowerdims >= 0);
+    for(int d=0;d<lowerdims;d++){
+      assert(fine->_simd_layout[d]==1);
+      assert(fine->_processors[d]==1);
+    }
+
+    // local and global volumes subdivide cleanly after SIMDization
+    int multiplicity=1;
+    for(int d=0;d<rngdims;d++){
+      int fd= d+lowerdims;
+      assert(coarse->_processors[d]  == fine->_processors[fd]);
+      assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
+      assert((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[fd]); 
+
+      multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
+    }
+
+    return multiplicity;
+  }
+
  // Wrap seed_seq to give common interface with random_device
  class fixedSeed {
  public:
@@ -226,26 +257,32 @@ namespace Grid {
      typedef typename vobj::scalar_type scalar_type;
      typedef typename vobj::vector_type vector_type;
      
-      conformable(_grid,l._grid);
+      int multiplicity = RNGfillable(_grid,l._grid);

      int     Nsimd =_grid->Nsimd();
      int     osites=_grid->oSites();
      int words=sizeof(scalar_object)/sizeof(scalar_type);

-      std::vector<scalar_object> buf(Nsimd);

+PARALLEL_FOR_LOOP
      for(int ss=0;ss<osites;ss++){
-	for(int si=0;si<Nsimd;si++){

-	  int gdx = generator_idx(ss,si); // index of generator state
-	  scalar_type *pointer = (scalar_type *)&buf[si];
-	  for(int idx=0;idx<words;idx++){
-	    fillScalar(pointer[idx],dist,_generators[gdx]);
+	std::vector<scalar_object> buf(Nsimd);
+	for(int m=0;m<multiplicity;m++) {// Draw from same generator multiplicity times
+
+	  int sm=multiplicity*ss+m;      // Maps the generator site to the fine site
+
+	  for(int si=0;si<Nsimd;si++){
+	    int gdx = generator_idx(ss,si); // index of generator state
+	    scalar_type *pointer = (scalar_type *)&buf[si];
+	    for(int idx=0;idx<words;idx++){
+	      fillScalar(pointer[idx],dist,_generators[gdx]);
+	    }
 	  }

+	  // merge into SIMD lanes
+	  merge(l._odata[sm],buf);
 	}
-	// merge into SIMD lanes
-	merge(l._odata[ss],buf);
      }
    };

--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -17,13 +17,14 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
  }
 }

+
  ////////////////////////////////////////////////////////////////////////////////////////////
  // remove and insert a half checkerboard
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
    half.checkerboard = cb;
    int ssh=0;
-PARALLEL_FOR_LOOP
+    //PARALLEL_FOR_LOOP
    for(int ss=0;ss<full._grid->oSites();ss++){
      std::vector<int> coor;
      int cbos;
@@ -40,7 +41,7 @@ PARALLEL_FOR_LOOP
  template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
    int cb = half.checkerboard;
    int ssh=0;
-PARALLEL_FOR_LOOP
+    //PARALLEL_FOR_LOOP
    for(int ss=0;ss<full._grid->oSites();ss++){
      std::vector<int> coor;
      int cbos;
@@ -158,6 +159,7 @@ template<class vobj,class CComplex>

  fine_inner = localInnerProduct(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
+PARALLEL_FOR_LOOP
  for(int ss=0;ss<coarse->oSites();ss++){
    CoarseInner._odata[ss] = coarse_inner._odata[ss];
  }
@@ -297,5 +299,42 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 }


+template<class vobj>
+void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
+{
+  typedef typename vobj::scalar_object sobj;
+
+  GridBase *cg = coarse._grid;
+  GridBase *fg =   fine._grid;
+
+  int nd = cg->_ndimension;
+
+  subdivides(cg,fg); 
+
+  assert(cg->_ndimension==fg->_ndimension);
+
+  std::vector<int> ratio(cg->_ndimension);
+
+  for(int d=0;d<cg->_ndimension;d++){
+    ratio[d] = fg->_fdimensions[d]/cg->_fdimensions[d];
+  }
+
+  std::vector<int> fcoor(nd);
+  std::vector<int> ccoor(nd);
+  for(int g=0;g<fg->gSites();g++){
+
+    fg->GlobalIndexToGlobalCoor(g,fcoor);
+    for(int d=0;d<nd;d++){
+      ccoor[d] = fcoor[d]%cg->_gdimensions[d];
+    }
+    
+    sobj tmp;
+    peekSite(tmp,coarse,ccoor);
+    pokeSite(tmp,fine,fcoor);
+  }
+
+}
+
+
 }
 #endif
--- a/lib/lattice/Lattice_where.h
+++ b/lib/lattice/Lattice_where.h
@@ -22,7 +22,6 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
  typedef typename iobj::vector_type mask_type;

  const int Nsimd = grid->Nsimd();
-  const int words = sizeof(vobj)/sizeof(vector_type);

  std::vector<Integer> mask(Nsimd);
  std::vector<scalar_object> truevals (Nsimd);
--- a/lib/pugixml/README.md
+++ b/lib/pugixml/README.md
@@ -0,0 +1,44 @@
+pugixml [![Build Status](https://travis-ci.org/zeux/pugixml.svg?branch=master)](https://travis-ci.org/zeux/pugixml) [![Build status](https://ci.appveyor.com/api/projects/status/9hdks1doqvq8pwe7/branch/master?svg=true)](https://ci.appveyor.com/project/zeux/pugixml)
+=======
+
+pugixml is a C++ XML processing library, which consists of a DOM-like interface with rich traversal/modification
+capabilities, an extremely fast XML parser which constructs the DOM tree from an XML file/buffer, and an XPath 1.0
+implementation for complex data-driven tree queries. Full Unicode support is also available, with Unicode interface
+variants and conversions between different Unicode encodings (which happen automatically during parsing/saving).
+
+pugixml is used by a lot of projects, both open-source and proprietary, for performance and easy-to-use interface.
+
+## Documentation
+
+Documentation for the current release of pugixml is available on-line as two separate documents:
+
+* [Quick-start guide](http://pugixml.org/docs/quickstart.html), that aims to provide enough information to start using the library;
+* [Complete reference manual](http://pugixml.org/docs/manual.html), that describes all features of the library in detail.
+
+You’re advised to start with the quick-start guide; however, many important library features are either not described in it at all or only mentioned briefly; if you require more information you should read the complete manual.
+
+## License
+This library is available to anybody free of charge, under the terms of MIT License:
+
+Copyright (c) 2006-2015 Arseny Kapoulkine
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
--- a/lib/pugixml/pugiconfig.hpp
+++ b/lib/pugixml/pugiconfig.hpp
@@ -0,0 +1,71 @@
+/**
+ * pugixml parser - version 1.6
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
+ */
+
+#ifndef HEADER_PUGICONFIG_HPP
+#define HEADER_PUGICONFIG_HPP
+
+// Uncomment this to enable wchar_t mode
+// #define PUGIXML_WCHAR_MODE
+
+// Uncomment this to disable XPath
+// #define PUGIXML_NO_XPATH
+
+// Uncomment this to disable STL
+// #define PUGIXML_NO_STL
+
+// Uncomment this to disable exceptions
+// #define PUGIXML_NO_EXCEPTIONS
+
+// Set this to control attributes for public classes/functions, i.e.:
+// #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL
+// #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL
+// #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall
+// In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead
+
+// Tune these constants to adjust memory-related behavior
+// #define PUGIXML_MEMORY_PAGE_SIZE 32768
+// #define PUGIXML_MEMORY_OUTPUT_STACK 10240
+// #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096
+
+// Uncomment this to switch to header-only version
+// #define PUGIXML_HEADER_ONLY
+
+// Uncomment this to enable long long support
+// #define PUGIXML_HAS_LONG_LONG
+
+#endif
+
+/**
+ * Copyright (c) 2006-2015 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
--- a/lib/pugixml/pugixml.cpp
+++ b/lib/pugixml/pugixml.cpp
--- a/lib/pugixml/pugixml.hpp
+++ b/lib/pugixml/pugixml.hpp
--- a/lib/pugixml/readme.txt
+++ b/lib/pugixml/readme.txt
@@ -0,0 +1,52 @@
+pugixml 1.6 - an XML processing library
+
+Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+Report bugs and download new versions at http://pugixml.org/
+
+This is the distribution of pugixml, which is a C++ XML processing library,
+which consists of a DOM-like interface with rich traversal/modification
+capabilities, an extremely fast XML parser which constructs the DOM tree from
+an XML file/buffer, and an XPath 1.0 implementation for complex data-driven
+tree queries. Full Unicode support is also available, with Unicode interface
+variants and conversions between different Unicode encodings (which happen
+automatically during parsing/saving).
+
+The distribution contains the following folders:
+
+	contrib/ - various contributions to pugixml
+
+	docs/ - documentation
+		docs/samples - pugixml usage examples
+		docs/quickstart.html - quick start guide
+		docs/manual.html - complete manual
+
+	scripts/ - project files for IDE/build systems
+
+	src/ - header and source files
+
+	readme.txt - this file.
+
+This library is distributed under the MIT License:
+
+Copyright (c) 2006-2015 Arseny Kapoulkine
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -19,6 +19,7 @@ namespace QCD {
    static const int Nd=4;
    static const int Nhs=2; // half spinor
    static const int Nds=8; // double stored gauge field
+    static const int Ngp=2; // gparity index range

    //////////////////////////////////////////////////////////////////////////////
    // QCD iMatrix types
@@ -28,6 +29,15 @@ namespace QCD {
    static const int SpinIndex   = 1;
    static const int LorentzIndex= 0;

+    // Useful traits is this a spin index
+    //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
+
+    const int SpinorIndex = 2;
+    template<typename T> struct isSpinor {
+      static const bool value = (SpinorIndex==T::TensorLevel);
+    };
+    template <typename T> using IfSpinor    = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
+    template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;

    // ChrisK very keen to add extra space for Gparity doubling.
    //
@@ -49,6 +59,10 @@ namespace QCD {
    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;

+    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+
+
    // Spin matrix
    typedef iSpinMatrix<Complex  >          SpinMatrix;
    typedef iSpinMatrix<ComplexF >          SpinMatrixF;
--- a/lib/qcd/action/ActionBase.h
+++ b/lib/qcd/action/ActionBase.h
@@ -7,14 +7,30 @@ template<class GaugeField>
 class Action { 

 public:
-  virtual void  init(const GaugeField &U, GridParallelRNG& pRNG) = 0;
-  virtual RealD S(const GaugeField &U)                           = 0;  // evaluate the action
-  virtual void  deriv(const GaugeField &U,GaugeField & dSdU )    = 0;  // evaluate the action derivative
-  //virtual void  refresh(const GaugeField & ) {}                ; 
+  virtual void  init (const GaugeField &U, GridParallelRNG& pRNG) = 0;  // 
+  virtual RealD S    (const GaugeField &U)                        = 0;  // evaluate the action
+  virtual void  deriv(const GaugeField &U,GaugeField & dSdU )     = 0;  // evaluate the action derivative
+  virtual void  refresh(const GaugeField & ) {};                        // Default to no-op for actions with no internal fields
  // Boundary conditions?
  // Heatbath?
  virtual ~Action() {};
 };

+// Could derive PseudoFermion action with a PF field, FermionField, and a Grid; implement refresh
+template<class GaugeField, class FermionField>
+class PseudoFermionAction : public Action<GaugeField> {
+ public:
+  FermionField Phi;
+  GridParallelRNG &pRNG;
+  GridBase &Grid;
+
+  PseudoFermionAction(GridBase &_Grid,GridParallelRNG &_pRNG) : Grid(_Grid), Phi(&_Grid), pRNG(_pRNG) {
+  };
+
+  virtual void refresh(const GaugeField &gauge) {
+    gaussian(Phi,pRNG);
+  };
+
+};
 }}
 #endif
--- a/lib/qcd/action/ActionParams.h
+++ b/lib/qcd/action/ActionParams.h
@@ -0,0 +1,29 @@
+#ifndef GRID_QCD_ACTION_PARAMS_H
+#define GRID_QCD_ACTION_PARAMS_H
+
+namespace Grid {
+namespace QCD {
+
+    // These can move into a params header and be given MacroMagic serialisation
+    struct GparityWilsonImplParams {
+      std::vector<int> twists; 
+    };
+
+    struct WilsonImplParams { };
+
+    struct OneFlavourRationalParams { 
+      RealD  lo;
+      RealD  hi;
+      int MaxIter;   // Vector?
+      RealD tolerance; // Vector? 
+      int    degree=10;
+      int precision=64;
+
+      OneFlavourRationalParams (RealD _lo,RealD _hi,int _maxit,RealD tol=1.0e-8,int _degree = 10,int _precision=64) :
+        lo(_lo), hi(_hi), MaxIter(_maxit), tolerance(tol), degree(_degree), precision(_precision)
+      {};
+    };
+
+}}
+
+#endif
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -6,19 +6,15 @@
 // are separating the concept of the operator from that of action.
 //
 // The FermAction contains methods to create 
-//
 // * Linear operators             (Hermitian and non-hermitian)  .. my LinearOperator
 // * System solvers               (Hermitian and non-hermitian)  .. my OperatorFunction
 // * MultiShift System solvers    (Hermitian and non-hermitian)  .. my OperatorFunction

-
 ////////////////////////////////////////////
 // Abstract base interface
 ////////////////////////////////////////////
 #include <qcd/action/ActionBase.h>
-
-#include <qcd/action/fermion/FermionOperator.h>
-
+#include <qcd/action/ActionParams.h>

 ////////////////////////////////////////////
 // Gauge Actions
@@ -30,53 +26,146 @@
 // Utility functions
 ////////////////////////////////////////////
 #include <qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
+#include <qcd/action/fermion/FermionOperatorImpl.h>
+#include <qcd/action/fermion/FermionOperator.h>
 #include <qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions

+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Explicit explicit template instantiation is still required in the .cc files
+//
+// - CayleyFermion5D.cc
+// - PartialFractionFermion5D.cc
+// - WilsonFermion5D.cc
+// - WilsonKernelsHand.cc
+// - ContinuedFractionFermion5D.cc
+// - WilsonFermion.cc
+// - WilsonKernels.cc
+//
+// The explicit instantiation is only avoidable if we move this source to headers and end up with include/parse/recompile
+// for EVERY .cc file. This define centralises the list and restores global push of impl cases
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define FermOpTemplateInstantiate(A) \
+  template class A<GparityWilsonImplF>;		\
+  template class A<GparityWilsonImplD>;		\
+  template class A<WilsonImplF>;		\
+  template class A<WilsonImplD>;
+
 ////////////////////////////////////////////
-// 4D formulations
+// Fermion operators / actions
 ////////////////////////////////////////////
-#include <qcd/action/fermion/WilsonFermion.h>
+
+#include <qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
+#include <qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
+
 //#include <qcd/action/fermion/CloverFermion.h>

-////////////////////////////////////////////
-// 5D formulations...
-////////////////////////////////////////////
-
-#include <qcd/action/fermion/WilsonFermion5D.h> // used by all 5d overlap types
-
-//////////
-// Cayley
-//////////
-#include <qcd/action/fermion/CayleyFermion5D.h>
-
+#include <qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
 #include <qcd/action/fermion/DomainWallFermion.h>
 #include <qcd/action/fermion/DomainWallFermion.h>
-
 #include <qcd/action/fermion/MobiusFermion.h>
 #include <qcd/action/fermion/ScaledShamirFermion.h>
-#include <qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
-
 #include <qcd/action/fermion/MobiusZolotarevFermion.h>
 #include <qcd/action/fermion/ShamirZolotarevFermion.h>
+#include <qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
 #include <qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>

-//////////////////////
-// Continued fraction
-//////////////////////
-#include <qcd/action/fermion/ContinuedFractionFermion5D.h>
-#include <qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>
+#include <qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
+#include <qcd/action/fermion/OverlapWilsonContFracTanhFermion.h>
+#include <qcd/action/fermion/OverlapWilsonContFracZolotarevFermion.h>

-//////////////////////
-// Partial fraction
-//////////////////////
-#include <qcd/action/fermion/PartialFractionFermion5D.h>
+#include <qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
 #include <qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
 #include <qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>

+////////////////////////////////////////////////////////////////////////////////////////////////////
+// More maintainable to maintain the following typedef list centrally, as more "impl" targets
+// are added, (e.g. extension for gparity, half precision project in comms etc..)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+// Cayley 5d
+namespace Grid {
+  namespace QCD {
+
+typedef WilsonFermion<WilsonImplR> WilsonFermionR;
+typedef WilsonFermion<WilsonImplF> WilsonFermionF;
+typedef WilsonFermion<WilsonImplD> WilsonFermionD;
+
+typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
+typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
+typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
+typedef MobiusFermion<WilsonImplR> MobiusFermionR;
+typedef MobiusFermion<WilsonImplF> MobiusFermionF;
+typedef MobiusFermion<WilsonImplD> MobiusFermionD;
+typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
+typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
+typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
+
+typedef MobiusZolotarevFermion<WilsonImplR> MobiusZolotarevFermionR;
+typedef MobiusZolotarevFermion<WilsonImplF> MobiusZolotarevFermionF;
+typedef MobiusZolotarevFermion<WilsonImplD> MobiusZolotarevFermionD;
+typedef ShamirZolotarevFermion<WilsonImplR> ShamirZolotarevFermionR;
+typedef ShamirZolotarevFermion<WilsonImplF> ShamirZolotarevFermionF;
+typedef ShamirZolotarevFermion<WilsonImplD> ShamirZolotarevFermionD;
+
+typedef OverlapWilsonCayleyTanhFermion<WilsonImplR> OverlapWilsonCayleyTanhFermionR;
+typedef OverlapWilsonCayleyTanhFermion<WilsonImplF> OverlapWilsonCayleyTanhFermionF;
+typedef OverlapWilsonCayleyTanhFermion<WilsonImplD> OverlapWilsonCayleyTanhFermionD;
+typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplR> OverlapWilsonCayleyZolotarevFermionR;
+typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplF> OverlapWilsonCayleyZolotarevFermionF;
+typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD> OverlapWilsonCayleyZolotarevFermionD;
+
+// Continued fraction
+typedef OverlapWilsonContFracTanhFermion<WilsonImplR> OverlapWilsonContFracTanhFermionR;
+typedef OverlapWilsonContFracTanhFermion<WilsonImplF> OverlapWilsonContFracTanhFermionF;
+typedef OverlapWilsonContFracTanhFermion<WilsonImplD> OverlapWilsonContFracTanhFermionD;
+typedef OverlapWilsonContFracZolotarevFermion<WilsonImplR> OverlapWilsonContFracZolotarevFermionR;
+typedef OverlapWilsonContFracZolotarevFermion<WilsonImplF> OverlapWilsonContFracZolotarevFermionF;
+typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD> OverlapWilsonContFracZolotarevFermionD;
+
+// Partial fraction
+typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplR> OverlapWilsonPartialFractionTanhFermionR;
+typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplF> OverlapWilsonPartialFractionTanhFermionF;
+typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD> OverlapWilsonPartialFractionTanhFermionD;
+
+typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplR> OverlapWilsonPartialFractionZolotarevFermionR;
+typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplF> OverlapWilsonPartialFractionZolotarevFermionF;
+typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonPartialFractionZolotarevFermionD;
+
+// Gparity cases; partial list until tested
+typedef WilsonFermion<GparityWilsonImplR>     GparityWilsonFermionR;
+typedef WilsonFermion<GparityWilsonImplF>     GparityWilsonFermionF;
+typedef WilsonFermion<GparityWilsonImplD>     GparityWilsonFermionD;
+typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
+typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
+typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
+
+  }}
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
 #include <qcd/action/fermion/g5HermitianLinop.h>

+////////////////////////////////////////
+// Pseudo fermion combinations for HMC
+////////////////////////////////////////
+#include <qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
+
+#include <qcd/action/pseudofermion/TwoFlavour.h>
+#include <qcd/action/pseudofermion/TwoFlavourRatio.h>
+#include <qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
+#include <qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>
+
+//IroIro inserted general "Nf" param; could also be done,
+//but not clear why unless into large Nf BSM studies
+//Even there, don't want the explicit (2) on power denominator
+//if even number of flavours, so further generalised interface
+//would be required but easy.
+#include <qcd/action/pseudofermion/OneFlavourRational.h>
+#include <qcd/action/pseudofermion/OneFlavourRationalRatio.h>
+#include <qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
+#include <qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
+
 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -2,27 +2,27 @@
 namespace Grid {
 namespace QCD {

- CayleyFermion5D::CayleyFermion5D(LatticeGaugeField &_Umu,
-				  GridCartesian         &FiveDimGrid,
-				  GridRedBlackCartesian &FiveDimRedBlackGrid,
-				  GridCartesian         &FourDimGrid,
-				  GridRedBlackCartesian &FourDimRedBlackGrid,
-				  RealD _mass,RealD _M5) :
-   WilsonFermion5D(_Umu,
+ template<class Impl>
+ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
+					GridCartesian         &FiveDimGrid,
+					GridRedBlackCartesian &FiveDimRedBlackGrid,
+					GridCartesian         &FourDimGrid,
+					GridRedBlackCartesian &FourDimRedBlackGrid,
+					RealD _mass,RealD _M5,const ImplParams &p) :
+   WilsonFermion5D<Impl>(_Umu,
 		   FiveDimGrid,
 		   FiveDimRedBlackGrid,
 		   FourDimGrid,
-		   FourDimRedBlackGrid,_M5),
+ 	 	   FourDimRedBlackGrid,_M5,p),
   mass(_mass)
 {
 }

-  // override multiply
-  RealD CayleyFermion5D::M    (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
  {
-    LatticeFermion Din(psi._grid);
-
    // Assemble Din
+    int Ls=this->Ls;
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	//	Din = bs psi[s] + cs[s] psi[s+1}
@@ -37,11 +37,57 @@ namespace QCD {
 	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
      }
    }
+  }
+ template<class Impl>
+  void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
+  {
+    int Ls=this->Ls;
+    for(int s=0;s<Ls;s++){
+      if ( s==0 ) {
+	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
+	axpby_ssp_pminus(Din,1.0,Din,-mass*cs[Ls-1],psi,s,Ls-1);
+      } else if ( s==(Ls-1)) { 
+	axpby_ssp_pplus (Din,bs[s],psi,-mass*cs[0],psi,s,0);
+	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
+      } else {
+	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
+	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
+      }
+    }
+  }

-    DW(Din,chi,DaggerNo);
+  // override multiply
+ template<class Impl>
+  RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
+  {
+    int Ls=this->Ls;
+
+    FermionField Din(psi._grid);
+
+    // Assemble Din
+    /*
+    for(int s=0;s<Ls;s++){
+      if ( s==0 ) {
+	//	Din = bs psi[s] + cs[s] psi[s+1}
+	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
+	//      Din+= -mass*cs[s] psi[s+1}
+	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
+      } else if ( s==(Ls-1)) { 
+	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
+	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
+      } else {
+	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
+	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
+      }
+    }
+    */
+    Meooe5D(psi,Din);
+
+    this->DW(Din,chi,DaggerNo);
    // ((b D_W + D_w hop terms +1) on s-diag
    axpby(chi,1.0,1.0,chi,psi); 

+    // Call Mooee??
    for(int s=0;s<Ls;s++){
      if ( s==0 ){
 	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
@@ -57,20 +103,26 @@ namespace QCD {
    return norm2(chi);
  }

-  RealD CayleyFermion5D::Mdag (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
+  RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
  {
    // Under adjoint
    //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
    //D2- P+     D2+            P-D1-^dag D2+dag

-    LatticeFermion Din(psi._grid);
+    FermionField Din(psi._grid);
    // Apply Dw
-    DW(psi,Din,DaggerYes); 
+    this->DW(psi,Din,DaggerYes); 

+    Meooe5D(Din,chi);
+
+    int Ls=this->Ls;
    for(int s=0;s<Ls;s++){
+
      // Collect the terms in DW
      //	Chi = bs Din[s] + cs[s] Din[s+1}
      //    Chi+= -mass*cs[s] psi[s+1}
+      /*
      if ( s==0 ) {
 	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,-mass*cs[Ls-1],Din,s,Ls-1);
@@ -81,6 +133,10 @@ namespace QCD {
 	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
      }
+      */
+
+      // FIXME just call MooeeDag??
+
      // Collect the terms indept of DW
      if ( s==0 ){
 	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s+1);
@@ -99,10 +155,17 @@ namespace QCD {
  }

  // half checkerboard operations
-  void CayleyFermion5D::Meooe       (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
  {
-    LatticeFermion tmp(psi._grid);
+    int Ls=this->Ls;
+
+    FermionField tmp(psi._grid);
    // Assemble the 5d matrix
+    Meooe5D(psi,tmp); 
+
+#if 0
+    std::cout << "Meooe Test replacement norm2 tmp = " <<norm2(tmp)<<std::endl;
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	//	tmp = bs psi[s] + cs[s] psi[s+1}
@@ -117,24 +180,33 @@ namespace QCD {
 	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
      }
    }
+    std::cout << "Meooe Test replacement norm2 tmp old = " <<norm2(tmp)<<std::endl;
+#endif
+
    // Apply 4d dslash
    if ( psi.checkerboard == Odd ) {
-      DhopEO(tmp,chi,DaggerNo);
+      this->DhopEO(tmp,chi,DaggerNo);
    } else {
-      DhopOE(tmp,chi,DaggerNo);
+      this->DhopOE(tmp,chi,DaggerNo);
    }
  }

-  void CayleyFermion5D::MeooeDag    (const LatticeFermion &psi, LatticeFermion &chi)
+  template<class Impl>
+  void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
  {
-    LatticeFermion tmp(psi._grid);
+    FermionField tmp(psi._grid);
    // Apply 4d dslash
    if ( psi.checkerboard == Odd ) {
-      DhopEO(psi,tmp,DaggerYes);
+      this->DhopEO(psi,tmp,DaggerYes);
    } else {
-      DhopOE(psi,tmp,DaggerYes);
+      this->DhopOE(psi,tmp,DaggerYes);
    }
+
+    Meooe5D(tmp,chi); 
+#if 0
+    std::cout << "Meooe Test replacement norm2 chi new = " <<norm2(chi)<<std::endl;
    // Assemble the 5d matrix
+    int Ls=this->Ls;
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	axpby_ssp_pplus(chi,beo[s],tmp,   -ceo[s+1]  ,tmp,s,s+1);
@@ -147,10 +219,15 @@ namespace QCD {
 	axpby_ssp_pminus(chi,1.0   ,chi,-ceo[s-1],tmp,s,s-1);
      }
    }
+    std::cout << "Meooe Test replacement norm2 chi old = " <<norm2(chi)<<std::endl;
+#endif
+
  }

-  void CayleyFermion5D::Mooee       (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
  {
+    int Ls=this->Ls;
    for (int s=0;s<Ls;s++){
      if ( s==0 ) {
 	axpby_ssp_pminus(chi,bee[s],psi ,-cee[s],psi,s,s+1);
@@ -165,8 +242,10 @@ namespace QCD {
    }
  }

-  void  CayleyFermion5D::Mdir (const LatticeFermion &psi, LatticeFermion &chi,int dir,int disp){
-    LatticeFermion tmp(psi._grid);
+ template<class Impl>
+  void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+    int Ls=this->Ls;
+    FermionField tmp(psi._grid);
    // Assemble the 5d matrix
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
@@ -183,11 +262,13 @@ namespace QCD {
      }
    }
    // Apply 4d dslash fragment
-    DhopDir(tmp,chi,dir,disp);
+    this->DhopDir(tmp,chi,dir,disp);
  }

-  void CayleyFermion5D::MooeeDag    (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
  {
+    int Ls=this->Ls;
    for (int s=0;s<Ls;s++){
      // Assemble the 5d matrix
      if ( s==0 ) {
@@ -203,8 +284,10 @@ namespace QCD {
    }
  }

-  void CayleyFermion5D::MooeeInv    (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
  {
+    int Ls=this->Ls;
    // Apply (L^{\prime})^{-1}
    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
    for (int s=1;s<Ls;s++){
@@ -227,8 +310,10 @@ namespace QCD {
    }
  }

-  void CayleyFermion5D::MooeeInvDag (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
  {
+    int Ls=this->Ls;
    // Apply (U^{\prime})^{-dagger}
    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
    for (int s=1;s<Ls;s++){
@@ -250,15 +335,65 @@ namespace QCD {
    }
  }

+  // force terms; five routines; default to Dhop on diagonal
+  template<class Impl>
+  void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    FermionField Din(V._grid);
+
+    if ( dag == DaggerNo ) {
+      //      U d/du [D_w D5] V = U d/du DW D5 V
+      Meooe5D(V,Din);
+      this->DhopDeriv(mat,U,Din,dag);
+    } else {
+      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+      Meooe5D(U,Din);
+      this->DhopDeriv(mat,Din,V,dag);
+    }
+  };
+ template<class Impl>
+  void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    FermionField Din(V._grid);
+
+    if ( dag == DaggerNo ) {
+      //      U d/du [D_w D5] V = U d/du DW D5 V
+      Meooe5D(V,Din);
+      this->DhopDerivOE(mat,U,Din,dag);
+    } else {
+      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+      Meooe5D(U,Din);
+      this->DhopDerivOE(mat,Din,V,dag);
+    }
+  };
+ template<class Impl>
+  void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    FermionField Din(V._grid);
+
+    if ( dag == DaggerNo ) {
+      //      U d/du [D_w D5] V = U d/du DW D5 V
+      Meooe5D(V,Din);
+      this->DhopDerivEO(mat,U,Din,dag);
+    } else {
+      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+      Meooe5D(U,Din);
+      this->DhopDerivEO(mat,Din,V,dag);
+    }
+  };
+  
  // Tanh
-  void CayleyFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
  {
    SetCoefficientsZolotarev(1.0,zdata,b,c);

  }
  //Zolo
-  void CayleyFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
  {
+    int Ls=this->Ls;

    ///////////////////////////////////////////////////////////
    // The Cayley coeffs (unprec)
@@ -308,8 +443,8 @@ namespace QCD {
    ceo.resize(Ls);
    
    for(int i=0;i<Ls;i++){
-      bee[i]=as[i]*(bs[i]*(4.0-M5) +1.0);
-      cee[i]=as[i]*(1.0-cs[i]*(4.0-M5));
+      bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
+      cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
      beo[i]=as[i]*bs[i];
      ceo[i]=-as[i]*cs[i];
    }
@@ -362,6 +497,8 @@ namespace QCD {
    }
  }

+  FermOpTemplateInstantiate(CayleyFermion5D);
+
 }}


--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@@ -5,25 +5,36 @@ namespace Grid {

  namespace QCD {

-    class CayleyFermion5D : public WilsonFermion5D
+    template<class Impl>
+    class CayleyFermion5D : public WilsonFermion5D<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      // override multiply
-      virtual RealD  M    (const LatticeFermion &in, LatticeFermion &out);
-      virtual RealD  Mdag (const LatticeFermion &in, LatticeFermion &out);
+      virtual RealD  M    (const FermionField &in, FermionField &out);
+      virtual RealD  Mdag (const FermionField &in, FermionField &out);

      // half checkerboard operations
-      virtual void   Meooe       (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MeooeDag    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   Mooee       (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeDag    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   Meooe       (const FermionField &in, FermionField &out);
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+      virtual void   Mooee       (const FermionField &in, FermionField &out);
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
      virtual void   Instantiatable(void)=0;

+      // force terms; five routines; default to Dhop on diagonal
+      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+
      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+      
+      void   Meooe5D       (const FermionField &in, FermionField &out);
+      void   MeooeDag5D    (const FermionField &in, FermionField &out);

      //    protected:
      RealD mass;
@@ -48,12 +59,12 @@ namespace Grid {
      std::vector<RealD> dee;    

      // Constructors
-      CayleyFermion5D(LatticeGaugeField &_Umu,
+      CayleyFermion5D(GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
 		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 		      GridCartesian         &FourDimGrid,
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
-		      RealD _mass,RealD _M5);
+		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
--- a/lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
+++ b/lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
@@ -3,20 +3,22 @@
 namespace Grid {
  namespace QCD {

-    void ContinuedFractionFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
    {
      SetCoefficientsZolotarev(1.0/scale,zdata);
    }
-    void ContinuedFractionFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
    {
      // How to check Ls matches??
-      //      std::cout << Ls << " Ls"<<std::endl;
-      //      std::cout << zdata->n  << " - n"<<std::endl;
-      //      std::cout << zdata->da << " -da "<<std::endl;
-      //      std::cout << zdata->db << " -db"<<std::endl;
-      //      std::cout << zdata->dn << " -dn"<<std::endl;
-      //      std::cout << zdata->dd << " -dd"<<std::endl;
-
+      //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+      int Ls = this->Ls;
      assert(zdata->db==Ls);// Beta has Ls coeffs

      R=(1+this->mass)/(1-this->mass);
@@ -39,7 +41,7 @@ namespace Grid {


      ZoloHiInv =1.0/zolo_hi;
-      dw_diag = (4.0-M5)*ZoloHiInv;
+      dw_diag = (4.0-this->M5)*ZoloHiInv;
    
      See.resize(Ls);
      Aee.resize(Ls);
@@ -55,17 +57,20 @@ namespace Grid {
 	See[s] = Aee[s] - 1.0/See[s-1];
      }
      for(int s=0;s<Ls;s++){
-	std::cout <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
+	std::cout<<GridLogMessage <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
      }
    }



-    RealD  ContinuedFractionFermion5D::M           (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
    {
-      LatticeFermion D(psi._grid);
+      int Ls = this->Ls;

-      DW(psi,D,DaggerNo); 
+      FermionField D(psi._grid);
+
+      this->DW(psi,D,DaggerNo); 

      int sign=1;
      for(int s=0;s<Ls;s++){
@@ -83,15 +88,20 @@ namespace Grid {
      }
      return norm2(chi);
    }
-    RealD  ContinuedFractionFermion5D::Mdag        (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    RealD  ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
    {
      // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
      // The rest of matrix is symmetric.
      // Can ignore "dag"
      return M(psi,chi);
    }
-    void  ContinuedFractionFermion5D::Mdir (const LatticeFermion &psi, LatticeFermion &chi,int dir,int disp){
-      DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
+    template<class Impl>
+    void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+      int Ls = this->Ls;
+
+      this->DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
+
      int sign=1;
      for(int s=0;s<Ls;s++){
 	if ( s==(Ls-1) ){
@@ -102,13 +112,16 @@ namespace Grid {
 	sign=-sign; 
      }
    }
-    void   ContinuedFractionFermion5D::Meooe       (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
    {
+      int Ls = this->Ls;
+
      // Apply 4d dslash
      if ( psi.checkerboard == Odd ) {
-	DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
+	this->DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
      } else {
-	DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
+	this->DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
      }
      
      int sign=1;
@@ -121,12 +134,16 @@ namespace Grid {
 	sign=-sign; 
      }
    }
-    void   ContinuedFractionFermion5D::MeooeDag    (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
    {
-      Meooe(psi,chi);
+      this->Meooe(psi,chi);
    }
-    void   ContinuedFractionFermion5D::Mooee       (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
    {
+      int Ls = this->Ls;
+
      int sign=1;
      for(int s=0;s<Ls;s++){
 	if ( s==0 ) {
@@ -144,12 +161,16 @@ namespace Grid {
      }
    }

-    void   ContinuedFractionFermion5D::MooeeDag    (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
    {
-      Mooee(psi,chi);
+      this->Mooee(psi,chi);
    }
-    void   ContinuedFractionFermion5D::MooeeInv    (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
    {
+      int Ls = this->Ls;
+
      // Apply Linv
      axpby_ssp(chi,1.0/cc_d[0],psi,0.0,psi,0,0); 
      for(int s=1;s<Ls;s++){
@@ -165,27 +186,88 @@ namespace Grid {
 	axpbg5y_ssp(chi,1.0/cc_d[s],chi,-1.0*cc_d[s+1]/See[s]/cc_d[s],chi,s,s+1);
      }
    }
-    void   ContinuedFractionFermion5D::MooeeInvDag (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
    {
-      MooeeInv(psi,chi);
+      this->MooeeInv(psi,chi);
    }

+  // force terms; five routines; default to Dhop on diagonal
+    template<class Impl>
+   void ContinuedFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int sign=1;
+    for(int s=0;s<Ls;s++){
+      if ( s==(Ls-1) ){
+	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+      } else {
+	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+      }
+      sign=-sign; 
+    }
+    this->DhopDeriv(mat,D,V,DaggerNo); 
+  };
+    template<class Impl>
+   void ContinuedFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int sign=1;
+    for(int s=0;s<Ls;s++){
+      if ( s==(Ls-1) ){
+	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+      } else {
+	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+      }
+      sign=-sign; 
+    }
+    this->DhopDerivOE(mat,D,V,DaggerNo); 
+  };
+  template<class Impl>
+  void ContinuedFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int sign=1;
+    for(int s=0;s<Ls;s++){
+      if ( s==(Ls-1) ){
+	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+      } else {
+	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+      }
+      sign=-sign; 
+    }
+    this->DhopDerivEO(mat,D,V,DaggerNo); 
+  };
+    
    // Constructors
-    ContinuedFractionFermion5D::ContinuedFractionFermion5D(
-							   LatticeGaugeField &_Umu,
+    template<class Impl>
+    ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
+							   GaugeField &_Umu,
 							   GridCartesian         &FiveDimGrid,
 							   GridRedBlackCartesian &FiveDimRedBlackGrid,
 							   GridCartesian         &FourDimGrid,
 							   GridRedBlackCartesian &FourDimRedBlackGrid,
-							   RealD _mass,RealD M5) :
-      WilsonFermion5D(_Umu,
-		      FiveDimGrid, FiveDimRedBlackGrid,
-		      FourDimGrid, FourDimRedBlackGrid,M5),
+							   RealD _mass,RealD M5,const ImplParams &p) :
+      WilsonFermion5D<Impl>(_Umu,
+			    FiveDimGrid, FiveDimRedBlackGrid,
+			    FourDimGrid, FourDimRedBlackGrid,M5,p),
      mass(_mass)
    {
+      int Ls = this->Ls;
      assert((Ls&0x1)==1); // Odd Ls required
    }

+    FermOpTemplateInstantiate(ContinuedFractionFermion5D);
+
  }
 }

--- a/lib/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/lib/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -5,35 +5,43 @@ namespace Grid {

  namespace QCD {

-    class ContinuedFractionFermion5D : public WilsonFermion5D
+    template<class Impl>
+    class ContinuedFractionFermion5D : public WilsonFermion5D<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      // override multiply
-      virtual RealD  M    (const LatticeFermion &in, LatticeFermion &out);
-      virtual RealD  Mdag (const LatticeFermion &in, LatticeFermion &out);
+      virtual RealD  M    (const FermionField &in, FermionField &out);
+      virtual RealD  Mdag (const FermionField &in, FermionField &out);

      // half checkerboard operaions
-      virtual void   Meooe       (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MeooeDag    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   Mooee       (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeDag    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   Meooe       (const FermionField &in, FermionField &out);
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+      virtual void   Mooee       (const FermionField &in, FermionField &out);
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+
+      // force terms; five routines; default to Dhop on diagonal
+      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

      //      virtual void   Instantiatable(void)=0;
      virtual void   Instantiatable(void) =0;

      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);

      // Constructors
-      ContinuedFractionFermion5D(LatticeGaugeField &_Umu,
+      ContinuedFractionFermion5D(GaugeField &_Umu,
 				 GridCartesian         &FiveDimGrid,
 				 GridRedBlackCartesian &FiveDimRedBlackGrid,
 				 GridCartesian         &FourDimGrid,
 				 GridRedBlackCartesian &FourDimRedBlackGrid,
-				 RealD _mass,RealD M5);
+				 RealD _mass,RealD M5,const ImplParams &p= ImplParams());

    protected:

--- a/lib/qcd/action/fermion/DomainWallFermion.h
+++ b/lib/qcd/action/fermion/DomainWallFermion.h
@@ -7,24 +7,27 @@ namespace Grid {

  namespace QCD {

-    class DomainWallFermion : public CayleyFermion5D
+    template<class Impl>
+    class DomainWallFermion : public CayleyFermion5D<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      virtual void   Instantiatable(void) {};
      // Constructors
-      DomainWallFermion(LatticeGaugeField &_Umu,
+      DomainWallFermion(GaugeField &_Umu,
 			GridCartesian         &FiveDimGrid,
 			GridRedBlackCartesian &FiveDimRedBlackGrid,
 			GridCartesian         &FourDimGrid,
 			GridRedBlackCartesian &FourDimRedBlackGrid,
-			RealD _mass,RealD _M5) : 
+			RealD _mass,RealD _M5,const ImplParams &p= ImplParams()) : 

-      CayleyFermion5D(_Umu,
-		      FiveDimGrid,
-		      FiveDimRedBlackGrid,
-		      FourDimGrid,
-		      FourDimRedBlackGrid,_mass,_M5)
+      CayleyFermion5D<Impl>(_Umu,
+			    FiveDimGrid,
+			    FiveDimRedBlackGrid,
+			    FourDimGrid,
+			    FourDimRedBlackGrid,_mass,_M5,p)

      {
 	RealD eps = 1.0;
@@ -32,9 +35,9 @@ namespace Grid {
 	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
 	assert(zdata->n==this->Ls);
 	
-	std::cout << "DomainWallFermion with Ls="<<Ls<<std::endl;
+	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
 	// Call base setter
-	this->CayleyFermion5D::SetCoefficientsTanh(zdata,1.0,0.0);
+	this->SetCoefficientsTanh(zdata,1.0,0.0);

 	Approx::zolotarev_free(zdata);
      }
--- a/lib/qcd/action/fermion/FermionOperator.h
+++ b/lib/qcd/action/fermion/FermionOperator.h
@@ -5,16 +5,20 @@ namespace Grid {

  namespace QCD {

-    //////////////////////////////////////////////////////////////////////////////
-    // Four component fermions
-    // Should type template the vector and gauge types
-    // Think about multiple representations
-    //////////////////////////////////////////////////////////////////////////////
-    template<class FermionField,class GaugeField>
-    class FermionOperator : public CheckerBoardedSparseMatrixBase<FermionField>
+    ////////////////////////////////////////////////////////////////
+    // Allow to select  between gauge representation rank bc's, flavours etc.
+    // and single/double precision.
+    ////////////////////////////////////////////////////////////////
+    
+    template<class Impl>
+    class FermionOperator : public CheckerBoardedSparseMatrixBase<typename Impl::FermionField>, public Impl
    {
    public:

+      INHERIT_IMPL_TYPES(Impl);
+
+      FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
+
      GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
      GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };

@@ -28,6 +32,8 @@ namespace Grid {
      virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;

      // half checkerboard operaions
+      virtual int    ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
+
      virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
      virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0;
      virtual void   Mooee       (const FermionField &in, FermionField &out)=0;
@@ -39,13 +45,31 @@ namespace Grid {
      virtual void Dhop  (const FermionField &in, FermionField &out,int dag)=0;
      virtual void DhopOE(const FermionField &in, FermionField &out,int dag)=0;
      virtual void DhopEO(const FermionField &in, FermionField &out,int dag)=0;
+      virtual void DhopDir(const FermionField &in, FermionField &out,int dir,int disp)=0; // implemented by WilsonFermion and WilsonFermion5D

-      virtual void  Mdiag(const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
-      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
-      virtual void  DhopDir(const FermionField &in, FermionField &out,int dir,int disp)=0; // implemented by WilsonFermion and WilsonFermion5D
+      // force terms; five routines; default to Dhop on diagonal
+      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDeriv(mat,U,V,dag);};
+      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivOE(mat,U,V,dag);};
+      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivEO(mat,U,V,dag);};
+      virtual void MooDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=zero;}; // Clover can override these
+      virtual void MeeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=zero;};
+
+      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+      virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+      virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+
+
+      virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
+      virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+
+      ///////////////////////////////////////////////
+      // Updates gauge field during HMC
+      ///////////////////////////////////////////////
+      virtual void ImportGauge(const GaugeField & _U)=0;

    };

  }
 }
+
 #endif
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -0,0 +1,356 @@
+#ifndef  GRID_QCD_FERMION_OPERATOR_IMPL_H
+#define  GRID_QCD_FERMION_OPERATOR_IMPL_H
+
+namespace Grid {
+
+  namespace QCD {
+
+
+    //////////////////////////////////////////////
+    // Template parameter class constructs to package
+    // externally control Fermion implementations
+    // in orthogonal directions
+    //
+    // Ultimately need Impl to always define types where XXX is opaque
+    //
+    //    typedef typename XXX               Simd;
+    //    typedef typename XXX     GaugeLinkField;	
+    //    typedef typename XXX         GaugeField;
+    //    typedef typename XXX      GaugeActField;
+    //    typedef typename XXX       FermionField;
+    //    typedef typename XXX  DoubledGaugeField;
+    //    typedef typename XXX         SiteSpinor;
+    //    typedef typename XXX     SiteHalfSpinor;	
+    //    typedef typename XXX         Compressor;	
+    //
+    // and Methods:
+    //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+    //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+    //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St)
+    //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
+    //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
+    //
+    //
+    // To acquire the typedefs from "Base" (either a base class or template param) use:
+    //
+    // INHERIT_GIMPL_TYPES(Base)
+    // INHERIT_FIMPL_TYPES(Base)
+    // INHERIT_IMPL_TYPES(Base)
+    //
+    // The Fermion operators will do the following:
+    //
+    // struct MyOpParams { 
+    //   RealD mass;
+    // };
+    //
+    //
+    // template<class Impl>
+    // class MyOp : pubic<Impl> { 
+    // public:
+    //
+    //    INHERIT_ALL_IMPL_TYPES(Impl);
+    //
+    //    MyOp(MyOpParams Myparm, ImplParams &ImplParam) :  Impl(ImplParam)
+    //    {
+    //
+    //    };
+    //    
+    //  }
+    //////////////////////////////////////////////
+
+    
+    ////////////////////////////////////////////////////////////////////////
+    // Implementation dependent gauge types
+    ////////////////////////////////////////////////////////////////////////
+
+#define INHERIT_IMPL_TYPES(Base) \
+    INHERIT_GIMPL_TYPES(Base)\
+    INHERIT_FIMPL_TYPES(Base)
+
+#define INHERIT_GIMPL_TYPES(GImpl) \
+    typedef typename GImpl::Simd                           Simd;\
+    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
+    typedef typename GImpl::GaugeField               GaugeField;	
+    
+    // Composition with smeared link, bc's etc.. probably need multiple inheritance
+    // Variable precision "S" and variable Nc
+    template<class S,int Nrepresentation=Nc>
+    class ImplGauge { 
+    public:
+    
+      typedef S Simd;
+    
+      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd  >;
+    
+      typedef iImplGaugeLink    <Simd>           SiteGaugeLink;
+      typedef iImplGaugeField   <Simd>           SiteGaugeField;
+    
+      typedef Lattice<SiteGaugeLink>                GaugeLinkField; // bit ugly naming; polarised gauge field, lorentz... all ugly
+      typedef Lattice<SiteGaugeField>                   GaugeField;
+
+    };
+
+    ////////////////////////////////////////////////////////////////////////
+    // Implementation dependent fermion types
+    ////////////////////////////////////////////////////////////////////////
+
+#define INHERIT_FIMPL_TYPES(Impl)\
+    typedef typename Impl::FermionField           FermionField;		\
+    typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
+    typedef typename Impl::SiteSpinor               SiteSpinor;		\
+    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
+    typedef typename Impl::Compressor               Compressor;		\
+    typedef typename Impl::ImplParams ImplParams;
+
+    ///////
+    // Single flavour four spinors with colour index
+    ///////
+    template<class S,int Nrepresentation=Nc>
+    class WilsonImpl :  public ImplGauge<S,Nrepresentation> { 
+    public:
+
+      typedef ImplGauge<S,Nrepresentation> Gimpl;
+
+
+      INHERIT_GIMPL_TYPES(Gimpl);
+
+      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
+      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
+      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
+    
+      typedef iImplSpinor    <Simd>           SiteSpinor;
+      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
+      typedef iImplDoubledGaugeField<Simd>    SiteDoubledGaugeField;
+
+      typedef Lattice<SiteSpinor>                 FermionField;
+      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+
+      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
+      typedef WilsonImplParams ImplParams;
+      ImplParams Params;
+      WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
+
+      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
+        mult(&phi(),&U(mu),&chi());
+      }
+
+      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+      {
+        conformable(Uds._grid,GaugeGrid);
+        conformable(Umu._grid,GaugeGrid);
+        GaugeLinkField U(GaugeGrid);
+        for(int mu=0;mu<Nd;mu++){
+  	  U = PeekIndex<LorentzIndex>(Umu,mu);
+	  PokeIndex<LorentzIndex>(Uds,U,mu);
+	  U = adj(Cshift(U,mu,-1));
+	  PokeIndex<LorentzIndex>(Uds,U,mu+4);
+	}
+      }
+      
+      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+	GaugeLinkField link(mat._grid);
+	link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+	PokeIndex<LorentzIndex>(mat,link,mu);
+      }   
+
+      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+
+	int Ls=Btilde._grid->_fdimensions[0];
+
+	GaugeLinkField tmp(mat._grid);
+	tmp = zero;
+PARALLEL_FOR_LOOP
+	for(int sss=0;sss<tmp._grid->oSites();sss++){
+	  int sU=sss;
+	  for(int s=0;s<Ls;s++){
+	    int sF = s+Ls*sU;
+	    tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
+	  }
+	}
+	PokeIndex<LorentzIndex>(mat,tmp,mu);
+	
+      }
+
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+    // Flavour doubled spinors; is Gparity the only? what about C*?
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+    template<class S,int Nrepresentation>
+    class GparityWilsonImpl : public ImplGauge<S,Nrepresentation> { 
+    public:
+
+      typedef ImplGauge<S,Nrepresentation> Gimpl;
+
+      INHERIT_GIMPL_TYPES(Gimpl);
+
+      template<typename vtype> using iImplSpinor             = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp >;
+      template<typename vtype> using iImplHalfSpinor         = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp >;
+      template<typename vtype> using iImplDoubledGaugeField  = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >, Ngp >;
+    
+      typedef iImplSpinor    <Simd>           SiteSpinor;
+      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
+      typedef iImplDoubledGaugeField<Simd>    SiteDoubledGaugeField;
+
+      typedef Lattice<SiteSpinor>                 FermionField;
+      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+
+      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
+
+      typedef GparityWilsonImplParams ImplParams;
+      ImplParams Params;
+      GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
+      
+      // provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
+      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
+
+	typedef SiteHalfSpinor vobj;
+	typedef typename SiteHalfSpinor::scalar_object sobj;
+
+	vobj vtmp;
+	sobj stmp;
+	
+	GridBase *grid = St._grid;
+      
+	const int Nsimd = grid->Nsimd();
+	
+	int direction    = St._directions[mu];
+	int distance     = St._distances[mu];
+	int ptype        = St._permute_type[mu]; 
+	int sl           = St._grid->_simd_layout[direction];
+
+	// Fixme X.Y.Z.T hardcode in stencil
+	int mmu          = mu % Nd;
+
+	// assert our assumptions
+	assert((distance==1)||(distance==-1)); // nearest neighbour stencil hard code
+	assert((sl==1)||(sl==2));
+	
+	std::vector<int> icoor;
+      
+	if ( SE->_around_the_world && Params.twists[mmu] ) {
+
+	  if ( sl == 2 ) {
+
+	    std::vector<sobj> vals(Nsimd);
+
+	    extract(chi,vals);
+	    for(int s=0;s<Nsimd;s++){
+
+	      grid->iCoorFromIindex(icoor,s);
+	      
+	      assert((icoor[direction]==0)||(icoor[direction]==1));
+	      
+	      int permute_lane;
+	      if ( distance == 1) {
+		permute_lane = icoor[direction]?1:0;
+	      } else {
+		permute_lane = icoor[direction]?0:1;
+	      }
+	      
+	      if ( permute_lane ) { 
+		stmp(0) = vals[s](1);
+		stmp(1) = vals[s](0);
+		vals[s] = stmp;
+	      }
+	    }
+	    merge(vtmp,vals);
+
+	  } else { 
+	    vtmp(0) = chi(1);
+	    vtmp(1) = chi(0);
+	  }
+	  mult(&phi(0),&U(0)(mu),&vtmp(0));
+	  mult(&phi(1),&U(1)(mu),&vtmp(1));
+	  
+	} else { 
+	  mult(&phi(0),&U(0)(mu),&chi(0));
+	  mult(&phi(1),&U(1)(mu),&chi(1));
+	}
+	
+      }
+
+      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+      {
+	
+	conformable(Uds._grid,GaugeGrid);
+	conformable(Umu._grid,GaugeGrid);
+	
+	GaugeLinkField Utmp(GaugeGrid);
+	GaugeLinkField U(GaugeGrid);
+	GaugeLinkField Uconj(GaugeGrid);
+	
+	Lattice<iScalar<vInteger> > coor(GaugeGrid);
+
+	
+	for(int mu=0;mu<Nd;mu++){
+	  
+	  LatticeCoordinate(coor,mu);
+	  
+	  U     = PeekIndex<LorentzIndex>(Umu,mu);
+	  Uconj = conjugate(U);
+
+	  // This phase could come from a simple bc 1,1,-1,1 ..
+	  int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
+	  if ( Params.twists[mu] ) { 
+	    Uconj = where(coor==neglink,-Uconj,Uconj);
+	  }
+
+	  
+PARALLEL_FOR_LOOP
+	  for(auto ss=U.begin();ss<U.end();ss++){
+	    Uds[ss](0)(mu) = U[ss]();
+	    Uds[ss](1)(mu) = Uconj[ss]();
+	  }
+	  
+	  U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
+	  Uconj = adj(Cshift(Uconj,mu,-1));
+	  
+	  Utmp = U;
+	  if ( Params.twists[mu] ) { 
+	    Utmp = where(coor==0,Uconj,Utmp);
+	  }
+	  
+PARALLEL_FOR_LOOP
+	  for(auto ss=U.begin();ss<U.end();ss++){
+	    Uds[ss](0)(mu+4) = Utmp[ss]();
+	  }
+	  
+	  Utmp = Uconj;
+	  if ( Params.twists[mu] ) { 
+	    Utmp = where(coor==0,U,Utmp);
+	  }
+	  
+PARALLEL_FOR_LOOP
+	  for(auto ss=U.begin();ss<U.end();ss++){
+	    Uds[ss](1)(mu+4) = Utmp[ss]();
+	  }
+	  
+	}
+      }
+
+      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+	assert(0);
+	// Fixme
+	return;
+      }
+      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+	assert(0);
+	// Fixme
+	return;
+      }
+    };
+
+    typedef WilsonImpl<vComplex ,Nc> WilsonImplR; // Real.. whichever prec
+    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
+    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
+
+    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
+    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
+    typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
+
+  }
+}
+#endif
--- a/lib/qcd/action/fermion/MobiusFermion.h
+++ b/lib/qcd/action/fermion/MobiusFermion.h
@@ -7,35 +7,38 @@ namespace Grid {

  namespace QCD {

-    class MobiusFermion : public CayleyFermion5D
+    template<class Impl>
+    class MobiusFermion : public CayleyFermion5D<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      virtual void   Instantiatable(void) {};
      // Constructors
-      MobiusFermion(LatticeGaugeField &_Umu,
+      MobiusFermion(GaugeField &_Umu,
 		    GridCartesian         &FiveDimGrid,
 		    GridRedBlackCartesian &FiveDimRedBlackGrid,
 		    GridCartesian         &FourDimGrid,
 		    GridRedBlackCartesian &FourDimRedBlackGrid,
 		    RealD _mass,RealD _M5,
-		    RealD b, RealD c) : 
+		    RealD b, RealD c,const ImplParams &p= ImplParams()) : 
      
-      CayleyFermion5D(_Umu,
-		      FiveDimGrid,
-		      FiveDimRedBlackGrid,
-		      FourDimGrid,
-		      FourDimRedBlackGrid,_mass,_M5)
+      CayleyFermion5D<Impl>(_Umu,
+			    FiveDimGrid,
+			    FiveDimRedBlackGrid,
+			    FourDimGrid,
+			    FourDimRedBlackGrid,_mass,_M5,p)

      {
 	RealD eps = 1.0;

-	std::cout << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<Ls<<" Tanh approx"<<std::endl;
+	std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
 	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
 	assert(zdata->n==this->Ls);
 	
 	// Call base setter
-	this->CayleyFermion5D::SetCoefficientsTanh(zdata,b,c);
+	this->SetCoefficientsTanh(zdata,b,c);

 	Approx::zolotarev_free(zdata);
 
--- a/lib/qcd/action/fermion/MobiusZolotarevFermion.h
+++ b/lib/qcd/action/fermion/MobiusZolotarevFermion.h
@@ -7,26 +7,29 @@ namespace Grid {

  namespace QCD {

-    class MobiusZolotarevFermion : public CayleyFermion5D
+    template<class Impl>
+    class MobiusZolotarevFermion : public CayleyFermion5D<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      virtual void   Instantiatable(void) {};
      // Constructors
-       MobiusZolotarevFermion(LatticeGaugeField &_Umu,
+       MobiusZolotarevFermion(GaugeField &_Umu,
 			      GridCartesian         &FiveDimGrid,
 			      GridRedBlackCartesian &FiveDimRedBlackGrid,
 			      GridCartesian         &FourDimGrid,
 			      GridRedBlackCartesian &FourDimRedBlackGrid,
 			      RealD _mass,RealD _M5,
 			      RealD b, RealD c,
-			      RealD lo, RealD hi) : 
+			      RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
      
-      CayleyFermion5D(_Umu,
-		      FiveDimGrid,
-		      FiveDimRedBlackGrid,
-		      FourDimGrid,
-		      FourDimRedBlackGrid,_mass,_M5)
+      CayleyFermion5D<Impl>(_Umu,
+			    FiveDimGrid,
+			    FiveDimRedBlackGrid,
+			    FourDimGrid,
+			    FourDimRedBlackGrid,_mass,_M5,p)

      {
 	RealD eps = lo/hi;
@@ -34,10 +37,10 @@ namespace Grid {
 	Approx::zolotarev_data *zdata = Approx::zolotarev(eps,this->Ls,0);
 	assert(zdata->n==this->Ls);

-	std::cout << "MobiusZolotarevFermion (b="<<b<<",c="<<c<<") with Ls= "<<Ls<<" Zolotarev range ["<<lo<<","<<hi<<"]"<<std::endl;
+	std::cout<<GridLogMessage << "MobiusZolotarevFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Zolotarev range ["<<lo<<","<<hi<<"]"<<std::endl;
 	
 	// Call base setter
-	this->CayleyFermion5D::SetCoefficientsZolotarev(hi,zdata,b,c);
+	this->SetCoefficientsZolotarev(hi,zdata,b,c);
 
 	Approx::zolotarev_free(zdata);
      }
--- a/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
@@ -7,25 +7,28 @@ namespace Grid {

  namespace QCD {

-    class OverlapWilsonCayleyTanhFermion : public MobiusFermion
+    template<class Impl>
+    class OverlapWilsonCayleyTanhFermion : public MobiusFermion<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      // Constructors
-    OverlapWilsonCayleyTanhFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
 				   GridCartesian         &FiveDimGrid,
 				   GridRedBlackCartesian &FiveDimRedBlackGrid,
 				   GridCartesian         &FourDimGrid,
 				   GridRedBlackCartesian &FourDimRedBlackGrid,
 				   RealD _mass,RealD _M5,
-				   RealD scale) :
+				   RealD scale,const ImplParams &p= ImplParams()) :
      
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      MobiusFermion(_Umu,
-		    FiveDimGrid,
-		    FiveDimRedBlackGrid,
-		    FourDimGrid,
-		    FourDimRedBlackGrid,_mass,_M5,0.5*scale,0.5*scale)
+      MobiusFermion<Impl>(_Umu,
+			  FiveDimGrid,
+			  FiveDimRedBlackGrid,
+			  FourDimGrid,
+			  FourDimRedBlackGrid,_mass,_M5,0.5*scale,0.5*scale,p)
 	{
 	}
    };
--- a/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
@@ -7,25 +7,28 @@ namespace Grid {

  namespace QCD {

-    class OverlapWilsonCayleyZolotarevFermion : public MobiusZolotarevFermion
+    template<class Impl>
+    class OverlapWilsonCayleyZolotarevFermion : public MobiusZolotarevFermion<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      // Constructors

-    OverlapWilsonCayleyZolotarevFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
 					GridCartesian         &FiveDimGrid,
 					GridRedBlackCartesian &FiveDimRedBlackGrid,
 					GridCartesian         &FourDimGrid,
 					GridRedBlackCartesian &FourDimRedBlackGrid,
 					RealD _mass,RealD _M5,
-					RealD lo, RealD hi) : 
+					RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
      // b+c=1.0, b-c = 0 <=> b =c = 1/2
-      MobiusZolotarevFermion(_Umu,
-			     FiveDimGrid,
-			     FiveDimRedBlackGrid,
-			     FourDimGrid,
-			     FourDimRedBlackGrid,_mass,_M5,0.5,0.5,lo,hi)
+      MobiusZolotarevFermion<Impl>(_Umu,
+				   FiveDimGrid,
+				   FiveDimRedBlackGrid,
+				   FourDimGrid,
+				   FourDimRedBlackGrid,_mass,_M5,0.5,0.5,lo,hi,p)

      {}

--- a/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@@ -7,31 +7,34 @@ namespace Grid {

  namespace QCD {

-    class OverlapWilsonContFracTanhFermion : public ContinuedFractionFermion5D
+    template<class Impl>
+    class OverlapWilsonContFracTanhFermion : public ContinuedFractionFermion5D<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      virtual void   Instantiatable(void){};
      // Constructors
-    OverlapWilsonContFracTanhFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
 				     GridCartesian         &FiveDimGrid,
 				     GridRedBlackCartesian &FiveDimRedBlackGrid,
 				     GridCartesian         &FourDimGrid,
 				     GridRedBlackCartesian &FourDimRedBlackGrid,
 				     RealD _mass,RealD _M5,
-				     RealD scale) :
+				     RealD scale,const ImplParams &p= ImplParams()) :
      
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      ContinuedFractionFermion5D(_Umu,
-				 FiveDimGrid,
-				 FiveDimRedBlackGrid,
-				 FourDimGrid,
-				 FourDimRedBlackGrid,_mass,_M5)
+      ContinuedFractionFermion5D<Impl>(_Umu,
+				       FiveDimGrid,
+				       FiveDimRedBlackGrid,
+				       FourDimGrid,
+				       FourDimRedBlackGrid,_mass,_M5,p)
 	{
-	  assert((Ls&0x1)==1); // Odd Ls required
-	  int nrational=Ls-1;// Even rational order
+	  assert((this->Ls&0x1)==1); // Odd Ls required
+	  int nrational=this->Ls-1;// Even rational order
 	  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
-	  SetCoefficientsTanh(zdata,scale);
+	  this->SetCoefficientsTanh(zdata,scale);
 	  Approx::zolotarev_free(zdata);
 	}
    };
--- a/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@@ -7,34 +7,36 @@ namespace Grid {

  namespace QCD {

-    class OverlapWilsonContFracZolotarevFermion : public ContinuedFractionFermion5D
+    template<class Impl>
+    class OverlapWilsonContFracZolotarevFermion : public ContinuedFractionFermion5D<Impl>
    {
    public:
+     INHERIT_IMPL_TYPES(Impl);

      virtual void   Instantiatable(void){};
      // Constructors
-    OverlapWilsonContFracZolotarevFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
 					  GridCartesian         &FiveDimGrid,
 					  GridRedBlackCartesian &FiveDimRedBlackGrid,
 					  GridCartesian         &FourDimGrid,
 					  GridRedBlackCartesian &FourDimRedBlackGrid,
 					  RealD _mass,RealD _M5,
-					  RealD lo,RealD hi):
+					  RealD lo,RealD hi,const ImplParams &p= ImplParams()):
      
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      ContinuedFractionFermion5D(_Umu,
-				 FiveDimGrid,
-				 FiveDimRedBlackGrid,
-				 FourDimGrid,
-				 FourDimRedBlackGrid,_mass,_M5)
+      ContinuedFractionFermion5D<Impl>(_Umu,
+				       FiveDimGrid,
+				       FiveDimRedBlackGrid,
+				       FourDimGrid,
+				       FourDimRedBlackGrid,_mass,_M5,p)
 	{
-	  assert((Ls&0x1)==1); // Odd Ls required
+	  assert((this->Ls&0x1)==1); // Odd Ls required

-	  int nrational=Ls;// Odd rational order
+	  int nrational=this->Ls;// Odd rational order
 	  RealD eps = lo/hi;

 	  Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
-	  SetCoefficientsZolotarev(hi,zdata);
+	  this->SetCoefficientsZolotarev(hi,zdata);
 	  Approx::zolotarev_free(zdata);

 	}
--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@@ -7,31 +7,34 @@ namespace Grid {

  namespace QCD {

-    class OverlapWilsonPartialFractionTanhFermion : public PartialFractionFermion5D
+    template<class Impl>
+    class OverlapWilsonPartialFractionTanhFermion : public PartialFractionFermion5D<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      virtual void   Instantiatable(void){};
      // Constructors
-    OverlapWilsonPartialFractionTanhFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
 					    GridCartesian         &FiveDimGrid,
 					    GridRedBlackCartesian &FiveDimRedBlackGrid,
 					    GridCartesian         &FourDimGrid,
 					    GridRedBlackCartesian &FourDimRedBlackGrid,
 					    RealD _mass,RealD _M5,
-					    RealD scale) :
+					    RealD scale,const ImplParams &p= ImplParams()) :
      
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      PartialFractionFermion5D(_Umu,
-			       FiveDimGrid,
-			       FiveDimRedBlackGrid,
-			       FourDimGrid,
-			       FourDimRedBlackGrid,_mass,_M5)
+      PartialFractionFermion5D<Impl>(_Umu,
+				     FiveDimGrid,
+				     FiveDimRedBlackGrid,
+				     FourDimGrid,
+				     FourDimRedBlackGrid,_mass,_M5,p)
 	{
-	  assert((Ls&0x1)==1); // Odd Ls required
-	  int nrational=Ls-1;// Even rational order
+	  assert((this->Ls&0x1)==1); // Odd Ls required
+	  int nrational=this->Ls-1;// Even rational order
 	  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
-	  SetCoefficientsTanh(zdata,scale);
+	  this->SetCoefficientsTanh(zdata,scale);
 	  Approx::zolotarev_free(zdata);
 	}
    };
--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@@ -7,34 +7,36 @@ namespace Grid {

  namespace QCD {

-    class OverlapWilsonPartialFractionZolotarevFermion : public PartialFractionFermion5D
+    template<class Impl>
+    class OverlapWilsonPartialFractionZolotarevFermion : public PartialFractionFermion5D<Impl>
    {
    public:
+     INHERIT_IMPL_TYPES(Impl);

      virtual void   Instantiatable(void){};
      // Constructors
-    OverlapWilsonPartialFractionZolotarevFermion(LatticeGaugeField &_Umu,
-					  GridCartesian         &FiveDimGrid,
-					  GridRedBlackCartesian &FiveDimRedBlackGrid,
-					  GridCartesian         &FourDimGrid,
-					  GridRedBlackCartesian &FourDimRedBlackGrid,
-					  RealD _mass,RealD _M5,
-					  RealD lo,RealD hi):
+    OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
+						 GridCartesian         &FiveDimGrid,
+						 GridRedBlackCartesian &FiveDimRedBlackGrid,
+						 GridCartesian         &FourDimGrid,
+						 GridRedBlackCartesian &FourDimRedBlackGrid,
+						 RealD _mass,RealD _M5,
+						 RealD lo,RealD hi,const ImplParams &p= ImplParams()):
      
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      PartialFractionFermion5D(_Umu,
-			       FiveDimGrid,
-			       FiveDimRedBlackGrid,
-			       FourDimGrid,
-			       FourDimRedBlackGrid,_mass,_M5)
+      PartialFractionFermion5D<Impl>(_Umu,
+				     FiveDimGrid,
+				     FiveDimRedBlackGrid,
+				     FourDimGrid,
+				     FourDimRedBlackGrid,_mass,_M5,p)
 	{
-	  assert((Ls&0x1)==1); // Odd Ls required
+	  assert((this->Ls&0x1)==1); // Odd Ls required

-	  int nrational=Ls;// Odd rational order
+	  int nrational=this->Ls;// Odd rational order
 	  RealD eps = lo/hi;

 	  Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
-	  SetCoefficientsZolotarev(hi,zdata);
+	  this->SetCoefficientsZolotarev(hi,zdata);
 	  Approx::zolotarev_free(zdata);

 	}
--- a/lib/qcd/action/fermion/PartialFractionFermion5D.cc
+++ b/lib/qcd/action/fermion/PartialFractionFermion5D.cc
@@ -2,12 +2,15 @@
 namespace Grid {
  namespace QCD {

-    void  PartialFractionFermion5D::Mdir (const LatticeFermion &psi, LatticeFermion &chi,int dir,int disp){
+
+    template<class Impl>
+    void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
      // this does both dag and undag but is trivial; make a common helper routing

      int sign = 1;
+      int Ls = this->Ls;

-      DhopDir(psi,chi,dir,disp);
+      this->DhopDir(psi,chi,dir,disp);

      int nblock=(Ls-1)/2;
      for(int b=0;b<nblock;b++){
@@ -18,15 +21,16 @@ namespace Grid {
      ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);

    }
-    void   PartialFractionFermion5D::Meooe_internal(const LatticeFermion &psi, LatticeFermion &chi,int dag)
+    template<class Impl>
+    void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
    {
-      // this does both dag and undag but is trivial; make a common helper routing
+      int Ls = this->Ls;
      int sign = dag ? (-1) : 1;

      if ( psi.checkerboard == Odd ) {
-	DhopEO(psi,chi,DaggerNo);
+	this->DhopEO(psi,chi,DaggerNo);
      } else {
-	DhopOE(psi,chi,DaggerNo);
+	this->DhopOE(psi,chi,DaggerNo);
      }

      int nblock=(Ls-1)/2;
@@ -38,10 +42,12 @@ namespace Grid {
      ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
    }

-    void   PartialFractionFermion5D::Mooee_internal(const LatticeFermion &psi, LatticeFermion &chi,int dag)
+    template<class Impl>
+    void   PartialFractionFermion5D<Impl>::Mooee_internal(const FermionField &psi, FermionField &chi,int dag)
    {
      // again dag and undag are trivially related
      int sign = dag ? (-1) : 1;
+      int Ls = this->Ls;
      
      int nblock=(Ls-1)/2;
      for(int b=0;b<nblock;b++){
@@ -69,11 +75,13 @@ namespace Grid {
      }
    }

-    void   PartialFractionFermion5D::MooeeInv_internal(const LatticeFermion &psi, LatticeFermion &chi,int dag)
+    template<class Impl>
+    void   PartialFractionFermion5D<Impl>::MooeeInv_internal(const FermionField &psi, FermionField &chi,int dag)
    {
      int sign = dag ? (-1) : 1;
+      int Ls = this->Ls;

-      LatticeFermion tmp(psi._grid);
+      FermionField tmp(psi._grid);
      
      ///////////////////////////////////////////////////////////////////////////////////////
      //Linv
@@ -129,10 +137,12 @@ namespace Grid {
      axpby_ssp  (chi, 1.0/scale,tmp,0.0,tmp,Ls-1,Ls-1);
    }

-    void   PartialFractionFermion5D::M_internal(const LatticeFermion &psi, LatticeFermion &chi,int dag)
+    template<class Impl>
+    void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, FermionField &chi,int dag)
    {
-      LatticeFermion D(psi._grid);
+      FermionField D(psi._grid);
  
+      int Ls = this->Ls;
      int sign = dag ? (-1) : 1;

      // For partial frac Hw case (b5=c5=1) chroma quirkily computes
@@ -186,7 +196,7 @@ namespace Grid {
      //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H
      //

-      DW(psi,D,DaggerNo); 
+      this->DW(psi,D,DaggerNo); 

      int nblock=(Ls-1)/2;
      for(int b=0;b<nblock;b++){
@@ -217,61 +227,127 @@ namespace Grid {

    }

-    RealD  PartialFractionFermion5D::M    (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    RealD  PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
    {
      M_internal(in,out,DaggerNo);
      return norm2(out);
    }
-    RealD  PartialFractionFermion5D::Mdag (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    RealD  PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
    {
      M_internal(in,out,DaggerYes);
      return norm2(out);
    }

-    void PartialFractionFermion5D::Meooe       (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::Meooe       (const FermionField &in, FermionField &out)
    {
      Meooe_internal(in,out,DaggerNo);
    }
-    void PartialFractionFermion5D::MeooeDag    (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::MeooeDag    (const FermionField &in, FermionField &out)
    {
      Meooe_internal(in,out,DaggerYes);
    }
-    void PartialFractionFermion5D::Mooee       (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::Mooee       (const FermionField &in, FermionField &out)
    {
      Mooee_internal(in,out,DaggerNo);
    }
-    void PartialFractionFermion5D::MooeeDag    (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::MooeeDag    (const FermionField &in, FermionField &out)
    {
      Mooee_internal(in,out,DaggerYes);
    }

-    void PartialFractionFermion5D::MooeeInv    (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::MooeeInv    (const FermionField &in, FermionField &out)
    {
      MooeeInv_internal(in,out,DaggerNo);
    }
-    void PartialFractionFermion5D::MooeeInvDag (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::MooeeInvDag (const FermionField &in, FermionField &out)
    {
      MooeeInv_internal(in,out,DaggerYes);
    }

-    void  PartialFractionFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
+
+  // force terms; five routines; default to Dhop on diagonal
+    template<class Impl>
+   void PartialFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int nblock=(Ls-1)/2;
+    for(int b=0;b<nblock;b++){
+      int s = 2*b;
+      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+    }
+    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+    this->DhopDeriv(mat,D,V,DaggerNo); 
+  };
+    template<class Impl>
+   void PartialFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int nblock=(Ls-1)/2;
+    for(int b=0;b<nblock;b++){
+      int s = 2*b;
+      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+    }
+    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+    this->DhopDerivOE(mat,D,V,DaggerNo); 
+  };
+    template<class Impl>
+   void PartialFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int nblock=(Ls-1)/2;
+    for(int b=0;b<nblock;b++){
+      int s = 2*b;
+      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+    }
+    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+    this->DhopDerivEO(mat,D,V,DaggerNo); 
+  };
+
+    template<class Impl>
+    void  PartialFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
      SetCoefficientsZolotarev(1.0/scale,zdata);
    }
-    void  PartialFractionFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){
+    template<class Impl>
+    void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){

      // check on degree matching
-      //      std::cout << Ls << " Ls"<<std::endl;
-      //      std::cout << zdata->n  << " - n"<<std::endl;
-      //      std::cout << zdata->da << " -da "<<std::endl;
-      //      std::cout << zdata->db << " -db"<<std::endl;
-      //      std::cout << zdata->dn << " -dn"<<std::endl;
-      //      std::cout << zdata->dd << " -dd"<<std::endl;
+      //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+      int Ls = this->Ls;
+
      assert(Ls == (2*zdata->da -1) );

      // Part frac
      //      RealD R;
      R=(1+mass)/(1-mass);
-      dw_diag = (4.0-M5);
+      dw_diag = (4.0-this->M5);

      //      std::vector<RealD> p; 
      //      std::vector<RealD> q;
@@ -291,18 +367,22 @@ namespace Grid {
    }

      // Constructors
-    PartialFractionFermion5D::PartialFractionFermion5D(LatticeGaugeField &_Umu,
-						       GridCartesian         &FiveDimGrid,
-						       GridRedBlackCartesian &FiveDimRedBlackGrid,
-						       GridCartesian         &FourDimGrid,
-						       GridRedBlackCartesian &FourDimRedBlackGrid,
-						       RealD _mass,RealD M5) :
-      WilsonFermion5D(_Umu,
-		      FiveDimGrid, FiveDimRedBlackGrid,
-		      FourDimGrid, FourDimRedBlackGrid,M5),
+    template<class Impl>
+    PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
+							     GridCartesian         &FiveDimGrid,
+							     GridRedBlackCartesian &FiveDimRedBlackGrid,
+							     GridCartesian         &FourDimGrid,
+							     GridRedBlackCartesian &FourDimRedBlackGrid,
+							     RealD _mass,RealD M5,
+							     const ImplParams &p) :
+      WilsonFermion5D<Impl>(_Umu,
+			    FiveDimGrid, FiveDimRedBlackGrid,
+			    FourDimGrid, FourDimRedBlackGrid,M5,p),
      mass(_mass)

    {
+      int Ls = this->Ls;
+
      assert((Ls&0x1)==1); // Odd Ls required
      int nrational=Ls-1;

@@ -321,6 +401,8 @@ namespace Grid {

    }
 
+    FermOpTemplateInstantiate(PartialFractionFermion5D);
+
 }
 }

--- a/lib/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/lib/qcd/action/fermion/PartialFractionFermion5D.h
@@ -5,41 +5,48 @@ namespace Grid {

  namespace QCD {

-    class PartialFractionFermion5D : public WilsonFermion5D
+    template<class Impl>
+    class PartialFractionFermion5D : public WilsonFermion5D<Impl>
    {
    public:
+     INHERIT_IMPL_TYPES(Impl);

      const int part_frac_chroma_convention=1;

-      void   Meooe_internal(const LatticeFermion &in, LatticeFermion &out,int dag);
-      void   Mooee_internal(const LatticeFermion &in, LatticeFermion &out,int dag);
-      void   MooeeInv_internal(const LatticeFermion &in, LatticeFermion &out,int dag);
-      void   M_internal(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
+      void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
+      void   MooeeInv_internal(const FermionField &in, FermionField &out,int dag);
+      void   M_internal(const FermionField &in, FermionField &out,int dag);

      // override multiply
-      virtual RealD  M    (const LatticeFermion &in, LatticeFermion &out);
-      virtual RealD  Mdag (const LatticeFermion &in, LatticeFermion &out);
+      virtual RealD  M    (const FermionField &in, FermionField &out);
+      virtual RealD  Mdag (const FermionField &in, FermionField &out);

      // half checkerboard operaions
-      virtual void   Meooe       (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MeooeDag    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   Mooee       (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeDag    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   Meooe       (const FermionField &in, FermionField &out);
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+      virtual void   Mooee       (const FermionField &in, FermionField &out);
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+
+      // force terms; five routines; default to Dhop on diagonal
+      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

      virtual void   Instantiatable(void) =0; // ensure no make-eee

      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);

      // Constructors
-      PartialFractionFermion5D(LatticeGaugeField &_Umu,
-				    GridCartesian         &FiveDimGrid,
-				    GridRedBlackCartesian &FiveDimRedBlackGrid,
-				    GridCartesian         &FourDimGrid,
-				    GridRedBlackCartesian &FourDimRedBlackGrid,
-				    RealD _mass,RealD M5);
+      PartialFractionFermion5D(GaugeField &_Umu,
+			       GridCartesian         &FiveDimGrid,
+			       GridRedBlackCartesian &FiveDimRedBlackGrid,
+			       GridCartesian         &FourDimGrid,
+			       GridRedBlackCartesian &FourDimRedBlackGrid,
+			       RealD _mass,RealD M5,const ImplParams &p= ImplParams());

    protected:

--- a/lib/qcd/action/fermion/ScaledShamirFermion.h
+++ b/lib/qcd/action/fermion/ScaledShamirFermion.h
@@ -7,12 +7,14 @@ namespace Grid {

  namespace QCD {

-    class ScaledShamirFermion : public MobiusFermion
+    template<class Impl>
+    class ScaledShamirFermion : public MobiusFermion<Impl>
    {
    public:
+     INHERIT_IMPL_TYPES(Impl);

      // Constructors
-    ScaledShamirFermion(LatticeGaugeField &_Umu,
+    ScaledShamirFermion(GaugeField &_Umu,
 			GridCartesian         &FiveDimGrid,
 			GridRedBlackCartesian &FiveDimRedBlackGrid,
 			GridCartesian         &FourDimGrid,
@@ -21,7 +23,7 @@ namespace Grid {
 			RealD scale) :
      
      // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
-      MobiusFermion(_Umu,
+      MobiusFermion<Impl>(_Umu,
 		    FiveDimGrid,
 		    FiveDimRedBlackGrid,
 		    FourDimGrid,
--- a/lib/qcd/action/fermion/ShamirZolotarevFermion.h
+++ b/lib/qcd/action/fermion/ShamirZolotarevFermion.h
@@ -7,27 +7,29 @@ namespace Grid {

  namespace QCD {

-    class ShamirZolotarevFermion : public MobiusZolotarevFermion
+    template<class Impl>
+    class ShamirZolotarevFermion : public MobiusZolotarevFermion<Impl>
    {
    public:
+     INHERIT_IMPL_TYPES(Impl);

      // Constructors


-    ShamirZolotarevFermion(LatticeGaugeField &_Umu,
+    ShamirZolotarevFermion(GaugeField &_Umu,
 			   GridCartesian         &FiveDimGrid,
 			   GridRedBlackCartesian &FiveDimRedBlackGrid,
 			   GridCartesian         &FourDimGrid,
 			   GridRedBlackCartesian &FourDimRedBlackGrid,
 			   RealD _mass,RealD _M5,
-			   RealD lo, RealD hi) : 
+			   RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
      
      // b+c = 1; b-c = 1 => b=1, c=0
-      MobiusZolotarevFermion(_Umu,
-			     FiveDimGrid,
-			     FiveDimRedBlackGrid,
-			     FourDimGrid,
-			     FourDimRedBlackGrid,_mass,_M5,1.0,0.0,lo,hi)
+      MobiusZolotarevFermion<Impl>(_Umu,
+				   FiveDimGrid,
+				   FiveDimRedBlackGrid,
+				   FourDimGrid,
+				   FourDimRedBlackGrid,_mass,_M5,1.0,0.0,lo,hi,p)
      
      {}

--- a/lib/qcd/action/fermion/WilsonCompressor.h
+++ b/lib/qcd/action/fermion/WilsonCompressor.h
@@ -4,6 +4,7 @@
 namespace Grid {
 namespace QCD {

+  template<class SiteHalfSpinor,class SiteSpinor>
  class WilsonCompressor {
  public:
    int mu;
@@ -18,9 +19,13 @@ namespace QCD {
      mu=p;
    };

-    vHalfSpinColourVector operator () (const vSpinColourVector &in)
+    virtual SiteHalfSpinor operator () (const SiteSpinor &in,int dim,int plane,int osite,GridBase *grid) {
+      return spinproject(in);
+    }
+
+    SiteHalfSpinor spinproject(const SiteSpinor &in)
    {
-      vHalfSpinColourVector ret;
+      SiteHalfSpinor ret;
      int mudag=mu;
      if (dag) {
 	mudag=(mu+Nd)%(2*Nd);
@@ -57,5 +62,7 @@ namespace QCD {
      return ret;
    }
  };
+
+
 }} // namespace close
 #endif
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -3,179 +3,291 @@
 namespace Grid {
 namespace QCD {

-const std::vector<int> WilsonFermion::directions   ({0,1,2,3, 0, 1, 2, 3});
-const std::vector<int> WilsonFermion::displacements({1,1,1,1,-1,-1,-1,-1});
+  const std::vector<int> WilsonFermionStatic::directions   ({0,1,2,3, 0, 1, 2, 3});
+  const std::vector<int> WilsonFermionStatic::displacements({1,1,1,1,-1,-1,-1,-1});
+  int WilsonFermionStatic::HandOptDslash;

-int WilsonFermion::HandOptDslash;
+  /////////////////////////////////
+  // Constructor and gauge import
+  /////////////////////////////////

-WilsonFermion::WilsonFermion(LatticeGaugeField &_Umu,
-			     GridCartesian         &Fgrid,
-			     GridRedBlackCartesian &Hgrid, 
-			     RealD _mass) :
-  _grid(&Fgrid),
-  _cbgrid(&Hgrid),
-  Stencil    (&Fgrid,npoint,Even,directions,displacements),
-  StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
-  StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
-  mass(_mass),
-  Umu(&Fgrid),
-  UmuEven(&Hgrid),
-  UmuOdd (&Hgrid)
-{
-  // Allocate the required comms buffer
-  comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
-  DoubleStore(Umu,_Umu);
-  pickCheckerboard(Even,UmuEven,Umu);
-  pickCheckerboard(Odd ,UmuOdd,Umu);
-}
-      
-void WilsonFermion::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu)
-{
-  conformable(Uds._grid,GaugeGrid());
-  conformable(Umu._grid,GaugeGrid());
-  LatticeColourMatrix U(GaugeGrid());
-  for(int mu=0;mu<Nd;mu++){
-    U = PeekIndex<LorentzIndex>(Umu,mu);
-    PokeIndex<LorentzIndex>(Uds,U,mu);
-    U = adj(Cshift(U,mu,-1));
-    PokeIndex<LorentzIndex>(Uds,U,mu+4);
-  }
-}
-
-RealD WilsonFermion::M(const LatticeFermion &in, LatticeFermion &out)
-{
-  out.checkerboard=in.checkerboard;
-  Dhop(in,out,DaggerNo);
-  return axpy_norm(out,4+mass,in,out);
-}
-RealD WilsonFermion::Mdag(const LatticeFermion &in, LatticeFermion &out)
-{
-  out.checkerboard=in.checkerboard;
-  Dhop(in,out,DaggerYes);
-  return axpy_norm(out,4+mass,in,out);
-}
-
-void WilsonFermion::Meooe(const LatticeFermion &in, LatticeFermion &out)
-{
-  if ( in.checkerboard == Odd ) {
-    DhopEO(in,out,DaggerNo);
-  } else {
-    DhopOE(in,out,DaggerNo);
-  }
-}
-void WilsonFermion::MeooeDag(const LatticeFermion &in, LatticeFermion &out)
-{
-  if ( in.checkerboard == Odd ) {
-    DhopEO(in,out,DaggerYes);
-  } else {
-    DhopOE(in,out,DaggerYes);
-  }
-}
-void WilsonFermion::Mooee(const LatticeFermion &in, LatticeFermion &out)
-{
-  out.checkerboard = in.checkerboard;
-  out = (4.0+mass)*in;
-  return ;
-}
-void WilsonFermion::MooeeDag(const LatticeFermion &in, LatticeFermion &out)
-{
-  out.checkerboard = in.checkerboard;
-  Mooee(in,out);
-}
-void WilsonFermion::MooeeInv(const LatticeFermion &in, LatticeFermion &out)
-{
-  out.checkerboard = in.checkerboard;
-  out = (1.0/(4.0+mass))*in;
-  return ;
-}
-void WilsonFermion::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
-{
-  out.checkerboard = in.checkerboard;
-  MooeeInv(in,out);
-}
-void WilsonFermion::Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp)
-{
-  DhopDir(in,out,dir,disp);
-}
-void WilsonFermion::DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir,int disp){
-  WilsonCompressor compressor(DaggerNo);
-  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
-  
-  assert( (disp==1)||(disp==-1) );
-
-  int skip = (disp==1) ? 0 : 1;
-
-  int dirdisp = dir+skip*4;
-
-PARALLEL_FOR_LOOP
-  for(int sss=0;sss<in._grid->oSites();sss++){
-    DiracOptDhopDir(Stencil,Umu,comm_buf,sss,sss,in,out,dirdisp);
+  template<class Impl>
+  WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu,
+				     GridCartesian         &Fgrid,
+				     GridRedBlackCartesian &Hgrid, 
+				     RealD _mass,const ImplParams &p) :
+        Kernels(p),
+        _grid(&Fgrid),
+	_cbgrid(&Hgrid),
+	Stencil    (&Fgrid,npoint,Even,directions,displacements),
+	StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
+	StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
+	mass(_mass),
+	Umu(&Fgrid),
+	UmuEven(&Hgrid),
+	UmuOdd (&Hgrid) 
+  {
+    // Allocate the required comms buffer
+    comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
+    ImportGauge(_Umu);
  }

-};
+  template<class Impl>
+  void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
+  {
+    Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
+    pickCheckerboard(Even,UmuEven,Umu);
+    pickCheckerboard(Odd ,UmuOdd,Umu);
+  }
  
-void WilsonFermion::DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField & U,
-				const LatticeFermion &in, LatticeFermion &out,int dag)
-{
-  assert((dag==DaggerNo) ||(dag==DaggerYes));
-  WilsonCompressor compressor(dag);
-  st.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
+  /////////////////////////////
+  // Implement the interface
+  /////////////////////////////
      
-  if ( dag == DaggerYes ) {
-    if( HandOptDslash ) {
-PARALLEL_FOR_LOOP
-      for(int sss=0;sss<in._grid->oSites();sss++){
-        DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
-      }
+  template<class Impl>
+  RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) 
+  {
+    out.checkerboard=in.checkerboard;
+    Dhop(in,out,DaggerNo);
+    return axpy_norm(out,4+mass,in,out);
+  }
+
+  template<class Impl>
+  RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) 
+  {
+    out.checkerboard=in.checkerboard;
+    Dhop(in,out,DaggerYes);
+    return axpy_norm(out,4+mass,in,out);
+  }
+
+  template<class Impl>
+  void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) 
+  {
+    if ( in.checkerboard == Odd ) {
+      DhopEO(in,out,DaggerNo);
    } else {
-PARALLEL_FOR_LOOP
-      for(int sss=0;sss<in._grid->oSites();sss++){
-        DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
-      }
-    }
-  } else {
-    if( HandOptDslash ) {
-PARALLEL_FOR_LOOP
-      for(int sss=0;sss<in._grid->oSites();sss++){
-        DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out);
-      }
-    } else { 
-PARALLEL_FOR_LOOP
-      for(int sss=0;sss<in._grid->oSites();sss++){
-        DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out);
-      }
+      DhopOE(in,out,DaggerNo);
+    }
+  }
+  template<class Impl>
+  void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) 
+  {
+    if ( in.checkerboard == Odd ) {
+      DhopEO(in,out,DaggerYes);
+    } else {
+      DhopOE(in,out,DaggerYes);
    }
  }
-}
-void WilsonFermion::DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag)
-{
-  conformable(in._grid,_cbgrid);    // verifies half grid
-  conformable(in._grid,out._grid); // drops the cb check

-  assert(in.checkerboard==Even);
-  out.checkerboard = Odd;
+  template<class Impl>
+  void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+    out.checkerboard = in.checkerboard;
+    out = (4.0+mass)*in;
+  }
  
-  DhopInternal(StencilEven,UmuOdd,in,out,dag);
-}
-void WilsonFermion::DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag)
-{
-  conformable(in._grid,_cbgrid);    // verifies half grid
-  conformable(in._grid,out._grid); // drops the cb check
+  template<class Impl>
+  void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+    out.checkerboard = in.checkerboard;
+    Mooee(in,out);
+  }
  
-  assert(in.checkerboard==Odd);
-  out.checkerboard = Even;
+  template<class Impl>
+  void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+    out.checkerboard = in.checkerboard;
+    out = (1.0/(4.0+mass))*in;
+  }
  
-  DhopInternal(StencilOdd,UmuEven,in,out,dag);
-}
-void WilsonFermion::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
-{
-  conformable(in._grid,_grid); // verifies full grid
-  conformable(in._grid,out._grid);
+  template<class Impl>
+  void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
+    out.checkerboard = in.checkerboard;
+    MooeeInv(in,out);
+  }
  
-  out.checkerboard = in.checkerboard;
+  ///////////////////////////////////
+  // Internal
+  ///////////////////////////////////

-  DhopInternal(Stencil,Umu,in,out,dag);
-}
+  template<class Impl>
+  void WilsonFermion<Impl>::DerivInternal(CartesianStencil & st,
+					  DoubledGaugeField & U,
+					  GaugeField &mat,
+					  const FermionField &A,
+					  const FermionField &B,int dag) {
+	
+    assert((dag==DaggerNo) ||(dag==DaggerYes));
+    
+    Compressor compressor(dag);
+    
+    FermionField Btilde(B._grid);
+    FermionField Atilde(B._grid);
+    Atilde = A;
+
+    st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
+    
+    for(int mu=0;mu<Nd;mu++){
+      
+      ////////////////////////////////////////////////////////////////////////
+      // Flip gamma (1+g)<->(1-g) if dag
+      ////////////////////////////////////////////////////////////////////////
+      int gamma = mu;
+      if ( dag ) gamma+= Nd;
+      
+      ////////////////////////
+      // Call the single hop
+      ////////////////////////
+PARALLEL_FOR_LOOP
+	for(int sss=0;sss<B._grid->oSites();sss++){
+	  Kernels::DiracOptDhopDir(st,U,comm_buf,sss,sss,B,Btilde,mu,gamma);
+	}
+      
+      //////////////////////////////////////////////////
+      // spin trace outer product
+      //////////////////////////////////////////////////
+      Impl::InsertForce4D(mat,Btilde,Atilde,mu);
+
+    }
+  }
+  
+  template<class Impl>
+  void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    conformable(U._grid,_grid);  
+    conformable(U._grid,V._grid);
+    conformable(U._grid,mat._grid);
+    
+    mat.checkerboard = U.checkerboard;
+    
+    DerivInternal(Stencil,Umu,mat,U,V,dag);
+  }
+  
+  template<class Impl>
+  void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    conformable(U._grid,_cbgrid);  
+    conformable(U._grid,V._grid);
+    conformable(U._grid,mat._grid);
+    
+    assert(V.checkerboard==Even);
+    assert(U.checkerboard==Odd);
+    mat.checkerboard = Odd;
+    
+    DerivInternal(StencilEven,UmuOdd,mat,U,V,dag);
+  }
+  
+  template<class Impl>
+  void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    conformable(U._grid,_cbgrid);  
+    conformable(U._grid,V._grid);
+    conformable(U._grid,mat._grid);
+	
+    assert(V.checkerboard==Odd);
+    assert(U.checkerboard==Even);
+    mat.checkerboard = Even;
+	
+    DerivInternal(StencilOdd,UmuEven,mat,U,V,dag);
+  }
+  
+
+  template<class Impl>
+  void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) {
+    conformable(in._grid,_grid); // verifies full grid
+    conformable(in._grid,out._grid);
+    
+    out.checkerboard = in.checkerboard;
+    
+    DhopInternal(Stencil,Umu,in,out,dag);
+  }
+  
+  template<class Impl>
+  void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) {
+    conformable(in._grid,_cbgrid);    // verifies half grid
+    conformable(in._grid,out._grid); // drops the cb check
+    
+    assert(in.checkerboard==Even);
+    out.checkerboard = Odd;
+    
+    DhopInternal(StencilEven,UmuOdd,in,out,dag);
+  }
+  
+  template<class Impl>
+  void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) {
+    conformable(in._grid,_cbgrid);    // verifies half grid
+    conformable(in._grid,out._grid); // drops the cb check
+    
+    assert(in.checkerboard==Odd);
+    out.checkerboard = Even;
+    
+    DhopInternal(StencilOdd,UmuEven,in,out,dag);
+  }
+  
+  template<class Impl>
+  void WilsonFermion<Impl>::Mdir (const FermionField &in, FermionField &out,int dir,int disp) {
+    DhopDir(in,out,dir,disp);
+  }
+  
+  template<class Impl>
+  void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir,int disp){
+    
+    int skip = (disp==1) ? 0 : 1;
+    int dirdisp = dir+skip*4;
+    
+    DhopDirDisp(in,out,dirdisp,dirdisp,DaggerNo);
+    
+  };
+  
+  template<class Impl>
+  void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp,int gamma,int dag) {
+    
+    Compressor compressor(dag);
+    
+    Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
+    
+PARALLEL_FOR_LOOP
+      for(int sss=0;sss<in._grid->oSites();sss++){
+	Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sss,sss,in,out,dirdisp,gamma);
+      }
+    
+  };
+
+
+  template<class Impl>
+  void WilsonFermion<Impl>::DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
+					 const FermionField &in, FermionField &out,int dag) {
+
+    assert((dag==DaggerNo) ||(dag==DaggerYes));
+
+    Compressor compressor(dag);
+    st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
+    
+    if ( dag == DaggerYes ) {
+      if( HandOptDslash ) {
+PARALLEL_FOR_LOOP
+        for(int sss=0;sss<in._grid->oSites();sss++){
+	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
+	}
+      } else { 
+PARALLEL_FOR_LOOP
+        for(int sss=0;sss<in._grid->oSites();sss++){
+	  Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
+	}
+      }
+    } else {
+      if( HandOptDslash ) {
+PARALLEL_FOR_LOOP
+        for(int sss=0;sss<in._grid->oSites();sss++){
+	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out);
+	}
+      } else { 
+PARALLEL_FOR_LOOP
+        for(int sss=0;sss<in._grid->oSites();sss++){
+	  Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out);
+	}
+      }
+    }
+  };
+ 
+  FermOpTemplateInstantiate(WilsonFermion);

 }}

--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -5,9 +5,21 @@ namespace Grid {

  namespace QCD {

-    class WilsonFermion : public FermionOperator<LatticeFermion,LatticeGaugeField>
+    class WilsonFermionStatic {
+    public:
+      static int HandOptDslash; // these are a temporary hack
+      static int MortonOrder;
+      static const std::vector<int> directions   ;
+      static const std::vector<int> displacements;
+      static const int npoint=8;
+    };
+
+    template<class Impl>
+    class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic
    {
    public:
+    INHERIT_IMPL_TYPES(Impl);
+    typedef WilsonKernels<Impl> Kernels;

      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
@@ -17,75 +29,102 @@ namespace Grid {
      GridBase *FermionGrid(void)            { return _grid;}
      GridBase *FermionRedBlackGrid(void)    { return _cbgrid;}

-      // override multiply
-      virtual RealD  M    (const LatticeFermion &in, LatticeFermion &out);
-      virtual RealD  Mdag (const LatticeFermion &in, LatticeFermion &out);
+      //////////////////////////////////////////////////////////////////
+      // override multiply; cut number routines if pass dagger argument
+      // and also make interface more uniformly consistent
+      //////////////////////////////////////////////////////////////////
+      RealD M(const FermionField &in, FermionField &out);
+      RealD Mdag(const FermionField &in, FermionField &out);

-      // half checkerboard operaions
-      void   Meooe       (const LatticeFermion &in, LatticeFermion &out);
-      void   MeooeDag    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   Mooee       (const LatticeFermion &in, LatticeFermion &out); // remain virtual so we 
-      virtual void   MooeeDag    (const LatticeFermion &in, LatticeFermion &out); // can derive Clover
-      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out); // from Wilson base
-      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
+      /////////////////////////////////////////////////////////
+      // half checkerboard operations
+      // could remain virtual so we  can derive Clover from Wilson base
+      /////////////////////////////////////////////////////////
+      void Meooe(const FermionField &in, FermionField &out) ;
+      void MeooeDag(const FermionField &in, FermionField &out) ;
+      void Mooee(const FermionField &in, FermionField &out) ;
+      void MooeeDag(const FermionField &in, FermionField &out) ;
+      void MooeeInv(const FermionField &in, FermionField &out) ;
+      void MooeeInvDag(const FermionField &in, FermionField &out) ;

+      ////////////////////////
+      // Derivative interface
+      ////////////////////////
+      // Interface calls an internal routine
+      void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+
+
+      ///////////////////////////////////////////////////////////////
      // non-hermitian hopping term; half cb or both
-      void Dhop  (const LatticeFermion &in, LatticeFermion &out,int dag);
-      void DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag);
-      void DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag);
+      ///////////////////////////////////////////////////////////////
+      void Dhop(const FermionField &in, FermionField &out,int dag) ;
+      void DhopOE(const FermionField &in, FermionField &out,int dag) ;
+      void DhopEO(const FermionField &in, FermionField &out,int dag) ;

-      // Multigrid assistance
-      void   Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
-      void DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      ///////////////////////////////////////////////////////////////
+      // Multigrid assistance; force term uses too
+      ///////////////////////////////////////////////////////////////
+      void Mdir (const FermionField &in, FermionField &out,int dir,int disp) ;
+      void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+      void DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp,int gamma,int dag) ;

      ///////////////////////////////////////////////////////////////
      // Extra methods added by derived
      ///////////////////////////////////////////////////////////////
-      void DhopInternal(CartesianStencil & st,
-			LatticeDoubledGaugeField &U,
-			const LatticeFermion &in, 
-			LatticeFermion &out,
-			int dag);
+      void DerivInternal(CartesianStencil & st,
+			 DoubledGaugeField & U,
+			 GaugeField &mat,
+			 const FermionField &A,
+			 const FermionField &B,
+			 int dag);
+
+      void DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
+			const FermionField &in, FermionField &out,int dag) ;
+

      // Constructor
-      WilsonFermion(LatticeGaugeField &_Umu,GridCartesian &Fgrid,GridRedBlackCartesian &Hgrid,RealD _mass);
+      WilsonFermion(GaugeField &_Umu,
+		    GridCartesian         &Fgrid,
+		    GridRedBlackCartesian &Hgrid, 
+		    RealD _mass,
+		    const ImplParams &p= ImplParams()
+		    ) ;

-      // DoubleStore
-      void DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu);
+      // DoubleStore impl dependent
+      void ImportGauge(const GaugeField &_Umu);

      ///////////////////////////////////////////////////////////////
      // Data members require to support the functionality
      ///////////////////////////////////////////////////////////////
-      static int HandOptDslash; // these are a temporary hack
-      static int MortonOrder;

-    protected:
+      //    protected:
+    public:

      RealD                        mass;

      GridBase                     *    _grid; 
      GridBase                     *  _cbgrid;

-      static const int npoint=8;
-      static const std::vector<int> directions   ;
-      static const std::vector<int> displacements;
-
      //Defines the stencils for even and odd
      CartesianStencil Stencil; 
      CartesianStencil StencilEven; 
      CartesianStencil StencilOdd; 

      // Copy of the gauge field , with even and odd subsets
-      LatticeDoubledGaugeField Umu;
-      LatticeDoubledGaugeField UmuEven;
-      LatticeDoubledGaugeField UmuOdd;
+      DoubledGaugeField Umu;
+      DoubledGaugeField UmuEven;
+      DoubledGaugeField UmuOdd;

      // Comms buffer
-      std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  comm_buf;
-
+      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
      
    };

+    typedef WilsonFermion<WilsonImplF> WilsonFermionF;
+    typedef WilsonFermion<WilsonImplD> WilsonFermionD;
+
  }
 }
 #endif
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -3,19 +3,20 @@
 namespace Grid {
 namespace QCD {
  
-  // S-direction is INNERMOST and takes no part in the parity.
-  const std::vector<int> WilsonFermion5D::directions   ({1,2,3,4, 1, 2, 3, 4});
-  const std::vector<int> WilsonFermion5D::displacements({1,1,1,1,-1,-1,-1,-1});
-
-  int WilsonFermion5D::HandOptDslash;
+// S-direction is INNERMOST and takes no part in the parity.
+const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
+const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
+int WilsonFermion5DStatic::HandOptDslash;

  // 5d lattice for DWF.
-  WilsonFermion5D::WilsonFermion5D(LatticeGaugeField &_Umu,
-					   GridCartesian         &FiveDimGrid,
-					   GridRedBlackCartesian &FiveDimRedBlackGrid,
-					   GridCartesian         &FourDimGrid,
-					   GridRedBlackCartesian &FourDimRedBlackGrid,
-					   RealD _M5) :
+template<class Impl>
+WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
+				       GridCartesian         &FiveDimGrid,
+				       GridRedBlackCartesian &FiveDimRedBlackGrid,
+				       GridCartesian         &FourDimGrid,
+				       GridRedBlackCartesian &FourDimRedBlackGrid,
+				       RealD _M5,const ImplParams &p) :
+  Kernels(p),
  _FiveDimGrid(&FiveDimGrid),
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
  _FourDimGrid(&FourDimGrid),
@@ -66,32 +67,25 @@ namespace QCD {
  // Allocate the required comms buffer
  comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO

-  DoubleStore(Umu,_Umu);
+  ImportGauge(_Umu);
+}  
+template<class Impl>
+void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
+{
+  Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
  pickCheckerboard(Even,UmuEven,Umu);
  pickCheckerboard(Odd ,UmuOdd,Umu);
 }
-void WilsonFermion5D::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu)
-{
-  assert(GaugeGrid()->_ndimension==4);
-  conformable(Uds._grid,GaugeGrid());
-  conformable(Umu._grid,GaugeGrid());
-  LatticeColourMatrix U(GaugeGrid());
-  for(int mu=0;mu<Nd;mu++){
-    U = PeekIndex<LorentzIndex>(Umu,mu);
-    PokeIndex<LorentzIndex>(Uds,U,mu);
-    U = adj(Cshift(U,mu,-1));
-    PokeIndex<LorentzIndex>(Uds,U,mu+4);
-  }
-}
-void WilsonFermion5D::DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir5,int disp)
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
 {
  int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
                    // we drop off the innermost fifth dimension
  //  assert( (disp==1)||(disp==-1) );
  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;

-  WilsonCompressor compressor(DaggerNo);
-  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
  
  int skip = (disp==1) ? 0 : 1;

@@ -100,25 +94,133 @@ void WilsonFermion5D::DhopDir(const LatticeFermion &in, LatticeFermion &out,int
  assert(dirdisp<=7);
  assert(dirdisp>=0);

-//PARALLEL_FOR_LOOP
+PARALLEL_FOR_LOOP
  for(int ss=0;ss<Umu._grid->oSites();ss++){
    for(int s=0;s<Ls;s++){
      int sU=ss;
      int sF = s+Ls*sU; 
-      DiracOptDhopDir(Stencil,Umu,comm_buf,sF,sU,in,out,dirdisp);
+      Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sF,sU,in,out,dirdisp,dirdisp);
    }
  }
 };

-void WilsonFermion5D::DhopInternal(CartesianStencil & st, LebesgueOrder &lo,
-				   LatticeDoubledGaugeField & U,
-			   const LatticeFermion &in, LatticeFermion &out,int dag)
+template<class Impl>
+void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
+					  DoubledGaugeField & U,
+					  GaugeField &mat,
+					  const FermionField &A,
+					  const FermionField &B,
+					  int dag)
+{
+  assert((dag==DaggerNo) ||(dag==DaggerYes));
+
+  conformable(st._grid,A._grid);
+  conformable(st._grid,B._grid);
+
+  Compressor compressor(dag);
+  
+  FermionField Btilde(B._grid);
+  FermionField Atilde(B._grid);
+
+  st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
+
+  Atilde=A;
+
+  for(int mu=0;mu<Nd;mu++){
+      
+    ////////////////////////////////////////////////////////////////////////
+    // Flip gamma if dag
+    ////////////////////////////////////////////////////////////////////////
+    int gamma = mu;
+    if ( dag ) gamma+= Nd;
+
+    ////////////////////////
+    // Call the single hop
+    ////////////////////////
+
+PARALLEL_FOR_LOOP
+    for(int sss=0;sss<U._grid->oSites();sss++){
+      for(int s=0;s<Ls;s++){
+	int sU=sss;
+	int sF = s+Ls*sU;
+
+	assert ( sF< B._grid->oSites());
+	assert ( sU< U._grid->oSites());
+
+	Kernels::DiracOptDhopDir(st,U,comm_buf,sF,sU,B,Btilde,mu,gamma);
+
+    ////////////////////////////
+    // spin trace outer product
+    ////////////////////////////
+
+      }
+
+    }
+
+    Impl::InsertForce5D(mat,Btilde,Atilde,mu);
+
+  }
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopDeriv(      GaugeField &mat,
+					    const FermionField &A,
+					    const FermionField &B,
+					    int dag)
+{
+  conformable(A._grid,FermionGrid());  
+  conformable(A._grid,B._grid);
+  conformable(GaugeGrid(),mat._grid);
+
+  mat.checkerboard = A.checkerboard;
+
+  DerivInternal(Stencil,Umu,mat,A,B,dag);
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
+					const FermionField &A,
+					const FermionField &B,
+					int dag)
+{
+  conformable(A._grid,FermionRedBlackGrid());
+  conformable(GaugeRedBlackGrid(),mat._grid);
+  conformable(A._grid,B._grid);
+
+  assert(B.checkerboard==Odd);
+  assert(A.checkerboard==Even);
+  mat.checkerboard = Even;
+
+  DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
+				  const FermionField &A,
+				  const FermionField &B,
+				  int dag)
+{
+  conformable(A._grid,FermionRedBlackGrid());
+  conformable(GaugeRedBlackGrid(),mat._grid);
+  conformable(A._grid,B._grid);
+
+  assert(B.checkerboard==Even);
+  assert(A.checkerboard==Odd);
+  mat.checkerboard = Odd;
+
+  DerivInternal(StencilEven,UmuOdd,mat,A,B,dag);
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &lo,
+					 DoubledGaugeField & U,
+					 const FermionField &in, FermionField &out,int dag)
 {
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));

-  WilsonCompressor compressor(dag);
+  Compressor compressor(dag);

-  st.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
+  st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
  
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  // Not loop ordering and data layout.
@@ -126,13 +228,13 @@ void WilsonFermion5D::DhopInternal(CartesianStencil & st, LebesgueOrder &lo,
  // - per thread reuse in L1 cache for U
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
  if ( dag == DaggerYes ) {
-    if( HandOptDslash ) {
+    if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	for(int s=0;s<Ls;s++){
 	  int sU=ss;
 	  int sF = s+Ls*sU;
-	  DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
+	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
 	  }
      }
    } else { 
@@ -143,20 +245,20 @@ PARALLEL_FOR_LOOP
 	  for(sd=0;sd<Ls;sd++){
 	    int sU=ss;
 	    int sF = sd+Ls*sU;
-	    DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
+	    Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
 	  }
 	}
      }
    }
  } else {
-    if( HandOptDslash ) {
+    if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	for(int s=0;s<Ls;s++){
 	  //	  int sU=lo.Reorder(ss);
 	  int sU=ss;
 	  int sF = s+Ls*sU;
-	  DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
+	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 	}
      }

@@ -167,13 +269,14 @@ PARALLEL_FOR_LOOP
 	  //	  int sU=lo.Reorder(ss);
 	  int sU=ss;
 	  int sF = s+Ls*sU; 
-	  DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out);
+	  Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out);
 	}
      }
    }
  }
 }
-void WilsonFermion5D::DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag)
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
@@ -183,7 +286,8 @@ void WilsonFermion5D::DhopOE(const LatticeFermion &in, LatticeFermion &out,int d

  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
 }
-void WilsonFermion5D::DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag)
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
@@ -193,7 +297,8 @@ void WilsonFermion5D::DhopEO(const LatticeFermion &in, LatticeFermion &out,int d

  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
 }
-void WilsonFermion5D::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
+template<class Impl>
+void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  conformable(in._grid,FermionGrid()); // verifies full grid
  conformable(in._grid,out._grid);
@@ -202,12 +307,16 @@ void WilsonFermion5D::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag

  DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
 }
-void WilsonFermion5D::DW(const LatticeFermion &in, LatticeFermion &out,int dag)
+template<class Impl>
+void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
 {
  out.checkerboard=in.checkerboard;
  Dhop(in,out,dag); // -0.5 is included
  axpy(out,4.0-M5,in,out);
 }
+
+FermOpTemplateInstantiate(WilsonFermion5D);
+
 }}


--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -14,22 +14,24 @@ namespace Grid {
    // i.e. even even contains fifth dim hopping term.
    //
    // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
-    ////////////////////////////
-    //ContFrac:
-    //  Ls always odd. Rational poly deg is either Ls or Ls-1
-    //PartFrac 
-    //  Ls always odd. Rational poly deg is either Ls or Ls-1
-    //
-    //Cayley: Ls always even, Rational poly deg is Ls
-    // 
-    // Just set nrational as Ls. Forget about Ls-1 cases.
-    //
-    // Require odd Ls for cont and part frac
-    ////////////////////////////
    ////////////////////////////////////////////////////////////////////////////////
-    class WilsonFermion5D : public FermionOperator<LatticeFermion,LatticeGaugeField>
+
+    class WilsonFermion5DStatic { 
+    public:
+      // S-direction is INNERMOST and takes no part in the parity.
+      static int HandOptDslash; // these are a temporary hack
+      static const std::vector<int> directions;
+      static const std::vector<int> displacements;
+      const int npoint = 8;
+    };
+
+    template<class Impl>
+    class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
    {
    public:
+     INHERIT_IMPL_TYPES(Impl);
+     typedef WilsonKernels<Impl> Kernels;
+
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
      ///////////////////////////////////////////////////////////////
@@ -39,54 +41,65 @@ namespace Grid {
      GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}

      // full checkerboard operations; leave unimplemented as abstract for now
-      virtual RealD  M    (const LatticeFermion &in, LatticeFermion &out){assert(0); return 0.0;};
-      virtual RealD  Mdag (const LatticeFermion &in, LatticeFermion &out){assert(0); return 0.0;};
+      virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+      virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};

      // half checkerboard operations; leave unimplemented as abstract for now
-      virtual void   Meooe       (const LatticeFermion &in, LatticeFermion &out){assert(0);};
-      virtual void   MeooeDag    (const LatticeFermion &in, LatticeFermion &out){assert(0);};
-      virtual void   Mooee       (const LatticeFermion &in, LatticeFermion &out){assert(0);};
-      virtual void   MooeeDag    (const LatticeFermion &in, LatticeFermion &out){assert(0);};
-      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out){assert(0);};
-      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out){assert(0);};
+      virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
+
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+
+      // These can be overridden by fancy 5d chiral action
+      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

      // Implement hopping term non-hermitian hopping term; half cb or both
      // Implement s-diagonal DW
-      void DW    (const LatticeFermion &in, LatticeFermion &out,int dag);
-      void Dhop  (const LatticeFermion &in, LatticeFermion &out,int dag);
-      void DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag);
-      void DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void DW    (const FermionField &in, FermionField &out,int dag);
+      void Dhop  (const FermionField &in, FermionField &out,int dag);
+      void DhopOE(const FermionField &in, FermionField &out,int dag);
+      void DhopEO(const FermionField &in, FermionField &out,int dag);

      // add a DhopComm
      // -- suboptimal interface will presently trigger multiple comms.
-      void DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);

      ///////////////////////////////////////////////////////////////
      // New methods added 
      ///////////////////////////////////////////////////////////////
+      void DerivInternal(CartesianStencil & st,
+			 DoubledGaugeField & U,
+			 GaugeField &mat,
+			 const FermionField &A,
+			 const FermionField &B,
+			 int dag);
+
      void DhopInternal(CartesianStencil & st,
 			LebesgueOrder &lo,
-			LatticeDoubledGaugeField &U,
-			const LatticeFermion &in, 
-			LatticeFermion &out,
+			DoubledGaugeField &U,
+			const FermionField &in, 
+			FermionField &out,
 			int dag);

      // Constructors
-      WilsonFermion5D(LatticeGaugeField &_Umu,
-			  GridCartesian         &FiveDimGrid,
-			  GridRedBlackCartesian &FiveDimRedBlackGrid,
-			  GridCartesian         &FourDimGrid,
-			  GridRedBlackCartesian &FourDimRedBlackGrid,
-			  double _M5);
+      WilsonFermion5D(GaugeField &_Umu,
+		      GridCartesian         &FiveDimGrid,
+		      GridRedBlackCartesian &FiveDimRedBlackGrid,
+		      GridCartesian         &FourDimGrid,
+		      GridRedBlackCartesian &FourDimRedBlackGrid,
+		      double _M5,const ImplParams &p= ImplParams());

      // DoubleStore
-      void DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu);
+      void ImportGauge(const GaugeField &_Umu);

      ///////////////////////////////////////////////////////////////
      // Data members require to support the functionality
      ///////////////////////////////////////////////////////////////
-      static int HandOptDslash; // these are a temporary hack
-
    protected:

      // Add these to the support from Wilson
@@ -95,10 +108,6 @@ namespace Grid {
      GridBase *_FiveDimGrid;
      GridBase *_FiveDimRedBlackGrid;

-      static const int npoint=8;
-      static const std::vector<int> directions   ;
-      static const std::vector<int> displacements;
-
      double                        M5;
      int Ls;

@@ -108,15 +117,15 @@ namespace Grid {
      CartesianStencil StencilOdd; 

      // Copy of the gauge field , with even and odd subsets
-      LatticeDoubledGaugeField Umu;
-      LatticeDoubledGaugeField UmuEven;
-      LatticeDoubledGaugeField UmuOdd;
+      DoubledGaugeField Umu;
+      DoubledGaugeField UmuEven;
+      DoubledGaugeField UmuOdd;

      LebesgueOrder Lebesgue;
      LebesgueOrder LebesgueEvenOdd;

      // Comms buffer
-      std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  comm_buf;
+      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
      
    };
  }
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -1,429 +1,374 @@
 #include <Grid.h>
-
 namespace Grid {
 namespace QCD {

-void DiracOptDhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
-			std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
-			int sF,int sU,const LatticeFermion &in, LatticeFermion &out)
+template<class Impl> 
+void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						  int sF,int sU,const FermionField &in, FermionField &out)
 {
-    vHalfSpinColourVector  tmp;    
-    vHalfSpinColourVector  chi;    
-    vSpinColourVector result;
-    vHalfSpinColourVector Uchi;
-    int offset,local,perm, ptype;
+  SiteHalfSpinor  tmp;    
+  SiteHalfSpinor  chi;    
+  SiteHalfSpinor Uchi;
+  SiteSpinor result;
+  StencilEntry *SE;
+  int ptype;

+  // Xp
+  SE=st.GetEntry(ptype,Xp,sF);
+  if ( SE->_is_local && SE->_permute ) {
+    spProjXp(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjXp(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
+  spReconXp(result,Uchi);
    
-    // Xp
-    int ss = sF;
-    offset = st._offsets [Xp][ss];
-    local  = st._is_local[Xp][ss];
-    perm   = st._permute[Xp][ss];
+  // Yp
+  SE=st.GetEntry(ptype,Yp,sF);
+  if ( SE->_is_local && SE->_permute ) {
+    spProjYp(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjYp(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
+  accumReconYp(result,Uchi);

-    ptype  = st._permute_type[Xp];
-    if ( local && perm ) {
-      spProjXp(tmp,in._odata[offset]);
+  // Zp
+  SE=st.GetEntry(ptype,Zp,sF);
+  if (  SE->_is_local && SE->_permute ) {
+    spProjZp(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjZp(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
+  accumReconZp(result,Uchi);
+
+  // Tp
+  SE=st.GetEntry(ptype,Tp,sF);
+  if (  SE->_is_local && SE->_permute ) {
+    spProjTp(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjTp(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
+  accumReconTp(result,Uchi);
+
+  // Xm
+  SE=st.GetEntry(ptype,Xm,sF);
+  if (  SE->_is_local && SE->_permute ) {
+    spProjXm(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjXm(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
+  accumReconXm(result,Uchi);
+  
+  // Ym
+  SE=st.GetEntry(ptype,Ym,sF);
+  if (  SE->_is_local && SE->_permute ) {
+    spProjYm(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjYm(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
+  accumReconYm(result,Uchi);
+  
+  // Zm
+  SE=st.GetEntry(ptype,Zm,sF);
+  if (  SE->_is_local && SE->_permute ) {
+    spProjZm(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjZm(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
+  accumReconZm(result,Uchi);
+
+  // Tm
+  SE=st.GetEntry(ptype,Tm,sF);
+  if (  SE->_is_local && SE->_permute ) {
+    spProjTm(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjTm(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
+  accumReconTm(result,Uchi);
+
+  vstream(out._odata[sF],result*(-0.5));
+};
+
+template<class Impl> 
+void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					      int sF,int sU,const FermionField &in, FermionField &out)
+{
+  SiteHalfSpinor  tmp;    
+  SiteHalfSpinor  chi;    
+  SiteSpinor result;
+  SiteHalfSpinor Uchi;
+  StencilEntry *SE;
+  int ptype;
+
+  // Xp
+  SE=st.GetEntry(ptype,Xm,sF);
+  if (  SE->_is_local && SE->_permute ) {
+    spProjXp(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjXp(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
+  spReconXp(result,Uchi);
+
+  // Yp
+  SE=st.GetEntry(ptype,Ym,sF);
+  if (  SE->_is_local && SE->_permute ) {
+    spProjYp(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjYp(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
+  accumReconYp(result,Uchi);
+  
+  // Zp
+  SE=st.GetEntry(ptype,Zm,sF);
+  if (  SE->_is_local && SE->_permute ) {
+    spProjZp(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjZp(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
+  accumReconZp(result,Uchi);
+  
+  // Tp
+  SE=st.GetEntry(ptype,Tm,sF);
+  if (  SE->_is_local && SE->_permute ) {
+    spProjTp(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjTp(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
+  accumReconTp(result,Uchi);
+  
+  // Xm
+  SE=st.GetEntry(ptype,Xp,sF);
+  if (  SE->_is_local && SE->_permute ) {
+    spProjXm(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjXm(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
+  accumReconXm(result,Uchi);
+
+  // Ym
+  SE=st.GetEntry(ptype,Yp,sF);
+  if (  SE->_is_local && SE->_permute ) {
+    spProjYm(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjYm(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
+  accumReconYm(result,Uchi);
+
+  // Zm
+  SE=st.GetEntry(ptype,Zp,sF);
+  if (  SE->_is_local && SE->_permute ) {
+    spProjZm(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjZm(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
+  accumReconZm(result,Uchi);
+    
+  // Tm
+  SE=st.GetEntry(ptype,Tp,sF);
+  if (  SE->_is_local && SE->_permute ) {
+    spProjTm(tmp,in._odata[SE->_offset]);
+    permute(chi,tmp,ptype);
+  } else if ( SE->_is_local ) {
+    spProjTm(chi,in._odata[SE->_offset]);
+  } else { 
+    chi=buf[SE->_offset];
+  }
+  Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
+  accumReconTm(result,Uchi);
+  
+  vstream(out._odata[sF],result*(-0.5));
+}
+
+template<class Impl> 
+void WilsonKernels<Impl>::DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
+					  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					  int sF,int sU,const FermionField &in, FermionField &out,int dir,int gamma)
+{
+  SiteHalfSpinor  tmp;    
+  SiteHalfSpinor  chi;    
+  SiteSpinor   result;
+  SiteHalfSpinor Uchi;
+  StencilEntry *SE;
+  int ptype;
+
+  SE=st.GetEntry(ptype,dir,sF);
+
+  // Xp
+  if(gamma==Xp){
+    if (  SE->_is_local && SE->_permute ) {
+      spProjXp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjXp(chi,in._odata[offset]);
+    } else if ( SE->_is_local ) {
+      spProjXp(chi,in._odata[SE->_offset]);
    } else { 
-      chi=buf[offset];
+      chi=buf[SE->_offset];
    }
-    mult(&Uchi(),&U._odata[sU](Xp),&chi());
+    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
    spReconXp(result,Uchi);
+  }

-    // Yp
-    offset = st._offsets [Yp][ss];
-    local  = st._is_local[Yp][ss];
-    perm   = st._permute[Yp][ss];
-    ptype  = st._permute_type[Yp];
-    if ( local && perm ) {
-      spProjYp(tmp,in._odata[offset]);
+  // Yp
+  if ( gamma==Yp ){
+    if (  SE->_is_local && SE->_permute ) {
+      spProjYp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjYp(chi,in._odata[offset]);
+    } else if ( SE->_is_local ) {
+      spProjYp(chi,in._odata[SE->_offset]);
    } else { 
-      chi=buf[offset];
+      chi=buf[SE->_offset];
    }
-    mult(&Uchi(),&U._odata[sU](Yp),&chi());
-    accumReconYp(result,Uchi);
+    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
+    spReconYp(result,Uchi);
+  }
  
-    // Zp
-    offset = st._offsets [Zp][ss];
-    local  = st._is_local[Zp][ss];
-    perm   = st._permute[Zp][ss];
-    ptype  = st._permute_type[Zp];
-    if ( local && perm ) {
-      spProjZp(tmp,in._odata[offset]);
+  // Zp
+  if ( gamma ==Zp ){
+    if (  SE->_is_local && SE->_permute ) {
+      spProjZp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjZp(chi,in._odata[offset]);
+    } else if ( SE->_is_local ) {
+      spProjZp(chi,in._odata[SE->_offset]);
    } else { 
-      chi=buf[offset];
+      chi=buf[SE->_offset];
    }
-    mult(&Uchi(),&U._odata[sU](Zp),&chi());
-    accumReconZp(result,Uchi);
+    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
+    spReconZp(result,Uchi);
+  }
  
-    // Tp
-    offset = st._offsets [Tp][ss];
-    local  = st._is_local[Tp][ss];
-    perm   = st._permute[Tp][ss];
-    ptype  = st._permute_type[Tp];
-    if ( local && perm ) {
-      spProjTp(tmp,in._odata[offset]);
+  // Tp
+  if ( gamma ==Tp ){
+    if (  SE->_is_local && SE->_permute ) {
+      spProjTp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjTp(chi,in._odata[offset]);
+    } else if ( SE->_is_local ) {
+      spProjTp(chi,in._odata[SE->_offset]);
    } else { 
-      chi=buf[offset];
+      chi=buf[SE->_offset];
    }
-    mult(&Uchi(),&U._odata[sU](Tp),&chi());
-    accumReconTp(result,Uchi);
+    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
+    spReconTp(result,Uchi);
+  }

-    // Xm
-    offset = st._offsets [Xm][ss];
-    local  = st._is_local[Xm][ss];
-    perm   = st._permute[Xm][ss];
-    ptype  = st._permute_type[Xm];
-
-    if ( local && perm ) {
-      spProjXm(tmp,in._odata[offset]);
+  // Xm
+  if ( gamma==Xm ){
+    if (  SE->_is_local && SE->_permute ) {
+      spProjXm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjXm(chi,in._odata[offset]);
+    } else if ( SE->_is_local ) {
+      spProjXm(chi,in._odata[SE->_offset]);
    } else { 
-      chi=buf[offset];
+      chi=buf[SE->_offset];
    }
-    mult(&Uchi(),&U._odata[sU](Xm),&chi());
-    accumReconXm(result,Uchi);
+    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
+    spReconXm(result,Uchi);
+  }

-    // Ym
-    offset = st._offsets [Ym][ss];
-    local  = st._is_local[Ym][ss];
-    perm   = st._permute[Ym][ss];
-    ptype  = st._permute_type[Ym];
-
-    if ( local && perm ) {
-      spProjYm(tmp,in._odata[offset]);
+  // Ym
+  if ( gamma == Ym ){
+    if (  SE->_is_local && SE->_permute ) {
+      spProjYm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjYm(chi,in._odata[offset]);
+    } else if ( SE->_is_local ) {
+      spProjYm(chi,in._odata[SE->_offset]);
    } else { 
-      chi=buf[offset];
+      chi=buf[SE->_offset];
    }
-    mult(&Uchi(),&U._odata[sU](Ym),&chi());
-    accumReconYm(result,Uchi);
+    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
+    spReconYm(result,Uchi);
+  }

-    // Zm
-    offset = st._offsets [Zm][ss];
-    local  = st._is_local[Zm][ss];
-    perm   = st._permute[Zm][ss];
-    ptype  = st._permute_type[Zm];
-    if ( local && perm ) {
-      spProjZm(tmp,in._odata[offset]);
+  // Zm
+  if ( gamma == Zm ){
+    if (  SE->_is_local && SE->_permute ) {
+      spProjZm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjZm(chi,in._odata[offset]);
+    } else if ( SE->_is_local ) {
+      spProjZm(chi,in._odata[SE->_offset]);
    } else { 
-      chi=buf[offset];
+      chi=buf[SE->_offset];
    }
-    mult(&Uchi(),&U._odata[sU](Zm),&chi());
-    accumReconZm(result,Uchi);
+    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
+    spReconZm(result,Uchi);
+  }
  
-    // Tm
-    offset = st._offsets [Tm][ss];
-    local  = st._is_local[Tm][ss];
-    perm   = st._permute[Tm][ss];
-    ptype  = st._permute_type[Tm];
-    if ( local && perm ) {
-      spProjTm(tmp,in._odata[offset]);
+  // Tm
+  if ( gamma==Tm ) {
+    if (  SE->_is_local && SE->_permute ) {
+      spProjTm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjTm(chi,in._odata[offset]);
+    } else if ( SE->_is_local ) {
+      spProjTm(chi,in._odata[SE->_offset]);
    } else { 
-      chi=buf[offset];
+      chi=buf[SE->_offset];
    }
-    mult(&Uchi(),&U._odata[sU](Tm),&chi());
-    accumReconTm(result,Uchi);
+    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
+    spReconTm(result,Uchi);
+  }

-    vstream(out._odata[ss],result*(-0.5));
+  vstream(out._odata[sF],result*(-0.5));
 }

-void DiracOptDhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
-			   std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
-			   int sF,int sU,const LatticeFermion &in, LatticeFermion &out)
-{
-    vHalfSpinColourVector  tmp;    
-    vHalfSpinColourVector  chi;    
-    vSpinColourVector result;
-    vHalfSpinColourVector Uchi;
-    int offset,local,perm, ptype;
-
-    // Xp
-    int ss=sF;
-    offset = st._offsets [Xm][ss];
-    local  = st._is_local[Xm][ss];
-    perm   = st._permute[Xm][ss];
-
-    ptype  = st._permute_type[Xm];
-    if ( local && perm ) {
-      spProjXp(tmp,in._odata[offset]);
-      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjXp(chi,in._odata[offset]);
-    } else { 
-      chi=buf[offset];
-    }
-    mult(&Uchi(),&U._odata[sU](Xm),&chi());
-    spReconXp(result,Uchi);
-
-    // Yp
-    offset = st._offsets [Ym][ss];
-    local  = st._is_local[Ym][ss];
-    perm   = st._permute[Ym][ss];
-    ptype  = st._permute_type[Ym];
-    if ( local && perm ) {
-      spProjYp(tmp,in._odata[offset]);
-      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjYp(chi,in._odata[offset]);
-    } else { 
-      chi=buf[offset];
-    }
-    mult(&Uchi(),&U._odata[sU](Ym),&chi());
-    accumReconYp(result,Uchi);
-
-    // Zp
-    offset = st._offsets [Zm][ss];
-    local  = st._is_local[Zm][ss];
-    perm   = st._permute[Zm][ss];
-    ptype  = st._permute_type[Zm];
-    if ( local && perm ) {
-      spProjZp(tmp,in._odata[offset]);
-      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjZp(chi,in._odata[offset]);
-    } else { 
-      chi=buf[offset];
-    }
-    mult(&Uchi(),&U._odata[sU](Zm),&chi());
-    accumReconZp(result,Uchi);
-
-    // Tp
-    offset = st._offsets [Tm][ss];
-    local  = st._is_local[Tm][ss];
-    perm   = st._permute[Tm][ss];
-    ptype  = st._permute_type[Tm];
-    if ( local && perm ) {
-      spProjTp(tmp,in._odata[offset]);
-      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjTp(chi,in._odata[offset]);
-    } else { 
-      chi=buf[offset];
-    }
-    mult(&Uchi(),&U._odata[sU](Tm),&chi());
-    accumReconTp(result,Uchi);
-
-    // Xm
-    offset = st._offsets [Xp][ss];
-    local  = st._is_local[Xp][ss];
-    perm   = st._permute[Xp][ss];
-    ptype  = st._permute_type[Xp];
-
-    if ( local && perm ) 
-    {
-      spProjXm(tmp,in._odata[offset]);
-      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjXm(chi,in._odata[offset]);
-    } else { 
-      chi=buf[offset];
-    }
-    mult(&Uchi(),&U._odata[sU](Xp),&chi());
-    accumReconXm(result,Uchi);
-
-    // Ym
-    offset = st._offsets [Yp][ss];
-    local  = st._is_local[Yp][ss];
-    perm   = st._permute[Yp][ss];
-    ptype  = st._permute_type[Yp];
-
-    if ( local && perm ) {
-      spProjYm(tmp,in._odata[offset]);
-      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjYm(chi,in._odata[offset]);
-    } else { 
-      chi=buf[offset];
-    }
-    mult(&Uchi(),&U._odata[sU](Yp),&chi());
-    accumReconYm(result,Uchi);
-
-    // Zm
-    offset = st._offsets [Zp][ss];
-    local  = st._is_local[Zp][ss];
-    perm   = st._permute[Zp][ss];
-    ptype  = st._permute_type[Zp];
-    if ( local && perm ) {
-      spProjZm(tmp,in._odata[offset]);
-      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjZm(chi,in._odata[offset]);
-    } else { 
-      chi=buf[offset];
-    }
-    mult(&Uchi(),&U._odata[sU](Zp),&chi());
-    accumReconZm(result,Uchi);
-
-    // Tm
-    offset = st._offsets [Tp][ss];
-    local  = st._is_local[Tp][ss];
-    perm   = st._permute[Tp][ss];
-    ptype  = st._permute_type[Tp];
-    if ( local && perm ) {
-      spProjTm(tmp,in._odata[offset]);
-      permute(chi,tmp,ptype);
-    } else if ( local ) {
-      spProjTm(chi,in._odata[offset]);
-    } else { 
-      chi=buf[offset];
-    }
-    mult(&Uchi(),&U._odata[sU](Tp),&chi());
-    accumReconTm(result,Uchi);
-
-    vstream(out._odata[ss],result*(-0.5));
-}
-
-void DiracOptDhopDir(CartesianStencil &st,LatticeDoubledGaugeField &U,
-			std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
-		       int sF,int sU,const LatticeFermion &in, LatticeFermion &out,int dirdisp)
-{
-    vHalfSpinColourVector  tmp;    
-    vHalfSpinColourVector  chi;    
-    vSpinColourVector result;
-    vHalfSpinColourVector Uchi;
-    int offset,local,perm, ptype;
-    int ss=sF;
-    
-    offset = st._offsets [dirdisp][ss];
-    local  = st._is_local[dirdisp][ss];
-    perm   = st._permute[dirdisp][ss];
-    ptype  = st._permute_type[dirdisp];
-
-    // Xp
-    if(dirdisp==Xp){
-      if ( local && perm ) {
-	spProjXp(tmp,in._odata[offset]);
-	permute(chi,tmp,ptype);
-      } else if ( local ) {
-	spProjXp(chi,in._odata[offset]);
-      } else { 
-	chi=buf[offset];
-      }
-      mult(&Uchi(),&U._odata[sU](Xp),&chi());
-      spReconXp(result,Uchi);
-    }
-
-    // Yp
-    if ( dirdisp==Yp ){
-      if ( local && perm ) {
-	spProjYp(tmp,in._odata[offset]);
-	permute(chi,tmp,ptype);
-      } else if ( local ) {
-	spProjYp(chi,in._odata[offset]);
-      } else { 
-	chi=buf[offset];
-      }
-      mult(&Uchi(),&U._odata[sU](Yp),&chi());
-      spReconYp(result,Uchi);
-    }
-
-    // Zp
-    if ( dirdisp ==Zp ){
-      if ( local && perm ) {
-	spProjZp(tmp,in._odata[offset]);
-	permute(chi,tmp,ptype);
-      } else if ( local ) {
-	spProjZp(chi,in._odata[offset]);
-      } else { 
-	chi=buf[offset];
-      }
-      mult(&Uchi(),&U._odata[sU](Zp),&chi());
-      spReconZp(result,Uchi);
-    }
-
-    // Tp
-    if ( dirdisp ==Tp ){
-      if ( local && perm ) {
-	spProjTp(tmp,in._odata[offset]);
-	permute(chi,tmp,ptype);
-      } else if ( local ) {
-	spProjTp(chi,in._odata[offset]);
-      } else { 
-	chi=buf[offset];
-      }
-      mult(&Uchi(),&U._odata[sU](Tp),&chi());
-      spReconTp(result,Uchi);
-    }
-
-    // Xm
-    if ( dirdisp==Xm ){
-      if ( local && perm ) {
-	spProjXm(tmp,in._odata[offset]);
-	permute(chi,tmp,ptype);
-      } else if ( local ) {
-	spProjXm(chi,in._odata[offset]);
-      } else { 
-	chi=buf[offset];
-      }
-      mult(&Uchi(),&U._odata[sU](Xm),&chi());
-      spReconXm(result,Uchi);
-    }
-
-    // Ym
-    if ( dirdisp == Ym ){
-      if ( local && perm ) {
-	spProjYm(tmp,in._odata[offset]);
-	permute(chi,tmp,ptype);
-      } else if ( local ) {
-	spProjYm(chi,in._odata[offset]);
-      } else { 
-	chi=buf[offset];
-      }
-      mult(&Uchi(),&U._odata[sU](Ym),&chi());
-      spReconYm(result,Uchi);
-    }
-
-    // Zm
-    if ( dirdisp == Zm ){
-      if ( local && perm ) {
-	spProjZm(tmp,in._odata[offset]);
-	permute(chi,tmp,ptype);
-      } else if ( local ) {
-	spProjZm(chi,in._odata[offset]);
-      } else { 
-	chi=buf[offset];
-      }
-      mult(&Uchi(),&U._odata[sU](Zm),&chi());
-      spReconZm(result,Uchi);
-    }
-
-    // Tm
-    if ( dirdisp==Tm ) {
-      if ( local && perm ) {
-	spProjTm(tmp,in._odata[offset]);
-	permute(chi,tmp,ptype);
-      } else if ( local ) {
-	spProjTm(chi,in._odata[offset]);
-      } else { 
-	chi=buf[offset];
-      }
-      mult(&Uchi(),&U._odata[sU](Tm),&chi());
-      spReconTm(result,Uchi);
-    }
-
-    vstream(out._odata[ss],result*(-0.5));
-}
+  FermOpTemplateInstantiate(WilsonKernels);

 }}
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -6,44 +6,44 @@ namespace Grid {
  namespace QCD {

    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Helper classes that implement Wilson stencil for a single site.
+    // Helper routines that implement Wilson stencil for a single site.
+    // Common to both the WilsonFermion and WilsonFermion5D
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////

-    // Generic version works for any Nc and with extra flavour indices
-    //    namespace DiracOpt {
+    template<class Impl> class WilsonKernels : public FermionOperator<Impl> { 
+    public:

-      // These ones will need to be package intelligently. WilsonType base class
-      // for use by DWF etc..
-      void DiracOptDhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
-			    std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
-			    int sF,int sU,const LatticeFermion &in, LatticeFermion &out);
-      void DiracOptDhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
-			       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
-			       int sF,int sU,const LatticeFermion &in, LatticeFermion &out);
-      void DiracOptDhopDir(CartesianStencil &st,LatticeDoubledGaugeField &U,
-			   std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
-			   int sF,int sU,const LatticeFermion &in, LatticeFermion &out,int dirdisp);
+     INHERIT_IMPL_TYPES(Impl);
+     typedef FermionOperator<Impl> Base;
     
-      //  };
+    public:
+     void DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+			   int sF,int sU,const FermionField &in, FermionField &out);
      
-      // Hand unrolled for Nc=3, one flavour
-      //    namespace DiracOptHand {
-      // These ones will need to be package intelligently. WilsonType base class
-      // for use by DWF etc..
+     void DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+			      int sF,int sU,const FermionField &in,FermionField &out);

-      void DiracOptHandDhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
-				std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
-				int sF,int sU,const LatticeFermion &in, LatticeFermion &out);
-      void DiracOptHandDhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
-				   std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
-				   int sF,int sU,const LatticeFermion &in, LatticeFermion &out);
+     void DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
+			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);

-      //    };
+     void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+			       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+			       int sF,int sU,const FermionField &in, FermionField &out){
+       DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+     }

+     void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+				  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+				  int sF,int sU,const FermionField &in, FermionField &out){
+       DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+     }

-    void DiracOptHandDhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
-				 std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
-				 int sF,int sU,const LatticeFermion &in, LatticeFermion &out);
+     WilsonKernels(const ImplParams &p= ImplParams()) : Base(p) {};
+
+    };

  }
 }
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -280,48 +280,50 @@
 namespace Grid {
 namespace QCD {

-void DiracOptHandDhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
-			    std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
-			    int sF,int sU,const LatticeFermion &in, LatticeFermion &out)
+#if 0
+template<class Simd>
+void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       int sF,int sU,const FermionField &in, FermionField &out)
 {
-  REGISTER vComplex result_00; // 12 regs on knc
-  REGISTER vComplex result_01;
-  REGISTER vComplex result_02;
+  REGISTER Simd result_00; // 12 regs on knc
+  REGISTER Simd result_01;
+  REGISTER Simd result_02;
  
-  REGISTER vComplex result_10;
-  REGISTER vComplex result_11;
-  REGISTER vComplex result_12;
+  REGISTER Simd result_10;
+  REGISTER Simd result_11;
+  REGISTER Simd result_12;

-  REGISTER vComplex result_20;
-  REGISTER vComplex result_21;
-  REGISTER vComplex result_22;
+  REGISTER Simd result_20;
+  REGISTER Simd result_21;
+  REGISTER Simd result_22;

-  REGISTER vComplex result_30;
-  REGISTER vComplex result_31;
-  REGISTER vComplex result_32; // 20 left
+  REGISTER Simd result_30;
+  REGISTER Simd result_31;
+  REGISTER Simd result_32; // 20 left

-  REGISTER vComplex Chi_00;    // two spinor; 6 regs
-  REGISTER vComplex Chi_01;
-  REGISTER vComplex Chi_02;
+  REGISTER Simd Chi_00;    // two spinor; 6 regs
+  REGISTER Simd Chi_01;
+  REGISTER Simd Chi_02;

-  REGISTER vComplex Chi_10;
-  REGISTER vComplex Chi_11;
-  REGISTER vComplex Chi_12;   // 14 left
+  REGISTER Simd Chi_10;
+  REGISTER Simd Chi_11;
+  REGISTER Simd Chi_12;   // 14 left

-  REGISTER vComplex UChi_00;  // two spinor; 6 regs
-  REGISTER vComplex UChi_01;
-  REGISTER vComplex UChi_02;
+  REGISTER Simd UChi_00;  // two spinor; 6 regs
+  REGISTER Simd UChi_01;
+  REGISTER Simd UChi_02;

-  REGISTER vComplex UChi_10;
-  REGISTER vComplex UChi_11;
-  REGISTER vComplex UChi_12;  // 8 left
+  REGISTER Simd UChi_10;
+  REGISTER Simd UChi_11;
+  REGISTER Simd UChi_12;  // 8 left

-  REGISTER vComplex U_00;  // two rows of U matrix
-  REGISTER vComplex U_10;
-  REGISTER vComplex U_20;  
-  REGISTER vComplex U_01;
-  REGISTER vComplex U_11;
-  REGISTER vComplex U_21;  // 2 reg left.
+  REGISTER Simd U_00;  // two rows of U matrix
+  REGISTER Simd U_10;
+  REGISTER Simd U_20;  
+  REGISTER Simd U_01;
+  REGISTER Simd U_11;
+  REGISTER Simd U_21;  // 2 reg left.

 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
@@ -360,11 +362,6 @@ void DiracOptHandDhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
    MULT_2SPIN(Xp);
  }
  XP_RECON;
-  //  std::cout << "XP_RECON"<<std::endl;
-  //  std::cout << result_00 <<" "<<result_01 <<" "<<result_02 <<std::endl;
-  //  std::cout << result_10 <<" "<<result_11 <<" "<<result_12 <<std::endl;
-  //  std::cout << result_20 <<" "<<result_21 <<" "<<result_22 <<std::endl;
-  //  std::cout << result_30 <<" "<<result_31 <<" "<<result_32 <<std::endl;

  // Yp
  offset = st._offsets [Yp][ss];
@@ -446,12 +443,6 @@ void DiracOptHandDhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
    MULT_2SPIN(Xm);
  }
  XM_RECON_ACCUM;
-  //  std::cout << "XM_RECON_ACCUM"<<std::endl;
-  //  std::cout << result_00 <<" "<<result_01 <<" "<<result_02 <<std::endl;
-  //  std::cout << result_10 <<" "<<result_11 <<" "<<result_12 <<std::endl;
-  //  std::cout << result_20 <<" "<<result_21 <<" "<<result_22 <<std::endl;
-  //  std::cout << result_30 <<" "<<result_31 <<" "<<result_32 <<std::endl;
-  
  
  // Ym
  offset = st._offsets [Ym][ss];
@@ -530,48 +521,49 @@ void DiracOptHandDhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
  }
 }

-void DiracOptHandDhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
-			       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
-			       int ss,int sU,const LatticeFermion &in, LatticeFermion &out)
+template<class Simd>
+void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+							      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							      int ss,int sU,const FermionField &in, FermionField &out)
 {
-  REGISTER vComplex result_00; // 12 regs on knc
-  REGISTER vComplex result_01;
-  REGISTER vComplex result_02;
+  REGISTER Simd result_00; // 12 regs on knc
+  REGISTER Simd result_01;
+  REGISTER Simd result_02;

-  REGISTER vComplex result_10;
-  REGISTER vComplex result_11;
-  REGISTER vComplex result_12;
+  REGISTER Simd result_10;
+  REGISTER Simd result_11;
+  REGISTER Simd result_12;

-  REGISTER vComplex result_20;
-  REGISTER vComplex result_21;
-  REGISTER vComplex result_22;
+  REGISTER Simd result_20;
+  REGISTER Simd result_21;
+  REGISTER Simd result_22;

-  REGISTER vComplex result_30;
-  REGISTER vComplex result_31;
-  REGISTER vComplex result_32; // 20 left
+  REGISTER Simd result_30;
+  REGISTER Simd result_31;
+  REGISTER Simd result_32; // 20 left

-  REGISTER vComplex Chi_00;    // two spinor; 6 regs
-  REGISTER vComplex Chi_01;
-  REGISTER vComplex Chi_02;
+  REGISTER Simd Chi_00;    // two spinor; 6 regs
+  REGISTER Simd Chi_01;
+  REGISTER Simd Chi_02;

-  REGISTER vComplex Chi_10;
-  REGISTER vComplex Chi_11;
-  REGISTER vComplex Chi_12;   // 14 left
+  REGISTER Simd Chi_10;
+  REGISTER Simd Chi_11;
+  REGISTER Simd Chi_12;   // 14 left

-  REGISTER vComplex UChi_00;  // two spinor; 6 regs
-  REGISTER vComplex UChi_01;
-  REGISTER vComplex UChi_02;
+  REGISTER Simd UChi_00;  // two spinor; 6 regs
+  REGISTER Simd UChi_01;
+  REGISTER Simd UChi_02;

-  REGISTER vComplex UChi_10;
-  REGISTER vComplex UChi_11;
-  REGISTER vComplex UChi_12;  // 8 left
+  REGISTER Simd UChi_10;
+  REGISTER Simd UChi_11;
+  REGISTER Simd UChi_12;  // 8 left

-  REGISTER vComplex U_00;  // two rows of U matrix
-  REGISTER vComplex U_10;
-  REGISTER vComplex U_20;  
-  REGISTER vComplex U_01;
-  REGISTER vComplex U_11;
-  REGISTER vComplex U_21;  // 2 reg left.
+  REGISTER Simd U_00;  // two rows of U matrix
+  REGISTER Simd U_10;
+  REGISTER Simd U_20;  
+  REGISTER Simd U_01;
+  REGISTER Simd U_11;
+  REGISTER Simd U_21;  // 2 reg left.

 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
@@ -752,7 +744,7 @@ void DiracOptHandDhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
  TP_RECON_ACCUM;

  {
-    vSpinColourVector & ref (out._odata[ss]);
+    SiteSpinor & ref (out._odata[ss]);
    vstream(ref()(0)(0),result_00*(-0.5));
    vstream(ref()(0)(1),result_01*(-0.5));
    vstream(ref()(0)(2),result_02*(-0.5));
@@ -767,4 +759,5 @@ void DiracOptHandDhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
    vstream(ref()(3)(2),result_32*(-0.5));
  }
 }
+#endif
 }}
--- a/lib/qcd/action/fermion/g5HermitianLinop.h
+++ b/lib/qcd/action/fermion/g5HermitianLinop.h
@@ -1,7 +1,9 @@
 #ifndef G5_HERMITIAN_LINOP
 #define G5_HERMITIAN_LINOP
+
 namespace Grid {
  namespace QCD {
+
 ////////////////////////////////////////////////////////////////////
 // Wrap an already herm matrix
 ////////////////////////////////////////////////////////////////////
--- a/lib/qcd/action/gauge/WilsonGaugeAction.h
+++ b/lib/qcd/action/gauge/WilsonGaugeAction.h
@@ -7,7 +7,7 @@ namespace Grid{
    ////////////////////////////////////////////////////////////////////////
    // Wilson Gauge Action .. should I template the Nc etc..
    ////////////////////////////////////////////////////////////////////////
-    template<class GaugeField,class MatrixField>
+    template<class GaugeField, class MatrixField>
      class WilsonGaugeAction : public Action<GaugeField> {
    private:
      RealD beta;
@@ -18,12 +18,13 @@ namespace Grid{
      
      virtual RealD S(const GaugeField &U) {
 	RealD plaq = WilsonLoops<MatrixField,GaugeField>::avgPlaquette(U);
-	std::cout << "Plaq : "<<plaq << "\n";
-	double vol = U._grid->gSites();
-	return beta*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5;
+	std::cout<<GridLogMessage << "Plaq : "<<plaq << "\n";
+	RealD vol = U._grid->gSites();
+	RealD action=beta*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5;
+	std::cout << GridLogMessage << "WilsonGauge action "<<action<<std::endl;
+	return action;
      };
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
-
 	//not optimal implementation FIXME
 	//extend Ta to include Lorentz indexes
 	RealD factor = 0.5*beta/RealD(Nc);
--- a/lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
+++ b/lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
@@ -0,0 +1,112 @@
+#ifndef QCD_EVEN_ODD_SCHUR_DIFFERENTIABLE_H
+#define QCD_EVEN_ODD_SCHUR_DIFFERENTIABLE_H
+
+namespace Grid{
+  namespace QCD{
+
+    // Base even odd HMC on the normal Mee based schur decomposition.
+    //
+    //     M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
+    //         (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
+    //
+    // Determinant is det of middle factor
+    // This assumes Mee is indept of U.
+    //
+    template<class Impl>
+    class SchurDifferentiableOperator :  public SchurDiagMooeeOperator<FermionOperator<Impl>,typename Impl::FermionField> 
+      {
+      public:
+      INHERIT_IMPL_TYPES(Impl);
+
+ 	typedef FermionOperator<Impl> Matrix;
+
+	SchurDifferentiableOperator (Matrix &Mat) : SchurDiagMooeeOperator<Matrix,FermionField>(Mat) {};
+
+	void MpcDeriv(GaugeField &Force,const FermionField &U,const FermionField &V) {
+	
+	  GridBase *fgrid   = this->_Mat.FermionGrid();
+	  GridBase *fcbgrid = this->_Mat.FermionRedBlackGrid();
+	  GridBase *ugrid   = this->_Mat.GaugeGrid();
+	  GridBase *ucbgrid = this->_Mat.GaugeRedBlackGrid();
+
+	  Real coeff = 1.0;
+
+	  FermionField tmp1(fcbgrid);
+	  FermionField tmp2(fcbgrid);
+
+	  conformable(fcbgrid,U._grid);
+	  conformable(fcbgrid,V._grid);
+
+	  // Assert the checkerboard?? or code for either
+	  assert(U.checkerboard==Odd);
+	  assert(V.checkerboard==U.checkerboard);
+
+	  GaugeField ForceO(ucbgrid);
+	  GaugeField ForceE(ucbgrid);
+
+	  //  X^dag Der_oe MeeInv Meo Y
+	  // Use Mooee as nontrivial but gauge field indept
+	  this->_Mat.Meooe   (V,tmp1);      // odd->even -- implicit -0.5 factor to be applied
+	  this->_Mat.MooeeInv(tmp1,tmp2);   // even->even 
+	  this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerNo);
+	  
+	  //  Accumulate X^dag M_oe MeeInv Der_eo Y
+	  this->_Mat.MeooeDag   (U,tmp1);    // even->odd -- implicit -0.5 factor to be applied
+	  this->_Mat.MooeeInvDag(tmp1,tmp2); // even->even 
+	  this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerNo);
+	  
+	  assert(ForceE.checkerboard==Even);
+	  assert(ForceO.checkerboard==Odd);
+
+	  setCheckerboard(Force,ForceE); 
+	  setCheckerboard(Force,ForceO);
+	  Force=-Force;
+	}
+
+
+	void MpcDagDeriv(GaugeField &Force,const FermionField &U,const FermionField &V) {
+	
+	  GridBase *fgrid   = this->_Mat.FermionGrid();
+	  GridBase *fcbgrid = this->_Mat.FermionRedBlackGrid();
+	  GridBase *ugrid   = this->_Mat.GaugeGrid();
+	  GridBase *ucbgrid = this->_Mat.GaugeRedBlackGrid();
+
+	  Real coeff = 1.0;
+
+	  FermionField tmp1(fcbgrid);
+	  FermionField tmp2(fcbgrid);
+
+	  conformable(fcbgrid,U._grid);
+	  conformable(fcbgrid,V._grid);
+
+	  // Assert the checkerboard?? or code for either
+	  assert(V.checkerboard==Odd);
+	  assert(V.checkerboard==V.checkerboard);
+
+	  GaugeField ForceO(ucbgrid);
+	  GaugeField ForceE(ucbgrid);
+
+	  //  X^dag Der_oe MeeInv Meo Y
+	  // Use Mooee as nontrivial but gauge field indept
+	  this->_Mat.MeooeDag   (V,tmp1);      // odd->even -- implicit -0.5 factor to be applied
+	  this->_Mat.MooeeInvDag(tmp1,tmp2);   // even->even 
+	  this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerYes);
+	  
+	  //  Accumulate X^dag M_oe MeeInv Der_eo Y
+	  this->_Mat.Meooe   (U,tmp1);    // even->odd -- implicit -0.5 factor to be applied
+	  this->_Mat.MooeeInv(tmp1,tmp2); // even->even 
+	  this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerYes);
+
+	  assert(ForceE.checkerboard==Even);
+	  assert(ForceO.checkerboard==Odd);
+
+	  setCheckerboard(Force,ForceE); 
+	  setCheckerboard(Force,ForceO);
+	  Force=-Force;
+	}
+
+    };
+
+  }
+}
+#endif
--- a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
@@ -0,0 +1,185 @@
+#ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_H
+#define QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_H
+
+namespace Grid{
+  namespace QCD{
+
+    ///////////////////////////////////////
+    // One flavour rational
+    ///////////////////////////////////////
+
+    // S_f = chi^dag *  N(Mpc^dag*Mpc)/D(Mpc^dag*Mpc) * chi
+    //
+    // Here, M is some operator 
+    // N and D makeup the rat. poly 
+    //
+  
+    template<class Impl>
+    class OneFlavourEvenOddRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+      typedef OneFlavourRationalParams Params;
+      Params param;
+
+      MultiShiftFunction PowerHalf   ;
+      MultiShiftFunction PowerNegHalf;
+      MultiShiftFunction PowerQuarter;
+      MultiShiftFunction PowerNegQuarter;
+
+    private:
+     
+      FermionOperator<Impl> & FermOp;// the basic operator
+
+      // NOT using "Nroots"; IroIro is -- perhaps later, but this wasn't good for us historically
+      // and hasenbusch works better
+
+      FermionField PhiEven; // the pseudo fermion field for this trajectory
+      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+                        
+
+    public:
+
+      OneFlavourEvenOddRationalPseudoFermionAction(FermionOperator<Impl>  &Op, 
+						   Params & p ) : FermOp(Op), 
+	PhiEven(Op.FermionRedBlackGrid()), 
+	PhiOdd (Op.FermionRedBlackGrid()), 
+	param(p) 
+      {
+	AlgRemez remez(param.lo,param.hi,param.precision);
+
+	// MdagM^(+- 1/2)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
+	remez.generateApprox(param.degree,1,2);
+	PowerHalf.Init(remez,param.tolerance,false);
+	PowerNegHalf.Init(remez,param.tolerance,true);
+
+	// MdagM^(+- 1/4)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
+	remez.generateApprox(param.degree,1,4);
+   	PowerQuarter.Init(remez,param.tolerance,false);
+	PowerNegQuarter.Init(remez,param.tolerance,true);
+      };
+      
+      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+
+	// P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
+	//        = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
+	// Phi = MpcdagMpc^{1/4} eta 
+	//
+	// P(eta) = e^{- eta^dag eta}
+	//
+	// e^{x^2/2 sig^2} => sig^2 = 0.5.
+	// 
+	// So eta should be of width sig = 1/sqrt(2).
+
+	RealD scale = std::sqrt(0.5);
+
+	FermionField eta    (FermOp.FermionGrid());
+	FermionField etaOdd (FermOp.FermionRedBlackGrid());
+	FermionField etaEven(FermOp.FermionRedBlackGrid());
+
+	gaussian(pRNG,eta);	eta=eta*scale;
+
+	pickCheckerboard(Even,etaEven,eta);
+	pickCheckerboard(Odd,etaOdd,eta);
+
+	FermOp.ImportGauge(U);
+
+	// mutishift CG
+	SchurDifferentiableOperator<Impl> Mpc(FermOp);
+	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerQuarter);
+	msCG(Mpc,etaOdd,PhiOdd);
+
+	//////////////////////////////////////////////////////
+	// FIXME : Clover term not yet..
+	//////////////////////////////////////////////////////
+
+	assert(FermOp.ConstEE() == 1);
+	PhiEven = zero;
+	
+      };
+
+      //////////////////////////////////////////////////////
+      // S = phi^dag (Mdag M)^-1/2 phi
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+
+	FermOp.ImportGauge(U);
+
+	FermionField Y(FermOp.FermionRedBlackGrid());
+	
+	SchurDifferentiableOperator<Impl> Mpc(FermOp);
+
+	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegQuarter);
+
+	msCG(Mpc,PhiOdd,Y);
+
+	RealD action = norm2(Y);
+	std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 solve or -1/2 solve faster??? "<<action<<std::endl;
+
+	return action;
+      };
+
+      //////////////////////////////////////////////////////
+      // Need
+      // dS_f/dU = chi^dag   d[N/D]  chi
+      //
+      // N/D is expressed as partial fraction expansion:
+      //
+      //           a0 + \sum_k ak/(M^dagM + bk)
+      //
+      // d[N/D] is then
+      //
+      //          \sum_k -ak [M^dagM+bk]^{-1}  [ dM^dag M + M^dag dM ] [M^dag M + bk]^{-1}
+      //
+      // Need
+      //       Mf Phi_k = [MdagM+bk]^{-1} Phi
+      //       Mf Phi   = \sum_k ak [MdagM+bk]^{-1} Phi
+      //
+      // With these building blocks
+      //
+      //       dS/dU =  \sum_k -ak Mf Phi_k^dag      [ dM^dag M + M^dag dM ] Mf Phi_k
+      //        S    = innerprodReal(Phi,Mf Phi);
+      //////////////////////////////////////////////////////
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+	const int Npole = PowerNegHalf.poles.size();
+
+	std::vector<FermionField> MPhi_k (Npole,FermOp.FermionRedBlackGrid());
+
+	FermionField X(FermOp.FermionRedBlackGrid());
+	FermionField Y(FermOp.FermionRedBlackGrid());
+
+	GaugeField   tmp(FermOp.GaugeGrid());
+
+	FermOp.ImportGauge(U);
+
+	SchurDifferentiableOperator<Impl> Mpc(FermOp);
+
+	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegHalf);
+
+	msCG(Mpc,PhiOdd,MPhi_k);
+
+	dSdU = zero;
+	for(int k=0;k<Npole;k++){
+
+	  RealD ak = PowerNegHalf.residues[k];
+
+	  X  = MPhi_k[k];
+
+	  Mpc.Mpc(X,Y);
+	  Mpc.MpcDeriv   (tmp , Y, X );  dSdU=dSdU+ak*tmp;
+	  Mpc.MpcDagDeriv(tmp , X, Y );  dSdU=dSdU+ak*tmp;
+
+	}
+
+	dSdU = Ta(dSdU);
+
+      };
+    };
+  }
+}
+
+
+#endif
--- a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@@ -0,0 +1,240 @@
+#ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_RATIO_H
+#define QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_RATIO_H
+
+namespace Grid{
+  namespace QCD{
+
+    ///////////////////////////////////////
+    // One flavour rational
+    ///////////////////////////////////////
+
+    // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+    //
+    // Here P/Q \sim R_{1/4}  ~ (V^dagV)^{1/4}  
+    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
+  
+    template<class Impl>
+    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+
+      INHERIT_IMPL_TYPES(Impl);
+
+      typedef OneFlavourRationalParams Params;
+      Params param;
+
+      MultiShiftFunction PowerHalf   ;
+      MultiShiftFunction PowerNegHalf;
+      MultiShiftFunction PowerQuarter;
+      MultiShiftFunction PowerNegQuarter;
+
+    private:
+     
+      FermionOperator<Impl> & NumOp;// the basic operator
+      FermionOperator<Impl> & DenOp;// the basic operator
+      FermionField PhiEven; // the pseudo fermion field for this trajectory
+      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+
+    public:
+
+      OneFlavourEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+					    FermionOperator<Impl>  &_DenOp, 
+					    Params & p
+					    ) : 
+      NumOp(_NumOp), 
+      DenOp(_DenOp), 
+      PhiOdd (_NumOp.FermionRedBlackGrid()),
+      PhiEven(_NumOp.FermionRedBlackGrid()),
+      param(p) 
+      {
+	AlgRemez remez(param.lo,param.hi,param.precision);
+
+	// MdagM^(+- 1/2)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
+	remez.generateApprox(param.degree,1,2);
+	PowerHalf.Init(remez,param.tolerance,false);
+	PowerNegHalf.Init(remez,param.tolerance,true);
+
+	// MdagM^(+- 1/4)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
+	remez.generateApprox(param.degree,1,4);
+   	PowerQuarter.Init(remez,param.tolerance,false);
+	PowerNegQuarter.Init(remez,param.tolerance,true);
+      };
+      
+      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+
+	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+	//
+	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
+	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
+	//
+	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
+	//
+	// P(eta) = e^{- eta^dag eta}
+	//
+	// e^{x^2/2 sig^2} => sig^2 = 0.5.
+	// 
+	// So eta should be of width sig = 1/sqrt(2).
+
+	RealD scale = std::sqrt(0.5);
+
+	FermionField eta(NumOp.FermionGrid());
+	FermionField etaOdd (NumOp.FermionRedBlackGrid());
+	FermionField etaEven(NumOp.FermionRedBlackGrid());
+	FermionField     tmp(NumOp.FermionRedBlackGrid());
+
+	gaussian(pRNG,eta);	eta=eta*scale;
+
+	pickCheckerboard(Even,etaEven,eta);
+	pickCheckerboard(Odd,etaOdd,eta);
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+
+	// MdagM^1/4 eta
+	SchurDifferentiableOperator<Impl> MdagM(DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
+	msCG_M(MdagM,etaOdd,tmp);
+
+	// VdagV^-1/4 MdagM^1/4 eta
+	SchurDifferentiableOperator<Impl> VdagV(NumOp);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
+	msCG_V(VdagV,tmp,PhiOdd);
+
+	assert(NumOp.ConstEE() == 1);
+	assert(DenOp.ConstEE() == 1);
+	PhiEven = zero;
+	
+      };
+
+      //////////////////////////////////////////////////////
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	FermionField X(NumOp.FermionRedBlackGrid());
+	FermionField Y(NumOp.FermionRedBlackGrid());
+
+	// VdagV^1/4 Phi
+	SchurDifferentiableOperator<Impl> VdagV(NumOp);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
+	msCG_V(VdagV,PhiOdd,X);
+
+	// MdagM^-1/4 VdagV^1/4 Phi
+	SchurDifferentiableOperator<Impl> MdagM(DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
+	msCG_M(MdagM,X,Y);
+
+	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
+	RealD action = norm2(Y);
+
+	return action;
+      };
+
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //
+      // Here, M is some 5D operator and V is the Pauli-Villars field
+      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
+      //
+      // Need  
+      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
+      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
+      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
+      //
+      // P/Q is expressed as partial fraction expansion: 
+      // 
+      //           a0 + \sum_k ak/(V^dagV + bk) 
+      //  
+      // d[P/Q] is then  
+      //
+      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
+      //  
+      // and similar for N/D. 
+      // 
+      // Need   
+      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
+      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
+      //   
+      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
+      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
+      // 
+      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
+      //  
+
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+	const int n_f  = PowerNegHalf.poles.size();
+	const int n_pv = PowerQuarter.poles.size();
+
+	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
+	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
+	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
+
+	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField           Y(NumOp.FermionRedBlackGrid());
+
+	GaugeField   tmp(NumOp.GaugeGrid());
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	SchurDifferentiableOperator<Impl> VdagV(NumOp);
+	SchurDifferentiableOperator<Impl> MdagM(DenOp);
+
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
+
+	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
+	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
+	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
+
+	RealD ak;
+
+	dSdU = zero;
+
+	// With these building blocks  
+	//  
+	//       dS/dU = 
+	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
+	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
+	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
+
+	//(1)
+	for(int k=0;k<n_f;k++){
+	  ak = PowerNegHalf.residues[k];
+	  MdagM.Mpc(MfMpvPhi_k[k],Y);
+	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
+	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
+	}
+	
+	//(2)
+	//(3)
+	for(int k=0;k<n_pv;k++){
+
+          ak = PowerQuarter.residues[k];
+	  
+	  VdagV.Mpc(MpvPhi_k[k],Y);
+	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
+	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
+	  
+	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
+	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
+	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
+
+	}
+
+	dSdU = Ta(dSdU);
+
+      };
+    };
+  }
+}
+
+
+#endif
--- a/lib/qcd/action/pseudofermion/OneFlavourRational.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourRational.h
@@ -0,0 +1,170 @@
+#ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_RATIONAL_H
+#define QCD_PSEUDOFERMION_ONE_FLAVOUR_RATIONAL_H
+
+namespace Grid{
+  namespace QCD{
+
+    ///////////////////////////////////////
+    // One flavour rational
+    ///////////////////////////////////////
+
+    // S_f = chi^dag *  N(M^dag*M)/D(M^dag*M) * chi
+    //
+    // Here, M is some operator 
+    // N and D makeup the rat. poly 
+    //
+  
+    template<class Impl>
+    class OneFlavourRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+      typedef OneFlavourRationalParams Params;
+      Params param;
+
+      MultiShiftFunction PowerHalf   ;
+      MultiShiftFunction PowerNegHalf;
+      MultiShiftFunction PowerQuarter;
+      MultiShiftFunction PowerNegQuarter;
+
+    private:
+     
+      FermionOperator<Impl> & FermOp;// the basic operator
+
+      // NOT using "Nroots"; IroIro is -- perhaps later, but this wasn't good for us historically
+      // and hasenbusch works better
+
+      FermionField Phi; // the pseudo fermion field for this trajectory
+
+    public:
+
+      OneFlavourRationalPseudoFermionAction(FermionOperator<Impl>  &Op, 
+					    Params & p
+					    ) : FermOp(Op), Phi(Op.FermionGrid()), param(p) 
+      {
+	AlgRemez remez(param.lo,param.hi,param.precision);
+
+	// MdagM^(+- 1/2)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
+	remez.generateApprox(param.degree,1,2);
+	PowerHalf.Init(remez,param.tolerance,false);
+	PowerNegHalf.Init(remez,param.tolerance,true);
+
+	// MdagM^(+- 1/4)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
+	remez.generateApprox(param.degree,1,4);
+   	PowerQuarter.Init(remez,param.tolerance,false);
+	PowerNegQuarter.Init(remez,param.tolerance,true);
+      };
+      
+      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+
+	// P(phi) = e^{- phi^dag (MdagM)^-1/2 phi}
+	//        = e^{- phi^dag (MdagM)^-1/4 (MdagM)^-1/4 phi}
+	// Phi = Mdag^{1/4} eta 
+	// P(eta) = e^{- eta^dag eta}
+	//
+	// e^{x^2/2 sig^2} => sig^2 = 0.5.
+	// 
+	// So eta should be of width sig = 1/sqrt(2).
+
+	RealD scale = std::sqrt(0.5);
+
+	FermionField eta(FermOp.FermionGrid());
+
+	gaussian(pRNG,eta);
+
+	FermOp.ImportGauge(U);
+
+	// mutishift CG
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(FermOp);
+	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerQuarter);
+	msCG(MdagMOp,eta,Phi);
+
+	Phi=Phi*scale;
+	
+      };
+
+      //////////////////////////////////////////////////////
+      // S = phi^dag (Mdag M)^-1/2 phi
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+
+	FermOp.ImportGauge(U);
+
+	FermionField Y(FermOp.FermionGrid());
+	
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(FermOp);
+
+	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegQuarter);
+
+	msCG(MdagMOp,Phi,Y);
+
+	RealD action = norm2(Y);
+	std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 solve or -1/2 solve faster??? "<<action<<std::endl;
+	return action;
+      };
+
+      //////////////////////////////////////////////////////
+      // Need
+      // dS_f/dU = chi^dag   d[N/D]  chi
+      //
+      // N/D is expressed as partial fraction expansion:
+      //
+      //           a0 + \sum_k ak/(M^dagM + bk)
+      //
+      // d[N/D] is then
+      //
+      //          \sum_k -ak [M^dagM+bk]^{-1}  [ dM^dag M + M^dag dM ] [M^dag M + bk]^{-1}
+      //
+      // Need
+      //       Mf Phi_k = [MdagM+bk]^{-1} Phi
+      //       Mf Phi   = \sum_k ak [MdagM+bk]^{-1} Phi
+      //
+      // With these building blocks
+      //
+      //       dS/dU =  \sum_k -ak Mf Phi_k^dag      [ dM^dag M + M^dag dM ] Mf Phi_k
+      //        S    = innerprodReal(Phi,Mf Phi);
+      //////////////////////////////////////////////////////
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+	const int Npole = PowerNegHalf.poles.size();
+
+	std::vector<FermionField> MPhi_k (Npole,FermOp.FermionGrid());
+
+	FermionField X(FermOp.FermionGrid());
+	FermionField Y(FermOp.FermionGrid());
+
+	GaugeField   tmp(FermOp.GaugeGrid());
+
+	FermOp.ImportGauge(U);
+
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(FermOp);
+
+	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegHalf);
+
+	msCG(MdagMOp,Phi,MPhi_k);
+
+	dSdU = zero;
+	for(int k=0;k<Npole;k++){
+
+	  RealD ak = PowerNegHalf.residues[k];
+
+	  X  = MPhi_k[k];
+
+	  FermOp.M(X,Y);
+
+	  FermOp.MDeriv(tmp , Y, X,DaggerNo );  dSdU=dSdU+ak*tmp;
+	  FermOp.MDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+ak*tmp;
+
+	}
+
+	dSdU = Ta(dSdU);
+
+      };
+    };
+  }
+}
+
+
+#endif
--- a/lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h
@@ -0,0 +1,226 @@
+#ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_RATIONAL_RATIO_H
+#define QCD_PSEUDOFERMION_ONE_FLAVOUR_RATIONAL_RATIO_H
+
+namespace Grid{
+  namespace QCD{
+
+    ///////////////////////////////////////
+    // One flavour rational
+    ///////////////////////////////////////
+
+    // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+    //
+    // Here P/Q \sim R_{1/4}  ~ (V^dagV)^{1/4}  
+    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
+  
+    template<class Impl>
+    class OneFlavourRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+
+      INHERIT_IMPL_TYPES(Impl);
+
+      typedef OneFlavourRationalParams Params;
+      Params param;
+
+      MultiShiftFunction PowerHalf   ;
+      MultiShiftFunction PowerNegHalf;
+      MultiShiftFunction PowerQuarter;
+      MultiShiftFunction PowerNegQuarter;
+
+    private:
+     
+      FermionOperator<Impl> & NumOp;// the basic operator
+      FermionOperator<Impl> & DenOp;// the basic operator
+      FermionField Phi; // the pseudo fermion field for this trajectory
+
+    public:
+
+      OneFlavourRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+					    FermionOperator<Impl>  &_DenOp, 
+					    Params & p
+					    ) : NumOp(_NumOp), DenOp(_DenOp), Phi(_NumOp.FermionGrid()), param(p) 
+      {
+	AlgRemez remez(param.lo,param.hi,param.precision);
+
+	// MdagM^(+- 1/2)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
+	remez.generateApprox(param.degree,1,2);
+	PowerHalf.Init(remez,param.tolerance,false);
+	PowerNegHalf.Init(remez,param.tolerance,true);
+
+	// MdagM^(+- 1/4)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
+	remez.generateApprox(param.degree,1,4);
+   	PowerQuarter.Init(remez,param.tolerance,false);
+	PowerNegQuarter.Init(remez,param.tolerance,true);
+      };
+      
+      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+
+	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+	//
+	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
+	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
+	//
+	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
+	//
+	// P(eta) = e^{- eta^dag eta}
+	//
+	// e^{x^2/2 sig^2} => sig^2 = 0.5.
+	// 
+	// So eta should be of width sig = 1/sqrt(2).
+
+	RealD scale = std::sqrt(0.5);
+
+	FermionField tmp(NumOp.FermionGrid());
+	FermionField eta(NumOp.FermionGrid());
+
+	gaussian(pRNG,eta);
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	// MdagM^1/4 eta
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagM(DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
+	msCG_M(MdagM,eta,tmp);
+
+	// VdagV^-1/4 MdagM^1/4 eta
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> VdagV(NumOp);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
+	msCG_V(VdagV,tmp,Phi);
+
+	Phi=Phi*scale;
+	
+      };
+
+      //////////////////////////////////////////////////////
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	FermionField X(NumOp.FermionGrid());
+	FermionField Y(NumOp.FermionGrid());
+
+	// VdagV^1/4 Phi
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> VdagV(NumOp);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
+	msCG_V(VdagV,Phi,X);
+
+	// MdagM^-1/4 VdagV^1/4 Phi
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagM(DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
+	msCG_M(MdagM,X,Y);
+
+	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
+	RealD action = norm2(Y);
+
+	return action;
+      };
+
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //
+      // Here, M is some 5D operator and V is the Pauli-Villars field
+      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
+      //
+      // Need  
+      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
+      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
+      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
+      //
+      // P/Q is expressed as partial fraction expansion: 
+      // 
+      //           a0 + \sum_k ak/(V^dagV + bk) 
+      //  
+      // d[P/Q] is then  
+      //
+      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
+      //  
+      // and similar for N/D. 
+      // 
+      // Need   
+      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
+      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
+      //   
+      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
+      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
+      // 
+      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
+      //  
+
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+	const int n_f  = PowerNegHalf.poles.size();
+	const int n_pv = PowerQuarter.poles.size();
+
+	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionGrid());
+	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionGrid());
+	std::vector<FermionField> MfMpvPhi_k   (n_f,NumOp.FermionGrid());
+
+	FermionField      MpvPhi(NumOp.FermionGrid());
+	FermionField    MfMpvPhi(NumOp.FermionGrid());
+	FermionField MpvMfMpvPhi(NumOp.FermionGrid());
+	FermionField           Y(NumOp.FermionGrid());
+
+	GaugeField   tmp(NumOp.GaugeGrid());
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagM(DenOp);
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> VdagV(NumOp);
+
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
+
+	msCG_V(VdagV,Phi,MpvPhi_k,MpvPhi);
+	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
+	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
+
+	RealD ak;
+
+	dSdU = zero;
+
+	// With these building blocks  
+	//  
+	//       dS/dU = 
+	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
+	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
+	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
+
+	//(1)
+	for(int k=0;k<n_f;k++){
+	  ak = PowerNegHalf.residues[k];
+	  DenOp.M(MfMpvPhi_k[k],Y);
+	  DenOp.MDeriv(tmp , MfMpvPhi_k[k], Y,DaggerYes );  dSdU=dSdU+ak*tmp;
+	  DenOp.MDeriv(tmp , Y, MfMpvPhi_k[k], DaggerNo );  dSdU=dSdU+ak*tmp;
+	}
+	
+	//(2)
+	//(3)
+	for(int k=0;k<n_pv;k++){
+
+          ak = PowerQuarter.residues[k];
+	  
+	  NumOp.M(MpvPhi_k[k],Y);
+	  NumOp.MDeriv(tmp,MpvMfMpvPhi_k[k],Y,DaggerYes); dSdU=dSdU+ak*tmp;
+	  NumOp.MDeriv(tmp,Y,MpvMfMpvPhi_k[k],DaggerNo);  dSdU=dSdU+ak*tmp;     
+	  
+	  NumOp.M(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
+	  NumOp.MDeriv(tmp,Y, MpvPhi_k[k], DaggerNo); dSdU=dSdU+ak*tmp;
+	  NumOp.MDeriv(tmp,MpvPhi_k[k], Y,DaggerYes); dSdU=dSdU+ak*tmp;
+
+	}
+
+	dSdU = Ta(dSdU);
+
+      };
+    };
+  }
+}
+
+
+#endif
--- a/lib/qcd/action/pseudofermion/TwoFlavour.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavour.h
@@ -0,0 +1,121 @@
+#ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_H
+#define QCD_PSEUDOFERMION_TWO_FLAVOUR_H
+
+namespace Grid{
+  namespace QCD{
+
+    ////////////////////////////////////////////////////////////////////////
+    // Two flavour pseudofermion action for any dop
+    ////////////////////////////////////////////////////////////////////////
+    template<class Impl>
+    class TwoFlavourPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+    private:
+      
+      FermionOperator<Impl> & FermOp;// the basic operator
+
+      OperatorFunction<FermionField> &DerivativeSolver;
+
+      OperatorFunction<FermionField> &ActionSolver;
+
+      FermionField Phi; // the pseudo fermion field for this trajectory
+
+    public:
+      /////////////////////////////////////////////////
+      // Pass in required objects.
+      /////////////////////////////////////////////////
+    TwoFlavourPseudoFermionAction(FermionOperator<Impl>  &Op, 
+				  OperatorFunction<FermionField> & DS,
+				  OperatorFunction<FermionField> & AS
+				  ) : FermOp(Op), DerivativeSolver(DS), ActionSolver(AS), Phi(Op.FermionGrid()) {
+      };
+      
+      //////////////////////////////////////////////////////////////////////////////////////
+      // Push the gauge field in to the dops. Assume any BC's and smearing already applied
+      //////////////////////////////////////////////////////////////////////////////////////
+      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+
+	// P(phi) = e^{- phi^dag (MdagM)^-1 phi}
+	// Phi = Mdag eta 
+	// P(eta) = e^{- eta^dag eta}
+	//
+	// e^{x^2/2 sig^2} => sig^2 = 0.5.
+	// 
+	// So eta should be of width sig = 1/sqrt(2).
+	// and must multiply by 0.707....
+	//
+	// Chroma has this scale factor: two_flavor_monomial_w.h
+	// IroIro: does not use this scale. It is absorbed by a change of vars
+	//         in the Phi integral, and thus is only an irrelevant prefactor for the partition function.
+	//
+	RealD scale = std::sqrt(0.5);
+	FermionField eta(FermOp.FermionGrid());
+
+	gaussian(pRNG,eta);
+
+	FermOp.ImportGauge(U);
+	FermOp.Mdag(eta,Phi);
+
+	Phi=Phi*scale;
+	
+      };
+
+      //////////////////////////////////////////////////////
+      // S = phi^dag (Mdag M)^-1 phi
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+
+	FermOp.ImportGauge(U);
+
+	FermionField X(FermOp.FermionGrid());
+	FermionField Y(FermOp.FermionGrid());
+	
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(FermOp);
+	X=zero;
+	ActionSolver(MdagMOp,Phi,X);
+	MdagMOp.Op(X,Y);
+
+	RealD action = norm2(Y);
+	std::cout << GridLogMessage << "Pseudofermion action "<<action<<std::endl;
+	return action;
+      };
+
+      //////////////////////////////////////////////////////
+      // dS/du = - phi^dag  (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 phi
+      //       = - phi^dag M^-1 dM (MdagM)^-1 phi -  phi^dag (MdagM)^-1 dMdag dM (Mdag)^-1 phi 
+      //
+      //       = - Ydag dM X  - Xdag dMdag Y
+      //
+      //////////////////////////////////////////////////////
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+	FermOp.ImportGauge(U);
+
+	FermionField X(FermOp.FermionGrid());
+	FermionField Y(FermOp.FermionGrid());
+	GaugeField   tmp(FermOp.GaugeGrid());
+
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(FermOp);
+
+	X=zero;
+	DerivativeSolver(MdagMOp,Phi,X);
+	MdagMOp.Op(X,Y);
+
+	// Our conventions really make this UdSdU; We do not differentiate wrt Udag here.
+	// So must take dSdU - adj(dSdU) and left multiply by mom to get dS/dt.
+
+	FermOp.MDeriv(tmp , Y, X,DaggerNo );  dSdU=tmp;
+	FermOp.MDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
+	
+	dSdU = Ta(dSdU);
+
+      };
+
+    };
+    
+  }
+}
+
+#endif
--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
@@ -0,0 +1,157 @@
+#ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_H
+#define QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_H
+
+namespace Grid{
+  namespace QCD{
+
+
+
+    ////////////////////////////////////////////////////////////////////////
+    // Two flavour pseudofermion action for any EO prec dop
+    ////////////////////////////////////////////////////////////////////////
+    template<class Impl>
+    class TwoFlavourEvenOddPseudoFermionAction : public Action<typename Impl::GaugeField> {
+
+    public:
+
+      INHERIT_IMPL_TYPES(Impl);
+
+    private:
+      
+      FermionOperator<Impl> & FermOp;// the basic operator
+
+      OperatorFunction<FermionField> &DerivativeSolver;
+      OperatorFunction<FermionField> &ActionSolver;
+
+      FermionField PhiOdd;   // the pseudo fermion field for this trajectory
+      FermionField PhiEven;  // the pseudo fermion field for this trajectory
+
+    public:
+      /////////////////////////////////////////////////
+      // Pass in required objects.
+      /////////////////////////////////////////////////
+      TwoFlavourEvenOddPseudoFermionAction(FermionOperator<Impl>  &Op, 
+					 OperatorFunction<FermionField> & DS,
+					 OperatorFunction<FermionField> & AS
+					   ) : 
+        FermOp(Op), 
+	DerivativeSolver(DS), 
+	ActionSolver(AS), 
+        PhiEven(Op.FermionRedBlackGrid()),
+	PhiOdd(Op.FermionRedBlackGrid())
+		  {};
+      
+      //////////////////////////////////////////////////////////////////////////////////////
+      // Push the gauge field in to the dops. Assume any BC's and smearing already applied
+      //////////////////////////////////////////////////////////////////////////////////////
+      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+
+	// P(phi) = e^{- phi^dag (MpcdagMpc)^-1 phi}
+	// Phi = McpDag eta 
+	// P(eta) = e^{- eta^dag eta}
+	//
+	// e^{x^2/2 sig^2} => sig^2 = 0.5.
+
+	RealD scale = std::sqrt(0.5);
+
+	FermionField eta    (FermOp.FermionGrid());
+	FermionField etaOdd (FermOp.FermionRedBlackGrid());
+	FermionField etaEven(FermOp.FermionRedBlackGrid());
+
+	gaussian(pRNG,eta);
+	pickCheckerboard(Even,etaEven,eta);
+	pickCheckerboard(Odd,etaOdd,eta);
+
+	FermOp.ImportGauge(U);
+	SchurDifferentiableOperator<Impl> PCop(FermOp);
+	
+
+	PCop.MpcDag(etaOdd,PhiOdd);
+
+	FermOp.MooeeDag(etaEven,PhiEven);
+
+	PhiOdd =PhiOdd*scale;
+	PhiEven=PhiEven*scale;
+	
+      };
+
+      //////////////////////////////////////////////////////
+      // S = phi^dag (Mdag M)^-1 phi  (odd)
+      //   + phi^dag (Mdag M)^-1 phi  (even)
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+
+	FermOp.ImportGauge(U);
+
+	FermionField X(FermOp.FermionRedBlackGrid());
+	FermionField Y(FermOp.FermionRedBlackGrid());
+	
+	SchurDifferentiableOperator<Impl> PCop(FermOp);
+
+	X=zero;
+	ActionSolver(PCop,PhiOdd,X);
+	PCop.Op(X,Y);
+	RealD action = norm2(Y);
+
+	// The EE factorised block; normally can replace with zero if det is constant (gauge field indept)
+	// Only really clover term that creates this.
+	FermOp.MooeeInvDag(PhiEven,Y);
+	action = action + norm2(Y);
+
+	std::cout << GridLogMessage << "Pseudofermion EO action "<<action<<std::endl;
+	return action;
+      };
+
+      //////////////////////////////////////////////////////
+      //
+      // dS/du = - phi^dag  (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 phi
+      //       = - phi^dag M^-1 dM (MdagM)^-1 phi -  phi^dag (MdagM)^-1 dMdag dM (Mdag)^-1 phi 
+      //
+      //       = - Ydag dM X  - Xdag dMdag Y
+      //
+      //////////////////////////////////////////////////////
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+	FermOp.ImportGauge(U);
+
+	FermionField X(FermOp.FermionRedBlackGrid());
+	FermionField Y(FermOp.FermionRedBlackGrid());
+	GaugeField tmp(FermOp.GaugeGrid());
+
+	SchurDifferentiableOperator<Impl> Mpc(FermOp);
+
+	// Our conventions really make this UdSdU; We do not differentiate wrt Udag here.
+	// So must take dSdU - adj(dSdU) and left multiply by mom to get dS/dt.
+
+	X=zero;
+	DerivativeSolver(Mpc,PhiOdd,X);
+	Mpc.Mpc(X,Y);
+  	Mpc.MpcDeriv(tmp , Y, X );    dSdU=tmp;
+	Mpc.MpcDagDeriv(tmp , X, Y);  dSdU=dSdU+tmp;
+
+	// Treat the EE case. (MdagM)^-1 = Minv Minvdag
+	// Deriv defaults to zero.
+	//        FermOp.MooeeInvDag(PhiOdd,Y);
+	//      FermOp.MooeeInv(Y,X);
+	//	FermOp.MeeDeriv(tmp , Y, X,DaggerNo );    dSdU=tmp;
+	//  FermOp.MeeDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
+
+	assert(FermOp.ConstEE() == 1);
+
+	/*
+        FermOp.MooeeInvDag(PhiOdd,Y);
+        FermOp.MooeeInv(Y,X);
+  	FermOp.MeeDeriv(tmp , Y, X,DaggerNo );    dSdU=tmp;
+	FermOp.MeeDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
+	*/
+	
+	dSdU = Ta(dSdU);
+
+      };
+
+    };
+    
+  }
+}
+
+#endif
--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -0,0 +1,168 @@
+#ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_RATIO_H
+#define QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_RATIO_H
+
+namespace Grid{
+  namespace QCD{
+
+    ///////////////////////////////////////
+    // Two flavour ratio
+    ///////////////////////////////////////
+    template<class Impl>
+    class TwoFlavourEvenOddRatioPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+    private:
+      FermionOperator<Impl> & NumOp;// the basic operator
+      FermionOperator<Impl> & DenOp;// the basic operator
+
+      OperatorFunction<FermionField> &DerivativeSolver;
+      OperatorFunction<FermionField> &ActionSolver;
+
+      FermionField PhiOdd;   // the pseudo fermion field for this trajectory
+      FermionField PhiEven;  // the pseudo fermion field for this trajectory
+
+    public:
+      TwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+						FermionOperator<Impl>  &_DenOp, 
+						OperatorFunction<FermionField> & DS,
+						OperatorFunction<FermionField> & AS) :
+      NumOp(_NumOp), 
+      DenOp(_DenOp), 
+      DerivativeSolver(DS), 
+      ActionSolver(AS),
+      PhiEven(_NumOp.FermionRedBlackGrid()),
+      PhiOdd(_NumOp.FermionRedBlackGrid()) 
+	{
+	  conformable(_NumOp.FermionGrid(), _DenOp.FermionGrid());
+	  conformable(_NumOp.FermionRedBlackGrid(), _DenOp.FermionRedBlackGrid());
+	  conformable(_NumOp.GaugeGrid(), _DenOp.GaugeGrid());
+	  conformable(_NumOp.GaugeRedBlackGrid(), _DenOp.GaugeRedBlackGrid());
+	};
+      
+      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+
+	// P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
+	//
+	// NumOp == V
+	// DenOp == M
+	//
+	// Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
+	//
+	// P(eta_o) = e^{- eta_o^dag eta_o}
+	//
+	// e^{x^2/2 sig^2} => sig^2 = 0.5.
+	// 
+	RealD scale = std::sqrt(0.5);
+
+	FermionField eta    (NumOp.FermionGrid());
+	FermionField etaOdd (NumOp.FermionRedBlackGrid());
+	FermionField etaEven(NumOp.FermionRedBlackGrid());
+	FermionField tmp    (NumOp.FermionRedBlackGrid());
+
+	gaussian(pRNG,eta);
+	pickCheckerboard(Even,etaEven,eta);
+	pickCheckerboard(Odd,etaOdd,eta);
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	SchurDifferentiableOperator<Impl> Mpc(DenOp);
+	SchurDifferentiableOperator<Impl> Vpc(NumOp);
+
+	// Odd det factors
+	Mpc.MpcDag(etaOdd,PhiOdd);
+	ActionSolver(Vpc,PhiOdd,tmp);
+	Vpc.Mpc(tmp,PhiOdd);            
+
+	// Even det factors
+	DenOp.MooeeDag(etaEven,tmp);
+	NumOp.MooeeInvDag(tmp,PhiEven);
+
+	PhiOdd =PhiOdd*scale;
+	PhiEven=PhiEven*scale;
+	
+      };
+
+      //////////////////////////////////////////////////////
+      // S = phi^dag V (Mdag M)^-1 Vdag phi
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	SchurDifferentiableOperator<Impl> Mpc(DenOp);
+	SchurDifferentiableOperator<Impl> Vpc(NumOp);
+
+	FermionField X(NumOp.FermionRedBlackGrid());
+	FermionField Y(NumOp.FermionRedBlackGrid());
+
+	X=zero;
+	Vpc.MpcDag(PhiOdd,Y);           // Y= Vdag phi
+	ActionSolver(Mpc,Y,X);          // X= (MdagM)^-1 Vdag phi
+	Mpc.Mpc(X,Y);                   // Y=  Mdag^-1 Vdag phi
+
+	RealD action = norm2(Y);
+
+	// The EE factorised block; normally can replace with zero if det is constant (gauge field indept)
+	// Only really clover term that creates this. Leave the EE portion as a future to do to make most
+	// rapid progresss on DWF for now.
+	//
+	NumOp.MooeeDag(PhiEven,X);
+	DenOp.MooeeInvDag(X,Y);
+	action = action + norm2(Y);
+
+	return action;
+      };
+
+      //////////////////////////////////////////////////////
+      // dS/du = phi^dag dV (Mdag M)^-1 V^dag  phi
+      //       - phi^dag V (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 V^dag  phi
+      //       + phi^dag V (Mdag M)^-1 dV^dag  phi
+      //////////////////////////////////////////////////////
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	SchurDifferentiableOperator<Impl> Mpc(DenOp);
+	SchurDifferentiableOperator<Impl> Vpc(NumOp);
+
+	FermionField  X(NumOp.FermionRedBlackGrid());
+	FermionField  Y(NumOp.FermionRedBlackGrid());
+
+	GaugeField   force(NumOp.GaugeGrid());	
+
+	X=zero;
+
+	//Y=Vdag phi
+	//X = (Mdag M)^-1 V^dag phi
+	//Y = (Mdag)^-1 V^dag  phi
+	Vpc.MpcDag(PhiOdd,Y);          // Y= Vdag phi
+	DerivativeSolver(Mpc,Y,X);     // X= (MdagM)^-1 Vdag phi
+	Mpc.Mpc(X,Y);                  // Y=  Mdag^-1 Vdag phi
+
+	// phi^dag V (Mdag M)^-1 dV^dag  phi
+	Vpc.MpcDagDeriv(force , X, PhiOdd );  dSdU=force;
+  
+	// phi^dag dV (Mdag M)^-1 V^dag  phi
+	Vpc.MpcDeriv(force , PhiOdd, X );  dSdU=dSdU+force;
+
+	//    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi
+	//    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi
+	Mpc.MpcDeriv(force,Y,X);   dSdU=dSdU-force;
+	Mpc.MpcDagDeriv(force,X,Y);  dSdU=dSdU-force;
+
+	// FIXME No force contribution from EvenEven assumed here
+	// Needs a fix for clover.
+	assert(NumOp.ConstEE() == 1);
+	assert(DenOp.ConstEE() == 1);
+
+	dSdU = -Ta(dSdU);
+
+      };
+    };
+  }
+}
+#endif
--- a/lib/qcd/action/pseudofermion/TwoFlavourRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourRatio.h
@@ -0,0 +1,134 @@
+#ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_RATIO_H
+#define QCD_PSEUDOFERMION_TWO_FLAVOUR_RATIO_H
+
+namespace Grid{
+  namespace QCD{
+
+    ///////////////////////////////////////
+    // Two flavour ratio
+    ///////////////////////////////////////
+    template<class Impl>
+    class TwoFlavourRatioPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+    private:
+      FermionOperator<Impl> & NumOp;// the basic operator
+      FermionOperator<Impl> & DenOp;// the basic operator
+
+      OperatorFunction<FermionField> &DerivativeSolver;
+      OperatorFunction<FermionField> &ActionSolver;
+
+      FermionField Phi; // the pseudo fermion field for this trajectory
+
+    public:
+      TwoFlavourRatioPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+					 FermionOperator<Impl>  &_DenOp, 
+					 OperatorFunction<FermionField> & DS,
+					 OperatorFunction<FermionField> & AS
+					 ) : NumOp(_NumOp), DenOp(_DenOp), DerivativeSolver(DS), ActionSolver(AS), Phi(_NumOp.FermionGrid()) {};
+      
+      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+
+	// P(phi) = e^{- phi^dag V (MdagM)^-1 Vdag phi}
+	//
+	// NumOp == V
+	// DenOp == M
+	//
+	// Take phi = Vdag^{-1} Mdag eta  ; eta = Mdag^{-1} Vdag Phi
+	//
+	// P(eta) = e^{- eta^dag eta}
+	//
+	// e^{x^2/2 sig^2} => sig^2 = 0.5.
+	// 
+	// So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
+	//
+	RealD scale = std::sqrt(0.5);
+
+	FermionField eta(NumOp.FermionGrid());
+	FermionField tmp(NumOp.FermionGrid());
+
+	gaussian(pRNG,eta);
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	// Note: this hard codes normal equations type solvers; alternate implementation needed for 
+	// non-herm style solvers.
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(NumOp);
+
+	DenOp.Mdag(eta,Phi);            // Mdag eta
+	ActionSolver(MdagMOp,Phi,tmp);  // (VdagV)^-1 Mdag eta = V^-1 Vdag^-1 Mdag eta
+	NumOp.M(tmp,Phi);               // Vdag^-1 Mdag eta
+
+	Phi=Phi*scale;
+	
+      };
+
+      //////////////////////////////////////////////////////
+      // S = phi^dag V (Mdag M)^-1 Vdag phi
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	FermionField X(NumOp.FermionGrid());
+	FermionField Y(NumOp.FermionGrid());
+	
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
+
+	X=zero;
+	NumOp.Mdag(Phi,Y);              // Y= Vdag phi
+	ActionSolver(MdagMOp,Y,X);      // X= (MdagM)^-1 Vdag phi
+	DenOp.M(X,Y);                  // Y=  Mdag^-1 Vdag phi
+
+	RealD action = norm2(Y);
+
+	return action;
+      };
+
+      //////////////////////////////////////////////////////
+      // dS/du = phi^dag dV (Mdag M)^-1 V^dag  phi
+      //       - phi^dag V (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 V^dag  phi
+      //       + phi^dag V (Mdag M)^-1 dV^dag  phi
+      //////////////////////////////////////////////////////
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
+
+	FermionField  X(NumOp.FermionGrid());
+	FermionField  Y(NumOp.FermionGrid());
+
+	GaugeField   force(NumOp.GaugeGrid());	
+
+	X=zero;
+
+	//Y=Vdag phi
+	//X = (Mdag M)^-1 V^dag phi
+	//Y = (Mdag)^-1 V^dag  phi
+	NumOp.Mdag(Phi,Y);              // Y= Vdag phi
+	DerivativeSolver(MdagMOp,Y,X);      // X= (MdagM)^-1 Vdag phi
+	DenOp.M(X,Y);                  // Y=  Mdag^-1 Vdag phi
+
+	// phi^dag V (Mdag M)^-1 dV^dag  phi
+	NumOp.MDeriv(force , X, Phi, DaggerYes );  dSdU=force;
+  
+	// phi^dag dV (Mdag M)^-1 V^dag  phi
+	NumOp.MDeriv(force , Phi, X ,DaggerNo  );  dSdU=dSdU+force;
+
+	//    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi
+	//    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi
+	DenOp.MDeriv(force,Y,X,DaggerNo);   dSdU=dSdU-force;
+	DenOp.MDeriv(force,X,Y,DaggerYes);  dSdU=dSdU-force;
+
+	dSdU = - Ta(dSdU);
+
+      };
+    };
+  }
+}
+#endif
--- a/lib/qcd/hmc/HMC.cc
+++ b/lib/qcd/hmc/HMC.cc
@@ -7,8 +7,8 @@ namespace Grid{
 	// FIXME fill this constructor  now just default values
 	  
 	////////////////////////////// Default values
-	Nsweeps             = 100;
-	TotalSweeps         = 20;
+	Nsweeps             = 200;
+	TotalSweeps         = 220;
 	ThermalizationSteps = 20;
 	StartingConfig      = 0;
 	SaveInterval        = 1;
@@ -17,8 +17,5 @@ namespace Grid{
 	  
      }

-
-   
-
  }
 }
--- a/lib/qcd/hmc/HMC.h
+++ b/lib/qcd/hmc/HMC.h
@@ -1,15 +1,16 @@
 //--------------------------------------------------------------------
 /*! @file HMC.h
- * @brief Declaration of classes for Hybrid Monte Carlo update
+ * @brief Classes for Hybrid Monte Carlo update
 *
 * @author Guido Cossu
+ * Time-stamp: <2015-07-30 16:58:26 neo>
 */
 //--------------------------------------------------------------------
 #ifndef HMC_INCLUDED
 #define HMC_INCLUDED

 #include <string>
-#include <memory>
+

 namespace Grid{
  namespace QCD{
@@ -20,84 +21,98 @@ namespace Grid{
      Integer ThermalizationSteps;
      Integer StartingConfig;
      Integer SaveInterval; //Setting to 0 does not save configurations
-      std::string Filename_prefix; // To save configurations
+      std::string Filename_prefix; // To save configurations and rng seed
      
      HMCparameters();
    };
    
    template <class Algorithm> 
    class HybridMonteCarlo{
+
      const HMCparameters Params;
-      GridSerialRNG sRNG;
+
+      GridSerialRNG sRNG; // Fixme: need a RNG management strategy.
+
      Integrator<Algorithm>& MD;

+      /////////////////////////////////////////////////////////
+      // Metropolis step
+      /////////////////////////////////////////////////////////
      bool metropolis_test(const RealD DeltaH){
+
 	RealD rn_test;
+
 	RealD prob = std::exp(-DeltaH);
+
 	random(sRNG,rn_test);
      
-	std::cout<< "--------------------------------------------\n";
-	std::cout<< "dH = "<<DeltaH << "  Random = "<< rn_test 
-		 << "\nAcc. Probability = " << ((prob<1.0)? prob: 1.0)<< "   ";
+	std::cout<<GridLogMessage<< "--------------------------------------------\n";
+	std::cout<<GridLogMessage<< "dH = "<<DeltaH << "  Random = "<< rn_test <<"\n";
+	std::cout<<GridLogMessage<< "Acc. Probability = " << ((prob<1.0)? prob: 1.0)<< "   ";
      
 	if((prob >1.0) || (rn_test <= prob)){       // accepted
-	  std::cout <<"-- ACCEPTED\n";
+	  std::cout<<GridLogMessage <<"-- ACCEPTED\n";
 	  return true;
 	} else {                               // rejected
-	  std::cout <<"-- REJECTED\n";
+	  std::cout<<GridLogMessage <<"-- REJECTED\n";
 	  return false;
 	}
+
      }

-      RealD evolve_step(LatticeLorentzColourMatrix& U){
-
+      /////////////////////////////////////////////////////////
+      // Evolution
+      /////////////////////////////////////////////////////////
+      RealD evolve_step(LatticeGaugeField& U){
 	MD.init(U); // set U and initialize P and phi's 
+
 	RealD H0 = MD.S(U); // initial state action  
-	std::cout<<"Total H before = "<< H0 << "\n";
+	std::cout<<GridLogMessage<<"Total H before = "<< H0 << "\n";

 	MD.integrate(U);
      
 	RealD H1 = MD.S(U); // updated state action            
-	std::cout<<"Total H after = "<< H1 << "\n";
-      
+	std::cout<<GridLogMessage<<"Total H after = "<< H1 << "\n";
 	return (H1-H0);
      }
      
    public:
-    HybridMonteCarlo(HMCparameters Pms, 
-		     Integrator<Algorithm>& MolDyn):
-      Params(Pms),MD(MolDyn){
-	//FIXME

-	// initialize RNGs also with seed
+      /////////////////////////////////////////
+      // Constructor
+      /////////////////////////////////////////
+      HybridMonteCarlo(HMCparameters Pms,  Integrator<Algorithm>& MolDyn): Params(Pms),MD(MolDyn) {
+
+	//FIXME...  initialize RNGs also with seed ; RNG management strategy
 	sRNG.SeedRandomDevice();
+
      }
      ~HybridMonteCarlo(){};


-
-      void evolve(LatticeLorentzColourMatrix& Uin){
+      void evolve(LatticeGaugeField& Uin){
 	Real DeltaH;
 	
 	// Thermalizations
 	for(int iter=1; iter <= Params.ThermalizationSteps; ++iter){
-	  std::cout << "-- # Thermalization step = "<< iter <<  "\n";
+	  std::cout<<GridLogMessage << "-- # Thermalization step = "<< iter <<  "\n";
 	
 	  DeltaH = evolve_step(Uin);
-	  std::cout<< " dH = "<< DeltaH << "\n";
+	  std::cout<<GridLogMessage<< "dH = "<< DeltaH << "\n";
 	}

 	// Actual updates (evolve a copy Ucopy then copy back eventually)
-	LatticeLorentzColourMatrix Ucopy(Uin._grid);
+	LatticeGaugeField Ucopy(Uin._grid);
 	for(int iter=Params.StartingConfig; 
 	    iter < Params.Nsweeps+Params.StartingConfig; ++iter){
-	  std::cout << "-- # Sweep = "<< iter <<  "\n";
+	  std::cout<<GridLogMessage << "-- # Sweep = "<< iter <<  "\n";
 	  
 	  Ucopy = Uin;
+
 	  DeltaH = evolve_step(Ucopy);
 		
 	  if(metropolis_test(DeltaH)) Uin = Ucopy;
-	  //need sync?
+
 	}
      }
    };
--- a/lib/qcd/hmc/integrators/Integrator.cc
+++ b/lib/qcd/hmc/integrators/Integrator.cc
@@ -18,7 +18,7 @@ namespace Grid{
      Pmu = zero;
      for(int mu=0;mu<Nd;mu++){
 	SU3::GaussianLieAlgebraMatrix(pRNG, Pmu);
-	pokeLorentz(P, Pmu, mu);
+	PokeIndex<LorentzIndex>(P, Pmu, mu);
      }
      
    }
--- a/lib/qcd/hmc/integrators/Integrator.h
+++ b/lib/qcd/hmc/integrators/Integrator.h
@@ -1,8 +1,9 @@
 //--------------------------------------------------------------------
 /*! @file Integrator.h
- * @brief Declaration of classes for the Molecular Dynamics integrator
+ * @brief Classes for the Molecular Dynamics integrator
 *
 * @author Guido Cossu
+ * Time-stamp: <2015-07-30 16:21:29 neo>
 */
 //--------------------------------------------------------------------

@@ -16,8 +17,16 @@ class Observer;
 namespace Grid{
  namespace QCD{

-    typedef Action<LatticeLorentzColourMatrix>*  ActPtr; // now force the same colours as the rest of the code
-    typedef std::vector<ActPtr> ActionLevel;
+    typedef Action<LatticeGaugeField>*  ActPtr; // now force the same colours as the rest of the code
+    struct ActionLevel{
+      int multiplier;
+    public:
+      std::vector<ActPtr> actions;
+      explicit ActionLevel(int mul = 1):multiplier(mul){assert (mul > 0);};
+      void push_back(ActPtr ptr){
+	actions.push_back(ptr);
+      }
+    };
    typedef std::vector<ActionLevel> ActionSet;
    typedef std::vector<Observer*> ObserverList;
    
@@ -35,8 +44,8 @@ namespace Grid{


    namespace MDutils{
-      void generate_momenta(LatticeLorentzColourMatrix&,GridParallelRNG&);
-      void generate_momenta_su3(LatticeLorentzColourMatrix&,GridParallelRNG&);
+      void generate_momenta(LatticeGaugeField&,GridParallelRNG&);
+      void generate_momenta_su3(LatticeGaugeField&,GridParallelRNG&);
    }

    /*! @brief Class for Molecular Dynamics management */   
@@ -45,8 +54,7 @@ namespace Grid{
    private:
      IntegratorParameters Params;
      const ActionSet as;
-      const std::vector<int> Nrel; //relative step size per level
-      std::unique_ptr<LatticeLorentzColourMatrix> P;
+      std::unique_ptr<LatticeGaugeField> P;
      GridParallelRNG pRNG;
      //ObserverList observers; // not yet
     
@@ -55,59 +63,53 @@ namespace Grid{
      void register_observers();
      void notify_observers();

-      void update_P(LatticeLorentzColourMatrix&U, int level,double ep){
-	for(int a=0; a<as[level].size(); ++a){
-	  LatticeLorentzColourMatrix force(U._grid);
-	  as[level].at(a)->deriv(U,force);
+      void update_P(LatticeGaugeField&U, int level,double ep){
+	for(int a=0; a<as[level].actions.size(); ++a){
+	  LatticeGaugeField force(U._grid);
+	  as[level].actions.at(a)->deriv(U,force);
 	  *P -= force*ep;
 	}
      }

-
-      void update_U(LatticeLorentzColourMatrix&U, double ep){
+      void update_U(LatticeGaugeField&U, double ep){
 	//rewrite exponential to deal automatically  with the lorentz index?
 	LatticeColourMatrix Umu(U._grid);
 	LatticeColourMatrix Pmu(U._grid);
 	for (int mu = 0; mu < Nd; mu++){
-	  Umu=peekLorentz(U, mu);
-	  Pmu=peekLorentz(*P, mu);
+	  Umu=PeekIndex<LorentzIndex>(U, mu);
+	  Pmu=PeekIndex<LorentzIndex>(*P, mu);
 	  Umu = expMat(Pmu, ep, Params.Nexp)*Umu;
-	  pokeLorentz(U, Umu, mu);
+	  PokeIndex<LorentzIndex>(U, Umu, mu);
 	}

      }
      
      
-      
-      friend void IntegratorAlgorithm::step (LatticeLorentzColourMatrix& U, 
+      friend void IntegratorAlgorithm::step (LatticeGaugeField& U, 
 					     int level, std::vector<int>& clock,
 					     Integrator<IntegratorAlgorithm>* Integ);
    public:
    Integrator(GridBase* grid, IntegratorParameters Par,
-		 ActionSet& Aset, std::vector<int> Nrel_):
-      Params(Par),as(Aset),Nrel(Nrel_),P(new LatticeLorentzColourMatrix(grid)),pRNG(grid){
-	assert(as.size() == Nrel.size());
+		 ActionSet& Aset):
+      Params(Par),as(Aset),P(new LatticeGaugeField(grid)),pRNG(grid){
 	pRNG.SeedRandomDevice();
      };
      
      ~Integrator(){}

-
      //Initialization of momenta and actions
-      void init(LatticeLorentzColourMatrix& U){
-	std::cout<< "Integrator init\n";
-
+      void init(LatticeGaugeField& U){
+	std::cout<<GridLogMessage<< "Integrator init\n";
 	MDutils::generate_momenta(*P,pRNG);
 	for(int level=0; level< as.size(); ++level){
-	  for(int actionID=0; actionID<as.at(level).size(); ++actionID){
-	    as[level].at(actionID)->init(U, pRNG);
+	  for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
+	    as[level].actions.at(actionID)->init(U, pRNG);
 	  }
 	}
      }

-      
      // Calculate action
-      RealD S(LatticeLorentzColourMatrix& U){
+      RealD S(LatticeGaugeField& U){
 	LatticeComplex Hloc(U._grid);
 	Hloc = zero;
 	// Momenta
@@ -119,17 +121,19 @@ namespace Grid{
 	
 	RealD H = Hsum.real();

-	std::cout << "H_p = "<< H << "\n";
+	std::cout<<GridLogMessage << "Momentum action H_p = "<< H << "\n";

 	// Actions
 	for(int level=0; level<as.size(); ++level)
-	  for(int actionID=0; actionID<as.at(level).size(); ++actionID)
-	    H += as[level].at(actionID)->S(U);
+	  for(int actionID=0; actionID<as[level].actions.size(); ++actionID)
+	    H += as[level].actions.at(actionID)->S(U);
+
+	std::cout<<GridLogMessage << "Total action H = "<< H << "\n";
 	
 	return H;
      }

-      void integrate(LatticeLorentzColourMatrix& U){
+      void integrate(LatticeGaugeField& U){
 	std::vector<int> clock;
 	clock.resize(as.size(),0);
 	for(int step=0; step< Params.MDsteps; ++step)   // MD step
--- a/lib/qcd/hmc/integrators/Integrator_algorithm.h
+++ b/lib/qcd/hmc/integrators/Integrator_algorithm.h
@@ -27,41 +27,38 @@ namespace Grid{
 	int fl = Integ->as.size() -1;
 	double eps = Integ->Params.stepsize;
 	
-	for(int l=0; l<=level; ++l) eps/= 2.0*Integ->Nrel[l];
+	for(int l=0; l<=level; ++l) eps/= 2.0*Integ->as[l].multiplier;
 	
-	int fin = Integ->Nrel[0];
-	for(int l=1; l<=level; ++l) fin*= 2.0*Integ->Nrel[l];
+	int fin = Integ->as[0].multiplier;
+	for(int l=1; l<=level; ++l) fin*= 2.0*Integ->as[l].multiplier;
 	fin = 3*Integ->Params.MDsteps*fin -1;
 	
-	
-	for(int e=0; e<Integ->Nrel[level]; ++e){
-	  
+	for(int e=0; e<Integ->as[level].multiplier; ++e){
 	  if(clock[level] == 0){    // initial half step 
 	    Integ->update_P(U,level,lambda*eps);
 	    ++clock[level];
-	    for(int l=0; l<level;++l) std::cout<<"   ";
-	    std::cout<<"P "<< clock[level] <<std::endl;
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
+	    std::cout<<GridLogMessage<<"P "<< clock[level] <<std::endl;
 	  }
 	  
 	  if(level == fl){          // lowest level 
 	    Integ->update_U(U,0.5*eps);
-	    
-	    for(int l=0; l<level;++l) std::cout<<"   ";
-	    std::cout<<"U "<< (clock[level]+1) <<std::endl;
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
+	    std::cout<<GridLogMessage<<"U "<< (clock[level]+1) <<std::endl;
 	  }else{                 // recursive function call 
 	    step(U,level+1,clock, Integ);
 	  }
 	  
 	  Integ->update_P(U,level,(1.0-2.0*lambda)*eps);
 	  ++clock[level];
-	  for(int l=0; l<level;++l) std::cout<<"   ";
-	  std::cout<<"P "<< (clock[level]) <<std::endl;
+	  for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
+	  std::cout<<GridLogMessage<<"P "<< (clock[level]) <<std::endl;
 	  
 	  if(level == fl){          // lowest level 
 	    Integ->update_U(U,0.5*eps);
 	    
-	    for(int l=0; l<level;++l) std::cout<<"   ";
-	    std::cout<<"U "<< (clock[level]+1) <<std::endl;
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
+	    std::cout<<GridLogMessage<<"U "<< (clock[level]+1) <<std::endl;
 	  }else{                 // recursive function call 
 	    step(U,level+1,clock, Integ);
 	  }    
@@ -71,19 +68,17 @@ namespace Grid{
 	    Integ->update_P(U,level,lambda*eps);
 	    
 	    ++clock[level];
-	    for(int l=0; l<level;++l) std::cout<<"   ";
-	    std::cout<<"P "<< clock[level] <<std::endl;
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
+	    std::cout<<GridLogMessage<<"P "<< clock[level] <<std::endl;
 	  }else{                  // bulk step
 	    Integ->update_P(U,level,lambda*2.0*eps);
 	    
 	    clock[level]+=2;
-	    for(int l=0; l<level;++l) std::cout<<"   ";
-	    std::cout<<"P "<< clock[level] <<std::endl;
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
+	    std::cout<<GridLogMessage<<"P "<< clock[level] <<std::endl;
 	  }
 	}
 		
-	
-	
      }
      
    };
@@ -93,6 +88,7 @@ namespace Grid{
      void step (LatticeLorentzColourMatrix& U, 
 		 int level, std::vector<int>& clock,
 		 Integrator<LeapFrog>* Integ){
+
 	// level  : current level
 	// fl     : final level
 	// eps    : current step size
@@ -101,45 +97,43 @@ namespace Grid{
 	double eps = Integ->Params.stepsize;
 	
 	// Get current level step size
-	for(int l=0; l<=level; ++l) eps/= Integ->Nrel[l];
+	for(int l=0; l<=level; ++l) eps/= Integ->as[l].multiplier;
 	
 	int fin = 1;
-	for(int l=0; l<=level; ++l) fin*= Integ->Nrel[l];
+	for(int l=0; l<=level; ++l) fin*= Integ->as[l].multiplier;
 	fin = 2*Integ->Params.MDsteps*fin - 1;
 	
-	for(int e=0; e<Integ->Nrel[level]; ++e){
+	for(int e=0; e<Integ->as[level].multiplier; ++e){
 	  
 	  if(clock[level] == 0){    // initial half step
 	    Integ->update_P(U, level,eps/2.0);
 	    ++clock[level];
-	    for(int l=0; l<level;++l) std::cout<<"   ";
-	    std::cout<<"P "<< 0.5*clock[level] <<std::endl;
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
+	    std::cout<<GridLogMessage<<"P "<< 0.5*clock[level] <<std::endl;
 	  }
+
 	  if(level == fl){          // lowest level
 	    Integ->update_U(U, eps);
-	    for(int l=0; l<level;++l) std::cout<<"   ";
-	    std::cout<<"U "<< 0.5*(clock[level]+1) <<std::endl;
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
+	    std::cout<<GridLogMessage<<"U "<< 0.5*(clock[level]+1) <<std::endl;
 	  }else{                 // recursive function call
 	    step(U, level+1,clock, Integ);
 	  }
+
 	  if(clock[level] == fin){  // final half step
 	    Integ->update_P(U, level,eps/2.0);
-	    
 	    ++clock[level];
-	    for(int l=0; l<level;++l) std::cout<<"   ";
-	    std::cout<<"P "<< 0.5*clock[level] <<std::endl;
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
+	    std::cout<<GridLogMessage<<"P "<< 0.5*clock[level] <<std::endl;
 	  }else{                  // bulk step
 	    Integ->update_P(U, level,eps);
-	    
 	    clock[level]+=2;
-	    for(int l=0; l<level;++l) std::cout<<"   ";
-	    std::cout<<"P "<< 0.5*clock[level] <<std::endl;
+	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
+	    std::cout<<GridLogMessage<<"P "<< 0.5*clock[level] <<std::endl;
 	  }
+
 	}

-
-
-
      }
    };

--- a/lib/qcd/spin/Dirac.h
+++ b/lib/qcd/spin/Dirac.h
@@ -4,7 +4,6 @@ namespace Grid{

 namespace QCD {

-  const int SpinorIndex = 2;

  class Gamma {

@@ -344,14 +343,19 @@ namespace QCD {
      typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type 
 	{
 	  iVector<vtype,N> ret;
-	  ret._internal=G*arg._internal;
+	  for(int i=0;i<N;i++){
+	    ret._internal[i]=G*arg._internal[i];
+	  }
 	  return ret;
 	}
    template<class vtype,int N> inline auto operator * ( const Gamma &G,const iMatrix<vtype,N> &arg) ->
      typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type 
 	{
 	  iMatrix<vtype,N> ret;
-	  ret._internal=G*arg._internal;
+	  for(int i=0;i<N;i++){
+	  for(int j=0;j<N;j++){
+	    ret._internal[i][j]=G*arg._internal[i][j];
+	  }}
 	  return ret;
 	}

@@ -369,14 +373,19 @@ namespace QCD {
      typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type 
 	{
 	  iVector<vtype,N> ret;
-	  ret._internal=arg._internal*G;
+	  for(int i=0;i<N;i++){
+	    ret._internal=arg._internal[i]*G;
+	  }
 	  return ret;
 	}
    template<class vtype,int N> inline auto operator * (const iMatrix<vtype,N> &arg, const Gamma &G) ->
      typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type 
 	{
 	  iMatrix<vtype,N> ret;
-	  ret._internal=arg._internal*G;
+	  for(int i=0;i<N;i++){
+	  for(int j=0;j<N;j++){
+	    ret._internal[i][j]=arg._internal[i][j]*G;
+	  }}
 	  return ret;
 	}

--- a/lib/qcd/spin/TwoSpinor.h
+++ b/lib/qcd/spin/TwoSpinor.h
--- a/lib/qcd/utils/LinalgUtils.h
+++ b/lib/qcd/utils/LinalgUtils.h
@@ -32,10 +32,11 @@ void ag5xpby_ssp(Lattice<vobj> &z,RealD a,const Lattice<vobj> &x,RealD b,const L
  conformable(x,z);
  GridBase *grid=x._grid;
  int Ls = grid->_rdimensions[0];
+  Gamma G5(Gamma::Gamma5);
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp;
-    multGamma5(tmp(),a*x._odata[ss+s]());
+    tmp = G5*x._odata[ss+s]*a;
    tmp = tmp + b*y._odata[ss+sp];
    vstream(z._odata[ss+s],tmp);
  }
@@ -49,10 +50,11 @@ void axpbg5y_ssp(Lattice<vobj> &z,RealD a,const Lattice<vobj> &x,RealD b,const L
  conformable(x,z);
  GridBase *grid=x._grid;
  int Ls = grid->_rdimensions[0];
+  Gamma G5(Gamma::Gamma5);
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp;
-    multGamma5(tmp(),b*y._odata[ss+sp]());
+    tmp = G5*y._odata[ss+sp]*b;
    tmp = tmp + a*x._odata[ss+s];
    vstream(z._odata[ss+s],tmp);
  }
@@ -66,12 +68,13 @@ void ag5xpbg5y_ssp(Lattice<vobj> &z,RealD a,const Lattice<vobj> &x,RealD b,const
  conformable(x,z);
  GridBase *grid=x._grid;
  int Ls = grid->_rdimensions[0];
+  Gamma G5(Gamma::Gamma5);
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp1;
    vobj tmp2;
    tmp1 = a*x._odata[ss+s]+b*y._odata[ss+sp];
-    multGamma5(tmp2(),tmp1());
+    tmp2 = G5*tmp1;
    vstream(z._odata[ss+s],tmp2);
  }
 }
@@ -117,12 +120,13 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
  z.checkerboard = x.checkerboard;
  conformable(x,z);
  int Ls = grid->_rdimensions[0];
+  Gamma G5(Gamma::Gamma5);
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp;
    for(int s=0;s<Ls;s++){
      int sp = Ls-1-s;
-      multGamma5(tmp(),x._odata[ss+s]());
+      tmp = G5*x._odata[ss+s];
      vstream(z._odata[ss+sp],tmp);
    }
  }
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -372,7 +372,7 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
    LatticeReal d(grid); d=zero;
    LatticeReal alpha(grid);

-    //    std::cout<<"xi "<<xi <<std::endl;
+    //    std::cout<<GridLogMessage<<"xi "<<xi <<std::endl;
    alpha = toReal(2.0*xi);

    do { 
@@ -468,11 +468,11 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
    LatticeMatrix Vcheck(grid);
    Vcheck = zero;
    Vcheck = where(Accepted,V*adj(V) - 1.0,Vcheck);
-    //    std::cout << "SU3 check " <<norm2(Vcheck)<<std::endl;
+    //    std::cout<<GridLogMessage << "SU3 check " <<norm2(Vcheck)<<std::endl;
    assert(norm2(Vcheck)<1.0e-4);
    
    // Verify the link stays in SU(3)
-    //    std::cout <<"Checking the modified link"<<std::endl;
+    //    std::cout<<GridLogMessage <<"Checking the modified link"<<std::endl;
    Vcheck = link*adj(link) - 1.0;
    assert(norm2(Vcheck)<1.0e-4);
    /////////////////////////////////
@@ -483,42 +483,42 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
    for(int gen=0;gen<generators();gen++){
      Matrix ta;
      generator(gen,ta);
-      std::cout<< "Nc = "<<ncolour<<" t_"<<gen<<std::endl;
-      std::cout<<ta<<std::endl;
+      std::cout<<GridLogMessage<< "Nc = "<<ncolour<<" t_"<<gen<<std::endl;
+      std::cout<<GridLogMessage<<ta<<std::endl;
    }
  }

  static void testGenerators(void){
    Matrix ta;
    Matrix tb;
-    std::cout<<"Checking trace ta tb is 0.5 delta_ab"<<std::endl;
+    std::cout<<GridLogMessage<<"Checking trace ta tb is 0.5 delta_ab"<<std::endl;
    for(int a=0;a<generators();a++){
      for(int b=0;b<generators();b++){
 	generator(a,ta);
 	generator(b,tb);
 	Complex tr =TensorRemove(trace(ta*tb)); 
-	std::cout<<tr<<" ";
+	std::cout<<GridLogMessage<<tr<<" ";
 	if(a==b) assert(abs(tr-Complex(0.5))<1.0e-6);
 	if(a!=b) assert(abs(tr)<1.0e-6);
      }
-      std::cout<<std::endl;
+      std::cout<<GridLogMessage<<std::endl;
    }
-    std::cout<<"Checking hermitian"<<std::endl;
+    std::cout<<GridLogMessage<<"Checking hermitian"<<std::endl;
    for(int a=0;a<generators();a++){
      generator(a,ta);
-      std::cout<<a<<" ";
+      std::cout<<GridLogMessage<<a<<" ";
      assert(norm2(ta-adj(ta))<1.0e-6);
    }    
-    std::cout<<std::endl;
+    std::cout<<GridLogMessage<<std::endl;

-    std::cout<<"Checking traceless"<<std::endl;
+    std::cout<<GridLogMessage<<"Checking traceless"<<std::endl;
    for(int a=0;a<generators();a++){
      generator(a,ta);
      Complex tr =TensorRemove(trace(ta)); 
-      std::cout<<a<<" ";
+      std::cout<<GridLogMessage<<a<<" ";
      assert(abs(tr)<1.0e-6);
    }    
-    std::cout<<std::endl;
+    std::cout<<GridLogMessage<<std::endl;
  }

  // reunitarise??
@@ -554,9 +554,7 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
    for(int a=0;a<generators();a++){
      gaussian(pRNG,ca); 
      generator(a,ta);
-      
      la=toComplex(ca)*ci*ta;
-   
      out += la; 
    }

--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -7,7 +7,6 @@ namespace QCD {
 template<class GaugeMat,class GaugeLorentz>
 class WilsonLoops {
 public:
-
  //////////////////////////////////////////////////
  // directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -1,14 +1,16 @@
 //----------------------------------------------------------------------
 /*! @file Grid_sse4.h
-  @brief Optimization libraries for NEON (ARM) instructions set ARMv7
+  @brief Optimization libraries for NEON (ARM) instructions set ARMv8

  Experimental - Using intrinsics - DEVELOPING! 
 */
-// Time-stamp: <2015-06-09 15:25:40 neo>
+// Time-stamp: <2015-07-10 17:45:09 neo>
 //----------------------------------------------------------------------

 #include <arm_neon.h>

+// ARMv8 supports double precision
+
 namespace Optimization {

  template<class vtype>
@@ -22,50 +24,47 @@ namespace Optimization {
    float f[4];
  };
  union u128d {
-    float32x4_t v;
-    float f[4];
+    float64x2_t v;
+    double f[4];
  };
  
  struct Vsplat{
    //Complex float
    inline float32x4_t operator()(float a, float b){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={a,b,a,b};
+      return vld1q_f32(tmp);
    }
    // Real float
    inline float32x4_t operator()(float a){
-      float32x4_t foo;
-      return foo;
+      return vld1q_dup_f32(&a);
    }
    //Complex double
    inline float32x4_t operator()(double a, double b){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={(float)a,(float)b,(float)a,(float)b};
+      return vld1q_f32(tmp);
    }
    //Real double
    inline float32x4_t operator()(double a){
-      float32x4_t foo;
-      return foo;
+      return vld1q_dup_f32(&a);
    }
    //Integer
    inline uint32x4_t operator()(Integer a){
-      uint32x4_t foo;
-      return foo;
+      return vld1q_dup_u32(&a);
    }
  };

  struct Vstore{
    //Float 
    inline void operator()(float32x4_t a, float* F){
-      
+      vst1q_f32(F, a);
    }
    //Double
    inline void operator()(float32x4_t a, double* D){
-      
+      vst1q_f32((float*)D, a);
    }
    //Integer
    inline void operator()(uint32x4_t a, Integer* I){
-     
+      vst1q_u32(I, a);
    }

  };
@@ -130,36 +129,30 @@ namespace Optimization {
  struct Sum{
    //Complex/Real float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
-      return foo;
+      return vaddq_f32(a,b);
    }
    //Complex/Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  float32x4_t foo;
-    //  return foo;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vaddq_f64(a,b);
+    }
    //Integer
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      uint32x4_t foo;
-      return foo;
+      return vaddq_u32(a,b);
    }
  };

  struct Sub{
    //Complex/Real float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
-      return foo;
+      return vsubq_f32(a,b);
    }
    //Complex/Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  float32x4_t foo;
-    //  return foo;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vsubq_f64(a,b);
+    }
    //Integer
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      uint32x4_t foo;
-      return foo;
+      return vsubq_u32(a,b);
    }
  };

@@ -170,24 +163,24 @@ namespace Optimization {
      return foo;
    }
    // Complex double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  float32x4_t foo;
-    //  return foo;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      float32x4_t foo;
+      return foo;
+    }
  };

  struct Mult{
    // Real float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      return a;
+      return vmulq_f32(a,b);
    }
    // Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  return 0;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vmulq_f64(a,b);
+    }
    // Integer
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      return a;
+      return vmulq_u32(a,b);
    }
  };

@@ -219,6 +212,7 @@ namespace Optimization {
  struct TimesI{
    //Complex single
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
+      //need shuffle
      return in;
    }
    //Complex double
@@ -242,20 +236,25 @@ namespace Optimization {
  //Real float Reduce
  template<>
  inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+    float32x2_t high = vget_high_f32(in);
+    float32x2_t low = vget_low_f32(in);
+    float32x2_t tmp = vadd_f32(low, high);
+    float32x2_t sum = vpadd_f32(tmp, tmp);
+    return vget_lane_f32(sum,0);
  }
  
  
  //Complex double Reduce
  template<>
-  inline Grid::ComplexD Reduce<Grid::ComplexD, float32x4_t>::operator()(float32x4_t in){
+  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
    return 0;
  }
  
  //Real double Reduce
  template<>
-  inline Grid::RealD Reduce<Grid::RealD, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+  inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
+    float64x2_t sum = vpaddq_f64(in, in);
+    return vgetq_lane_f64(sum,0);
  }

  //Integer Reduce
@@ -272,7 +271,7 @@ namespace Optimization {
 namespace Grid {

  typedef float32x4_t  SIMD_Ftype; // Single precision type
-  typedef float32x4_t  SIMD_Dtype; // Double precision type - no double on ARMv7
+  typedef float64x2_t  SIMD_Dtype; // Double precision type
  typedef uint32x4_t   SIMD_Itype; // Integer type

  inline void v_prefetch0(int size, const char *ptr){};  // prefetch utilities
--- a/Show More
+++ b/Show More