Merge branch 'master' of https://github.com/paboyle/Grid

2025-11-04 05:54:32 +00:00 · 2015-12-03 12:11:10 -05:00
parent ee9ecb6115 26161addd0
commit fb81acca3c
233 changed files with 33004 additions and 12555 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,55 +1,66 @@
-# Compiled Object files
+# Compiled Object files #
+#########################
 *.slo
 *.lo
 *.o
 *.obj
+
+
+# Editor files #
+################
 *~
-errs
 *#

-# Precompiled Headers
+# Precompiled Headers #
+#######################
 *.gch
 *.pch

-# Compiled Dynamic libraries
+# Compiled Dynamic libraries #
+##############################
 *.so
 *.dylib
 *.dll

-# Fortran module files
+# Fortran module files #
+########################
 *.mod

-# Compiled Static libraries
+# Compiled Static libraries #
+#############################
 *.lai
 *.la
 *.a
 *.lib

-# Executables
+# Executables #
+###############
 *.exe
 *.out
 *.app
-# http://www.gnu.org/software/automake

+# http://www.gnu.org/software/automake #
+########################################
 Makefile.in
 Makefile
+Config.h
 config.log
 config.status
 .deps

-# http://www.gnu.org/software/autoconf
-
-/autom4te.cache
-/aclocal.m4
-/compile
-/configure
-/depcomp
-/install-sh
-/missing
-/stamp-h1
-/config.sub
-/config.guess
-/INSTALL
+# http://www.gnu.org/software/autoconf #
+########################################
+autom4te.cache
+aclocal.m4
+compile
+configure
+depcomp
+install-sh
+missing
+stamp-h1
+config.sub
+config.guess
+INSTALL

 # Packages #
 ############
@@ -79,3 +90,12 @@ config.status
 .Trashes
 ehthumbs.db
 Thumbs.db
+
+# build directory #
+###################
+build/*
+
+# IDE related files #
+#####################
+*.xcodeproj/*
+build.sh
--- a/1
+++ b/1
@@ -1 +0,0 @@
-/usr/share/automake-1.14/INSTALL
--- a/125
+++ b/125
@@ -1,15 +1,61 @@
+RECENT
+---------------
+
+  - Clean up HMC                                                             -- DONE
+  - LorentzScalar<GaugeField> gets Gauge link type (cleaner).                -- DONE
+  - Simplified the integrators a bit.                                        -- DONE
+  - Multi-timescale looks broken and operating on single timescale for now.  -- DONE
+  - pass GaugeField as template param.                        -- DONE
+  - Reunitarise                                               -- DONE
+  - Force Gradient                                            -- DONE
+  - Prefer "RefreshInternal" or such like to "init" in naming -- DONE
+  - Parallel io improvements                                  -- DONE
+  - Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test. -- DONE
+
+TODO:
+---------------
+Policies:
+* Link smearing/boundary conds; Policy class based implementation ; framework more in place
+* Support different boundary conditions (finite temp, chem. potential ... )
+* Support different fermion representations? 
+  - contained entirely within the integrator presently
+
+- Sign of force term.
+
+- Reversibility test.
+
+- Rename "Ta" as too unclear
+
+- Lanczos
+
+- Rectangle gauge actions.
+  Iwasaki,
+  Symanzik,
+  ... etc...
+
+- Prepare multigrid for HMC. - Alternate setup schemes.
+
+- Support for ILDG --- ugly, not done
+
+- Flavour matrices?
+
+- FFTnD ?
+
 ================================================================
-*** Hacks and bug fixes to clean up and Audits
+* Hacks and bug fixes to clean up and Audits
 ================================================================

 *  Extract/merge/set cleanup ; too many variants; rationalise and call simpler ones
-*  Used #define repetitive sequences to minimise code.
+
 *  Rewrite core tensor arithmetic support to be more systematic
+ =  Use #define repetitive sequences to minimise code, decrease line count by thousands possible,
+    with more robust and maintainable implementation.
+
 *  Ensure we ET as much as possible; move unop functions into ET framework.
   - tests with expression args to all functions

-
 * FIXME audit
+
 * const audit

 Insert/Extract
@@ -22,10 +68,12 @@ Insert/Extract

 * Thread scaling tests Xeon, XeonPhi

-** Make the Tensor types and Complex etc... play more nicely.
+Not sure of status of this -- reverify. Things are working nicely now though.
+
+* Make the Tensor types and Complex etc... play more nicely.
  - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar<Scalar<Scalar<Complex > > >
-  QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I
-  want to introduce a syntax that does not require this.
+    QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I
+    want to introduce a syntax that does not require this.

  - Reductions that contract indices on a site should always demote the tensor structure.
    norm2(), innerProduct.
@@ -38,6 +86,37 @@ Insert/Extract
    template specialize the scalar scalar scalar sum and SliceSum,  on the basis of being
    pure scalar.

+======================================================================
+======================================================================
+======================================================================
+======================================================================
+Done: Cayley, Partial , ContFrac force terms.
+
+DONE
+- PseudoFermions
+=> generalise to non-const EE ; likely defer (??) (NOT DONE)
+Done:
+  - TwoFlavour
+  - TwoFlavourEvenOdd        
+  - TwoFlavourRatio
+  - TwoFlavourRatioEvenOdd
+
+Done:
+  - OneFlavourRationalEvenOdd
+  - OneFlavourRationalRatioEvenOdd
+  - OneFlavourRationalRatio
+
+Done
+=> Test DWF HMC
+   - Fix a threading bug that has been introduced and prevents HMC running hybrid OMP mode
+
+Done:
+- RNG filling from sparser grid, lower dim grid.
+
+
+DONE
+  - MacroMagic -> virtual reader class.
+
 *** Expression template engine: -- DONE

 [   -- Norm2(expression) problem: introduce norm2 unary op, or Introduce conversion automatic from expression to Lattice<vobj>
@@ -54,28 +133,13 @@ Insert/Extract
   // localMaxAbs
   // Fourier transform equivalent.]

-================================================================
-*** New Functionality
-================================================================
-
-* - BinaryWriter, TextWriter etc...
-  - use protocol buffers? replace xmlReader/Writer ec..
-  - Binary use htonll, htonl

 * CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)

-* Parallel io improvements
-  - optional parallel MPI2 IO
-  - move Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test.
-
-* Support for ILDG
-
-* Support different boundary conditions (finite temp, chem. potential ... )
-
-* Support different fermion representations? 
-
-Actions -- coherent framework for implementing actions and their forces.
+-- coherent framework for implementing actions and their forces.
+Actions 

+DONE
 * Fermion
  - Wilson
  - Clover
@@ -83,6 +147,7 @@ Actions -- coherent framework for implementing actions and their forces.
  - Mobius
  - z-Mobius

+
 Algorithms (lots of reuse/port from BFM)
 * LinearOperator
 * LinearSolver
@@ -100,17 +165,10 @@ Algorithms (lots of reuse/port from BFM)
 * Integrators, leapfrog, omelyan, force gradient etc...
 * etc..

-* Gauge
-  - Wilson, symanzik, iwasaki
-
-* rb4d support for 5th dimension in Mobius.
-
-* Flavour matrices?
+Done
 * Pauli, SU subgroup, etc.. 
-* su3 exponentiation & log etc.. [Jamie's code?]
-* TaProj
-* FFTnD ?

+* su3 exponentiation & log etc.. [Jamie's code?]

 ======================================================================================================
 FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane)
@@ -144,7 +202,6 @@ FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me argua
  - lib/communicator
  - lib/algorithms
  - lib/qcd
- future
  - lib/io/   -- GridLog, GridIn, GridErr, GridDebug, GridMessage
  - lib/qcd/actions
  - lib/qcd/measurements
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -11,22 +11,25 @@ int main (int argc, char ** argv)
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  int Nloop=10;
  int nmu=0;
  for(int mu=0;mu<4;mu++) if (mpi_layout[mu]>1) nmu++;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;



  for(int lat=4;lat<=32;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){

-      std::vector<int> latt_size  ({lat,lat,lat,lat});
+      std::vector<int> latt_size  ({lat*mpi_layout[0],
+      				    lat*mpi_layout[1],
+      				    lat*mpi_layout[2],
+      				    lat*mpi_layout[3]});

      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

@@ -84,15 +87,15 @@ int main (int argc, char ** argv)

      double time = stop-start; // microseconds

-      std::cout << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }    


-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking sequential halo exchange in "<<nmu<<" dimensions"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;


  for(int lat=4;lat<=32;lat+=2){
@@ -160,7 +163,7 @@ int main (int argc, char ** argv)

      double time = stop-start;

-      std::cout << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }  

--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -21,11 +21,11 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);

  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=8;
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  const int Ls=16;
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
@@ -59,7 +59,7 @@ int main (int argc, char ** argv)
  ////////////////////////////////////
  std::vector<LatticeColourMatrix> U(4,FGrid);
  for(int mu=0;mu<Nd;mu++){
-    U[mu] = peekIndex<LorentzIndex>(Umu5d,mu);
+    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
  }

  if (1)
@@ -79,25 +79,28 @@ int main (int argc, char ** argv)

  RealD mass=0.1;
  RealD M5  =1.8;
-  DomainWallFermion Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  
-  std::cout << "Calling Dw"<<std::endl;
-  int ncall=10;
-  double t0=usecond();
-  for(int i=0;i<ncall;i++){
-    Dw.Dhop(src,result,0);
+  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
+  int ncall=10000;
+  {
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.Dhop(src,result,0);
+    }
+    double t1=usecond();
+    
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=1344*volume*ncall;
+
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    err = ref-result; 
+    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+    Dw.Report();
  }
-  double t1=usecond();
-
-  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-  double flops=1344*volume*ncall;
-  
-  std::cout << "Called Dw"<<std::endl;
-  std::cout << "norm result "<< norm2(result)<<std::endl;
-  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
-  std::cout << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-  err = ref-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;


  if (1)
@@ -120,11 +123,11 @@ int main (int argc, char ** argv)
    ref = -0.5*ref;
  }
  Dw.Dhop(src,result,1);
-  std::cout << "Called DwDag"<<std::endl;
-  std::cout << "norm result "<< norm2(result)<<std::endl;
-  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
+  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  err = ref-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;

  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
@@ -133,24 +136,44 @@ int main (int argc, char ** argv)
  LatticeFermion r_eo  (FGrid);


-  std::cout << "Calling Deo and Doe"<<std::endl;
+  std::cout<<GridLogMessage << "Calling Deo and Doe"<<std::endl;
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);

+  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
+
+  {
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.DhopEO(src_o,r_e,DaggerNo);
+    }
+    double t1=usecond();
+    
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=(1344.0*volume*ncall)/2;
+
+    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
+  }
+
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
-  Dw.Dhop(src,result,DaggerNo);
+  Dw.Dhop  (src  ,result,DaggerNo);
+
+  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
+  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
+  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;

  setCheckerboard(r_eo,r_o);
  setCheckerboard(r_eo,r_e);

  err = r_eo-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;

  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
-  std::cout << "norm diff even  "<< norm2(src_e)<<std::endl;
-  std::cout << "norm diff odd   "<< norm2(src_o)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;

  Grid_finalize();
 }
--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@@ -0,0 +1,84 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Nvec=8;
+  typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
+  typedef iVector<vReal,Nvec> Vec;
+
+
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking READ bandwidth"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t"<<"bytes/thread"<<"\t\t\t"<<"GB/s"<<"\t\t\t"<<"GB/s per thread"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+
+  const int lmax = 16536*16;
+  for(int lat=4;lat<=lmax;lat*=2){
+
+    int Nloop=lmax*4/lat;
+
+    std::vector<int> latt_size  ({2*mpi_layout[0],2*mpi_layout[1],4*mpi_layout[2],lat*mpi_layout[3]});
+
+    GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+    int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]*threads;
+
+    Vec tsum; tsum = zero;
+
+    GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+
+    std::vector<double> stop(threads);
+    Vector<Vec> sum(threads);
+
+    std::vector<LatticeVec> x(threads,&Grid);
+    for(int t=0;t<threads;t++){
+      //      random(pRNG,x[t]);
+    }
+
+    double start=usecond();
+PARALLEL_FOR_LOOP
+    for(int t=0;t<threads;t++){
+
+      sum[t] = x[t]._odata[0];
+      for(int i=0;i<Nloop;i++){
+	for(auto ss=x[t].begin();ss<x[t].end();ss++){
+	  sum[t] = sum[t]+x[t]._odata[ss];
+	}
+      }
+      stop[t]=usecond();
+    }
+
+    double max_stop=stop[0];
+    double min_stop=stop[0];
+    
+    for(int t=0;t<threads;t++){
+      tsum+=sum[t];
+      if ( stop[t]<min_stop ) min_stop=stop[t];
+      if ( stop[t]>max_stop ) max_stop=stop[t];
+    }
+
+    
+
+    double max_time = (max_stop-start)/Nloop*1000;
+    double min_time = (min_stop-start)/Nloop*1000;
+      
+    double bytes=vol*Nvec*sizeof(Real);
+    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/threads<<"\t\t"<<bytes/max_time<<" - "<< bytes/min_time<<"\t\t"<<bytes/min_time/threads <<std::endl;
+    
+  }    
+
+  Grid_finalize();
+}
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -10,28 +10,34 @@ int main (int argc, char ** argv)

  const int Nvec=8;
  typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
+  typedef iVector<vReal,Nvec> Vec;

-  int Nloop=1000;
+
+  Vec rn = zero;

  std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();

  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking fused AXPY bandwidth"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
  
-  for(int lat=4;lat<=32;lat+=4){
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking fused AXPY bandwidth ; sizeof(Real) "<<sizeof(Real)<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+  uint64_t lmax=44;
+#define NLOOP (1*lmax*lmax*lmax*lmax/vol)
+  for(int lat=4;lat<=lmax;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

-      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      uint64_t Nloop=NLOOP;
+
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();

      LatticeVec z(&Grid); //random(pRNG,z);
      LatticeVec x(&Grid); //random(pRNG,x);
@@ -41,63 +47,70 @@ int main (int argc, char ** argv)

      double start=usecond();
      for(int i=0;i<Nloop;i++){
-	//   inline void axpy(Lattice<vobj> &ret,double a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
 	axpy(z,a,x,y);
+        x._odata[0]=z._odata[0]; // serial loop dependence to prevent optimise
+        y._odata[4]=z._odata[4];
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      
      double flops=vol*Nvec*2;// mul,add
      double bytes=3*vol*Nvec*sizeof(Real);
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;

    }

-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking a*x + y bandwidth"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
  
-  for(int lat=4;lat<=32;lat+=4){
+  for(int lat=4;lat<=lmax;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

-      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();

      LatticeVec z(&Grid); //random(pRNG,z);
      LatticeVec x(&Grid); //random(pRNG,x);
      LatticeVec y(&Grid); //random(pRNG,y);
      double a=2.0;

+      uint64_t Nloop=NLOOP;
+
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	z=a*x-y;
+        x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
+        y._odata[4]=z._odata[4];
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
     
      double flops=vol*Nvec*2;// mul,add
      double bytes=3*vol*Nvec*sizeof(Real);
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;

    }

-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SCALE bandwidth"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SCALE bandwidth"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
+
+  for(int lat=4;lat<=lmax;lat+=4){

-  for(int lat=4;lat<=32;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      uint64_t Nloop=NLOOP;

      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

-      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();

      LatticeVec z(&Grid); //random(pRNG,z);
      LatticeVec x(&Grid); //random(pRNG,x);
@@ -108,46 +121,47 @@ int main (int argc, char ** argv)
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	z=a*x;
+        x._odata[0]=z._odata[0]*2.0;
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      
      double bytes=2*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*1;// mul
-      std::cout <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;

  }

-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking READ bandwidth"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking READ bandwidth"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=4;lat<=32;lat+=4){
+  for(int lat=4;lat<=lmax;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      uint64_t Nloop=NLOOP;
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

-      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
-
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
      LatticeVec z(&Grid); //random(pRNG,z);
      LatticeVec x(&Grid); //random(pRNG,x);
      LatticeVec y(&Grid); //random(pRNG,y);
      RealD a=2.0;
-      ComplexD nn;
-
+      Real nn;      
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	nn=norm2(x);
+	vsplat(x._odata[0]._internal[0],nn);
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      
      double bytes=vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;

  }    

--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -14,15 +14,15 @@ int main (int argc, char ** argv)
  std::vector<int> mpi_layout  = GridDefaultMpi();

  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SU3xSU3  x= x*y"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  x= x*y"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=2;lat<=24;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -43,18 +43,18 @@ int main (int argc, char ** argv)
      double bytes=3.0*vol*Nc*Nc*sizeof(Complex);
      double footprint=2.0*vol*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(6.0+8.0+8.0)*vol;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<footprint<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<footprint<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;

    }


-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SU3xSU3  z= x*y"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z= x*y"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=2;lat<=24;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -75,17 +75,17 @@ int main (int argc, char ** argv)
      
      double bytes=3*vol*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(6+8+8)*vol;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;

    }

-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SU3xSU3  mult(z,x,y)"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  mult(z,x,y)"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=2;lat<=24;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -106,17 +106,17 @@ int main (int argc, char ** argv)
      
      double bytes=3*vol*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(6+8+8)*vol;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;

    }

-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "= Benchmarking SU3xSU3  mac(z,x,y)"<<std::endl;
-  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
-  std::cout << "----------------------------------------------------------"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  mac(z,x,y)"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=2;lat<=24;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -137,7 +137,7 @@ int main (int argc, char ** argv)
      
      double bytes=3*vol*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(8+8+8)*vol;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;

    }

--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -22,13 +22,16 @@ int main (int argc, char ** argv)


  std::vector<int> latt_size   = GridDefaultLatt();
-  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);

  int threads = GridThread::GetThreads();
-  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
+  std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
+  std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;

  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);
@@ -55,15 +58,15 @@ int main (int argc, char ** argv)
  for(int nn=0;nn<Nd;nn++){
    random(pRNG,U[nn]);
    if(0) {
-      if (nn==-1) { U[nn]=zero; std::cout << "zeroing gauge field in dir "<<nn<<std::endl; }
-      else       { U[nn] = cone;std::cout << "unit gauge field in dir "<<nn<<std::endl; }
+      if (nn==-1) { U[nn]=zero; std::cout<<GridLogMessage << "zeroing gauge field in dir "<<nn<<std::endl; }
+      else       { U[nn] = cone;std::cout<<GridLogMessage << "unit gauge field in dir "<<nn<<std::endl; }
    }
    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
  }
 #endif

  for(int mu=0;mu<Nd;mu++){
-    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  
  { // Naive wilson implementation
@@ -84,10 +87,10 @@ int main (int argc, char ** argv)
  }
  ref = -0.5*ref;
  RealD mass=0.1;
-  WilsonFermion Dw(Umu,Grid,RBGrid,mass);
+  WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
  
-  std::cout << "Calling Dw"<<std::endl;
-  int ncall=10000;
+  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
+  int ncall=1000;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.Dhop(src,result,0);
@@ -95,12 +98,12 @@ int main (int argc, char ** argv)
  double t1=usecond();
  double flops=1344*volume*ncall;
  
-  std::cout << "Called Dw"<<std::endl;
-  std::cout << "norm result "<< norm2(result)<<std::endl;
-  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
-  std::cout << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+  std::cout<<GridLogMessage << "Called Dw"<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
  err = ref-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;


  //  for(int ss=0;ss<10;ss++ ){
@@ -109,7 +112,7 @@ int main (int argc, char ** argv)
      for(int j=0;j<Nc;j++){
 	ComplexF * ref_p = (ComplexF *)&ref._odata[ss]()(i)(j);
 	ComplexF * res_p = (ComplexF *)&result._odata[ss]()(i)(j);
-	std::cout << ss<< " "<<i<<" "<<j<<" "<< (*ref_p)<<" " <<(*res_p)<<std::endl;
+	std::cout<<GridLogMessage << ss<< " "<<i<<" "<<j<<" "<< (*ref_p)<<" " <<(*res_p)<<std::endl;
      }
    }
  }
@@ -133,11 +136,11 @@ int main (int argc, char ** argv)
  }
  ref = -0.5*ref;
  Dw.Dhop(src,result,1);
-  std::cout << "Called DwDag"<<std::endl;
-  std::cout << "norm result "<< norm2(result)<<std::endl;
-  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
+  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  err = ref-result; 
-  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;

  Grid_finalize();
 }
--- a/benchmarks/Make.inc
+++ b/benchmarks/Make.inc
@@ -1,5 +1,5 @@

-bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson
+bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson


 Benchmark_comms_SOURCES=Benchmark_comms.cc
@@ -10,6 +10,10 @@ Benchmark_dwf_SOURCES=Benchmark_dwf.cc
 Benchmark_dwf_LDADD=-lGrid


+Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
+Benchmark_memory_asynch_LDADD=-lGrid
+
+
 Benchmark_memory_bandwidth_SOURCES=Benchmark_memory_bandwidth.cc
 Benchmark_memory_bandwidth_LDADD=-lGrid

--- a/8439
+++ b/8439
--- a/configure.ac
+++ b/configure.ac
@@ -3,7 +3,7 @@
 #
 # Project Grid package  
 # 
-# Time-stamp: <2015-06-09 15:26:39 neo>
+# Time-stamp: <2015-07-10 17:46:21 neo>

 AC_PREREQ([2.63])
 AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
@@ -11,7 +11,7 @@ AC_CANONICAL_SYSTEM
 AM_INIT_AUTOMAKE(subdir-objects)
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_SRCDIR([lib/Grid.h])
-AC_CONFIG_HEADERS([lib/GridConfig.h])
+AC_CONFIG_HEADERS([lib/Config.h])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])

 AC_MSG_NOTICE([
@@ -26,10 +26,9 @@ AC_LANG(C++)
 AC_PROG_CXX
 AC_OPENMP
 AC_PROG_RANLIB
-AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
+#AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
 AX_EXT

-
 # Checks for libraries.
 #AX_GCC_VAR_ATTRIBUTE(aligned)

@@ -39,6 +38,7 @@ AC_CHECK_HEADERS(mm_malloc.h)
 AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
+AC_CHECK_HEADERS(execinfo.h)
 AC_CHECK_HEADERS(gmp.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
@@ -56,19 +56,18 @@ echo :::::::::::::::::::::::::::::::::::::::::::

 AC_CHECK_FUNCS([gettimeofday])

-AC_CHECK_LIB([gmp],[__gmpf_init],,
-        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
-Please install or provide the correct path to your installation
-Info at: http://www.gmplib.org)])
+#AC_CHECK_LIB([gmp],[__gmpf_init],,
+#        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
+#Please install or provide the correct path to your installation
+#Info at: http://www.gmplib.org)])

-AC_CHECK_LIB([mpfr],[mpfr_init],,
-        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
-Please install or provide the correct path to your installation
-Info at: http://www.mpfr.org/)])
+#AC_CHECK_LIB([mpfr],[mpfr_init],,
+#        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
+#Please install or provide the correct path to your installation
+#Info at: http://www.mpfr.org/)])

-
-AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|MIC],\
-	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, MIC])],\
+AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
+	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
 	[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])

 supported=no
@@ -92,6 +91,15 @@ case ${ac_SIMD} in
       	AC_MSG_WARN([Your processor does not support AVX instructions])
       fi
     ;;
+     AVXFMA4)
+       echo Configuring for AVX
+       AC_DEFINE([AVXFMA4],[1],[AVX Intrinsics with FMA4] )
+       if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
+       supported=yes			  
+       else
+       	AC_MSG_WARN([Your processor does not support AVX instructions])
+       fi
+     ;;
     AVX2)
       echo Configuring for AVX2
       AC_DEFINE([AVX2],[1],[AVX2 Intrinsics] )
@@ -101,14 +109,19 @@ case ${ac_SIMD} in
       AC_MSG_WARN([Your processor does not support AVX2 instructions])
       fi
     ;;
-     AVX512|MIC)
-       echo Configuring for AVX512 and MIC
-       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Corner] )
+     AVX512)
+       echo Configuring for AVX512 
+       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Landing] )
       supported="cross compilation"
     ;;
-     NEONv7)
-       echo Configuring for experimental ARMv7 support 
-       AC_DEFINE([NEONv7],[1],[NEON ARMv7 Experimental support ] )
+     IMCI)
+       echo Configuring for IMCI
+       AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
+       supported="cross compilation"
+     ;;
+     NEONv8)
+       echo Configuring for experimental ARMv8a support 
+       AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] )
       supported="cross compilation"
     ;;
     DEBUG)
@@ -120,6 +133,17 @@ case ${ac_SIMD} in
     ;;
 esac

+AC_ARG_ENABLE([precision],[AC_HELP_STRING([--enable-precision=single|double],[Select default word size of Real])],[ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
+case ${ac_PRECISION} in
+     single)
+       echo default precision is single
+       AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] )
+     ;;
+     double)
+       echo default precision is double
+       AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
+     ;;
+esac

 AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])

@@ -144,15 +168,15 @@ AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
 ###################################################################
 # Checks for doxygen support
 # if present enables the "make doxyfile" command
-echo
-echo Checking doxygen support 
-echo :::::::::::::::::::::::::::::::::::::::::::
-AC_PROG_DOXYGEN
+#echo
+#echo Checking doxygen support 
+#echo :::::::::::::::::::::::::::::::::::::::::::
+#AC_PROG_DOXYGEN

-if test -n "$DOXYGEN"
-then
-AC_CONFIG_FILES([docs/doxy.cfg])
-fi
+#if test -n "$DOXYGEN"
+#then
+#AC_CONFIG_FILES([docs/doxy.cfg])
+#fi

 echo
 echo Creating configuration files
--- a/gcc-bug-report/broken.cc
+++ b/gcc-bug-report/broken.cc
@@ -29,12 +29,12 @@ public:

 template<int N,class obj,typename std::enable_if<N==obj::NestLevel >::type * = nullptr > auto function(const obj &arg)-> obj
 {
-  std::cout<<"Leaf "<<obj::NestLevel<<std::endl;
+  std::cout<<GridLogMessage<<"Leaf "<<obj::NestLevel<<std::endl;
  return arg;
 }
 template<int N,class obj,typename std::enable_if<N!=obj::NestLevel >::type * = nullptr > auto function(const obj &arg)-> obj
 {
-  std::cout<<"Node "<<obj::NestLevel<<std::endl;
+  std::cout<<GridLogMessage<<"Node "<<obj::NestLevel<<std::endl;
  obj ret;
  ret.internal=function<N>(arg.internal);
  return ret;
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -3,7 +3,7 @@

 #include <algorithms/SparseMatrix.h>
 #include <algorithms/LinearOperator.h>
-#include <algorithms/CoarsenedMatrix.h>
+#include <algorithms/Preconditioner.h>

 #include <algorithms/approx/Zolotarev.h>
 #include <algorithms/approx/Chebyshev.h>
@@ -17,6 +17,12 @@

 #include <algorithms/iterative/ConjugateGradientMultiShift.h>

+// Lanczos support
+#include <algorithms/iterative/MatrixUtils.h>
+#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
+
+#include <algorithms/CoarsenedMatrix.h>
+
 // Eigen/lanczos
 // EigCg
 // MCR
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -1,6 +1,13 @@
 #ifndef GRID_ALIGNED_ALLOCATOR_H
 #define GRID_ALIGNED_ALLOCATOR_H

+#ifdef HAVE_MALLOC_MALLOC_H
+#include <malloc/malloc.h>
+#endif
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
 #include <immintrin.h>
 #ifdef HAVE_MM_MALLOC_H
 #include <mm_malloc.h>
@@ -66,6 +73,5 @@ operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return t
 template<typename _Tp>  inline bool
 operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
    
-    
 }; // namespace Grid
 #endif
--- a/lib/GridConfig.h.in
+++ b/lib/GridConfig.h.in
@@ -1,4 +1,4 @@
-/* lib/GridConfig.h.in.  Generated from configure.ac by autoheader.  */
+/* lib/Config.h.in.  Generated from configure.ac by autoheader.  */

 /* AVX Intrinsics */
 #undef AVX1
@@ -6,9 +6,12 @@
 /* AVX2 Intrinsics */
 #undef AVX2

-/* AVX512 Intrinsics for Knights Corner */
+/* AVX512 Intrinsics for Knights Landing */
 #undef AVX512

+/* AVX Intrinsics with FMA4 */
+#undef AVXFMA4
+
 /* EMPTY_SIMD only for DEBUGGING */
 #undef EMPTY_SIMD

@@ -18,6 +21,12 @@
 /* GRID_COMMS_NONE */
 #undef GRID_COMMS_NONE

+/* GRID_DEFAULT_PRECISION is DOUBLE */
+#undef GRID_DEFAULT_PRECISION_DOUBLE
+
+/* GRID_DEFAULT_PRECISION is SINGLE */
+#undef GRID_DEFAULT_PRECISION_SINGLE
+
 /* Support Altivec instructions */
 #undef HAVE_ALTIVEC

@@ -27,9 +36,6 @@
 /* Support AVX2 (Advanced Vector Extensions 2) instructions */
 #undef HAVE_AVX2

-/* define if the compiler supports basic C++11 syntax */
-#undef HAVE_CXX11
-
 /* Define to 1 if you have the declaration of `be64toh', and to 0 if you
   don't. */
 #undef HAVE_DECL_BE64TOH
@@ -41,6 +47,9 @@
 /* Define to 1 if you have the <endian.h> header file. */
 #undef HAVE_ENDIAN_H

+/* Define to 1 if you have the <execinfo.h> header file. */
+#undef HAVE_EXECINFO_H
+
 /* Support FMA3 (Fused Multiply-Add) instructions */
 #undef HAVE_FMA

@@ -53,12 +62,6 @@
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H

-/* Define to 1 if you have the `gmp' library (-lgmp). */
-#undef HAVE_LIBGMP
-
-/* Define to 1 if you have the `mpfr' library (-lmpfr). */
-#undef HAVE_LIBMPFR
-
 /* Define to 1 if you have the <malloc.h> header file. */
 #undef HAVE_MALLOC_H

@@ -113,8 +116,11 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H

-/* NEON ARMv7 Experimental support */
-#undef NEONv7
+/* IMCI Intrinsics for Knights Corner */
+#undef IMCI
+
+/* NEON ARMv8 Experimental support */
+#undef NEONv8

 /* Name of package */
 #undef PACKAGE
@@ -131,9 +137,6 @@
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME

-/* Define to the home page for this package. */
-#undef PACKAGE_URL
-
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION

--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -6,92 +6,49 @@
 //  Copyright (c) 2014 University of Edinburgh. All rights reserved.
 //

-
 #ifndef GRID_H
 #define GRID_H

+///////////////////
+// Std C++ dependencies
+///////////////////
 #include <cassert>
-
 #include <complex>
 #include <vector>
-
 #include <iostream>
 #include <iomanip>
 #include <random>
 #include <functional>
-
 #include <stdio.h>
 #include <stdlib.h>
-#include <sys/time.h>
 #include <stdio.h>
 #include <signal.h>
+#include <ctime>
+#include <sys/time.h>
+#include <chrono>

-#ifndef MAX
-#define MAX(x,y) ((x)>(y)?(x):(y))
-#define MIN(x,y) ((x)>(y)?(y):(x))
-#endif
-
-#define strong_inline __attribute__((always_inline)) inline
-
-#include <GridConfig.h>
-
-////////////////////////////////////////////////////////////
-// Tunable header includes
-////////////////////////////////////////////////////////////
-
-#ifdef HAVE_MALLOC_MALLOC_H
-#include <malloc/malloc.h>
-#endif
-#ifdef HAVE_MALLOC_H
-#include <malloc.h>
-#endif
-
+///////////////////
+// Grid headers
+///////////////////
+#include <serialisation/Serialisation.h>
+#include <Config.h>
+#include <Timer.h>
+#include <Log.h>
 #include <AlignedAllocator.h>
-
 #include <Simd.h>
 #include <Threads.h>
-
-#include <Communicator.h> // subdir aggregate
-#include <Cartesian.h> // subdir aggregate
-#include <Tensors.h>   // subdir aggregate
-#include <Lattice.h>   // subdir aggregate
-#include <Cshift.h>    // subdir aggregate
-#include <Stencil.h>   // subdir aggregate
-#include <Algorithms.h>// subdir aggregate
-
+#include <Communicator.h> 
+#include <Cartesian.h>    
+#include <Tensors.h>      
+#include <Lattice.h>      
+#include <Cshift.h>       
+#include <Stencil.h>      
+#include <Algorithms.h>   
 #include <qcd/QCD.h>
+#include <parallelIO/BinaryIO.h>
 #include <parallelIO/NerscIO.h>

-namespace Grid {
+#include <Init.h>

-  void Grid_init(int *argc,char ***argv);
-  void Grid_finalize(void);
-  // internal, controled with --handle
-  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
-  void Grid_debug_handler_init(void);
-  void Grid_quiesce_nodes(void);
-  void Grid_unquiesce_nodes(void);
-
-  // C++11 time facilities better?
-  double usecond(void);
-
-  const std::vector<int> GridDefaultSimd(int dims,int nsimd);
-  const std::vector<int> &GridDefaultLatt(void);
-  const std::vector<int> &GridDefaultMpi(void);
-  const int              &GridThreads(void)  ;
-  void                 GridSetThreads(int t) ;
-
-  // Common parsing chores
-  std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
-  bool        GridCmdOptionExists(char** begin, char** end, const std::string& option);
-  std::string GridCmdVectorIntToString(const std::vector<int> & vec);
-
-  void GridParseLayout(char **argv,int argc,
-		       std::vector<int> &latt,
-		       std::vector<int> &simd,
-		       std::vector<int> &mpi);
-
-
-};

 #endif
--- a/lib/GridConfig.h
+++ b/lib/GridConfig.h
@@ -1,169 +0,0 @@
-/* lib/GridConfig.h.  Generated from GridConfig.h.in by configure.  */
-/* lib/GridConfig.h.in.  Generated from configure.ac by autoheader.  */
-
-/* AVX Intrinsics */
-/* #undef AVX1 */
-
-/* AVX2 Intrinsics */
-/* #undef AVX2 */
-
-/* AVX512 Intrinsics for Knights Corner */
-/* #undef AVX512 */
-
-/* EMPTY_SIMD only for DEBUGGING */
-/* #undef EMPTY_SIMD */
-
-/* GRID_COMMS_MPI */
-/* #undef GRID_COMMS_MPI */
-
-/* GRID_COMMS_NONE */
-#define GRID_COMMS_NONE 1
-
-/* Support Altivec instructions */
-/* #undef HAVE_ALTIVEC */
-
-/* Support AVX (Advanced Vector Extensions) instructions */
-/* #undef HAVE_AVX */
-
-/* Support AVX2 (Advanced Vector Extensions 2) instructions */
-/* #undef HAVE_AVX2 */
-
-/* define if the compiler supports basic C++11 syntax */
-/* #undef HAVE_CXX11 */
-
-/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
-   don't. */
-#define HAVE_DECL_BE64TOH 1
-
-/* Define to 1 if you have the declaration of `ntohll', and to 0 if you don't.
-   */
-#define HAVE_DECL_NTOHLL 0
-
-/* Define to 1 if you have the <endian.h> header file. */
-#define HAVE_ENDIAN_H 1
-
-/* Support FMA3 (Fused Multiply-Add) instructions */
-/* #undef HAVE_FMA */
-
-/* Define to 1 if you have the `gettimeofday' function. */
-#define HAVE_GETTIMEOFDAY 1
-
-/* Define to 1 if you have the <gmp.h> header file. */
-#define HAVE_GMP_H 1
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#define HAVE_INTTYPES_H 1
-
-/* Define to 1 if you have the `gmp' library (-lgmp). */
-#define HAVE_LIBGMP 1
-
-/* Define to 1 if you have the `mpfr' library (-lmpfr). */
-#define HAVE_LIBMPFR 1
-
-/* Define to 1 if you have the <malloc.h> header file. */
-#define HAVE_MALLOC_H 1
-
-/* Define to 1 if you have the <malloc/malloc.h> header file. */
-/* #undef HAVE_MALLOC_MALLOC_H */
-
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
-/* Support mmx instructions */
-#define HAVE_MMX /**/
-
-/* Define to 1 if you have the <mm_malloc.h> header file. */
-#define HAVE_MM_MALLOC_H 1
-
-/* Support SSE (Streaming SIMD Extensions) instructions */
-#define HAVE_SSE /**/
-
-/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
-#define HAVE_SSE2 /**/
-
-/* Support SSE3 (Streaming SIMD Extensions 3) instructions */
-#define HAVE_SSE3 /**/
-
-/* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
-#define HAVE_SSE4_1 /**/
-
-/* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
-#define HAVE_SSE4_2 /**/
-
-/* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
-#define HAVE_SSSE3 /**/
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HAVE_STDINT_H 1
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#define HAVE_STDLIB_H 1
-
-/* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
-
-/* Define to 1 if you have the <string.h> header file. */
-#define HAVE_STRING_H 1
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#define HAVE_SYS_TYPES_H 1
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#define HAVE_UNISTD_H 1
-
-/* NEON ARMv7 Experimental support */
-/* #undef NEONv7 */
-
-/* Name of package */
-#define PACKAGE "grid"
-
-/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT "paboyle@ph.ed.ac.uk"
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME "Grid"
-
-/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "Grid 1.0"
-
-/* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME "grid"
-
-/* Define to the home page for this package. */
-#define PACKAGE_URL ""
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION "1.0"
-
-/* SSE4 Intrinsics */
-#define SSE4 1
-
-/* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
-
-/* Version number of package */
-#define VERSION "1.0"
-
-/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
-   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
-   #define below would cause a syntax error. */
-/* #undef _UINT32_T */
-
-/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
-   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
-   #define below would cause a syntax error. */
-/* #undef _UINT64_T */
-
-/* Define to `unsigned int' if <sys/types.h> does not define. */
-/* #undef size_t */
-
-/* Define to the type of an unsigned integer type of width exactly 32 bits if
-   such a type exists and the standard includes do not define it. */
-/* #undef uint32_t */
-
-/* Define to the type of an unsigned integer type of width exactly 64 bits if
-   such a type exists and the standard includes do not define it. */
-/* #undef uint64_t */
--- a/lib/GridInit.cc
+++ b/lib/GridInit.cc
@@ -1,7 +1,6 @@
 /****************************************************************************/
 /* pab: Signal magic. Processor state dump is x86-64 specific               */
 /****************************************************************************/
-
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdint.h>
@@ -16,26 +15,29 @@
 #include <algorithm>
 #include <iterator>

-#undef __X86_64
-#define MAC
+#define __X86_64

-#ifdef MAC
+#ifdef HAVE_EXECINFO_H
 #include <execinfo.h>
 #endif

 namespace Grid {

-  //////////////////////////////////////////////////////
-  // Convenience functions to access stadard command line arg
-  // driven parallelism controls
-  //////////////////////////////////////////////////////
-  static std::vector<int> Grid_default_latt;
-  static std::vector<int> Grid_default_mpi;
+//////////////////////////////////////////////////////
+// Convenience functions to access stadard command line arg
+// driven parallelism controls
+//////////////////////////////////////////////////////
+static std::vector<int> Grid_default_latt;
+static std::vector<int> Grid_default_mpi;

-  int GridThread::_threads;
+int GridThread::_threads =1;
+int GridThread::_hyperthreads=1;
+int GridThread::_cores=1;

-  const std::vector<int> GridDefaultSimd(int dims,int nsimd)
-  {
+const std::vector<int> &GridDefaultLatt(void)     {return Grid_default_latt;};
+const std::vector<int> &GridDefaultMpi(void)      {return Grid_default_mpi;};
+const std::vector<int> GridDefaultSimd(int dims,int nsimd)
+{
    std::vector<int> layout(dims);
    int nn=nsimd;
    for(int d=dims-1;d>=0;d--){
@@ -48,15 +50,11 @@ namespace Grid {
    }
    assert(nn==1);
    return layout;
-  }
+}
  
-  
-  const std::vector<int> &GridDefaultLatt(void)     {return Grid_default_latt;};
-  const std::vector<int> &GridDefaultMpi(void)      {return Grid_default_mpi;};
-
-  ////////////////////////////////////////////////////////////
-  // Command line parsing assist for stock controls
-  ////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+// Command line parsing assist for stock controls
+////////////////////////////////////////////////////////////
 std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option)
 {
  char ** itr = std::find(begin, end, option);
@@ -70,6 +68,23 @@ bool GridCmdOptionExists(char** begin, char** end, const std::string& option)
 {
  return std::find(begin, end, option) != end;
 }
+  // Comma separated list
+void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec)
+{
+  size_t pos = 0;
+  std::string token;
+  std::string delimiter(",");
+
+  vec.resize(0);
+  while ((pos = str.find(delimiter)) != std::string::npos) {
+    token = str.substr(0, pos);
+    vec.push_back(token);
+    str.erase(0, pos + delimiter.length());
+  }
+  token = str;
+  vec.push_back(token);
+  return;
+}

 void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
 {
@@ -84,6 +99,7 @@ void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
  return;
 }

+
 void GridParseLayout(char **argv,int argc,
 		     std::vector<int> &latt,
 		     std::vector<int> &mpi)
@@ -102,13 +118,19 @@ void GridParseLayout(char **argv,int argc,
    arg= GridCmdOptionPayload(argv,argv+argc,"--grid");
    GridCmdOptionIntVector(arg,latt);
  }
-  if( GridCmdOptionExists(argv,argv+argc,"--omp") ){
+  if( GridCmdOptionExists(argv,argv+argc,"--threads") ){
    std::vector<int> ompthreads(0);
-    arg= GridCmdOptionPayload(argv,argv+argc,"--omp");
+    arg= GridCmdOptionPayload(argv,argv+argc,"--threads");
    GridCmdOptionIntVector(arg,ompthreads);
    assert(ompthreads.size()==1);
    GridThread::SetThreads(ompthreads[0]);
  }
+  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
+    std::vector<int> cores(0);
+    arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
+    GridCmdOptionIntVector(arg,cores);
+    GridThread::SetCores(cores[0]);
+  }

 }

@@ -117,8 +139,9 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
  std::copy(vec.begin(), vec.end(),std::ostream_iterator<int>(oss, " "));
  return oss.str();
 }
-  /////////////////////////////////////////////////////////
-  /////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////
+//
+/////////////////////////////////////////////////////////
 void Grid_init(int *argc,char ***argv)
 {
 #ifdef GRID_COMMS_MPI
@@ -126,15 +149,33 @@ void Grid_init(int *argc,char ***argv)
 #endif
  // Parse command line args.

+  GridLogger::StopWatch.Start();
+
+  std::string arg;
+  std::vector<std::string> logstreams;
+  std::string defaultLog("Error,Warning,Message,Performance");
+
+  GridCmdOptionCSL(defaultLog,logstreams);
+  GridLogConfigure(logstreams);
+
  if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
-    std::cout<<"--help : this message"<<std::endl;
-    std::cout<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
-    std::cout<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
-    std::cout<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
-    std::cout<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
-    std::cout<<"--omp n         : default number of OMP threads"<<std::endl;    
-    std::cout<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
+    std::cout<<GridLogMessage<<"--help : this message"<<std::endl;
+    std::cout<<GridLogMessage<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
+    std::cout<<GridLogMessage<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
+    std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
+    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
+    std::cout<<GridLogMessage<<"--omp n         : default number of OMP threads"<<std::endl;    
+    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
+    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Debug"<<std::endl;    
  }
+
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
+    arg = GridCmdOptionPayload(*argv,*argv+*argc,"--log");
+    GridCmdOptionCSL(arg,logstreams);
+    GridLogConfigure(logstreams);
+  }
+
+
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
@@ -142,49 +183,34 @@ void Grid_init(int *argc,char ***argv)
    Grid_quiesce_nodes();
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
-    WilsonFermion::HandOptDslash=1;
-    WilsonFermion5D::HandOptDslash=1;
+    QCD::WilsonFermionStatic::HandOptDslash=1;
+    QCD::WilsonFermion5DStatic::HandOptDslash=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
  }
+
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
+    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
+    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
+  }
  GridParseLayout(*argv,*argc,
 		  Grid_default_latt,
 		  Grid_default_mpi);
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
-    std::cout<<"Grid Decomposition\n";
-    std::cout<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
-    std::cout<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
-    std::cout<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
-    std::cout<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
-    std::cout<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
-    std::cout<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"Grid Decomposition\n";
+    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
+    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
+    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
  }

+
 }

  
-  ////////////////////////////////////////////////////////////
-  // Verbose limiter on MPI tasks
-  ////////////////////////////////////////////////////////////
-  void Grid_quiesce_nodes(void)
-  {
-#ifdef GRID_COMMS_MPI
-    int me;
-    MPI_Comm_rank(MPI_COMM_WORLD,&me);
-    if ( me ) { 
-      std::cout.setstate(std::ios::badbit);
-    }
-#endif
-  }
-  void Grid_unquiesce_nodes(void)
-  {
-#ifdef GRID_COMMS_MPI
-    std::cout.clear();
-#endif
-  }
-
-  
 void Grid_finalize(void)
 {
 #ifdef GRID_COMMS_MPI
@@ -207,11 +233,14 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  printf("  mem address %llx\n",(unsigned long long)si->si_addr);
  printf("         code %d\n",si->si_code);

-#ifdef __X86_64
+  // Linux/Posix
+#ifdef __linux__ 
+  // And x86 64bit
    ucontext_t * uc= (ucontext_t *)ptr;
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
  printf("  instruction %llx\n",(unsigned long long)sc->rip);
 #define REG(A)  printf("  %s %lx\n",#A,sc-> A);
+
  REG(rdi);
  REG(rsi);
  REG(rbp);
@@ -232,7 +261,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  REG(r14);
  REG(r15);
 #endif
-#ifdef MAC
+#ifdef HAVE_EXECINFO_H
  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);
  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
  for (int i = 0; i < symbols; i++){
--- a/lib/Init.h
+++ b/lib/Init.h
@@ -0,0 +1,32 @@
+#ifndef GRID_INIT_H
+#define GRID_INIT_H
+
+namespace Grid {
+
+  void Grid_init(int *argc,char ***argv);
+  void Grid_finalize(void);
+  // internal, controled with --handle
+  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
+  void Grid_debug_handler_init(void);
+  void Grid_quiesce_nodes(void);
+  void Grid_unquiesce_nodes(void);
+
+  const std::vector<int> GridDefaultSimd(int dims,int nsimd);
+  const std::vector<int> &GridDefaultLatt(void);
+  const std::vector<int> &GridDefaultMpi(void);
+  const int              &GridThreads(void)  ;
+  void                    GridSetThreads(int t) ;
+
+  // Common parsing chores
+  std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
+  bool        GridCmdOptionExists(char** begin, char** end, const std::string& option);
+  std::string GridCmdVectorIntToString(const std::vector<int> & vec);
+
+  void GridParseLayout(char **argv,int argc,
+		       std::vector<int> &latt,
+		       std::vector<int> &simd,
+		       std::vector<int> &mpi);
+
+
+};
+#endif
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -0,0 +1,62 @@
+#include <Grid.h>
+
+namespace Grid {
+
+GridStopWatch Logger::StopWatch;
+std::ostream  Logger::devnull(0);
+
+GridLogger GridLogError      (1,"Error");
+GridLogger GridLogWarning    (1,"Warning");
+GridLogger GridLogMessage    (1,"Message");
+GridLogger GridLogDebug      (1,"Debug");
+GridLogger GridLogPerformance(1,"Performance");
+GridLogger GridLogIterative  (1,"Iterative");
+
+void GridLogConfigure(std::vector<std::string> &logstreams)
+{
+  GridLogError.Active(0);
+  GridLogWarning.Active(0);
+  GridLogMessage.Active(0);
+  GridLogIterative.Active(0);
+  GridLogDebug.Active(0);
+  GridLogPerformance.Active(0);
+
+  for(int i=0;i<logstreams.size();i++){
+    if ( logstreams[i]== std::string("Error")       ) GridLogError.Active(1);
+    if ( logstreams[i]== std::string("Warning")     ) GridLogWarning.Active(1);
+    if ( logstreams[i]== std::string("Message")     ) GridLogMessage.Active(1);
+    if ( logstreams[i]== std::string("Iterative")   ) GridLogIterative.Active(1);
+    if ( logstreams[i]== std::string("Debug")       ) GridLogDebug.Active(1);
+    if ( logstreams[i]== std::string("Performance") ) GridLogPerformance.Active(1);
+  }
+}
+
+////////////////////////////////////////////////////////////
+// Verbose limiter on MPI tasks
+////////////////////////////////////////////////////////////
+void Grid_quiesce_nodes(void)
+{
+#ifdef GRID_COMMS_MPI
+  int me;
+  MPI_Comm_rank(MPI_COMM_WORLD,&me);
+  if ( me ) { 
+    std::cout.setstate(std::ios::badbit);
+  }
+#endif
+}
+
+void Grid_unquiesce_nodes(void)
+{
+#ifdef GRID_COMMS_MPI
+    std::cout.clear();
+#endif
+}
+
+std::ostream& operator<< (std::ostream& stream, const GridTime& time)
+{
+  stream << time.count()<<" ms";
+  return stream;
+}
+
+}
+
--- a/lib/Log.h
+++ b/lib/Log.h
@@ -0,0 +1,54 @@
+#ifndef GRID_LOG_H
+#define GRID_LOG_H
+namespace Grid {
+
+// Dress the output; use std::chrono for time stamping via the StopWatch class
+
+std::ostream& operator<< (std::ostream& stream, const GridTime& time);
+
+class Logger {
+protected:
+    int active;
+    std::string name, topName;
+public:
+    static GridStopWatch StopWatch;
+    static std::ostream devnull;
+    
+    Logger(std::string topNm, int on, std::string nm)
+    : active(on), name(nm), topName(topNm) {};
+    
+    void Active(int on) {active = on;};
+    int  isActive(void) {return active;};
+    
+    friend std::ostream& operator<< (std::ostream& stream, const Logger& log){
+        if ( log.active ) {
+            StopWatch.Stop();
+            GridTime now = StopWatch.Elapsed();
+            StopWatch.Start();
+            stream << std::setw(8) << std::left << log.topName << " : ";
+            stream << std::setw(12) << std::left << log.name << " : ";
+            stream << now << " : ";
+            return stream;
+        } else { 
+            return devnull;
+        }
+    }
+    
+};
+    
+class GridLogger: public Logger {
+public:
+  GridLogger(int on, std::string nm): Logger("Grid", on, nm){};
+};
+
+void GridLogConfigure(std::vector<std::string> &logstreams);
+
+extern GridLogger GridLogError;
+extern GridLogger GridLogWarning;
+extern GridLogger GridLogMessage;
+extern GridLogger GridLogDebug  ;
+extern GridLogger GridLogPerformance;
+extern GridLogger GridLogIterative  ;
+
+}
+#endif
--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@

-HFILES=./Cshift.h ./simd/Grid_avx.h ./simd/Grid_vector_types.h ./simd/Grid_sse4.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_vector_unops.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./Tensors.h ./Algorithms.h ./communicator/Communicator_base.h ./lattice/Lattice_rng.h ./lattice/Lattice_reduction.h ./lattice/Lattice_transfer.h ./lattice/Lattice_unary.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_comparison.h ./lattice/Lattice_overload.h ./lattice/Lattice_reality.h ./lattice/Lattice_local.h ./lattice/Lattice_conformable.h ./lattice/Lattice_where.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_ET.h ./lattice/Lattice_transpose.h ./lattice/Lattice_trace.h ./Stencil.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_poke.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_class.h ./tensors/Tensor_logical.h ./tensors/Tensor_transpose.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_outer.h ./tensors/Tensor_inner.h ./tensors/Tensor_traits.h ./tensors/Tensor_Ta.h ./tensors/Tensor_unary.h ./tensors/Tensor_determinant.h ./tensors/Tensor_peek.h ./tensors/Tensor_arith.h ./tensors/Tensor_extract_merge.h ./Communicator.h ./Cartesian.h ./parallelIO/NerscIO.h ./qcd/QCD.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/LinalgUtils.h ./qcd/utils/CovariantCshift.h ./qcd/utils/WilsonLoops.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/spin/TwoSpinor.h ./qcd/spin/Dirac.h ./cshift/Cshift_common.h ./cshift/Cshift_none.h ./cshift/Cshift_mpi.h ./Simd.h ./GridConfig.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_red_black.h ./cartesian/Cartesian_full.h ./AlignedAllocator.h ./Lattice.h ./Threads.h ./Grid.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Remez.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./algorithms/CoarsenedMatrix.h ./stencil/Lebesgue.h
+HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Config.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./Init.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./pugixml/pugixml.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h ./Timer.h

-CCFILES=./qcd/utils/SpaceTimeGrid.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/spin/Dirac.cc ./GridInit.cc ./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
+CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./Init.cc ./Log.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./serialisation/BinaryIO.cc ./serialisation/TextIO.cc ./serialisation/XmlIO.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
--- a/lib/tensors/Tensor_peek.h
+++ b/lib/tensors/Tensor_peek.h
@@ -11,7 +11,7 @@ namespace Grid {
 //template<int Level> inline ComplexD peekIndex(const ComplexD arg) { return arg;}
 //template<int Level> inline RealF peekIndex(const RealF arg) { return arg;}
 //template<int Level> inline RealD peekIndex(const RealD arg) { return arg;}
-
+#if 0
 // Scalar peek, no indices
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  auto peekIndex(const iScalar<vtype> &arg) ->  iScalar<vtype> 
@@ -88,6 +88,7 @@ template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::T
  }
  return ret;
 }
+
 // matrix
 template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
 auto peekIndex(const iMatrix<vtype,N> &arg) ->   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0])),N> 
@@ -119,6 +120,7 @@ template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::T
  }}
  return ret;
 }
+#endif


 }
--- a/lib/tensors/Tensor_poke.h
+++ b/lib/tensors/Tensor_poke.h
@@ -5,7 +5,7 @@ namespace Grid {
 //////////////////////////////////////////////////////////////////////////////
 // Poke a specific index; 
 //////////////////////////////////////////////////////////////////////////////
-
+#if 0
 // Scalar poke
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  void pokeIndex(iScalar<vtype> &ret, const iScalar<vtype> &arg)
@@ -18,7 +18,7 @@ template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::Te
 {
  ret._internal[i] = arg._internal;
 }
-// Vector poke, two indices
+//Matrix poke, two indices
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  void pokeIndex(iMatrix<vtype,N> &ret, const iScalar<vtype> &arg,int i,int j)
 {
@@ -31,7 +31,6 @@ template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::Te
 // scalar
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
 void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal))>  &arg)
-		 
 {
  pokeIndex<Level>(ret._internal,arg._internal);
 }
@@ -95,7 +94,7 @@ template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::Te
    pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj],i,j);
  }}
 }
-
+#endif

 }
 #endif
--- a/lib/PerfCount.cc
+++ b/lib/PerfCount.cc
@@ -0,0 +1,28 @@
+
+#include <Grid.h>
+#include <PerfCount.h>
+
+namespace Grid {
+#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
+const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." },
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......"},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS...."},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS....."},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS..."},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS.."},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS"},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS......."},
+  //  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS....."},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......"},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS...."},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS..."},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS."},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......"},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS...."}
+  //  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
+};
+}
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@@ -0,0 +1,157 @@
+#ifndef GRID_PERFCOUNT_H
+#define GRID_PERFCOUNT_H
+
+#include <sys/time.h>
+#include <ctime>
+#include <chrono>
+#include <string.h>
+
+#include <sys/ioctl.h>
+
+#ifdef __linux__
+#include <syscall.h>
+#include <linux/perf_event.h>
+#endif
+namespace Grid {
+
+
+#ifdef __linux__
+static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
+			    int cpu, int group_fd, unsigned long flags)
+{
+  int ret=0;
+
+  ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
+		group_fd, flags);
+  return ret;
+}
+#endif
+
+
+class PerformanceCounter {
+private:
+  typedef struct { 
+  public:
+    uint32_t type;
+    uint64_t config;
+    const char *name;
+  } PerformanceCounterConfig; 
+  
+  static const PerformanceCounterConfig PerformanceCounterConfigs [];
+
+public:
+
+  enum PerformanceCounterType {
+    CPUCYCLES=0,
+    INSTRUCTIONS,
+    //    STALL_CYCLES,
+    CACHE_REFERENCES,
+    CACHE_MISSES,
+    L1D_READ_MISS,
+    L1D_READ_ACCESS,
+    L1D_WRITE_MISS,
+    L1D_WRITE_ACCESS,
+    L1D_PREFETCH_MISS,
+    L1D_PREFETCH_ACCESS,
+    LL_READ_MISS,
+    //    LL_READ_ACCESS,
+    LL_WRITE_MISS,
+    LL_WRITE_ACCESS,
+    LL_PREFETCH_MISS,
+    LL_PREFETCH_ACCESS,
+    L1I_READ_MISS,
+    L1I_READ_ACCESS,
+    PERFORMANCE_COUNTER_NUM_TYPES
+  };
+
+public:
+    
+  int PCT;
+
+  long long count;
+  int fd;
+  uint64_t elapsed;
+  uint64_t begin;
+
+  static int NumTypes(void){ 
+    return PERFORMANCE_COUNTER_NUM_TYPES;
+  }
+
+  PerformanceCounter(int _pct) {
+#ifdef __linux__
+    assert(_pct>=0);
+    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
+    fd=-1;
+    count=0;
+    PCT =_pct;
+    Open();
+#endif
+  }
+  void Open(void) 
+  {
+#ifdef __linux__
+    struct perf_event_attr pe;
+    memset(&pe, 0, sizeof(struct perf_event_attr));
+    pe.size = sizeof(struct perf_event_attr);
+
+    pe.disabled = 1;
+    pe.exclude_kernel = 1;
+    pe.exclude_hv = 1;
+    pe.inherit    = 1;
+
+    pe.type  = PerformanceCounterConfigs[PCT].type;
+    pe.config= PerformanceCounterConfigs[PCT].config;
+    const char * name = PerformanceCounterConfigs[PCT].name;
+    fd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
+    if (fd == -1) {
+      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
+      perror("Error is");
+    }
+#endif
+  }
+
+  void Start(void)
+  {
+#ifdef __linux__
+    if ( fd!= -1) {
+      ioctl(fd, PERF_EVENT_IOC_RESET, 0);
+      ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+    }
+    begin  =__rdtsc();
+#else
+    begin = 0;
+#endif
+  }
+
+  void Stop(void) {
+    count=0;
+#ifdef __linux__
+    if ( fd!= -1) {
+      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+      ::read(fd, &count, sizeof(long long));
+    }
+    elapsed = __rdtsc() - begin;
+#else
+    elapsed = 0;
+#endif
+
+  }
+  void Report(void) {
+#ifdef __linux__
+    printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
+#else
+    printf("%llu cycles \n", elapsed );
+#endif
+  }
+
+  ~PerformanceCounter()
+  {
+#ifdef __linux__
+    close(fd);
+#endif
+  }
+
+};
+
+}
+#endif
--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -13,6 +13,11 @@

 typedef uint32_t Integer;

+#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
+#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
+#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
+#define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)
+
 namespace Grid {

  typedef  float  RealF;
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -41,11 +41,21 @@

 namespace Grid {
  
+  struct StencilEntry { 
+    int _offset;
+    int _is_local;
+    int _permute;
+    int _around_the_world;
+  };

+  template<class vobj,class cobj, class compressor>
  class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
  public:

      typedef uint32_t StencilInteger;
+      typedef typename cobj::vector_type vector_type;
+      typedef typename cobj::scalar_type scalar_type;
+      typedef typename cobj::scalar_object scalar_object;

      int                               _checkerboard;
      int                               _npoints; // Move to template param?
@@ -58,35 +68,336 @@ namespace Grid {
      std::vector<int>                  _permute_type;

      // npoints x Osites() of these
-      std::vector<std::vector<int>    > _offsets;
-      std::vector<std::vector<int>    > _is_local;
-      std::vector<std::vector<int> >    _permute;
+      std::vector<std::vector<StencilEntry> > _entries;
+
+      // Comms buffers
+      std::vector<std::vector<scalar_object> > send_buf_extract;
+      std::vector<std::vector<scalar_object> > recv_buf_extract;
+      std::vector<scalar_object *> pointers;
+      std::vector<scalar_object *> rpointers;
+      Vector<cobj> send_buf;
+
+      inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }

      int _unified_buffer_size;
      int _request_count;

+      double buftime;
+      double gathertime;
+      double commtime;
+      double commstime;
+      double halotime;
+      double scattertime;
+      double mergetime;
+      double gathermtime;
+      double splicetime;
+      double nosplicetime;

-      CartesianStencil(GridBase *grid,
-		       int npoints,
-		       int checkerboard,
-		       const std::vector<int> &directions,
-		       const std::vector<int> &distances);
+
+
+
+  CartesianStencil(GridBase *grid,
+				     int npoints,
+				     int checkerboard,
+				     const std::vector<int> &directions,
+				     const std::vector<int> &distances) 
+    :   _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
+    {
+      gathertime=0;
+      commtime=0;
+      commstime=0;
+      halotime=0;
+      scattertime=0;
+      mergetime=0;
+      gathermtime=0;
+      buftime=0;
+      splicetime=0;
+      nosplicetime=0;
+
+      _npoints = npoints;
+      _grid    = grid;
+      _directions = directions;
+      _distances  = distances;
+      _unified_buffer_size=0;
+      _request_count =0;
+
+      int osites  = _grid->oSites();
+
+      for(int i=0;i<npoints;i++){
+
+	int point = i;
+
+	_entries[i].resize( osites);
+
+	int dimension    = directions[i];
+	int displacement = distances[i];
+	int shift = displacement;
+	
+	int fd = _grid->_fdimensions[dimension];
+	int rd = _grid->_rdimensions[dimension];
+	_permute_type[point]=_grid->PermuteType(dimension);
+
+	_checkerboard = checkerboard;
+
+	// the permute type
+	int simd_layout     = _grid->_simd_layout[dimension];
+	int comm_dim        = _grid->_processors[dimension] >1 ;
+	int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
+
+	int sshift[2];
+	
+	// Underlying approach. For each local site build
+	// up a table containing the npoint "neighbours" and whether they 
+	// live in lattice or a comms buffer.
+	if ( !comm_dim ) {
+	  sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
+	  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
+
+	  if ( sshift[0] == sshift[1] ) {
+	    Local(point,dimension,shift,0x3);
+	  } else {
+	    Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	    Local(point,dimension,shift,0x2);// both with block stride loop iteration
+	  }
+	} else { // All permute extract done in comms phase prior to Stencil application
+	  //        So tables are the same whether comm_dim or splice_dim
+	  sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
+	  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
+	  if ( sshift[0] == sshift[1] ) {
+	    Comms(point,dimension,shift,0x3);
+	    //	    std::cout<<"Comms 0x3"<<std::endl;
+	  } else {
+	    Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	    Comms(point,dimension,shift,0x2);// both with block stride loop iteration
+	    //	    std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
+	  }
+	}
+	//	for(int ss=0;ss<osites;ss++){
+	  //	  std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
+	  //	    _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
+	//	}
+      }
+    }
+
+
+    void Local     (int point, int dimension,int shiftpm,int cbmask)
+    {
+      int fd = _grid->_fdimensions[dimension];
+      int rd = _grid->_rdimensions[dimension];
+      int ld = _grid->_ldimensions[dimension];
+      int gd = _grid->_gdimensions[dimension];
+      
+      // Map to always positive shift modulo global full dimension.
+      int shift = (shiftpm+fd)%fd;
+      
+      // the permute type
+      int permute_dim =_grid->PermuteDim(dimension);
+      
+      for(int x=0;x<rd;x++){       
+	
+	int o   = 0;
+	int bo  = x * _grid->_ostride[dimension];
+	
+	int cb= (cbmask==0x2)? Odd : Even;
+	  
+	int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
+	int sx     = (x+sshift)%rd;
+
+	int wraparound=0;
+	if ( (shiftpm==-1) && (sx>x)  ) {
+	  wraparound = 1;
+	}
+	if ( (shiftpm== 1) && (sx<x)  ) {
+	  wraparound = 1;
+	}
+
+	  
+	int permute_slice=0;
+	if(permute_dim){
+	  int wrap = sshift/rd;
+	  int  num = sshift%rd;
+	  if ( x< rd-num ) permute_slice=wrap;
+	  else permute_slice = 1-wrap;
+	}
+
+  	CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
+  
+      }
+    }
+
+    void Comms     (int point,int dimension,int shiftpm,int cbmask)
+    {
+      GridBase *grid=_grid;
+      
+      int fd              = _grid->_fdimensions[dimension];
+      int ld              = _grid->_ldimensions[dimension];
+      int rd              = _grid->_rdimensions[dimension];
+      int pd              = _grid->_processors[dimension];
+      int simd_layout     = _grid->_simd_layout[dimension];
+      int comm_dim        = _grid->_processors[dimension] >1 ;
+      
+      //      assert(simd_layout==1); // Why?
+      assert(comm_dim==1);
+      int shift = (shiftpm + fd) %fd;
+      assert(shift>=0);
+      assert(shift<fd);
+      
+      int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
+      _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
+                                           // send to one or more remote nodes.
+
+      int cb= (cbmask==0x2)? Odd : Even;
+      int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
+      
+
+      for(int x=0;x<rd;x++){       
+
+	int sx        =  (x+sshift)%rd;
+	int comm_proc = ((x+sshift)/rd)%pd;
+    	int offnode = (comm_proc!= 0);
+
+	//	std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
+	int wraparound=0;
+	if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
+	  wraparound = 1;
+	}
+	if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
+	  wraparound = 1;
+	}
+	if (!offnode) {
+	  
+	  int permute_slice=0;
+	  CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); 
+	  
+	} else {
+	  
+	  int words = buffer_size;
+	  if (cbmask != 0x3) words=words>>1;
+	  
+	  //	  GatherPlaneSimple (point,dimension,sx,cbmask);
+	  
+	  int rank           = grid->_processor;
+	  int recv_from_rank;
+	  int xmit_to_rank;
+
+	  int unified_buffer_offset = _unified_buffer_size;
+	  _unified_buffer_size    += words;
+	  ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
+	  
+	}
+      }
+    }
+  // Routine builds up integer table for each site in _offsets, _is_local, _permute
+  void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
+    {
+      int rd = _grid->_rdimensions[dimension];
+      
+      if ( !_grid->CheckerBoarded(dimension) ) {
+	
+	int o   = 0;                                     // relative offset to base within plane
+	int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int lo  = lplane*_grid->_ostride[dimension]; // offset in buffer
+	
+	// Simple block stride gather of SIMD objects
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+	    _entries[point][lo+o+b]._offset  =ro+o+b;
+	    _entries[point][lo+o+b]._is_local=1;
+	    _entries[point][lo+o+b]._permute=permute;
+	    _entries[point][lo+o+b]._around_the_world=wrap;
+	  }
+	  o +=_grid->_slice_stride[dimension];
+	}
+	
+      } else {
+	
+	int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int lo  = lplane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int o   = 0;                                     // relative offset to base within plane
+	
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+	    
+	    int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
+	    
+	    if ( ocb&cbmask ) {
+	      _entries[point][lo+o+b]._offset =ro+o+b;
+	      _entries[point][lo+o+b]._is_local=1;
+	      _entries[point][lo+o+b]._permute=permute;
+	      _entries[point][lo+o+b]._around_the_world=wrap;
+	    }
+	    
+	    }
+	  o +=_grid->_slice_stride[dimension];
+	}
+	
+      }
+    }
+  // Routine builds up integer table for each site in _offsets, _is_local, _permute
+   void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
+    {
+      int rd = _grid->_rdimensions[dimension];
+      
+      if ( !_grid->CheckerBoarded(dimension) ) {
+	
+	int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int o   = 0;                                    // relative offset to base within plane
+	int bo  = 0;                                    // offset in buffer
+
+	// Simple block stride gather of SIMD objects
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+	    _entries[point][so+o+b]._offset  =offset+(bo++);
+	    _entries[point][so+o+b]._is_local=0;
+	    _entries[point][so+o+b]._permute=0;
+	    _entries[point][so+o+b]._around_the_world=wrap;
+	  }
+	  o +=_grid->_slice_stride[dimension];
+	}
+
+      } else { 
+	
+	int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int o   = 0;                                      // relative offset to base within plane
+	int bo  = 0;                                      // offset in buffer
+
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+
+	    int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	    if ( ocb & cbmask ) {
+	      _entries[point][so+o+b]._offset  =offset+(bo++);
+	      _entries[point][so+o+b]._is_local=0;
+	      _entries[point][so+o+b]._permute =0;
+	      _entries[point][so+o+b]._around_the_world=wrap;
+	    }
+	  }
+	  o +=_grid->_slice_stride[dimension];
+	}
+      }
+    }
+
+//      CartesianStencil(GridBase *grid,
+//		       int npoints,
+//		       int checkerboard,
+//		       const std::vector<int> &directions,
+//		       const std::vector<int> &distances);


      // Add to tables for various cases;  is this mistaken. only local if 1 proc in dim
      // Can this be avoided with simpler coding of comms?
-      void Local     (int point, int dimension,int shift,int cbmask);
-      void Comms     (int point, int dimension,int shift,int cbmask);
-      void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute);
-      void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset);
+   //      void Local     (int point, int dimension,int shift,int cbmask);
+   //      void Comms     (int point, int dimension,int shift,int cbmask);
+   //      void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
+   //      void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);

      // Could allow a functional munging of the halo to another type during the comms.
      // this could implement the 16bit/32bit/64bit compression.
-      template<class vobj,class cobj, class compressor> void 
-	HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
+      void HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
      {
 	// conformable(source._grid,_grid);
 	assert(source._grid==_grid);
+	halotime-=usecond();
 	if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
 	int u_comm_offset=0;

@@ -120,24 +431,33 @@ namespace Grid {
 	    sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
 	    if ( sshift[0] == sshift[1] ) {
 	      if (splice_dim) {
+		splicetime-=usecond();
 		GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
+		splicetime+=usecond();
 	      } else { 
+		nosplicetime-=usecond();
 		GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
+		nosplicetime+=usecond();
 	      }
 	    } else {
+	      std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
 	      if(splice_dim){
+		splicetime-=usecond();
 		GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
 		GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
+		splicetime+=usecond();
 	      } else {
+		nosplicetime-=usecond();
 		GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
 		GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
+		nosplicetime+=usecond();
 	      }
 	    }
 	  }
 	}
+	halotime+=usecond();
      }

-      template<class vobj,class cobj, class compressor> 
        void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
 			      std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
 			      int &u_comm_offset,compressor & compress)
@@ -161,8 +481,7 @@ namespace Grid {

 	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];

-	  std::vector<cobj,alignedAllocator<cobj> > send_buf(buffer_size); // hmm...
-	  std::vector<cobj,alignedAllocator<cobj> > recv_buf(buffer_size);
+	  if(send_buf.size()<buffer_size) send_buf.resize(buffer_size);

 	  int cb= (cbmask==0x2)? Odd : Even;
 	  int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
@@ -174,12 +493,14 @@ namespace Grid {

 	    if (comm_proc) {
 	      
-	      int words = send_buf.size();
+	      int words = buffer_size;
 	      if (cbmask != 0x3) words=words>>1;
 	    
 	      int bytes = words * sizeof(cobj);

+	      gathertime-=usecond();
 	      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
+	      gathertime+=usecond();

 	      int rank           = _grid->_processor;
 	      int recv_from_rank;
@@ -189,31 +510,27 @@ namespace Grid {
 	      assert (recv_from_rank != _grid->ThisRank());

 	      //      FIXME Implement asynchronous send & also avoid buffer copy
+	      commtime-=usecond();
 	      _grid->SendToRecvFrom((void *)&send_buf[0],
 				   xmit_to_rank,
-				   (void *)&recv_buf[0],
+				    (void *)&u_comm_buf[u_comm_offset],
 				   recv_from_rank,
 				   bytes);
+	      commtime+=usecond();

-	      for(int i=0;i<buffer_size;i++){
-		u_comm_buf[u_comm_offset+i]=recv_buf[i];
-	      }
-	      u_comm_offset+=buffer_size;
+	      u_comm_offset+=words;
 	    }
 	  }
 	}


-      template<class vobj,class cobj, class compressor> 
 	void  GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
 				   std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
 				   int &u_comm_offset,compressor &compress)
 	{
+	  buftime-=usecond();
 	  const int Nsimd = _grid->Nsimd();

-	  typedef typename cobj::vector_type vector_type;
-	  typedef typename cobj::scalar_type scalar_type;
-	  typedef typename cobj::scalar_object scalar_object;
 	  
 	  int fd = _grid->_fdimensions[dimension];
 	  int rd = _grid->_rdimensions[dimension];
@@ -235,17 +552,23 @@ namespace Grid {
 	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 	  int words = sizeof(cobj)/sizeof(vector_type);

-	  /*
-	   * possibly slow to allocate
-	   * Doesn't matter in this test, but may want to preallocate in the 
-	   * dirac operators
-	   */
-	  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) ); 
-	  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
+	  assert(cbmask==0x3); // Fixme think there is a latent bug if not true
+
+	  //	Should grow to max size and then cost very little thereafter
+	  send_buf_extract.resize(Nsimd);
+	  recv_buf_extract.resize(Nsimd);
+	  for(int l=0;l<Nsimd;l++){
+	    if( send_buf_extract[l].size() < buffer_size) {
+	      send_buf_extract[l].resize(buffer_size);
+	      recv_buf_extract[l].resize(buffer_size);
+	    }
+	  }
+	  pointers.resize(Nsimd);
+	  rpointers.resize(Nsimd);
+
 	  int bytes = buffer_size*sizeof(scalar_object);
 	  
-	  std::vector<scalar_object *> pointers(Nsimd);  //
-	  std::vector<scalar_object *> rpointers(Nsimd); // received pointers
+	  buftime+=usecond();
 	  
 	  ///////////////////////////////////////////
 	  // Work out what to send where
@@ -266,7 +589,9 @@ namespace Grid {
 	      }
 	      int sx   = (x+sshift)%rd;
 	      
+	      gathermtime-=usecond();
 	      Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
+	      gathermtime+=usecond();

 	      for(int i=0;i<Nsimd;i++){
 		
@@ -293,11 +618,13 @@ namespace Grid {
 		  
 		  _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 		  
+		  commstime-=usecond();
 		  _grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
 					xmit_to_rank,
 					(void *)&recv_buf_extract[i][0],
 					recv_from_rank,
 					bytes);
+		  commstime+=usecond();
 		  
 		  rpointers[i] = &recv_buf_extract[i][0];

@@ -307,11 +634,13 @@ namespace Grid {
 	      }

 	      // Here we don't want to scatter, just place into a buffer.
+	      mergetime-=usecond();
+PARALLEL_FOR_LOOP
 	      for(int i=0;i<buffer_size;i++){
-		assert(u_comm_offset+i<_unified_buffer_size);
+		//		assert(u_comm_offset+i<_unified_buffer_size);
 		merge(u_comm_buf[u_comm_offset+i],rpointers,i);
 	      }
-
+	      mergetime+=usecond();
 	      u_comm_offset+=buffer_size;
 	    }
 	  }
--- a/lib/Tensors.h
+++ b/lib/Tensors.h
@@ -8,11 +8,12 @@
 #include <tensors/Tensor_outer.h>
 #include <tensors/Tensor_transpose.h>
 #include <tensors/Tensor_trace.h>
+#include <tensors/Tensor_index.h>
 #include <tensors/Tensor_Ta.h>
 #include <tensors/Tensor_determinant.h>
 #include <tensors/Tensor_exp.h>
-#include <tensors/Tensor_peek.h>
-#include <tensors/Tensor_poke.h>
+//#include <tensors/Tensor_peek.h>
+//#include <tensors/Tensor_poke.h>
 #include <tensors/Tensor_reality.h>
 #include <tensors/Tensor_unary.h>
 #include <tensors/Tensor_extract_merge.h>
--- a/lib/Threads.h
+++ b/lib/Threads.h
@@ -24,7 +24,16 @@ namespace Grid {
 class GridThread {
 public:
  static int _threads;
+  static int _hyperthreads;
+  static int _cores;

+  static void SetCores(int cr) { 
+#ifdef GRID_OMP
+    _cores = cr;
+#else 
+    _cores = 1;
+#endif
+  }
  static void SetThreads(int thr) { 
 #ifdef GRID_OMP
    _threads = MIN(thr,omp_get_max_threads()) ;
@@ -35,22 +44,28 @@ class GridThread {
  };
  static void SetMaxThreads(void) { 
 #ifdef GRID_OMP
+    //    setenv("KMP_AFFINITY","balanced",1);
    _threads = omp_get_max_threads();
    omp_set_num_threads(_threads);
 #else 
    _threads = 1;
 #endif
  };
+  static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
+  static int GetCores(void)   { return _cores; };
  static int GetThreads(void) { return _threads; };
  static int SumArraySize(void) {return _threads;};

  static void GetWork(int nwork, int me, int & mywork, int & myoff){
-    int basework = nwork/_threads;
-    int backfill = _threads-(nwork%_threads);
-    if ( me >= _threads ) { 
+    GetWork(nwork,me,mywork,myoff,_threads);
+  }
+  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
+    int basework = nwork/units;
+    int backfill = units-(nwork%units);
+    if ( me >= units ) { 
      mywork = myoff = 0;
    } else { 
-      mywork = (nwork+me)/_threads;
+      mywork = (nwork+me)/units;
      myoff  = basework * me;
      if ( me > backfill ) 
 	myoff+= (me-backfill);
--- a/lib/Timer.h
+++ b/lib/Timer.h
@@ -0,0 +1,52 @@
+#ifndef GRID_TIME_H
+#define GRID_TIME_H
+
+#include <sys/time.h>
+#include <ctime>
+#include <chrono>
+
+namespace Grid {
+
+
+  // Dress the output; use std::chrono
+
+// C++11 time facilities better?
+double usecond(void);
+
+typedef  std::chrono::system_clock          GridClock;
+typedef  std::chrono::time_point<GridClock> GridTimePoint;
+typedef  std::chrono::milliseconds          GridTime;
+
+ 
+class GridStopWatch {
+private:
+  bool running;
+  GridTimePoint start;
+  GridTime accumulator;
+public:
+  GridStopWatch () { 
+    Reset();
+  }
+  void     Start(void) { 
+    assert(running == false);
+    start = GridClock::now(); 
+    running = true;
+  }
+  void     Stop(void)  { 
+    assert(running == true);
+    accumulator+= std::chrono::duration_cast<GridTime>(GridClock::now()-start); 
+    running = false; 
+  };
+  void     Reset(void){
+    running = false;
+    start = GridClock::now();
+    accumulator = std::chrono::duration_cast<GridTime>(start-start); 
+  }
+  GridTime Elapsed(void) {
+    assert(running == false);
+    return accumulator;
+  }
+};
+
+}
+#endif
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -12,9 +12,6 @@ namespace Grid {
    std::vector<int> directions   ;
    std::vector<int> displacements;

-    // FIXME -- don't like xposing the operator directions
-    // as different to the geometrical dirs
-    // Also don't like special casing five dim.. should pass an object in template
  Geometry(int _d)  {
  
      int base = (_d==5) ? 1:0;
@@ -35,12 +32,12 @@ namespace Grid {
      displacements[2*_d]=0;
      
      //// report back
-      std::cout<<"directions    :";
+      std::cout<<GridLogMessage<<"directions    :";
      for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
      std::cout <<std::endl;
-      std::cout<<"displacements :";
+      std::cout<<GridLogMessage<<"displacements :";
      for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
-      std::cout <<std::endl;
+      std::cout<<std::endl;
    }
  
    /*
@@ -64,6 +61,97 @@ namespace Grid {

  };
  
+  template<class Fobj,class CComplex,int nbasis>
+  class Aggregation   {
+  public:
+    typedef iVector<CComplex,nbasis >             siteVector;
+    typedef Lattice<siteVector>                 CoarseVector;
+    typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
+
+    typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
+    typedef Lattice<Fobj >        FineField;
+
+    GridBase *CoarseGrid;
+    GridBase *FineGrid;
+    std::vector<Lattice<Fobj> > subspace;
+
+    Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid) : 
+      CoarseGrid(_CoarseGrid),
+      FineGrid(_FineGrid),
+      subspace(nbasis,_FineGrid)
+	{
+	};
+  
+    void Orthogonalise(void){
+      CoarseScalar InnerProd(CoarseGrid); 
+      blockOrthogonalise(InnerProd,subspace);
+    } 
+    void CheckOrthogonal(void){
+      CoarseVector iProj(CoarseGrid); 
+      CoarseVector eProj(CoarseGrid); 
+      Lattice<CComplex> pokey(CoarseGrid);
+
+      
+      for(int i=0;i<nbasis;i++){
+	blockProject(iProj,subspace[i],subspace);
+
+	eProj=zero; 
+	for(int ss=0;ss<CoarseGrid->oSites();ss++){
+	  eProj._odata[ss](i)=CComplex(1.0);
+	}
+	eProj=eProj - iProj;
+	std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
+      }
+      std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
+    }
+    void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
+      blockProject(CoarseVec,FineVec,subspace);
+    }
+    void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
+      blockPromote(CoarseVec,FineVec,subspace);
+    }
+    void CreateSubspaceRandom(GridParallelRNG &RNG){
+      for(int i=0;i<nbasis;i++){
+	random(RNG,subspace[i]);
+	std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
+      }
+      Orthogonalise();
+    }
+    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
+
+      RealD scale;
+
+      ConjugateGradient<FineField> CG(1.0e-2,10000);
+      FineField noise(FineGrid);
+      FineField Mn(FineGrid);
+
+      for(int b=0;b<nn;b++){
+	
+	gaussian(RNG,noise);
+	scale = std::pow(norm2(noise),-0.5); 
+	noise=noise*scale;
+
+	hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+	for(int i=0;i<1;i++){
+
+	  CG(hermop,noise,subspace[b]);
+
+	  noise = subspace[b];
+	  scale = std::pow(norm2(noise),-0.5); 
+	  noise=noise*scale;
+
+	}
+
+	hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
+	subspace[b]   = noise;
+
+      }
+
+      Orthogonalise();
+
+    }
+  };
  // Fine Object == (per site) type of fine field
  // nbasis      == number of deflation vectors
  template<class Fobj,class CComplex,int nbasis>
@@ -82,7 +170,7 @@ namespace Grid {
    ////////////////////
    Geometry         geom;
    GridBase *       _grid; 
-    CartesianStencil Stencil; 
+    CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil; 

    std::vector<CoarseMatrix> A;

@@ -101,24 +189,22 @@ namespace Grid {
      SimpleCompressor<siteVector> compressor;
      Stencil.HaloExchange(in,comm_buf,compressor);

-      //PARALLEL_FOR_LOOP
+PARALLEL_FOR_LOOP
      for(int ss=0;ss<Grid()->oSites();ss++){
        siteVector res = zero;
 	siteVector nbr;
-	int offset,local,perm,ptype;
-
+	int ptype;
+	StencilEntry *SE;
 	for(int point=0;point<geom.npoint;point++){
-	  offset = Stencil._offsets [point][ss];
-	  local  = Stencil._is_local[point][ss];
-	  perm   = Stencil._permute [point][ss];
-	  ptype  = Stencil._permute_type[point];

-	  if(local&&perm) { 
-	    permute(nbr,in._odata[offset],ptype);
-	  } else if(local) { 
-	    nbr = in._odata[offset];
+	  SE=Stencil.GetEntry(ptype,point,ss);
+	  
+	  if(SE->_is_local&&SE->_permute) { 
+	    permute(nbr,in._odata[SE->_offset],ptype);
+	  } else if(SE->_is_local) { 
+	    nbr = in._odata[SE->_offset];
 	  } else {
-	    nbr = comm_buf[offset];
+	    nbr = comm_buf[SE->_offset];
 	  }
 	  res = res + A[point]._odata[ss]*nbr;
 	}
@@ -145,7 +231,8 @@ namespace Grid {
      comm_buf.resize(Stencil._unified_buffer_size);
    };

-    void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,std::vector<Lattice<Fobj> > & subspace){
+    void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
+			 Aggregation<Fobj,CComplex,nbasis> & Subspace){

      FineField iblock(FineGrid); // contributions from within this block
      FineField oblock(FineGrid); // contributions from outwith this block
@@ -162,8 +249,7 @@ namespace Grid {
      CoarseScalar InnerProd(Grid()); 

      // Orthogonalise the subblocks over the basis
-      blockOrthogonalise(InnerProd,subspace);
-      blockProject(iProj,subspace[0],subspace);
+      blockOrthogonalise(InnerProd,Subspace.subspace);

      // Compute the matrix elements of linop between this orthonormal
      // set of vectors.
@@ -177,7 +263,10 @@ namespace Grid {
      assert(self_stencil!=-1);

      for(int i=0;i<nbasis;i++){
-	phi=subspace[i];
+	phi=Subspace.subspace[i];
+	
+	std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
+
 	for(int p=0;p<geom.npoint;p++){ 

 	  int dir   = geom.directions[p];
@@ -210,8 +299,11 @@ namespace Grid {
 	    assert(0);
 	  }

-	  blockProject(iProj,iblock,subspace);
-	  blockProject(oProj,oblock,subspace);
+	  Subspace.ProjectToSubspace(iProj,iblock);
+	  Subspace.ProjectToSubspace(oProj,oblock);
+	  //	  blockProject(iProj,iblock,Subspace.subspace);
+	  //	  blockProject(oProj,oblock,Subspace.subspace);
+PARALLEL_FOR_LOOP
 	  for(int ss=0;ss<Grid()->oSites();ss++){
 	    for(int j=0;j<nbasis;j++){
 	      if( disp!= 0 ) {
@@ -227,33 +319,33 @@ namespace Grid {
      ///////////////////////////
      // test code worth preserving in if block
      ///////////////////////////
-      std::cout<< " Computed matrix elements "<< self_stencil <<std::endl;
+      std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
      for(int p=0;p<geom.npoint;p++){
-	std::cout<< "A["<<p<<"]" << std::endl;
-	std::cout<< A[p] << std::endl;
+	std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
+	std::cout<<GridLogMessage<< A[p] << std::endl;
      }
-      std::cout<< " picking by block0 "<< self_stencil <<std::endl;
+      std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;

-      phi=subspace[0];
+      phi=Subspace.subspace[0];
      std::vector<int> bc(FineGrid->_ndimension,0);

      blockPick(Grid(),phi,tmp,bc);      // Pick out a block
      linop.Op(tmp,Mphi);                // Apply big dop
-      blockProject(iProj,Mphi,subspace); // project it and print it
-      std::cout<< " Computed matrix elements from block zero only "<<std::endl;
-      std::cout<< iProj <<std::endl;
-      std::cout<<"Computed Coarse Operator"<<std::endl;
+      blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
+      std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
+      std::cout<<GridLogMessage<< iProj <<std::endl;
+      std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
-      //      AssertHermitian();
      //      ForceHermitian();
-      //      ForceDiagonal();
+      AssertHermitian();
+      // ForceDiagonal();
    }
    void ForceDiagonal(void) {


-      std::cout<<"**************************************************"<<std::endl;
-      std::cout<<"****   Forcing coarse operator to be diagonal ****"<<std::endl;
-      std::cout<<"**************************************************"<<std::endl;
+      std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
+      std::cout<<GridLogMessage<<"****   Forcing coarse operator to be diagonal ****"<<std::endl;
+      std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
      for(int p=0;p<8;p++){
 	A[p]=zero;
      }
@@ -263,7 +355,7 @@ namespace Grid {

      Complex one(1.0);

-      iMatrix<Complex,nbasis> ident;  ident=one;
+      iMatrix<CComplex,nbasis> ident;  ident=one;

      val = val*adj(val);
      val = val + 1.0;
@@ -279,7 +371,7 @@ namespace Grid {
 	int dd=d+1;
 	A[2*d] = adj(Cshift(A[2*d+1],dd,1));
      }
-      A[8] = 0.5*(A[8] + adj(A[8]));
+      //      A[8] = 0.5*(A[8] + adj(A[8]));
    }
    void AssertHermitian(void) {
      CoarseMatrix AA    (Grid());
@@ -293,13 +385,13 @@ namespace Grid {
 	
 	Diff = AA - adj(AAc);

-	std::cout<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
-	std::cout<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
+	std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
+	std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
 	  
      }
      Diff = A[8] - adj(A[8]);
-      std::cout<<"Norm diff local "<< norm2(Diff)<<std::endl;
-      std::cout<<"Norm local "<< norm2(A[8])<<std::endl;
+      std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
+      std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
    }
    
  };
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -71,6 +71,47 @@ namespace Grid {
      }
    };

+    ////////////////////////////////////////////////////////////////////
+    // Construct herm op and shift it for mgrid smoother
+    ////////////////////////////////////////////////////////////////////
+    template<class Matrix,class Field>
+    class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
+      Matrix &_Mat;
+      RealD _shift;
+    public:
+    ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
+      // Support for coarsening to a multigrid
+      void OpDiag (const Field &in, Field &out) {
+	_Mat.Mdiag(in,out);
+	assert(0);
+      }
+      void OpDir  (const Field &in, Field &out,int dir,int disp) {
+	_Mat.Mdir(in,out,dir,disp);
+	assert(0);
+      }
+      void Op     (const Field &in, Field &out){
+	_Mat.M(in,out);
+	assert(0);
+      }
+      void AdjOp     (const Field &in, Field &out){
+	_Mat.Mdag(in,out);
+	assert(0);
+      }
+      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+	_Mat.MdagM(in,out,n1,n2);
+	out = out + _shift*in;
+
+	ComplexD dot;	
+	dot= innerProduct(in,out);
+	n1=real(dot);
+	n2=norm2(out);
+      }
+      void HermOp(const Field &in, Field &out){
+	RealD n1,n2;
+	HermOpAndNorm(in,out,n1,n2);
+      }
+    };
+
    ////////////////////////////////////////////////////////////////////
    // Wrap an already herm matrix
    ////////////////////////////////////////////////////////////////////
@@ -147,6 +188,7 @@ namespace Grid {
    };
    template<class Matrix,class Field>
      class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
+    protected:
      Matrix &_Mat;
    public:
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
@@ -173,6 +215,7 @@ namespace Grid {
    };
    template<class Matrix,class Field>
      class SchurDiagOneOperator :  public SchurOperatorBase<Field> {
+    protected:
      Matrix &_Mat;
    public:
      SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
@@ -199,6 +242,7 @@ namespace Grid {
      }
    };

+
    /////////////////////////////////////////////////////////////
    // Base classes for functions of operators
    /////////////////////////////////////////////////////////////
@@ -207,6 +251,11 @@ namespace Grid {
      virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
    };

+    template<class Field> class LinearFunction {
+    public:
+      virtual void operator() (const Field &in, Field &out) = 0;
+    };
+
    /////////////////////////////////////////////////////////////
    // Base classes for Multishift solvers for operators
    /////////////////////////////////////////////////////////////
--- a/lib/algorithms/Preconditioner.h
+++ b/lib/algorithms/Preconditioner.h
@@ -0,0 +1,19 @@
+#ifndef GRID_PRECONDITIONER_H
+#define GRID_PRECONDITIONER_H
+
+namespace Grid {
+
+  template<class Field> class Preconditioner :  public LinearFunction<Field> { 
+    virtual void operator()(const Field &src, Field & psi)=0;
+  };
+
+  template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
+  public:
+    void operator()(const Field &src, Field & psi){
+      psi = src;
+    }
+    TrivialPrecon(void){};
+  };
+
+}
+#endif
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -9,23 +9,34 @@ namespace Grid {
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Simple general polynomial with user supplied coefficients
  ////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field>
+  class HermOpOperatorFunction : public OperatorFunction<Field> {
+    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+      Linop.HermOp(in,out);
+    };
+  };
+
  template<class Field>
  class Polynomial : public OperatorFunction<Field> {
  private:
-    std::vector<double> Coeffs;
+    std::vector<RealD> Coeffs;
  public:
-    Polynomial(std::vector<double> &_Coeffs) : Coeffs(_Coeffs) {};
+    Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };

    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {

-      Field AtoN = in;
+      Field AtoN(in._grid);
+      Field Mtmp(in._grid);
+      AtoN = in;
      out = AtoN*Coeffs[0];
-
+      //      std::cout <<"Poly in " <<norm2(in)<<std::endl;
+      //      std::cout <<"0 " <<norm2(out)<<std::endl;
      for(int n=1;n<Coeffs.size();n++){
-	Field Mtmp=AtoN;
-	Linop.Op(Mtmp,AtoN);
+	Mtmp = AtoN;
+	Linop.HermOp(Mtmp,AtoN);
 	out=out+AtoN*Coeffs[n];
+	//	std::cout << n<<" " <<norm2(out)<<std::endl;
      }
    };
  };
@@ -36,21 +47,36 @@ namespace Grid {
  template<class Field>
  class Chebyshev : public OperatorFunction<Field> {
  private:
-    std::vector<double> Coeffs;
+    std::vector<RealD> Coeffs;
    int order;
-    double hi;
-    double lo;
+    RealD hi;
+    RealD lo;

  public:
    void csv(std::ostream &out){
-      for (double x=lo; x<hi; x+=(hi-lo)/1000) {
-	double f = approx(x);
+      for (RealD x=lo; x<hi; x+=(hi-lo)/1000) {
+	RealD f = approx(x);
 	out<< x<<" "<<f<<std::endl;
      }
      return;
    }

-    Chebyshev(double _lo,double _hi,int _order, double (* func)(double) ){
+    // Convenience for plotting the approximation
+    void   PlotApprox(std::ostream &out) {
+      out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
+      for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){
+	out <<x<<"\t"<<approx(x)<<std::endl;
+      }
+    };
+
+    Chebyshev(){};
+    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
+    
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
+    {
      lo=_lo;
      hi=_hi;
      order=_order;
@@ -58,29 +84,58 @@ namespace Grid {
      if(order < 2) exit(-1);
      Coeffs.resize(order);
      for(int j=0;j<order;j++){
-	double s=0;
+	RealD s=0;
 	for(int k=0;k<order;k++){
-	  double y=std::cos(M_PI*(k+0.5)/order);
-	  double x=0.5*(y*(hi-lo)+(hi+lo));
-	  double f=func(x);
+	  RealD y=std::cos(M_PI*(k+0.5)/order);
+	  RealD x=0.5*(y*(hi-lo)+(hi+lo));
+	  RealD f=func(x);
 	  s=s+f*std::cos( j*M_PI*(k+0.5)/order );
 	}
 	Coeffs[j] = s * 2.0/order;
      }
    };

-    double approx(double x) // Convenience for plotting the approximation
+    
+    void JacksonSmooth(void){
+      RealD M=order;
+      RealD alpha = M_PI/(M+2);
+      RealD lmax = std::cos(alpha);
+      RealD sumUsq =0;
+      std::vector<RealD> U(M);
+      std::vector<RealD> a(M);
+      std::vector<RealD> g(M);
+      for(int n=0;n<=M;n++){
+	U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
+	sumUsq += U[n]*U[n];
+      }      
+      sumUsq = std::sqrt(sumUsq);
+
+      for(int i=1;i<=M;i++){
+	a[i] = U[i]/sumUsq;
+      }
+      g[0] = 1.0;
+      for(int m=1;m<=M;m++){
+	g[m] = 0;
+	for(int i=0;i<=M-m;i++){
+	  g[m]+= a[i]*a[m+i];
+	}
+      }
+      for(int m=1;m<=M;m++){
+	Coeffs[m]*=g[m];
+      }
+    }
+    RealD approx(RealD x) // Convenience for plotting the approximation
    {
-      double Tn;
-      double Tnm;
-      double Tnp;
+      RealD Tn;
+      RealD Tnm;
+      RealD Tnp;
      
-      double y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
+      RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
      
-      double T0=1;
-      double T1=y;
+      RealD T0=1;
+      RealD T1=y;
      
-      double sum;
+      RealD sum;
      sum = 0.5*Coeffs[0]*T0;
      sum+= Coeffs[1]*T1;
      
@@ -95,41 +150,33 @@ namespace Grid {
      return sum;
    };

-    // Convenience for plotting the approximation
-    void   PlotApprox(std::ostream &out) {
-      out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
-      for(double x=lo;x<hi;x+=(hi-lo)/50.0){
-	out <<x<<"\t"<<approx(x)<<std::endl;
-      }
-    };
-
-    // Implement the required interface; could require Lattice base class
+    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {

-      Field T0 = in;
-      Field T1 = T0; // Field T1(T0._grid); more efficient but hardwires Lattice class
-      Field T2 = T1;
+      GridBase *grid=in._grid;
+
+      int vol=grid->gSites();
+
+      Field T0(grid); T0 = in;  
+      Field T1(grid); 
+      Field T2(grid);
+      Field y(grid);
      
-      // use a pointer trick to eliminate copies
      Field *Tnm = &T0;
      Field *Tn  = &T1;
      Field *Tnp = &T2;
-      Field y   = in;
-  
-      double xscale = 2.0/(hi-lo);
-      double mscale = -(hi+lo)/(hi-lo);

      // Tn=T1 = (xscale M + mscale)in
-      Linop.Op(T0,y);
-
+      RealD xscale = 2.0/(hi-lo);
+      RealD mscale = -(hi+lo)/(hi-lo);
+      Linop.HermOp(T0,y);
      T1=y*xscale+in*mscale;

      // sum = .5 c[0] T0 + c[1] T1
      out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
-
      for(int n=2;n<order;n++){
 	
-	Linop.Op(*Tn,y);
+	Linop.HermOp(*Tn,y);

 	y=xscale*y+mscale*(*Tn);

@@ -148,5 +195,121 @@ namespace Grid {
  };


+  template<class Field>
+  class ChebyshevLanczos : public Chebyshev<Field> {
+  private:
+    std::vector<RealD> Coeffs;
+    int order;
+    RealD alpha;
+    RealD beta;
+    RealD mu;
+
+  public:
+    ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) :
+    alpha(_alpha),
+      beta(_beta),
+          mu(_mu)
+    {
+      order=_order;
+      Coeffs.resize(order);
+      for(int i=0;i<_order;i++){
+	Coeffs[i] = 0.0;
+      }
+      Coeffs[order-1]=1.0;
+    };
+
+    void csv(std::ostream &out){
+      for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) {
+	RealD f = approx(x);
+	out<< x<<" "<<f<<std::endl;
+      }
+      return;
+    }
+
+    RealD approx(RealD xx) // Convenience for plotting the approximation
+    {
+      RealD Tn;
+      RealD Tnm;
+      RealD Tnp;
+      Real aa = alpha * alpha;
+      Real bb = beta  *  beta;
+      
+      RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb);
+
+      RealD y= x;
+      
+      RealD T0=1;
+      RealD T1=y;
+      
+      RealD sum;
+      sum = 0.5*Coeffs[0]*T0;
+      sum+= Coeffs[1]*T1;
+      
+      Tn =T1;
+      Tnm=T0;
+      for(int i=2;i<order;i++){
+	Tnp=2*y*Tn-Tnm;
+	Tnm=Tn;
+	Tn =Tnp;
+	sum+= Tn*Coeffs[i];
+      }
+      return sum;
+    };
+
+    // shift_Multiply in Rudy's code
+    void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out) 
+    {
+      GridBase *grid=in._grid;
+      Field tmp(grid);
+
+      RealD aa= alpha*alpha;
+      RealD bb= beta * beta;
+
+      Linop.HermOp(in,out);
+      out = out - mu*in;
+
+      Linop.HermOp(out,tmp);
+      tmp = tmp - mu * out;
+
+      out = (2.0/ (aa-bb) ) * tmp -  ((aa+bb)/(aa-bb))*in;
+    };
+    // Implement the required interface
+    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+
+      GridBase *grid=in._grid;
+
+      int vol=grid->gSites();
+
+      Field T0(grid); T0 = in;  
+      Field T1(grid); 
+      Field T2(grid);
+      Field  y(grid);
+      
+      Field *Tnm = &T0;
+      Field *Tn  = &T1;
+      Field *Tnp = &T2;
+
+      // Tn=T1 = (xscale M )*in
+      AminusMuSq(Linop,T0,T1);
+
+      // sum = .5 c[0] T0 + c[1] T1
+      out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
+      for(int n=2;n<order;n++){
+	
+	AminusMuSq(Linop,*Tn,y);
+
+	*Tnp=2.0*y-(*Tnm);
+
+	out=out+Coeffs[n]* (*Tnp);
+
+	// Cycle pointers to avoid copies
+	Field *swizzle = Tnm;
+	Tnm    =Tn;
+	Tn     =Tnp;
+	Tnp    =swizzle;
+	  
+      }
+    }
+  };
 }
 #endif
--- a/lib/algorithms/approx/MultiShiftFunction.h
+++ b/lib/algorithms/approx/MultiShiftFunction.h
@@ -1,6 +1,8 @@
 #ifndef MULTI_SHIFT_FUNCTION
 #define MULTI_SHIFT_FUNCTION
+
 namespace Grid {
+
 class MultiShiftFunction {
 public:
  int order;
@@ -9,20 +11,29 @@ public:
  std::vector<RealD> tolerances;
  RealD norm;
  RealD lo,hi;
+
  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
  RealD approx(RealD x);
  void csv(std::ostream &out);
  void gnuplot(std::ostream &out);
-  MultiShiftFunction(AlgRemez & remez,double tol,bool inverse) :
-      order(remez.getDegree()),
-      tolerances(remez.getDegree(),tol),
-      poles(remez.getDegree()),
-      residues(remez.getDegree())
+
+  void Init(AlgRemez & remez,double tol,bool inverse) 
  {
+    order=remez.getDegree();
+    tolerances.resize(remez.getDegree(),tol);
+    poles.resize(remez.getDegree());
+    residues.resize(remez.getDegree());
    remez.getBounds(lo,hi);
    if ( inverse ) remez.getIPFE (&residues[0],&poles[0],&norm);
-    else remez.getPFE (&residues[0],&poles[0],&norm);
+    else           remez.getPFE (&residues[0],&poles[0],&norm);
  }
+  // Allow deferred initialisation
+  MultiShiftFunction(void){};
+  MultiShiftFunction(AlgRemez & remez,double tol,bool inverse)
+  {
+    Init(remez,tol,inverse);
+  }
+
 };
 }
 #endif
--- a/lib/algorithms/approx/Remez.cc
+++ b/lib/algorithms/approx/Remez.cc
@@ -758,3 +758,4 @@ void AlgRemez::csv(std::ostream & os)
  }
  return;
 }
+
--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@@ -15,7 +15,10 @@
 #ifndef INCLUDED_ALG_REMEZ_H
 #define INCLUDED_ALG_REMEZ_H

-#include <algorithms/approx/bigfloat.h>
+#include <stddef.h>
+
+//#include <algorithms/approx/bigfloat.h>
+#include <algorithms/approx/bigfloat_double.h>

 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
 #define SUM_MAX 10 // Maximum number of terms in exponential
@@ -28,6 +31,7 @@
  remez.getIPFE(res,pole,&norm);
  remez.csv(ostream &os);
 */
+
 class AlgRemez
 {
 private:
--- a/lib/algorithms/iterative/AdefGeneric.h
+++ b/lib/algorithms/iterative/AdefGeneric.h
@@ -0,0 +1,370 @@
+#ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
+#define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
+
+  /*
+   * Compared to Tang-2009:  P=Pleft. P^T = PRight Q=MssInv. 
+   * Script A = SolverMatrix 
+   * Script P = Preconditioner
+   *
+   * Deflation methods considered
+   *      -- Solve P A x = P b        [ like Luscher ]
+   * DEF-1        M P A x = M P b     [i.e. left precon]
+   * DEF-2        P^T M A x = P^T M b
+   * ADEF-1       Preconditioner = M P + Q      [ Q + M + M A Q]
+   * ADEF-2       Preconditioner = P^T M + Q
+   * BNN          Preconditioner = P^T M P + Q
+   * BNN2         Preconditioner = M P + P^TM +Q - M P A M 
+   * 
+   * Implement ADEF-2
+   *
+   * Vstart = P^Tx + Qb
+   * M1 = P^TM + Q
+   * M2=M3=1
+   * Vout = x
+   */
+
+// abstract base
+template<class Field, class CoarseField>
+class TwoLevelFlexiblePcg : public LinearFunction<Field>
+{
+ public:
+  int verbose;
+  RealD   Tolerance;
+  Integer MaxIterations;
+  const int mmax = 5;
+  GridBase *grid;
+  GridBase *coarsegrid;
+
+  LinearOperatorBase<Field>   *_Linop
+  OperatorFunction<Field>     *_Smoother,
+  LinearFunction<CoarseField> *_CoarseSolver;
+
+  // Need somthing that knows how to get from Coarse to fine and back again
+  
+  // more most opertor functions
+  TwoLevelFlexiblePcg(RealD tol,
+		     Integer maxit,
+		     LinearOperatorBase<Field> *Linop,
+		     LinearOperatorBase<Field> *SmootherLinop,
+		     OperatorFunction<Field>   *Smoother,
+		     OperatorFunction<CoarseField>  CoarseLinop
+		     ) : 
+      Tolerance(tol), 
+      MaxIterations(maxit),
+      _Linop(Linop),
+      _PreconditionerLinop(PrecLinop),
+      _Preconditioner(Preconditioner)
+  { 
+    verbose=0;
+  };
+
+  // The Pcg routine is common to all, but the various matrices differ from derived 
+  // implementation to derived implmentation
+  void operator() (const Field &src, Field &psi){
+  void operator() (const Field &src, Field &psi){
+
+    psi.checkerboard = src.checkerboard;
+    grid             = src._grid;
+
+    RealD f;
+    RealD rtzp,rtz,a,d,b;
+    RealD rptzp;
+    RealD tn;
+    RealD guess = norm2(psi);
+    RealD ssq   = norm2(src);
+    RealD rsq   = ssq*Tolerance*Tolerance;
+    
+    /////////////////////////////
+    // Set up history vectors
+    /////////////////////////////
+    std::vector<Field> p  (mmax,grid);
+    std::vector<Field> mmp(mmax,grid);
+    std::vector<RealD> pAp(mmax);
+
+    Field x  (grid); x = psi;
+    Field z  (grid);
+    Field tmp(grid);
+    Field r  (grid);
+    Field mu (grid);
+  
+    //////////////////////////
+    // x0 = Vstart -- possibly modify guess
+    //////////////////////////
+    x=src;
+    Vstart(x,src);
+
+    // r0 = b -A x0
+    HermOp(x,mmp); // Shouldn't this be something else?
+    axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0
+
+    //////////////////////////////////
+    // Compute z = M1 x
+    //////////////////////////////////
+    M1(r,z,tmp,mp,SmootherMirs);
+    rtzp =real(innerProduct(r,z));
+
+    ///////////////////////////////////////
+    // Solve for Mss mu = P A z and set p = z-mu
+    // Def2: p = 1 - Q Az = Pright z 
+    // Other algos M2 is trivial
+    ///////////////////////////////////////
+    M2(z,p[0]);
+
+    for (int k=0;k<=MaxIterations;k++){
+    
+      int peri_k  = k % mmax;
+      int peri_kp = (k+1) % mmax;
+
+      rtz=rtzp;
+      d= M3(p[peri_k],mp,mmp[peri_k],tmp);
+      a = rtz/d;
+    
+      // Memorise this
+      pAp[peri_k] = d;
+
+      axpy(x,a,p[peri_k],x);
+      RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
+
+      // Compute z = M x
+      M1(r,z,tmp,mp);
+
+      rtzp =real(innerProduct(r,z));
+
+      M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
+
+      p[peri_kp]=p[peri_k];
+
+      // Standard search direction  p -> z + b p    ; b = 
+      b = (rtzp)/rtz;
+
+      int northog;
+      //    northog     = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
+      northog     = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
+    
+      for(int back=0; back < northog; back++){
+	int peri_back = (k-back)%mmax;
+	RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
+	RealD beta = -pbApk/pAp[peri_back];
+	axpy(p[peri_kp],beta,p[peri_back],p[peri_kp]);
+      }
+
+      RealD rrn=sqrt(rn/ssq);
+      std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;
+
+      // Stopping condition
+      if ( rn <= rsq ) { 
+
+	HermOp(x,mmp); // Shouldn't this be something else?
+	axpy(tmp,-1.0,src,mmp[0]);
+	
+	RealD psinorm = sqrt(norm2(x));
+	RealD srcnorm = sqrt(norm2(src));
+	RealD tmpnorm = sqrt(norm2(tmp));
+	RealD true_residual = tmpnorm/srcnorm;
+	std::cout<<GridLogMessage<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl;
+	std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
+	return k;
+      }
+    }
+    // Non-convergence
+    assert(0);
+  }
+
+ public:
+
+  virtual void M(Field & in,Field & out,Field & tmp) {
+
+  }
+
+  virtual void M1(Field & in, Field & out) {// the smoother
+
+    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
+    Field tmp(grid);
+    Field Min(grid);
+
+    PcgM(in,Min); // Smoother call
+
+    HermOp(Min,out);
+    axpy(tmp,-1.0,out,in);          // tmp  = in - A Min
+
+    ProjectToSubspace(tmp,PleftProj);     
+    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
+    PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
+    axpy(out,1.0,Min,tmp); // Min+tmp
+  }
+
+  virtual void M2(const Field & in, Field & out) {
+    out=in;
+    // Must override for Def2 only
+    //  case PcgDef2:
+    //    Pright(in,out);
+    //    break;
+  }
+
+  virtual RealD M3(const Field & p, Field & mmp){
+    double d,dd;
+    HermOpAndNorm(p,mmp,d,dd);
+    return dd;
+    // Must override for Def1 only
+    //  case PcgDef1:
+    //    d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
+    //      linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
+    //    Pleft(mp,mmp);
+    //    d=real(linop_d->inner(p,mmp));
+  }
+
+  virtual void VstartDef2(Field & xconst Field & src){
+    //case PcgDef2:
+    //case PcgAdef2: 
+    //case PcgAdef2f:
+    //case PcgV11f:
+    ///////////////////////////////////
+    // Choose x_0 such that 
+    // x_0 = guess +  (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
+    //                               = [1 - Ass_inv A] Guess + Assinv src
+    //                               = P^T guess + Assinv src 
+    //                               = Vstart  [Tang notation]
+    // This gives:
+    // W^T (src - A x_0) = src_s - A guess_s - r_s
+    //                   = src_s - (A guess)_s - src_s  + (A guess)_s 
+    //                   = 0 
+    ///////////////////////////////////
+    Field r(grid);
+    Field mmp(grid);
+    
+    HermOp(x,mmp);
+    axpy (r, -1.0, mmp, src);        // r_{-1} = src - A x
+    ProjectToSubspace(r,PleftProj);     
+    ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    PromoteFromSubspace(PleftMss_proj,mmp);  
+    x=x+mmp;
+
+  }
+
+  virtual void Vstart(Field & x,const Field & src){
+    return;
+  }
+
+  /////////////////////////////////////////////////////////////////////
+  // Only Def1 has non-trivial Vout. Override in Def1
+  /////////////////////////////////////////////////////////////////////
+  virtual void   Vout  (Field & in, Field & out,Field & src){
+    out = in;
+    //case PcgDef1:
+    //    //Qb + PT x
+    //    ProjectToSubspace(src,PleftProj);     
+    //    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    //    PromoteFromSubspace(PleftMss_proj,tmp);  
+    //    
+    //    Pright(in,out);
+    //    
+    //    linop_d->axpy(out,tmp,out,1.0);
+    //    break;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // Pright and Pleft are common to all implementations
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  virtual void Pright(Field & in,Field & out){
+    // P_R  = [ 1              0 ] 
+    //        [ -Mss^-1 Msb    0 ] 
+    Field in_sbar(grid);
+
+    ProjectToSubspace(in,PleftProj);     
+    PromoteFromSubspace(PleftProj,out);  
+    axpy(in_sbar,-1.0,out,in);       // in_sbar = in - in_s 
+
+    HermOp(in_sbar,out);
+    ProjectToSubspace(out,PleftProj);           // Mssbar in_sbar  (project)
+
+    ApplyInverse     (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar 
+    PromoteFromSubspace(PleftMss_proj,out);     // 
+
+    axpy(out,-1.0,out,in_sbar);     // in_sbar - Mss^{-1} Mssbar in_sbar
+  }
+  virtual void Pleft (Field & in,Field & out){
+    // P_L  = [ 1  -Mbs Mss^-1] 
+    //        [ 0   0         ] 
+    Field in_sbar(grid);
+    Field    tmp2(grid);
+    Field    Mtmp(grid);
+
+    ProjectToSubspace(in,PleftProj);     
+    PromoteFromSubspace(PleftProj,out);  
+    axpy(in_sbar,-1.0,out,in);      // in_sbar = in - in_s
+
+    ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
+    PromoteFromSubspace(PleftMss_proj,out);
+
+    HermOp(out,Mtmp);
+
+    ProjectToSubspace(Mtmp,PleftProj);      // Msbar s Mss^{-1}
+    PromoteFromSubspace(PleftProj,tmp2);
+
+    axpy(out,-1.0,tmp2,Mtmp);
+    axpy(out,-1.0,out,in_sbar);     // in_sbar - Msbars Mss^{-1} in_s
+  }
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp){
+
+  } 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
+
+  }
+  virtual void M2(Field & in, Field & out){
+
+  }
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
+
+  }
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
+
+  }
+}
+/*
+template<class Field>
+class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+  virtual void   Vout  (Field & in, Field & out,Field & src,Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+*/
+#endif
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -13,9 +13,7 @@ namespace Grid {
 public:                                                
    RealD   Tolerance;
    Integer MaxIterations;
-    int verbose;
    ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
-      verbose=1;
    };


@@ -42,14 +40,12 @@ public:
      cp =a;
      ssq=norm2(src);

-      if ( verbose ) {
-	std::cout <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
-	std::cout <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
-      }
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;

      RealD rsq =  Tolerance* Tolerance*ssq;
      
@@ -58,7 +54,7 @@ public:
 	return;
      }
      
-      std::cout << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" rsq"<<rsq<<std::endl;
+      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" rsq"<<rsq<<std::endl;
      
      int k;
      for (k=1;k<=MaxIterations;k++){
@@ -69,22 +65,18 @@ public:

 	RealD    qqck = norm2(mmp);
 	ComplexD dck  = innerProduct(p,mmp);
-	//	if (verbose) std::cout <<std::setprecision(4)<< "ConjugateGradient:  d,qq "<<d<< " "<<qq <<" qqcheck "<< qqck<< " dck "<< dck<<std::endl;
      
 	a      = c/d;
 	b_pred = a*(a*qq-d)/c;

-
-	//	if (verbose) std::cout <<std::setprecision(4)<< "ConjugateGradient:  a,bp "<<a<< " "<<b_pred <<std::endl;
 	cp = axpy_norm(r,-a,mmp,r);
 	b = cp/c;
-	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  cp,b "<<cp<< " "<<b <<std::endl;
 	
 	// Fuse these loops ; should be really easy
 	psi= a*p+psi;
 	p  = p*b+r;
 	  
-	if (verbose) std::cout<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 	
 	// Stopping condition
 	if ( cp <= rsq ) { 
@@ -98,13 +90,14 @@ public:
 	  RealD resnorm = sqrt(norm2(p));
 	  RealD true_residual = resnorm/srcnorm;

-	  std::cout<<"ConjugateGradient: Converged on iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
-	  std::cout<<"ConjugateGradient: true   residual  is "<<true_residual<<" sol "<<psinorm<<" src "<<srcnorm<<std::endl;
-	  std::cout<<"ConjugateGradient: target residual was "<<Tolerance<<std::endl;
+	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
+		   <<" computed residual "<<sqrt(cp/ssq)
+		   <<" true residual     "<<true_residual
+		   <<" target "<<Tolerance<<std::endl;
 	  return;
 	}
      }
-      std::cout<<"ConjugateGradient did NOT converge"<<std::endl;
+      std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl;
      assert(0);
    }
  };
--- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -27,10 +27,14 @@ public:

 void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
 {
-
  GridBase *grid = src._grid;
  int nshift = shifts.order;
  std::vector<Field> results(nshift,grid);
+  (*this)(Linop,src,results,psi);
+}
+void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
+{
+  int nshift = shifts.order;

  (*this)(Linop,src,results);
  
@@ -91,7 +95,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  cp = norm2(src);
  for(int s=0;s<nshift;s++){
    rsq[s] = cp * mresidual[s] * mresidual[s];
-    std::cout<<"ConjugateGradientMultiShift: shift "<<s
+    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
 	     <<" target resid "<<rsq[s]<<std::endl;
    ps[s] = src;
  }
@@ -109,7 +113,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  // p and mmp is equal to d after this since
  // the d computation is tricky
  //  qq = real(innerProduct(p,mmp));
-  //  std::cout << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl;
+  //  std::cout<<GridLogMessage << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl;
  
  b = -cp /d;
  
@@ -214,7 +218,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	
 	if(css<rsq[s]){
 	  if ( ! converged[s] )
-	    std::cout<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
+	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	      converged[s]=1;
 	} else {
 	  all_converged=0;
@@ -225,8 +229,8 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
    
    if ( all_converged ){

-      std::cout<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
-      std::cout<< "CGMultiShift: Checking solutions"<<std::endl;
+      std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
+      std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
      
      // Check answers 
      for(int s=0; s < nshift; s++) { 
@@ -235,13 +239,13 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	axpy(r,-alpha[s],src,tmp);
 	RealD rn = norm2(r);
 	RealD cn = norm2(src);
-	std::cout<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
+	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
      }
      return;
    }
  }
  // ugly hack
-  std::cout<<"CG multi shift did not converge"<<std::endl;
+  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
  assert(0);
 }

--- a/lib/algorithms/iterative/ConjugateResidual.h
+++ b/lib/algorithms/iterative/ConjugateResidual.h
@@ -16,7 +16,7 @@ namespace Grid {
    int verbose;

    ConjugateResidual(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
-      verbose=1;
+      verbose=0;
    };

    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
@@ -37,14 +37,11 @@ namespace Grid {
      Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
      Linop.HermOpAndNorm(r,Ar,rAr,rAAr);

-      std::cout << "pAp, pAAp"<< pAp<<" "<<pAAp<<std::endl;
-      std::cout << "rAr, rAAr"<< rAr<<" "<<rAAr<<std::endl;
-
      cp =norm2(r);
      ssq=norm2(src);
      rsq=Tolerance*Tolerance*ssq;

-      std::cout<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
+      if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;

      for(int k=1;k<MaxIterations;k++){

@@ -63,21 +60,22 @@ namespace Grid {
 	axpy(p,b,p,r);
 	pAAp=axpy_norm(Ap,b,Ap,Ar);
 	
-	std::cout<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+	if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;

 	if(cp<rsq) {
 	  Linop.HermOp(psi,Ap);
 	  axpy(r,-1.0,src,Ap);
-	  RealD true_resid = norm2(r);
-	  std::cout<<"ConjugateResidual: Converged on iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
-	  std::cout<<"ConjugateResidual: true   residual  is "<<true_resid<<std::endl;
-	  std::cout<<"ConjugateResidual: target residual was "<<Tolerance <<std::endl;
+	  RealD true_resid = norm2(r)/ssq;
+	  std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
+		   << " computed residual "<<sqrt(cp/ssq)
+	           << " true residual "<<sqrt(true_resid)
+	           << " target "       <<Tolerance <<std::endl;
 	  return;
 	}

      }

-      std::cout<<"ConjugateResidual did NOT converge"<<std::endl;
+      std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
      assert(0);
    }
  };
--- a/lib/algorithms/iterative/DenseMatrix.h
+++ b/lib/algorithms/iterative/DenseMatrix.h
@@ -0,0 +1,109 @@
+#ifndef GRID_DENSE_MATRIX_H
+#define GRID_DENSE_MATRIX_H
+
+namespace Grid {
+    /////////////////////////////////////////////////////////////
+    // Matrix untils
+    /////////////////////////////////////////////////////////////
+
+template<class T> using DenseVector = std::vector<T>;
+template<class T> using DenseMatrix = DenseVector<DenseVector<T> >;
+
+template<class T> void Size(DenseVector<T> & vec, int &N) 
+{ 
+  N= vec.size();
+}
+template<class T> void Size(DenseMatrix<T> & mat, int &N,int &M) 
+{ 
+  N= mat.size();
+  M= mat[0].size();
+}
+
+template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N) 
+{ 
+  int M; Size(mat,N,M);
+  assert(N==M);
+}
+
+template<class T> void Resize(DenseVector<T > & mat, int N) { 
+  mat.resize(N);
+}
+template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
+  mat.resize(N);
+  for(int i=0;i<N;i++){
+    mat[i].resize(M);
+  }
+}
+template<class T> void Fill(DenseMatrix<T> & mat, T&val) { 
+  int N,M;
+  Size(mat,N,M);
+  for(int i=0;i<N;i++){
+  for(int j=0;j<M;j++){
+    mat[i][j] = val;
+  }}
+}
+
+/** Transpose of a matrix **/
+template<class T> DenseMatrix<T> Transpose(DenseMatrix<T> & mat){
+  int N,M;
+  Size(mat,N,M);
+  DenseMatrix<T> C; Resize(C,M,N);
+  for(int i=0;i<M;i++){
+  for(int j=0;j<N;j++){
+    C[i][j] = mat[j][i];
+  }} 
+  return C;
+}
+/** Set DenseMatrix to unit matrix **/
+template<class T> void Unity(DenseMatrix<T> &A){
+  int N;  SizeSquare(A,N);
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      if ( i==j ) A[i][j] = 1;
+      else        A[i][j] = 0;
+    } 
+  } 
+}
+
+/** Add C * I to matrix **/
+template<class T>
+void PlusUnit(DenseMatrix<T> & A,T c){
+  int dim;  SizeSquare(A,dim);
+  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
+}
+
+/** return the Hermitian conjugate of matrix **/
+template<class T>
+DenseMatrix<T> HermitianConj(DenseMatrix<T> &mat){
+
+  int dim; SizeSquare(mat,dim);
+
+  DenseMatrix<T> C; Resize(C,dim,dim);
+
+  for(int i=0;i<dim;i++){
+    for(int j=0;j<dim;j++){
+      C[i][j] = conj(mat[j][i]);
+    } 
+  } 
+  return C;
+}
+/**Get a square submatrix**/
+template <class T>
+DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st, int col_end)
+{
+  DenseMatrix<T> H; Resize(H,row_end - row_st,col_end-col_st);
+
+  for(int i = row_st; i<row_end; i++){
+  for(int j = col_st; j<col_end; j++){
+    H[i-row_st][j-col_st]=A[i][j];
+  }}
+  return H;
+}
+
+}
+
+#include <algorithms/iterative/Householder.h>
+#include <algorithms/iterative/Francis.h>
+
+#endif
+
--- a/lib/algorithms/iterative/EigenSort.h
+++ b/lib/algorithms/iterative/EigenSort.h
@@ -0,0 +1,52 @@
+#ifndef GRID_EIGENSORT_H
+#define GRID_EIGENSORT_H
+
+
+namespace Grid {
+    /////////////////////////////////////////////////////////////
+    // Eigen sorter to begin with
+    /////////////////////////////////////////////////////////////
+
+template<class Field>
+class SortEigen {
+ private:
+  
+  static bool less_lmd(RealD left,RealD right){
+    return fabs(left) < fabs(right);
+  }  
+  static bool less_pair(std::pair<RealD,Field>& left,
+		 std::pair<RealD,Field>& right){
+    return fabs(left.first) < fabs(right.first);
+  }  
+  
+ public:
+
+  void push(DenseVector<RealD>& lmd,
+	    DenseVector<Field>& evec,int N) {
+
+    DenseVector<std::pair<RealD, Field> > emod;
+    typename DenseVector<std::pair<RealD, Field> >::iterator it;
+    
+    for(int i=0;i<lmd.size();++i){
+      emod.push_back(std::pair<RealD,Field>(lmd[i],evec[i]));
+    }
+
+    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
+
+    it=emod.begin();
+    for(int i=0;i<N;++i){
+      lmd[i]=it->first;
+      evec[i]=it->second;
+      ++it;
+    }
+  }
+  void push(DenseVector<RealD>& lmd,int N) {
+    std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
+  }
+  bool saturated(RealD lmd, RealD thrs) {
+    return fabs(lmd) > fabs(thrs);
+  }
+};
+
+}
+#endif
--- a/lib/algorithms/iterative/Francis.h
+++ b/lib/algorithms/iterative/Francis.h
@@ -0,0 +1,498 @@
+#ifndef FRANCIS_H
+#define FRANCIS_H
+
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <fstream>
+#include <complex>
+#include <algorithm>
+
+//#include <timer.h>
+//#include <lapacke.h>
+//#include <Eigen/Dense>
+
+namespace Grid {
+
+template <class T> int SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
+template <class T> int     Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
+
+/**
+  Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm.
+H =
+      x  x  x  x  x  x  x  x  x
+      x  x  x  x  x  x  x  x  x
+      0  x  x  x  x  x  x  x  x
+      0  0  x  x  x  x  x  x  x
+      0  0  0  x  x  x  x  x  x
+      0  0  0  0  x  x  x  x  x
+      0  0  0  0  0  x  x  x  x
+      0  0  0  0  0  0  x  x  x
+      0  0  0  0  0  0  0  x  x
+Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.
+**/
+template <class T>
+int QReigensystem(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
+{
+  DenseMatrix<T> H = Hin; 
+
+  int N ; SizeSquare(H,N);
+  int M = N;
+
+  Fill(evals,0);
+  Fill(evecs,0);
+
+  T s,t,x=0,y=0,z=0;
+  T u,d;
+  T apd,amd,bc;
+  DenseVector<T> p(N,0);
+  T nrm = Norm(H);    ///DenseMatrix Norm
+  int n, m;
+  int e = 0;
+  int it = 0;
+  int tot_it = 0;
+  int l = 0;
+  int r = 0;
+  DenseMatrix<T> P; Resize(P,N,N); Unity(P);
+  DenseVector<int> trows(N,0);
+
+  /// Check if the matrix is really hessenberg, if not abort
+  RealD sth = 0;
+  for(int j=0;j<N;j++){
+    for(int i=j+2;i<N;i++){
+      sth = abs(H[i][j]);
+      if(sth > small){
+	std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl;
+	exit(1);
+      }
+    }
+  }
+
+  do{
+    std::cout << "Francis QR Step N = " << N << std::endl;
+    /** Check for convergence
+      x  x  x  x  x
+      0  x  x  x  x
+      0  0  x  x  x
+      0  0  x  x  x
+      0  0  0  0  x
+      for this matrix l = 4
+     **/
+    do{
+      l = Chop_subdiag(H,nrm,e,small);
+      r = 0;    ///May have converged on more than one eval
+      ///Single eval
+      if(l == N-1){
+        evals[e] = H[l][l];
+        N--; e++; r++; it = 0;
+      }
+      ///RealD eval
+      if(l == N-2){
+        trows[l+1] = 1;    ///Needed for UTSolve
+        apd = H[l][l] + H[l+1][l+1];
+        amd = H[l][l] - H[l+1][l+1];
+        bc =  (T)4.0*H[l+1][l]*H[l][l+1];
+        evals[e]   = (T)0.5*( apd + sqrt(amd*amd + bc) );
+        evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) );
+        N-=2; e+=2; r++; it = 0;
+      }
+    } while(r>0);
+
+    if(N ==0) break;
+
+    DenseVector<T > ck; Resize(ck,3);
+    DenseVector<T> v;   Resize(v,3);
+
+    for(int m = N-3; m >= l; m--){
+      ///Starting vector essentially random shift.
+      if(it%10 == 0 && N >= 3 && it > 0){
+        s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
+        t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
+        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
+        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
+        z = H[m+1][m]*H[m+2][m+1];
+      }
+      ///Starting vector implicit Q theorem
+      else{
+        s = (H[N-2][N-2] + H[N-1][N-1]);
+        t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]);
+        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
+        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
+        z = H[m+1][m]*H[m+2][m+1];
+      }
+      ck[0] = x; ck[1] = y; ck[2] = z;
+
+      if(m == l) break;
+
+      /** Some stupid thing from numerical recipies, seems to work**/
+      // PAB.. for heaven's sake quote page, purpose, evidence it works.
+      //       what sort of comment is that!?!?!?
+      u=abs(H[m][m-1])*(abs(y)+abs(z));
+      d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1]));
+      if ((T)abs(u+d) == (T)abs(d) ){
+	l = m; break;
+      }
+
+      //if (u < small){l = m; break;}
+    }
+    if(it > 100000){
+     std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl;
+     std::cout << "got " << e << " evals " << l << " " << N << std::endl;
+      exit(1);
+    }
+    normalize(ck);    ///Normalization cancels in PHP anyway
+    T beta;
+    Householder_vector<T >(ck, 0, 2, v, beta);
+    Householder_mult<T >(H,v,beta,0,l,l+2,0);
+    Householder_mult<T >(H,v,beta,0,l,l+2,1);
+    ///Accumulate eigenvector
+    Householder_mult<T >(P,v,beta,0,l,l+2,1);
+    int sw = 0;      ///Are we on the last row?
+    for(int k=l;k<N-2;k++){
+      x = H[k+1][k];
+      y = H[k+2][k];
+      z = (T)0.0;
+      if(k+3 <= N-1){
+	z = H[k+3][k];
+      } else{
+	sw = 1; 
+	v[2] = (T)0.0;
+      }
+      ck[0] = x; ck[1] = y; ck[2] = z;
+      normalize(ck);
+      Householder_vector<T >(ck, 0, 2-sw, v, beta);
+      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,0);
+      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,1);
+      ///Accumulate eigenvector
+      Householder_mult<T >(P,v, beta,0,k+1,k+3-sw,1);
+    }
+    it++;
+    tot_it++;
+  }while(N > 1);
+  N = evals.size();
+  ///Annoying - UT solves in reverse order;
+  DenseVector<T> tmp; Resize(tmp,N);
+  for(int i=0;i<N;i++){
+    tmp[i] = evals[N-i-1];
+  } 
+  evals = tmp;
+  UTeigenvectors(H, trows, evals, evecs);
+  for(int i=0;i<evals.size();i++){evecs[i] = P*evecs[i]; normalize(evecs[i]);}
+  return tot_it;
+}
+
+template <class T>
+int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
+{
+  /**
+  Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm.
+  H =
+  x  x  0  0  0  0
+  x  x  x  0  0  0
+  0  x  x  x  0  0
+  0  0  x  x  x  0
+  0  0  0  x  x  x
+  0  0  0  0  x  x
+  Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.  **/
+  return my_Wilkinson(Hin, evals, evecs, small, small);
+}
+
+template <class T>
+int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small, RealD tol)
+{
+  int N; SizeSquare(Hin,N);
+  int M = N;
+
+  ///I don't want to modify the input but matricies must be passed by reference
+  //Scale a matrix by its "norm"
+  //RealD Hnorm = abs( Hin.LargestDiag() ); H =  H*(1.0/Hnorm);
+  DenseMatrix<T> H;  H = Hin;
+  
+  RealD Hnorm = abs(Norm(Hin));
+  H = H * (1.0 / Hnorm);
+
+  // TODO use openmp and memset
+  Fill(evals,0);
+  Fill(evecs,0);
+
+  T s, t, x = 0, y = 0, z = 0;
+  T u, d;
+  T apd, amd, bc;
+  DenseVector<T> p; Resize(p,N); Fill(p,0);
+
+  T nrm = Norm(H);    ///DenseMatrix Norm
+  int n, m;
+  int e = 0;
+  int it = 0;
+  int tot_it = 0;
+  int l = 0;
+  int r = 0;
+  DenseMatrix<T> P; Resize(P,N,N);
+  Unity(P);
+  DenseVector<int> trows(N, 0);
+  /// Check if the matrix is really symm tridiag
+  RealD sth = 0;
+  for(int j = 0; j < N; ++j)
+  {
+    for(int i = j + 2; i < N; ++i)
+    {
+      if(abs(H[i][j]) > tol || abs(H[j][i]) > tol)
+      {
+	std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl;
+	std::cout << "Warning tridiagonalize and call again" << std::endl;
+        // exit(1); // see what is going on
+        //return;
+      }
+    }
+  }
+
+  do{
+    do{
+      //Jasper
+      //Check if the subdiagonal term is small enough (<small)
+      //if true then it is converged.
+      //check start from H.dim - e - 1
+      //How to deal with more than 2 are converged?
+      //What if Chop_symm_subdiag return something int the middle?
+      //--------------
+      l = Chop_symm_subdiag(H,nrm, e, small);
+      r = 0;    ///May have converged on more than one eval
+      //Jasper
+      //In this case
+      // x  x  0  0  0  0
+      // x  x  x  0  0  0
+      // 0  x  x  x  0  0
+      // 0  0  x  x  x  0
+      // 0  0  0  x  x  0
+      // 0  0  0  0  0  x  <- l
+      //--------------
+      ///Single eval
+      if(l == N - 1)
+      {
+        evals[e] = H[l][l];
+        N--;
+        e++;
+        r++;
+        it = 0;
+      }
+      //Jasper
+      // x  x  0  0  0  0
+      // x  x  x  0  0  0
+      // 0  x  x  x  0  0
+      // 0  0  x  x  0  0
+      // 0  0  0  0  x  x  <- l
+      // 0  0  0  0  x  x
+      //--------------
+      ///RealD eval
+      if(l == N - 2)
+      {
+        trows[l + 1] = 1;    ///Needed for UTSolve
+        apd = H[l][l] + H[l + 1][ l + 1];
+        amd = H[l][l] - H[l + 1][l + 1];
+        bc =  (T) 4.0 * H[l + 1][l] * H[l][l + 1];
+        evals[e] = (T) 0.5 * (apd + sqrt(amd * amd + bc));
+        evals[e + 1] = (T) 0.5 * (apd - sqrt(amd * amd + bc));
+        N -= 2;
+        e += 2;
+        r++;
+        it = 0;
+      }
+    }while(r > 0);
+    //Jasper
+    //Already converged
+    //--------------
+    if(N == 0) break;
+
+    DenseVector<T> ck,v; Resize(ck,2); Resize(v,2);
+
+    for(int m = N - 3; m >= l; m--)
+    {
+      ///Starting vector essentially random shift.
+      if(it%10 == 0 && N >= 3 && it > 0)
+      {
+        t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]);
+        x = H[m][m] - t;
+        z = H[m + 1][m];
+      } else {
+      ///Starting vector implicit Q theorem
+        d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5;
+        t =  H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2] 
+	  / (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2]));
+        x = H[m][m] - t;
+        z = H[m + 1][m];
+      }
+      //Jasper
+      //why it is here????
+      //-----------------------
+      if(m == l)
+        break;
+
+      u = abs(H[m][m - 1]) * (abs(y) + abs(z));
+      d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1]));
+      if ((T)abs(u + d) == (T)abs(d))
+      {
+        l = m;
+        break;
+      }
+    }
+    //Jasper
+    if(it > 1000000)
+    {
+      std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl;
+      std::cout << "got " << e << " evals " << l << " " << N << std::endl;
+      exit(1);
+    }
+    //
+    T s, c;
+    Givens_calc<T>(x, z, c, s);
+    Givens_mult<T>(H, l, l + 1, c, -s, 0);
+    Givens_mult<T>(H, l, l + 1, c,  s, 1);
+    Givens_mult<T>(P, l, l + 1, c,  s, 1);
+    //
+    for(int k = l; k < N - 2; ++k)
+    {
+      x = H.A[k + 1][k];
+      z = H.A[k + 2][k];
+      Givens_calc<T>(x, z, c, s);
+      Givens_mult<T>(H, k + 1, k + 2, c, -s, 0);
+      Givens_mult<T>(H, k + 1, k + 2, c,  s, 1);
+      Givens_mult<T>(P, k + 1, k + 2, c,  s, 1);
+    }
+    it++;
+    tot_it++;
+  }while(N > 1);
+
+  N = evals.size();
+  ///Annoying - UT solves in reverse order;
+  DenseVector<T> tmp(N);
+  for(int i = 0; i < N; ++i)
+    tmp[i] = evals[N-i-1];
+  evals = tmp;
+  //
+  UTeigenvectors(H, trows, evals, evecs);
+  //UTSymmEigenvectors(H, trows, evals, evecs);
+  for(int i = 0; i < evals.size(); ++i)
+  {
+    evecs[i] = P * evecs[i];
+    normalize(evecs[i]);
+    evals[i] = evals[i] * Hnorm;
+  }
+  // // FIXME this is to test
+  // Hin.write("evecs3", evecs);
+  // Hin.write("evals3", evals);
+  // // check rsd
+  // for(int i = 0; i < M; i++) {
+  //   vector<T> Aevec = Hin * evecs[i];
+  //   RealD norm2(0.);
+  //   for(int j = 0; j < M; j++) {
+  //     norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]);
+  //   }
+  // }
+  return tot_it;
+}
+
+template <class T>
+void Hess(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
+
+  /**
+  turn a matrix A =
+  x  x  x  x  x
+  x  x  x  x  x
+  x  x  x  x  x
+  x  x  x  x  x
+  x  x  x  x  x
+  into
+  x  x  x  x  x
+  x  x  x  x  x
+  0  x  x  x  x
+  0  0  x  x  x
+  0  0  0  x  x
+  with householder rotations
+  Slow.
+  */
+  int N ; SizeSquare(A,N);
+  DenseVector<T > p; Resize(p,N); Fill(p,0);
+
+  for(int k=start;k<N-2;k++){
+    //cerr << "hess" << k << std::endl;
+    DenseVector<T > ck,v; Resize(ck,N-k-1); Resize(v,N-k-1);
+    for(int i=k+1;i<N;i++){ck[i-k-1] = A(i,k);}  ///kth column
+    normalize(ck);    ///Normalization cancels in PHP anyway
+    T beta;
+    Householder_vector<T >(ck, 0, ck.size()-1, v, beta);  ///Householder vector
+    Householder_mult<T>(A,v,beta,start,k+1,N-1,0);  ///A -> PA
+    Householder_mult<T >(A,v,beta,start,k+1,N-1,1);  ///PA -> PAP^H
+    ///Accumulate eigenvector
+    Householder_mult<T >(Q,v,beta,start,k+1,N-1,1);  ///Q -> QP^H
+  }
+  /*for(int l=0;l<N-2;l++){
+    for(int k=l+2;k<N;k++){
+    A(0,k,l);
+    }
+    }*/
+}
+
+template <class T>
+void Tri(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
+///Tridiagonalize a matrix
+  int N; SizeSquare(A,N);
+  Hess(A,Q,start);
+  /*for(int l=0;l<N-2;l++){
+    for(int k=l+2;k<N;k++){
+    A(0,l,k);
+    }
+    }*/
+}
+
+template <class T>
+void ForceTridiagonal(DenseMatrix<T> &A){
+///Tridiagonalize a matrix
+  int N ; SizeSquare(A,N);
+  for(int l=0;l<N-2;l++){
+    for(int k=l+2;k<N;k++){
+      A[l][k]=0;
+      A[k][l]=0;
+    }
+  }
+}
+
+template <class T>
+int my_SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
+  ///Solve a symmetric eigensystem, not necessarily in tridiagonal form
+  int N; SizeSquare(Ain,N);
+  DenseMatrix<T > A; A = Ain;
+  DenseMatrix<T > Q; Resize(Q,N,N); Unity(Q);
+  Tri(A,Q,0);
+  int it = my_Wilkinson<T>(A, evals, evecs, small);
+  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
+  return it;
+}
+
+
+template <class T>
+int Wilkinson(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
+  return my_Wilkinson(Ain, evals, evecs, small);
+}
+
+template <class T>
+int SymmEigensystem(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
+  return my_SymmEigensystem(Ain, evals, evecs, small);
+}
+
+template <class T>
+int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
+///Solve a general eigensystem, not necessarily in tridiagonal form
+  int N = Ain.dim;
+  DenseMatrix<T > A(N); A = Ain;
+  DenseMatrix<T > Q(N);Q.Unity();
+  Hess(A,Q,0);
+  int it = QReigensystem<T>(A, evals, evecs, small);
+  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
+  return it;
+}
+
+}
+#endif
--- a/lib/algorithms/iterative/Householder.h
+++ b/lib/algorithms/iterative/Householder.h
@@ -0,0 +1,215 @@
+#ifndef HOUSEHOLDER_H
+#define HOUSEHOLDER_H
+
+#define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
+#define ENTER()  std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
+#define LEAVE()  std::cout << GridLogMessage << "EXIT  "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
+
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <fstream>
+#include <complex>
+#include <algorithm>
+
+namespace Grid {
+/** Comparison function for finding the max element in a vector **/
+template <class T> bool cf(T i, T j) { 
+  return abs(i) < abs(j); 
+}
+
+/** 
+	Calculate a real Givens angle 
+ **/
+template <class T> inline void Givens_calc(T y, T z, T &c, T &s){
+
+  RealD mz = (RealD)abs(z);
+  
+  if(mz==0.0){
+    c = 1; s = 0;
+  }
+  if(mz >= (RealD)abs(y)){
+    T t = -y/z;
+    s = (T)1.0 / sqrt ((T)1.0 + t * t);
+    c = s * t;
+  } else {
+    T t = -z/y;
+    c = (T)1.0 / sqrt ((T)1.0 + t * t);
+    s = c * t;
+  }
+}
+
+template <class T> inline void Givens_mult(DenseMatrix<T> &A,  int i, int k, T c, T s, int dir)
+{
+  int q ; SizeSquare(A,q);
+
+  if(dir == 0){
+    for(int j=0;j<q;j++){
+      T nu = A[i][j];
+      T w  = A[k][j];
+      A[i][j] = (c*nu + s*w);
+      A[k][j] = (-s*nu + c*w);
+    }
+  }
+
+  if(dir == 1){
+    for(int j=0;j<q;j++){
+      T nu = A[j][i];
+      T w  = A[j][k];
+      A[j][i] = (c*nu - s*w);
+      A[j][k] = (s*nu + c*w);
+    }
+  }
+}
+
+/**
+	from input = x;
+	Compute the complex Householder vector, v, such that
+	P = (I - b v transpose(v) )
+	b = 2/v.v
+
+	P | x |    | x | k = 0
+	| x |    | 0 | 
+	| x | =  | 0 |
+	| x |    | 0 | j = 3
+	| x |	   | x |
+
+	These are the "Unreduced" Householder vectors.
+
+ **/
+template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, DenseVector<T> &v, T &beta)
+{
+  int N ; Size(input,N);
+  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf<T> );
+
+  if(abs(m) > 0.0){
+    T alpha = 0;
+
+    for(int i=k; i<j+1; i++){
+      v[i] = input[i]/m;
+      alpha = alpha + v[i]*conj(v[i]);
+    }
+    alpha = sqrt(alpha);
+    beta = (T)1.0/(alpha*(alpha + abs(v[k]) ));
+
+    if(abs(v[k]) > 0.0)  v[k] = v[k] + (v[k]/abs(v[k]))*alpha;
+    else                 v[k] = -alpha;
+  } else{
+    for(int i=k; i<j+1; i++){
+      v[i] = 0.0;
+    } 
+  }
+}
+
+/**
+	from input = x;
+	Compute the complex Householder vector, v, such that
+	P = (I - b v transpose(v) )
+	b = 2/v.v
+
+	Px = alpha*e_dir
+
+	These are the "Unreduced" Householder vectors.
+
+ **/
+
+template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, int dir, DenseVector<T> &v, T &beta)
+{
+  int N = input.size();
+  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf);
+  
+  if(abs(m) > 0.0){
+    T alpha = 0;
+
+    for(int i=k; i<j+1; i++){
+      v[i] = input[i]/m;
+      alpha = alpha + v[i]*conj(v[i]);
+    }
+    
+    alpha = sqrt(alpha);
+    beta = 1.0/(alpha*(alpha + abs(v[dir]) ));
+	
+    if(abs(v[dir]) > 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha;
+    else                  v[dir] = -alpha;
+  }else{
+    for(int i=k; i<j+1; i++){
+      v[i] = 0.0;
+    } 
+  }
+}
+
+/**
+	Compute the product PA if trans = 0
+	AP if trans = 1
+	P = (I - b v transpose(v) )
+	b = 2/v.v
+	start at element l of matrix A
+	v is of length j - k + 1 of v are nonzero
+ **/
+
+template <class T> inline void Householder_mult(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int k, int j, int trans)
+{
+  int N ; SizeSquare(A,N);
+
+  if(abs(beta) > 0.0){
+    for(int p=l; p<N; p++){
+      T s = 0;
+      if(trans==0){
+	for(int i=k;i<j+1;i++) s += conj(v[i-k])*A[i][p];
+	s *= beta;
+	for(int i=k;i<j+1;i++){ A[i][p] = A[i][p]-s*conj(v[i-k]);}
+      } else {
+	for(int i=k;i<j+1;i++){ s += conj(v[i-k])*A[p][i];}
+	s *= beta;
+	for(int i=k;i<j+1;i++){ A[p][i]=A[p][i]-s*conj(v[i-k]);}
+      }
+    }
+  }
+}
+
+/**
+	Compute the product PA if trans = 0
+	AP if trans = 1
+	P = (I - b v transpose(v) )
+	b = 2/v.v
+	start at element l of matrix A
+	v is of length j - k + 1 of v are nonzero
+	A is tridiagonal
+ **/
+template <class T> inline void Householder_mult_tri(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int M, int k, int j, int trans)
+{
+  if(abs(beta) > 0.0){
+
+    int N ; SizeSquare(A,N);
+
+    DenseMatrix<T> tmp; Resize(tmp,N,N); Fill(tmp,0); 
+
+    T s;
+    for(int p=l; p<M; p++){
+      s = 0;
+      if(trans==0){
+	for(int i=k;i<j+1;i++) s = s + conj(v[i-k])*A[i][p];
+      }else{
+	for(int i=k;i<j+1;i++) s = s + v[i-k]*A[p][i];
+      }
+      s = beta*s;
+      if(trans==0){
+	for(int i=k;i<j+1;i++) tmp[i][p] = tmp(i,p) - s*v[i-k];
+      }else{
+	for(int i=k;i<j+1;i++) tmp[p][i] = tmp[p][i] - s*conj(v[i-k]);
+      }
+    }
+    for(int p=l; p<M; p++){
+      if(trans==0){
+	for(int i=k;i<j+1;i++) A[i][p] = A[i][p] + tmp[i][p];
+      }else{
+	for(int i=k;i<j+1;i++) A[p][i] = A[p][i] + tmp[p][i];
+      }
+    }
+  }
+}
+}
+#endif
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
--- a/lib/algorithms/iterative/Matrix.h
+++ b/lib/algorithms/iterative/Matrix.h
@@ -0,0 +1,426 @@
+#ifndef MATRIX_H
+#define MATRIX_H
+
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <complex>
+#include <typeinfo>
+#include <Grid.h>
+
+
+/** Sign function **/
+template <class T> T sign(T p){return ( p/abs(p) );}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////// Hijack STL containers for our wicked means /////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class T> using Vector = Vector<T>;
+template<class T> using Matrix = Vector<Vector<T> >;
+
+template<class T> void Resize(Vector<T > & vec, int N) { vec.resize(N); }
+
+template<class T> void Resize(Matrix<T > & mat, int N, int M) { 
+  mat.resize(N);
+  for(int i=0;i<N;i++){
+    mat[i].resize(M);
+  }
+}
+template<class T> void Size(Vector<T> & vec, int &N) 
+{ 
+  N= vec.size();
+}
+template<class T> void Size(Matrix<T> & mat, int &N,int &M) 
+{ 
+  N= mat.size();
+  M= mat[0].size();
+}
+template<class T> void SizeSquare(Matrix<T> & mat, int &N) 
+{ 
+  int M; Size(mat,N,M);
+  assert(N==M);
+}
+template<class T> void SizeSame(Matrix<T> & mat1,Matrix<T> &mat2, int &N1,int &M1) 
+{ 
+  int N2,M2;
+  Size(mat1,N1,M1);
+  Size(mat2,N2,M2);
+  assert(N1==N2);
+  assert(M1==M2);
+}
+
+//*****************************************
+//*	(Complex) Vector operations	*
+//*****************************************
+
+/**Conj of a Vector **/
+template <class T> Vector<T> conj(Vector<T> p){
+	Vector<T> q(p.size());
+	for(int i=0;i<p.size();i++){q[i] = conj(p[i]);}
+	return q;
+}
+
+/** Norm of a Vector**/
+template <class T> T norm(Vector<T> p){
+	T sum = 0;
+	for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);}
+	return abs(sqrt(sum));
+}
+
+/** Norm squared of a Vector **/
+template <class T> T norm2(Vector<T> p){
+	T sum = 0;
+	for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);}
+	return abs((sum));
+}
+
+/** Sum elements of a Vector **/
+template <class T> T trace(Vector<T> p){
+	T sum = 0;
+	for(int i=0;i<p.size();i++){sum = sum + p[i];}
+	return sum;
+}
+
+/** Fill a Vector with constant c **/
+template <class T> void Fill(Vector<T> &p, T c){
+	for(int i=0;i<p.size();i++){p[i] = c;}
+}
+/** Normalize a Vector **/
+template <class T> void normalize(Vector<T> &p){
+	T m = norm(p);
+	if( abs(m) > 0.0) for(int i=0;i<p.size();i++){p[i] /= m;}
+}
+/** Vector by scalar **/
+template <class T, class U> Vector<T> times(Vector<T> p, U s){
+	for(int i=0;i<p.size();i++){p[i] *= s;}
+	return p;
+}
+template <class T, class U> Vector<T> times(U s, Vector<T> p){
+	for(int i=0;i<p.size();i++){p[i] *= s;}
+	return p;
+}
+/** inner product of a and b = conj(a) . b **/
+template <class T> T inner(Vector<T> a, Vector<T> b){
+	T m = 0.;
+	for(int i=0;i<a.size();i++){m = m + conj(a[i])*b[i];}
+	return m;
+}
+/** sum of a and b = a + b **/
+template <class T> Vector<T> add(Vector<T> a, Vector<T> b){
+	Vector<T> m(a.size());
+	for(int i=0;i<a.size();i++){m[i] = a[i] + b[i];}
+	return m;
+}
+/** sum of a and b = a - b **/
+template <class T> Vector<T> sub(Vector<T> a, Vector<T> b){
+	Vector<T> m(a.size());
+	for(int i=0;i<a.size();i++){m[i] = a[i] - b[i];}
+	return m;
+}
+
+/** 
+ *********************************
+ *	Matrices	         *
+ *********************************
+ **/
+
+template<class T> void Fill(Matrix<T> & mat, T&val) { 
+  int N,M;
+  Size(mat,N,M);
+  for(int i=0;i<N;i++){
+  for(int j=0;j<M;j++){
+    mat[i][j] = val;
+  }}
+}
+
+/** Transpose of a matrix **/
+Matrix<T> Transpose(Matrix<T> & mat){
+  int N,M;
+  Size(mat,N,M);
+  Matrix C; Resize(C,M,N);
+  for(int i=0;i<M;i++){
+  for(int j=0;j<N;j++){
+    C[i][j] = mat[j][i];
+  }} 
+  return C;
+}
+/** Set Matrix to unit matrix **/
+template<class T> void Unity(Matrix<T> &mat){
+  int N;  SizeSquare(mat,N);
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      if ( i==j ) A[i][j] = 1;
+      else        A[i][j] = 0;
+    } 
+  } 
+}
+/** Add C * I to matrix **/
+template<class T>
+void PlusUnit(Matrix<T> & A,T c){
+  int dim;  SizeSquare(A,dim);
+  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
+}
+
+/** return the Hermitian conjugate of matrix **/
+Matrix<T> HermitianConj(Matrix<T> &mat){
+
+  int dim; SizeSquare(mat,dim);
+
+  Matrix<T> C; Resize(C,dim,dim);
+
+  for(int i=0;i<dim;i++){
+    for(int j=0;j<dim;j++){
+      C[i][j] = conj(mat[j][i]);
+    } 
+  } 
+  return C;
+}
+
+/** return diagonal entries as a Vector **/
+Vector<T> diag(Matrix<T> &A)
+{
+  int dim; SizeSquare(A,dim);
+  Vector<T> d; Resize(d,dim);
+
+  for(int i=0;i<dim;i++){
+    d[i] = A[i][i];
+  }
+  return d;
+}
+
+/** Left multiply by a Vector **/
+Vector<T> operator *(Vector<T> &B,Matrix<T> &A)
+{
+  int K,M,N; 
+  Size(B,K);
+  Size(A,M,N);
+  assert(K==M);
+  
+  Vector<T> C; Resize(C,N);
+
+  for(int j=0;j<N;j++){
+    T sum = 0.0;
+    for(int i=0;i<M;i++){
+      sum += B[i] * A[i][j];
+    }
+    C[j] =  sum;
+  }
+  return C; 
+}
+
+/** return 1/diagonal entries as a Vector **/
+Vector<T> inv_diag(Matrix<T> & A){
+  int dim; SizeSquare(A,dim);
+  Vector<T> d; Resize(d,dim);
+  for(int i=0;i<dim;i++){
+    d[i] = 1.0/A[i][i];
+  }
+  return d;
+}
+/** Matrix Addition **/
+inline Matrix<T> operator + (Matrix<T> &A,Matrix<T> &B)
+{
+  int N,M  ; SizeSame(A,B,N,M);
+  Matrix C; Resize(C,N,M);
+  for(int i=0;i<N;i++){
+    for(int j=0;j<M;j++){
+      C[i][j] = A[i][j] +  B[i][j];
+    } 
+  } 
+  return C;
+} 
+/** Matrix Subtraction **/
+inline Matrix<T> operator- (Matrix<T> & A,Matrix<T> &B){
+  int N,M  ; SizeSame(A,B,N,M);
+  Matrix C; Resize(C,N,M);
+  for(int i=0;i<N;i++){
+  for(int j=0;j<M;j++){
+    C[i][j] = A[i][j] -  B[i][j];
+  }}
+  return C;
+} 
+
+/** Matrix scalar multiplication **/
+inline Matrix<T> operator* (Matrix<T> & A,T c){
+  int N,M; Size(A,N,M);
+  Matrix C; Resize(C,N,M);
+  for(int i=0;i<N;i++){
+  for(int j=0;j<M;j++){
+    C[i][j] = A[i][j]*c;
+  }} 
+  return C;
+} 
+/** Matrix Matrix multiplication **/
+inline Matrix<T> operator* (Matrix<T> &A,Matrix<T> &B){
+  int K,L,N,M;
+  Size(A,K,L);
+  Size(B,N,M); assert(L==N);
+  Matrix C; Resize(C,K,M);
+
+  for(int i=0;i<K;i++){
+    for(int j=0;j<M;j++){
+      T sum = 0.0;
+      for(int k=0;k<N;k++) sum += A[i][k]*B[k][j];
+      C[i][j] =sum;
+    }
+  }
+  return C; 
+} 
+/** Matrix Vector multiplication **/
+inline Vector<T> operator* (Matrix<T> &A,Vector<T> &B){
+  int M,N,K;
+  Size(A,N,M);
+  Size(B,K); assert(K==M);
+  Vector<T> C; Resize(C,N);
+  for(int i=0;i<N;i++){
+    T sum = 0.0;
+    for(int j=0;j<M;j++) sum += A[i][j]*B[j];
+    C[i] =  sum;
+  }
+  return C; 
+} 
+
+/** Some version of Matrix norm **/
+/*
+inline T Norm(){ // this is not a usual L2 norm
+    T norm = 0;
+    for(int i=0;i<dim;i++){
+      for(int j=0;j<dim;j++){
+	norm += abs(A[i][j]);
+    }}
+    return norm;
+  }
+*/
+
+/** Some version of Matrix norm **/
+template<class T> T LargestDiag(Matrix<T> &A)
+{
+  int dim ; SizeSquare(A,dim); 
+
+  T ld = abs(A[0][0]);
+  for(int i=1;i<dim;i++){
+    T cf = abs(A[i][i]);
+    if(abs(cf) > abs(ld) ){ld = cf;}
+  }
+  return ld;
+}
+
+/** Look for entries on the leading subdiagonal that are smaller than 'small' **/
+template <class T,class U> int Chop_subdiag(Matrix<T> &A,T norm, int offset, U small)
+{
+  int dim; SizeSquare(A,dim);
+  for(int l = dim - 1 - offset; l >= 1; l--) {             		
+    if((U)abs(A[l][l - 1]) < (U)small) {
+      A[l][l-1]=(U)0.0;
+      return l;
+    }
+  }
+  return 0;
+}
+
+/** Look for entries on the leading subdiagonal that are smaller than 'small' **/
+template <class T,class U> int Chop_symm_subdiag(Matrix<T> & A,T norm, int offset, U small) 
+{
+  int dim; SizeSquare(A,dim);
+  for(int l = dim - 1 - offset; l >= 1; l--) {
+    if((U)abs(A[l][l - 1]) < (U)small) {
+      A[l][l - 1] = (U)0.0;
+      A[l - 1][l] = (U)0.0;
+      return l;
+    }
+  }
+  return 0;
+}
+/**Assign a submatrix to a larger one**/
+template<class T>
+void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S)
+{
+  for(int i = row_st; i<row_end; i++){
+    for(int j = col_st; j<col_end; j++){
+      A[i][j] = S[i - row_st][j - col_st];
+    }
+  }
+}
+
+/**Get a square submatrix**/
+template <class T>
+Matrix<T> GetSubMtx(Matrix<T> &A,int row_st, int row_end, int col_st, int col_end)
+{
+  Matrix<T> H; Resize(row_end - row_st,col_end-col_st);
+
+  for(int i = row_st; i<row_end; i++){
+  for(int j = col_st; j<col_end; j++){
+    H[i-row_st][j-col_st]=A[i][j];
+  }}
+  return H;
+}
+  
+ /**Assign a submatrix to a larger one NB remember Vector Vectors are transposes of the matricies they represent**/
+template<class T>
+void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S)
+{
+  for(int i = row_st; i<row_end; i++){
+  for(int j = col_st; j<col_end; j++){
+    A[i][j] = S[i - row_st][j - col_st];
+  }}
+}
+  
+/** compute b_i A_ij b_j **/ // surprised no Conj
+template<class T> T proj(Matrix<T> A, Vector<T> B){
+  int dim; SizeSquare(A,dim);
+  int dimB; Size(B,dimB);
+  assert(dimB==dim);
+  T C = 0;
+  for(int i=0;i<dim;i++){
+    T sum = 0.0;
+    for(int j=0;j<dim;j++){
+      sum += A[i][j]*B[j];
+    }
+    C +=  B[i]*sum; // No conj?
+  }
+  return C; 
+}
+
+
+/*
+ *************************************************************
+ *
+ * Matrix Vector products
+ *
+ *************************************************************
+ */
+// Instead make a linop and call my CG;
+
+/// q -> q Q
+template <class T,class Fermion> void times(Vector<Fermion> &q, Matrix<T> &Q)
+{
+  int M; SizeSquare(Q,M);
+  int N; Size(q,N); 
+  assert(M==N);
+
+  times(q,Q,N);
+}
+
+/// q -> q Q
+template <class T> void times(multi1d<LatticeFermion> &q, Matrix<T> &Q, int N)
+{
+  GridBase *grid = q[0]._grid;
+  int M; SizeSquare(Q,M);
+  int K; Size(q,K); 
+  assert(N<M);
+  assert(N<K);
+  Vector<Fermion> S(N,grid );
+  for(int j=0;j<N;j++){
+    S[j] = zero;
+    for(int k=0;k<N;k++){
+      S[j] = S[j] +  q[k]* Q[k][j]; 
+    }
+  }
+  for(int j=0;j<q.size();j++){
+    q[j] = S[j];
+  }
+}
+#endif
--- a/lib/algorithms/iterative/MatrixUtils.h
+++ b/lib/algorithms/iterative/MatrixUtils.h
@@ -0,0 +1,48 @@
+#ifndef GRID_MATRIX_UTILS_H
+#define GRID_MATRIX_UTILS_H
+
+namespace Grid {
+
+  namespace MatrixUtils { 
+
+    template<class T> inline void Size(Matrix<T>& A,int &N,int &M){
+      N=A.size(); assert(N>0);
+      M=A[0].size();
+      for(int i=0;i<N;i++){
+	assert(A[i].size()==M);
+      }
+    }
+
+    template<class T> inline void SizeSquare(Matrix<T>& A,int &N)
+    {
+      int M;
+      Size(A,N,M);
+      assert(N==M);
+    }
+
+    template<class T> inline void Fill(Matrix<T>& A,T & val)
+    { 
+      int N,M;
+      Size(A,N,M);
+      for(int i=0;i<N;i++){
+      for(int j=0;j<M;j++){
+	A[i][j]=val;
+      }}
+    }
+    template<class T> inline void Diagonal(Matrix<T>& A,T & val)
+    { 
+      int N;
+      SizeSquare(A,N);
+      for(int i=0;i<N;i++){
+	A[i][i]=val;
+      }
+    }
+    template<class T> inline void Identity(Matrix<T>& A)
+    {
+      Fill(A,0.0);
+      Diagonal(A,1.0);
+    }
+
+  };
+}
+#endif
--- a/lib/algorithms/iterative/PrecConjugateResidual.h
+++ b/lib/algorithms/iterative/PrecConjugateResidual.h
@@ -0,0 +1,92 @@
+#ifndef GRID_PREC_CONJUGATE_RESIDUAL_H
+#define GRID_PREC_CONJUGATE_RESIDUAL_H
+
+namespace Grid {
+
+    /////////////////////////////////////////////////////////////
+    // Base classes for iterative processes based on operators
+    // single input vec, single output vec.
+    /////////////////////////////////////////////////////////////
+
+  template<class Field> 
+    class PrecConjugateResidual : public OperatorFunction<Field> {
+  public:                                                
+    RealD   Tolerance;
+    Integer MaxIterations;
+    int verbose;
+    LinearFunction<Field> &Preconditioner;
+
+    PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit),      Preconditioner(Prec)
+    { 
+      verbose=1;
+    };
+
+    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+
+      RealD a, b, c, d;
+      RealD cp, ssq,rsq;
+      
+      RealD rAr, rAAr, rArp;
+      RealD pAp, pAAp;
+
+      GridBase *grid = src._grid;
+      Field r(grid),  p(grid), Ap(grid), Ar(grid), z(grid);
+      
+      psi=zero;
+      r  = src;
+      Preconditioner(r,p);
+
+      
+
+      Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
+      Ar=Ap;
+      rAr=pAp;
+      rAAr=pAAp;
+
+      cp =norm2(r);
+      ssq=norm2(src);
+      rsq=Tolerance*Tolerance*ssq;
+
+      if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
+
+      for(int k=0;k<MaxIterations;k++){
+
+
+	Preconditioner(Ap,z);
+	RealD rq= real(innerProduct(Ap,z)); 
+
+	a = rAr/rq;
+
+   	axpy(psi,a,p,psi);
+   cp = axpy_norm(r,-a,z,r);
+
+	rArp=rAr;
+
+	Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
+
+	b   =rAr/rArp;
+ 
+	axpy(p,b,p,r);
+	pAAp=axpy_norm(Ap,b,Ap,Ar);
+	
+	if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+
+	if(cp<rsq) {
+	  Linop.HermOp(psi,Ap);
+	  axpy(r,-1.0,src,Ap);
+	  RealD true_resid = norm2(r)/ssq;
+	  std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k
+		   << " computed residual "<<sqrt(cp/ssq)
+	           << " true residual "<<sqrt(true_resid)
+	           << " target "       <<Tolerance <<std::endl;
+	  return;
+	}
+
+      }
+
+      std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
+      assert(0);
+    }
+  };
+}
+#endif
--- a/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -0,0 +1,175 @@
+#ifndef GRID_PREC_GCR_H
+#define GRID_PREC_GCR_H
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+//VPGCR Abe and Zhang, 2005.
+//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
+//Computing and Information Volume 2, Number 2, Pages 147-161
+//NB. Likely not original reference since they are focussing on a preconditioner variant.
+//    but VPGCR was nicely written up in their paper
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+namespace Grid {
+
+  template<class Field>
+    class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
+  public:                                                
+    RealD   Tolerance;
+    Integer MaxIterations;
+    int verbose;
+    int mmax;
+    int nstep;
+    int steps;
+    LinearFunction<Field> &Preconditioner;
+
+   PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
+      Tolerance(tol), 
+      MaxIterations(maxit),
+      Preconditioner(Prec),
+      mmax(_mmax),
+      nstep(_nstep)
+    { 
+      verbose=1;
+    };
+
+    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+
+      psi=zero;
+      RealD cp, ssq,rsq;
+      ssq=norm2(src);
+      rsq=Tolerance*Tolerance*ssq;
+      
+      Field r(src._grid);
+
+      steps=0;
+      for(int k=0;k<MaxIterations;k++){
+
+	cp=GCRnStep(Linop,src,psi,rsq);
+
+	if ( verbose ) std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
+
+	if(cp<rsq) {
+	  Linop.HermOp(psi,r);
+	  axpy(r,-1.0,src,r);
+	  RealD tr = norm2(r);
+	  std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
+		   << " computed residual "<<sqrt(cp/ssq)
+	           << " true residual "    <<sqrt(tr/ssq)
+	           << " target "           <<Tolerance <<std::endl;
+	  return;
+	}
+
+      }
+      std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
+      assert(0);
+    }
+    RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
+
+      RealD cp;
+      RealD a, b, c, d;
+      RealD zAz, zAAz;
+      RealD rAq, rq;
+
+      GridBase *grid = src._grid;
+
+      Field r(grid);
+      Field z(grid);
+      Field tmp(grid);
+      Field ttmp(grid);
+      Field Az(grid);
+
+      ////////////////////////////////
+      // history for flexible orthog
+      ////////////////////////////////
+      std::vector<Field> q(mmax,grid);
+      std::vector<Field> p(mmax,grid);
+      std::vector<RealD> qq(mmax);
+      
+      //////////////////////////////////
+      // initial guess x0 is taken as nonzero.
+      // r0=src-A x0 = src
+      //////////////////////////////////
+      Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
+      r=src-Az;
+      
+      /////////////////////
+      // p = Prec(r)
+      /////////////////////
+      Preconditioner(r,z);
+
+      std::cout<<GridLogMessage<< " Preconditioner in " << norm2(r)<<std::endl; 
+      std::cout<<GridLogMessage<< " Preconditioner out " << norm2(z)<<std::endl; 
+      
+      Linop.HermOp(z,tmp); 
+
+      std::cout<<GridLogMessage<< " Preconditioner Aout " << norm2(tmp)<<std::endl; 
+      ttmp=tmp;
+      tmp=tmp-r;
+
+      std::cout<<GridLogMessage<< " Preconditioner resid " << std::sqrt(norm2(tmp)/norm2(r))<<std::endl; 
+      /*
+      std::cout<<GridLogMessage<<r<<std::endl;
+      std::cout<<GridLogMessage<<z<<std::endl;
+      std::cout<<GridLogMessage<<ttmp<<std::endl;
+      std::cout<<GridLogMessage<<tmp<<std::endl;
+      */
+
+      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
+
+      //p[0],q[0],qq[0] 
+      p[0]= z;
+      q[0]= Az;
+      qq[0]= zAAz;
+
+      cp =norm2(r);
+
+      for(int k=0;k<nstep;k++){
+
+	steps++;
+
+	int kp     = k+1;
+	int peri_k = k %mmax;
+	int peri_kp= kp%mmax;
+
+	rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
+	a = rq/qq[peri_k];
+
+	axpy(psi,a,p[peri_k],psi);         
+
+	cp = axpy_norm(r,-a,q[peri_k],r);  
+
+	std::cout<<GridLogMessage<< " VPGCR_step resid" <<sqrt(cp/rsq)<<std::endl; 
+	if((k==nstep-1)||(cp<rsq)){
+	  return cp;
+	}
+
+	Preconditioner(r,z);// solve Az = r
+	Linop.HermOpAndNorm(z,Az,zAz,zAAz);
+
+
+	Linop.HermOp(z,tmp);
+        tmp=tmp-r;
+	std::cout<<GridLogMessage<< " Preconditioner resid" <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
+
+	q[peri_kp]=Az;
+	p[peri_kp]=z;
+
+	int northog = ((kp)>(mmax-1))?(mmax-1):(kp);  // if more than mmax done, we orthog all mmax history.
+	for(int back=0;back<northog;back++){
+
+	  int peri_back=(k-back)%mmax;   	  assert((k-back)>=0);
+
+	  b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
+	  p[peri_kp]=p[peri_kp]+b*p[peri_back];
+	  q[peri_kp]=q[peri_kp]+b*q[peri_back];
+
+	}
+	qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
+
+
+      }
+      assert(0); // never reached
+      return cp;
+    }
+  };
+}
+#endif
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@@ -89,7 +89,7 @@ namespace Grid {
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
-      std::cout << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
+      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);

      ///////////////////////////////////////////////////
@@ -108,7 +108,7 @@ namespace Grid {
      RealD ns = norm2(in);
      RealD nr = norm2(resid);

-      std::cout << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
+      std::cout<<GridLogMessage << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
    }     
  };

--- a/lib/algorithms/iterative/bisec.c
+++ b/lib/algorithms/iterative/bisec.c
@@ -0,0 +1,122 @@
+#include <math.h>
+#include <stdlib.h>
+#include <vector>
+
+struct Bisection {
+
+static void get_eig2(int row_num,std::vector<RealD> &ALPHA,std::vector<RealD> &BETA, std::vector<RealD> & eig)
+{
+  int i,j;
+  std::vector<RealD> evec1(row_num+3);
+  std::vector<RealD> evec2(row_num+3);
+  RealD eps2;
+  ALPHA[1]=0.;
+  BETHA[1]=0.;
+  for(i=0;i<row_num-1;i++) {
+    ALPHA[i+1] = A[i*(row_num+1)].real();
+    BETHA[i+2] = A[i*(row_num+1)+1].real();
+  }
+  ALPHA[row_num] = A[(row_num-1)*(row_num+1)].real();
+  bisec(ALPHA,BETHA,row_num,1,row_num,1e-10,1e-10,evec1,eps2);
+  bisec(ALPHA,BETHA,row_num,1,row_num,1e-16,1e-16,evec2,eps2);
+
+  // Do we really need to sort here?
+  int begin=1;
+  int end = row_num;
+  int swapped=1;
+  while(swapped) {
+    swapped=0;
+    for(i=begin;i<end;i++){
+      if(mag(evec2[i])>mag(evec2[i+1]))	{
+	swap(evec2+i,evec2+i+1);
+	swapped=1;
+      }
+    }
+    end--;
+    for(i=end-1;i>=begin;i--){
+      if(mag(evec2[i])>mag(evec2[i+1]))	{
+	swap(evec2+i,evec2+i+1);
+	swapped=1;
+      }
+    }
+    begin++;
+  }
+
+  for(i=0;i<row_num;i++){
+    for(j=0;j<row_num;j++) {
+      if(i==j) H[i*row_num+j]=evec2[i+1];
+      else H[i*row_num+j]=0.;
+    }
+  }
+}
+
+static void bisec(std::vector<RealD> &c,   
+		  std::vector<RealD> &b,
+		  int n,
+		  int m1,
+		  int m2,
+		  RealD eps1,
+		  RealD relfeh,
+		  std::vector<RealD> &x,
+		  RealD &eps2)
+{
+  std::vector<RealD> wu(n+2);
+
+  RealD h,q,x1,xu,x0,xmin,xmax; 
+  int i,a,k;
+
+  b[1]=0.0;
+  xmin=c[n]-fabs(b[n]);
+  xmax=c[n]+fabs(b[n]);
+  for(i=1;i<n;i++){
+    h=fabs(b[i])+fabs(b[i+1]);
+    if(c[i]+h>xmax) xmax= c[i]+h;
+    if(c[i]-h<xmin) xmin= c[i]-h;
+  }
+  xmax *=2.;
+
+  eps2=relfeh*((xmin+xmax)>0.0 ? xmax : -xmin);
+  if(eps1<=0.0) eps1=eps2;
+  eps2=0.5*eps1+7.0*(eps2);
+  x0=xmax;
+  for(i=m1;i<=m2;i++){
+    x[i]=xmax;
+    wu[i]=xmin;
+  }
+
+  for(k=m2;k>=m1;k--){
+    xu=xmin;
+    i=k;
+    do{
+      if(xu<wu[i]){
+	xu=wu[i];
+	i=m1-1;
+      }
+      i--;
+    }while(i>=m1);
+    if(x0>x[k]) x0=x[k];
+    while((x0-xu)>2*relfeh*(fabs(xu)+fabs(x0))+eps1){
+      x1=(xu+x0)/2;
+
+      a=0;
+      q=1.0;
+      for(i=1;i<=n;i++){
+	q=c[i]-x1-((q!=0.0)? b[i]*b[i]/q:fabs(b[i])/relfeh);
+	if(q<0) a++;
+      }
+      //			printf("x1=%e a=%d\n",x1,a);
+      if(a<k){
+	if(a<m1){
+	  xu=x1;
+	  wu[m1]=x1;
+	}else {
+	  xu=x1;
+	  wu[a+1]=x1;
+	  if(x[a]>x1) x[a]=x1;
+	}
+      }else x0=x1;
+    }
+    x[k]=(x0+xu)/2;
+  }
+}
+}
--- a/lib/algorithms/iterative/get_eig.c
+++ b/lib/algorithms/iterative/get_eig.c
@@ -0,0 +1 @@
+
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -87,6 +87,14 @@ class CartesianCommunicator {
 			void *recv,
 			int recv_from_rank,
 			int bytes);
+
+    void RecvFrom(void *recv,
+		  int recv_from_rank,
+		  int bytes);
+    void SendTo(void *xmit,
+		int xmit_to_rank,
+		int bytes);
+
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -81,13 +81,30 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
+void CartesianCommunicator::RecvFrom(void *recv,
+				     int from,
+				     int bytes) 
+{
+  MPI_Status stat;
+  int ierr=MPI_Recv(recv, bytes, MPI_CHAR,from,from,communicator,&stat);
+  assert(ierr==0);
+}
+void CartesianCommunicator::SendTo(void *xmit,
+				   int dest,
+				   int bytes)
+{
+  int rank = _processor; // used for tag; must know who it comes from
+  int ierr = MPI_Send(xmit, bytes, MPI_CHAR,dest,_processor,communicator);
+  assert(ierr==0);
+}
+
 // Basic Halo comms primitive
-  void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						  void *xmit,
-						  int dest,
-						  void *recv,
-						  int from,
-						  int bytes)
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
 {
  MPI_Request xrq;
  MPI_Request rrq;
@@ -100,7 +117,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,

  list.push_back(xrq);
  list.push_back(rrq);
-
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -22,6 +22,20 @@ void CartesianCommunicator::GlobalSum(double &){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}

+void CartesianCommunicator::RecvFrom(void *recv,
+				     int recv_from_rank,
+				     int bytes) 
+{
+  assert(0);
+}
+void CartesianCommunicator::SendTo(void *xmit,
+				   int xmit_to_rank,
+				   int bytes)
+{
+  assert(0);
+}
+
+
 // Basic Halo comms primitive -- should never call in single node
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -8,7 +8,7 @@ class SimpleCompressor {
 public:
  void Point(int) {};

-  vobj operator() (const vobj &arg) {
+  vobj operator() (const vobj &arg,int dimension,int plane,int osite,GridBase *grid) {
    return arg;
  }
 };
@@ -29,16 +29,27 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
+
+  if ( cbmask == 0x3 ) { 
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
-      int o  = n*rhs._grid->_slice_stride[dimension];
-      int bo = n*rhs._grid->_slice_block[dimension];
-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
-      if ( ocb &cbmask ) {
-	buffer[bo+b]=compress(rhs._odata[so+o+b]);
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o  = n*rhs._grid->_slice_stride[dimension];
+	int bo = n*rhs._grid->_slice_block[dimension];
+	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
      }
    }
+  } else { 
+     int bo=0;
+     for(int n=0;n<e1;n++){
+       for(int b=0;b<e2;b++){
+	 int o  = n*rhs._grid->_slice_stride[dimension];
+	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	 if ( ocb &cbmask ) {
+	   buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	 }
+       }
+     }
  }
 }

@@ -59,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_

  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
+  
+  if ( cbmask ==0x3){
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){

-      int o=n*rhs._grid->_slice_stride[dimension];
-      int offset = b+n*rhs._grid->_slice_block[dimension];
+	int o=n*rhs._grid->_slice_stride[dimension];
+	int offset = b+n*rhs._grid->_slice_block[dimension];

-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
-      if ( ocb & cbmask ) {
-	cobj temp; 
-	temp =compress(rhs._odata[so+o+b]);
+	cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 	extract<cobj>(temp,pointers,offset);
+
+      }
+    }
+  } else { 
+
+    assert(0); //Fixme think this is buggy
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o=n*rhs._grid->_slice_stride[dimension];
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	int offset = b+n*rhs._grid->_slice_block[dimension];
+
+	if ( ocb & cbmask ) {
+	  cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	  extract<cobj>(temp,pointers,offset);
+	}
      }
    }
  }
@@ -109,16 +135,28 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
    
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
+  
+  if ( cbmask ==0x3 ) {
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
-      int o   =n*rhs._grid->_slice_stride[dimension];
-      int bo  =n*rhs._grid->_slice_block[dimension];
-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
-      if ( ocb & cbmask ) {
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o   =n*rhs._grid->_slice_stride[dimension];
+	int bo  =n*rhs._grid->_slice_block[dimension];
 	rhs._odata[so+o+b]=buffer[bo+b];
      }
    }
+  } else { 
+    int bo=0;
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o   =n*rhs._grid->_slice_stride[dimension];
+	int bo  =n*rhs._grid->_slice_block[dimension];
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	if ( ocb & cbmask ) {
+	  rhs._odata[so+o+b]=buffer[bo++];
+	}
+      }
+    }
  }
 }

@@ -137,16 +175,28 @@ PARALLEL_NESTED_LOOP2
    
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
+
+  if(cbmask ==0x3 ) {
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
-      int o      = n*rhs._grid->_slice_stride[dimension];
-      int offset = b+n*rhs._grid->_slice_block[dimension];
-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
-      if ( ocb&cbmask ) {
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o      = n*rhs._grid->_slice_stride[dimension];
+	int offset = b+n*rhs._grid->_slice_block[dimension];
 	merge(rhs._odata[so+o+b],pointers,offset);
      }
    }
+  } else { 
+    assert(0); // think this is buggy FIXME
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o      = n*rhs._grid->_slice_stride[dimension];
+	int offset = b+n*rhs._grid->_slice_block[dimension];
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	if ( ocb&cbmask ) {
+	  merge(rhs._odata[so+o+b],pointers,offset);
+	}
+      }
+    }
  }
 }

@@ -166,17 +216,29 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs

  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs._grid->_slice_block[dimension];
-PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){

-      int o =n*rhs._grid->_slice_stride[dimension]+b;
-      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
-      if ( ocb&cbmask ) {
-	//lhs._odata[lo+o]=rhs._odata[ro+o];
+  if(cbmask == 0x3 ){
+PARALLEL_NESTED_LOOP2
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+ 
+        int o =n*rhs._grid->_slice_stride[dimension]+b;
+  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
      }
+    }
+  } else { 
+PARALLEL_NESTED_LOOP2
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
 
+        int o =n*rhs._grid->_slice_stride[dimension]+b;
+        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
+        if ( ocb&cbmask ) {
+  	//lhs._odata[lo+o]=rhs._odata[ro+o];
+	  vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
+	}
+      }
    }
  }
  
--- a/lib/cshift/Cshift_mpi.h
+++ b/lib/cshift/Cshift_mpi.h
@@ -26,10 +26,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension


  if ( !comm_dim ) {
+    //    std::cout << "Cshift_local" <<std::endl;
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
  } else if ( splice_dim ) {
+    //    std::cout << "Cshift_comms_simd" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift);
  } else {
+    //    std::cout << "Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift);
  }
  return ret;
@@ -42,9 +45,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);

+  //  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
+
  if ( sshift[0] == sshift[1] ) {
+    //    std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift,0x3);
  } else {
+    //    std::cout << "Two pass Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
    Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
@@ -113,12 +120,16 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);

+
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);

+      //      for(int i=0;i<words;i++){
+      //	std::cout << "SendRecv ["<<i<<"] snd "<<send_buf[i]<<" rcv " << recv_buf[i] << "  0x" << cbmask<<std::endl;
+      //      }
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
    }
  }
--- a/lib/lattice/Lattice_ET.h
+++ b/lib/lattice/Lattice_ET.h
@@ -132,18 +132,18 @@ inline void CBFromExpression(int &cb,const T1& lat)   // Lattice leaf
    assert(cb==lat.checkerboard);
  } 
  cb=lat.checkerboard;
-  //  std::cout<<"Lattice leaf cb "<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
 }
 template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
 inline void CBFromExpression(int &cb,const T1& notlat)   // non-lattice leaf
 {
-  //  std::cout<<"Non lattice leaf cb"<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
 }
 template <typename Op, typename T1>
 inline void CBFromExpression(int &cb,const LatticeUnaryExpression<Op,T1 > &expr)
 {
  CBFromExpression(cb,std::get<0>(expr.second));// recurse 
-  //  std::cout<<"Unary node cb "<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
 }

 template <typename Op, typename T1, typename T2>
@@ -151,7 +151,7 @@ inline void CBFromExpression(int &cb,const LatticeBinaryExpression<Op,T1,T2> &ex
 {
  CBFromExpression(cb,std::get<0>(expr.second));// recurse
  CBFromExpression(cb,std::get<1>(expr.second));
-  //  std::cout<<"Binary node cb "<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2, typename T3>
 inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
@@ -159,7 +159,7 @@ inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3
  CBFromExpression(cb,std::get<0>(expr.second));// recurse
  CBFromExpression(cb,std::get<1>(expr.second));
  CBFromExpression(cb,std::get<2>(expr.second));
-  //  std::cout<<"Trinary node cb "<<cb<<std::endl;
+  //  std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
 }

 ////////////////////////////////////////////
@@ -178,6 +178,7 @@ GridUnopClass(UnaryConj,conjugate(a));
 GridUnopClass(UnaryTrace,trace(a));
 GridUnopClass(UnaryTranspose,transpose(a));
 GridUnopClass(UnaryTa,Ta(a));
+GridUnopClass(UnaryProjectOnGroup,ProjectOnGroup(a));
 GridUnopClass(UnaryReal,real(a));
 GridUnopClass(UnaryImag,imag(a));
 GridUnopClass(UnaryToReal,toReal(a));
@@ -290,13 +291,14 @@ GRID_DEF_UNOP(conjugate,UnaryConj);
 GRID_DEF_UNOP(trace,UnaryTrace);
 GRID_DEF_UNOP(transpose,UnaryTranspose);
 GRID_DEF_UNOP(Ta,UnaryTa);
+GRID_DEF_UNOP(ProjectOnGroup,UnaryProjectOnGroup);
 GRID_DEF_UNOP(real,UnaryReal);
 GRID_DEF_UNOP(imag,UnaryImag);
 GRID_DEF_UNOP(toReal,UnaryToReal);
 GRID_DEF_UNOP(toComplex,UnaryToComplex);
 GRID_DEF_UNOP(abs  ,UnaryAbs); //abs overloaded in cmath C++98; DON'T do the abs-fabs-dabs-labs thing
 GRID_DEF_UNOP(sqrt ,UnarySqrt);
-GRID_DEF_UNOP(rsqrt,UnarySqrt);
+GRID_DEF_UNOP(rsqrt,UnaryRsqrt);
 GRID_DEF_UNOP(sin  ,UnarySin);
 GRID_DEF_UNOP(cos  ,UnaryCos);
 GRID_DEF_UNOP(log  ,UnaryLog);
@@ -370,7 +372,7 @@ using namespace Grid;
   tmp.func(eval(0,v1),eval(0,v2));

   auto var = v1+v2;
-   std::cout<<typeid(var).name()<<std::endl;
+   std::cout<<GridLogMessage<<typeid(var).name()<<std::endl;

   v3=v1+v2;
   v3=v1+v2+v1*v2;
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -29,6 +29,9 @@ extern int GridCshiftPermuteMap[4][16];
 class LatticeBase {};
 class LatticeExpressionBase {};

+template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
+template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; // Aligned allocator??
+
 template <typename Op, typename T1>                           
 class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
 public:
@@ -59,7 +62,12 @@ public:

    GridBase *_grid;
    int checkerboard;
-    std::vector<vobj,alignedAllocator<vobj> > _odata;
+    Vector<vobj> _odata;
+    
+    // to pthread need a computable loop where loop induction is not required
+    int begin(void) { return 0;};
+    int end(void)   { return _odata.size(); }
+    vobj & operator[](int i) { return _odata[i]; };

 public:
    typedef typename vobj::scalar_type scalar_type;
@@ -204,9 +212,10 @@ PARALLEL_FOR_LOOP
    // Constructor requires "grid" passed.
    // what about a default grid?
    //////////////////////////////////////////////////////////////////
- Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
+    Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
      //        _odata.reserve(_grid->oSites());
      //        _odata.resize(_grid->oSites());
+    //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
        assert((((uint64_t)&_odata[0])&0xF) ==0);
        checkerboard=0;
    }
@@ -221,7 +230,7 @@ PARALLEL_FOR_LOOP
    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
      this->checkerboard = r.checkerboard;
      conformable(*this,r);
-      std::cout<<"Lattice operator ="<<std::endl;
+      std::cout<<GridLogMessage<<"Lattice operator ="<<std::endl;
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r._odata[ss];
--- a/lib/lattice/Lattice_peekpoke.h
+++ b/lib/lattice/Lattice_peekpoke.h
@@ -10,20 +10,11 @@ namespace Grid {
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    // Peek internal indices of a Lattice object
    ////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<int Index,class vobj>
-     auto peekIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0]))>
-    {
-      Lattice<decltype(peekIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            ret._odata[ss] = peekIndex<Index>(lhs._odata[ss]);
-        }
-        return ret;
-    };
    template<int Index,class vobj>
-       auto peekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
+       auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
    {
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
+      ret.checkerboard=lhs.checkerboard;
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
@@ -31,9 +22,10 @@ PARALLEL_FOR_LOOP
        return ret;
    };
    template<int Index,class vobj>
-       auto peekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
+       auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
    {
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
+      ret.checkerboard=lhs.checkerboard;
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
@@ -45,15 +37,7 @@ PARALLEL_FOR_LOOP
    // Poke internal indices of a Lattice object
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    template<int Index,class vobj> 
-    void pokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0]))> & rhs)
-    {
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss]);
-	}      
-    }
-    template<int Index,class vobj> 
-    void pokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
+    void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
    {
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<lhs._grid->oSites();ss++){
@@ -61,7 +45,7 @@ PARALLEL_FOR_LOOP
 	}      
    }
    template<int Index,class vobj>
-      void pokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
+      void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
    {
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<lhs._grid->oSites();ss++){
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -125,7 +125,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  assert(grid!=NULL);

  // FIXME
-  std::cout<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
+  std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;

  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
--- a/lib/lattice/Lattice_rng.h
+++ b/lib/lattice/Lattice_rng.h
@@ -5,6 +5,37 @@

 namespace Grid {

+
+  //////////////////////////////////////////////////////////////
+  // Allow the RNG state to be less dense than the fine grid
+  //////////////////////////////////////////////////////////////
+  inline int RNGfillable(GridBase *coarse,GridBase *fine)
+  {
+
+    int rngdims = coarse->_ndimension;
+
+    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
+    int lowerdims   = fine->_ndimension - coarse->_ndimension;
+    assert(lowerdims >= 0);
+    for(int d=0;d<lowerdims;d++){
+      assert(fine->_simd_layout[d]==1);
+      assert(fine->_processors[d]==1);
+    }
+
+    // local and global volumes subdivide cleanly after SIMDization
+    int multiplicity=1;
+    for(int d=0;d<rngdims;d++){
+      int fd= d+lowerdims;
+      assert(coarse->_processors[d]  == fine->_processors[fd]);
+      assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
+      assert((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[fd]); 
+
+      multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
+    }
+
+    return multiplicity;
+  }
+
  // Wrap seed_seq to give common interface with random_device
  class fixedSeed {
  public:
@@ -226,26 +257,32 @@ namespace Grid {
      typedef typename vobj::scalar_type scalar_type;
      typedef typename vobj::vector_type vector_type;
      
-      conformable(_grid,l._grid);
+      int multiplicity = RNGfillable(_grid,l._grid);

      int     Nsimd =_grid->Nsimd();
      int     osites=_grid->oSites();
      int words=sizeof(scalar_object)/sizeof(scalar_type);

-      std::vector<scalar_object> buf(Nsimd);

+PARALLEL_FOR_LOOP
      for(int ss=0;ss<osites;ss++){
-	for(int si=0;si<Nsimd;si++){

-	  int gdx = generator_idx(ss,si); // index of generator state
-	  scalar_type *pointer = (scalar_type *)&buf[si];
-	  for(int idx=0;idx<words;idx++){
-	    fillScalar(pointer[idx],dist,_generators[gdx]);
+	std::vector<scalar_object> buf(Nsimd);
+	for(int m=0;m<multiplicity;m++) {// Draw from same generator multiplicity times
+
+	  int sm=multiplicity*ss+m;      // Maps the generator site to the fine site
+
+	  for(int si=0;si<Nsimd;si++){
+	    int gdx = generator_idx(ss,si); // index of generator state
+	    scalar_type *pointer = (scalar_type *)&buf[si];
+	    for(int idx=0;idx<words;idx++){
+	      fillScalar(pointer[idx],dist,_generators[gdx]);
+	    }
 	  }

+	  // merge into SIMD lanes
+	  merge(l._odata[sm],buf);
 	}
-	// merge into SIMD lanes
-	merge(l._odata[ss],buf);
      }
    };

--- a/lib/lattice/Lattice_trace.h
+++ b/lib/lattice/Lattice_trace.h
@@ -26,7 +26,7 @@ PARALLEL_FOR_LOOP
    // Trace Index level dependent operation
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    template<int Index,class vobj>
-    inline auto traceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
+    inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
    {
      Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
 PARALLEL_FOR_LOOP
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -17,13 +17,14 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
  }
 }

+
  ////////////////////////////////////////////////////////////////////////////////////////////
  // remove and insert a half checkerboard
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
    half.checkerboard = cb;
    int ssh=0;
-PARALLEL_FOR_LOOP
+    //PARALLEL_FOR_LOOP
    for(int ss=0;ss<full._grid->oSites();ss++){
      std::vector<int> coor;
      int cbos;
@@ -40,7 +41,7 @@ PARALLEL_FOR_LOOP
  template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
    int cb = half.checkerboard;
    int ssh=0;
-PARALLEL_FOR_LOOP
+    //PARALLEL_FOR_LOOP
    for(int ss=0;ss<full._grid->oSites();ss++){
      std::vector<int> coor;
      int cbos;
@@ -158,6 +159,7 @@ template<class vobj,class CComplex>

  fine_inner = localInnerProduct(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
+PARALLEL_FOR_LOOP
  for(int ss=0;ss<coarse->oSites();ss++){
    CoarseInner._odata[ss] = coarse_inner._odata[ss];
  }
@@ -168,7 +170,7 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
  GridBase *coarse = ip._grid;
  Lattice<vobj> zz(fineX._grid); zz=zero;
  blockInnerProduct(ip,fineX,fineX);
-  ip = rsqrt(ip);
+  ip = pow(ip,-0.5);
  blockZAXPY(fineX,ip,fineX,zz);
 }
 // useful in multigrid project;
@@ -297,5 +299,42 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 }


+template<class vobj>
+void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
+{
+  typedef typename vobj::scalar_object sobj;
+
+  GridBase *cg = coarse._grid;
+  GridBase *fg =   fine._grid;
+
+  int nd = cg->_ndimension;
+
+  subdivides(cg,fg); 
+
+  assert(cg->_ndimension==fg->_ndimension);
+
+  std::vector<int> ratio(cg->_ndimension);
+
+  for(int d=0;d<cg->_ndimension;d++){
+    ratio[d] = fg->_fdimensions[d]/cg->_fdimensions[d];
+  }
+
+  std::vector<int> fcoor(nd);
+  std::vector<int> ccoor(nd);
+  for(int g=0;g<fg->gSites();g++){
+
+    fg->GlobalIndexToGlobalCoor(g,fcoor);
+    for(int d=0;d<nd;d++){
+      ccoor[d] = fcoor[d]%cg->_gdimensions[d];
+    }
+    
+    sobj tmp;
+    peekSite(tmp,coarse,ccoor);
+    pokeSite(tmp,fine,fcoor);
+  }
+
+}
+
+
 }
 #endif
--- a/lib/lattice/Lattice_transpose.h
+++ b/lib/lattice/Lattice_transpose.h
@@ -24,7 +24,7 @@ PARALLEL_FOR_LOOP
    // Index level dependent transpose
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    template<int Index,class vobj>
-    inline auto transposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
+    inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
    {
      Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
 PARALLEL_FOR_LOOP
--- a/lib/lattice/Lattice_unary.h
+++ b/lib/lattice/Lattice_unary.h
@@ -24,6 +24,17 @@ PARALLEL_FOR_LOOP
    return ret;
  }

+  template<class obj> Lattice<obj> div(const Lattice<obj> &rhs,Integer y){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=div(rhs._odata[ss],y);
+    }
+    return ret;
+  }
+
  template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, ComplexD alpha, Integer Nexp = DEFAULT_MAT_EXP){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
--- a/lib/lattice/Lattice_where.h
+++ b/lib/lattice/Lattice_where.h
@@ -22,7 +22,6 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
  typedef typename iobj::vector_type mask_type;

  const int Nsimd = grid->Nsimd();
-  const int words = sizeof(vobj)/sizeof(vector_type);

  std::vector<Integer> mask(Nsimd);
  std::vector<scalar_object> truevals (Nsimd);
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -0,0 +1,512 @@
+#ifndef GRID_BINARY_IO_H
+#define GRID_BINARY_IO_H
+
+
+#ifdef HAVE_ENDIAN_H
+#include <endian.h>
+#endif
+#include <arpa/inet.h>
+#include <algorithm>
+// 64bit endian swap is a portability pain
+#ifndef __has_builtin         // Optional of course.
+#define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif
+
+#if HAVE_DECL_BE64TOH 
+#undef Grid_ntohll
+#define Grid_ntohll be64toh
+#endif
+
+#if HAVE_DECL_NTOHLL
+#undef  Grid_ntohll
+#define Grid_ntohll ntohll
+#endif
+
+#ifndef Grid_ntohll
+
+#if BYTE_ORDER == BIG_ENDIAN 
+
+#define Grid_ntohll(A) (A)
+
+#else 
+
+#if __has_builtin(__builtin_bswap64)
+#define Grid_ntohll(A) __builtin_bswap64(A)
+#else
+#error
+#endif
+
+#endif
+
+#endif
+
+namespace Grid { 
+
+  // A little helper
+  inline void removeWhitespace(std::string &key)
+  {
+    key.erase(std::remove_if(key.begin(), key.end(), ::isspace),key.end());
+  }
+
+class BinaryIO {
+
+ public:
+
+
+  // Network is big endian
+  static inline void htobe32_v(void *file_object,uint32_t bytes){ be32toh_v(file_object,bytes);} 
+  static inline void htobe64_v(void *file_object,uint32_t bytes){ be64toh_v(file_object,bytes);} 
+  static inline void htole32_v(void *file_object,uint32_t bytes){ le32toh_v(file_object,bytes);} 
+  static inline void htole64_v(void *file_object,uint32_t bytes){ le64toh_v(file_object,bytes);} 
+
+  static inline void be32toh_v(void *file_object,uint32_t bytes)
+  {
+    uint32_t * f = (uint32_t *)file_object;
+    for(int i=0;i*sizeof(uint32_t)<bytes;i++){  
+      f[i] = ntohl(f[i]);
+    }
+  }
+
+  // LE must Swap and switch to host
+  static inline void le32toh_v(void *file_object,uint32_t bytes)
+  {
+    uint32_t *fp = (uint32_t *)file_object;
+    uint32_t f;
+
+    for(int i=0;i*sizeof(uint32_t)<bytes;i++){  
+      f = fp[i];
+      // got network order and the network to host
+      f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
+      fp[i] = ntohl(f);
+    }
+  }
+
+  // BE is same as network
+  static inline void be64toh_v(void *file_object,uint32_t bytes)
+  {
+    uint64_t * f = (uint64_t *)file_object;
+    for(int i=0;i*sizeof(uint64_t)<bytes;i++){  
+      f[i] = Grid_ntohll(f[i]);
+    }
+  }
+  
+  // LE must swap and switch;
+  static inline void le64toh_v(void *file_object,uint32_t bytes)
+  {
+    uint64_t *fp = (uint64_t *)file_object;
+    uint64_t f,g;
+    
+    for(int i=0;i*sizeof(uint64_t)<bytes;i++){  
+      f = fp[i];
+      // got network order and the network to host
+      g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
+      g = g << 32;
+      f = f >> 32;
+      g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
+      fp[i] = Grid_ntohll(g);
+    }
+  }
+
+  template<class vobj,class fobj,class munger> static inline void Uint32Checksum(Lattice<vobj> lat,munger munge,uint32_t &csum)
+  {
+    typedef typename vobj::scalar_object sobj;
+    GridBase *grid = lat._grid ;
+    std::cout <<GridLogMessage<< "Uint32Checksum "<<norm2(lat)<<std::endl;
+    sobj siteObj;
+    fobj fileObj;
+
+    csum = 0;
+    std::vector<int> lcoor;
+    for(int l=0;l<grid->lSites();l++){
+      grid->CoorFromIndex(lcoor,l,grid->_ldimensions);
+      peekLocalSite(siteObj,lat,lcoor);
+      munge(siteObj,fileObj,csum);
+    }
+    grid->GlobalSum(csum);
+  }
+    
+  static inline void Uint32Checksum(uint32_t *buf,uint32_t buf_size_bytes,uint32_t &csum)
+  {
+    for(int i=0;i*sizeof(uint32_t)<buf_size_bytes;i++){
+      csum=csum+buf[i];
+    }
+  }
+  
+  template<class vobj,class fobj,class munger>
+  static inline uint32_t readObjectSerial(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string &format)
+  {
+    typedef typename vobj::scalar_object sobj;
+
+    GridBase *grid = Umu._grid;
+
+    std::cout<< GridLogMessage<< "Serial read I/O "<< file<< std::endl;
+
+    int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32    = (format == std::string("IEEE32"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64    = (format == std::string("IEEE64"));
+
+    // Find the location of each site and send to primary node
+    // Take loop order from Chroma; defines loop order now that NERSC doc no longer
+    // available (how short sighted is that?)
+    std::ifstream fin(file,std::ios::binary|std::ios::in);
+    fin.seekg(offset);
+
+    Umu = zero;
+    uint32_t csum=0;
+    fobj file_object;
+    sobj munged;
+    
+    for(int t=0;t<grid->_fdimensions[3];t++){
+    for(int z=0;z<grid->_fdimensions[2];z++){
+    for(int y=0;y<grid->_fdimensions[1];y++){
+    for(int x=0;x<grid->_fdimensions[0];x++){
+
+      std::vector<int> site({x,y,z,t});
+
+      if ( grid->IsBoss() ) {
+	fin.read((char *)&file_object,sizeof(file_object));
+	
+	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
+	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
+	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
+	if(ieee64)    le64toh_v((void *)&file_object,sizeof(file_object));
+
+	munge(file_object,munged,csum);
+      }
+      // The boss who read the file has their value poked
+      pokeSite(munged,Umu,site);
+    }}}}
+    return csum;
+  }
+
+  template<class vobj,class fobj,class munger> 
+  static inline uint32_t writeObjectSerial(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string & format)
+  {
+    typedef typename vobj::scalar_object sobj;
+
+    GridBase *grid = Umu._grid;
+
+    int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32    = (format == std::string("IEEE32"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64    = (format == std::string("IEEE64"));
+
+    //////////////////////////////////////////////////
+    // Serialise through node zero
+    //////////////////////////////////////////////////
+    std::cout<< GridLogMessage<< "Serial write I/O "<< file<<std::endl;
+
+    std::ofstream fout;
+    if ( grid->IsBoss() ) {
+      fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
+      fout.seekp(offset);
+    }
+    
+    uint32_t csum=0;
+    fobj file_object;
+    sobj unmunged;
+    for(int t=0;t<grid->_fdimensions[3];t++){
+    for(int z=0;z<grid->_fdimensions[2];z++){
+    for(int y=0;y<grid->_fdimensions[1];y++){
+    for(int x=0;x<grid->_fdimensions[0];x++){
+
+      std::vector<int> site({x,y,z,t});
+      // peek & write
+      peekSite(unmunged,Umu,site);
+
+      munge(unmunged,file_object,csum);
+
+      
+      if ( grid->IsBoss() ) {
+	
+	if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
+	if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
+	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
+	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
+	
+	fout.write((char *)&file_object,sizeof(file_object));
+      }
+    }}}}
+
+    return csum;
+  }
+
+  template<class vobj,class fobj,class munger>
+  static inline uint32_t readObjectParallel(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string &format)
+  {
+    typedef typename vobj::scalar_object sobj;
+
+    GridBase *grid = Umu._grid;
+
+    int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32    = (format == std::string("IEEE32"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64    = (format == std::string("IEEE64"));
+
+
+    // Take into account block size of parallel file systems want about
+    // 4-16MB chunks.
+    // Ideally one reader/writer per xy plane and read these contiguously
+    // with comms from nominated I/O nodes.
+    std::ifstream fin;
+
+    int nd = grid->_ndimension;
+    std::vector<int> parallel(nd,1);
+    std::vector<int> ioproc  (nd);
+    std::vector<int> start(nd);
+    std::vector<int> range(nd);
+
+    for(int d=0;d<nd;d++){
+      assert(grid->CheckerBoarded(d) == 0);
+    }
+
+    uint64_t slice_vol = 1;
+
+    int IOnode = 1;
+    for(int d=0;d<grid->_ndimension;d++) {
+
+      if ( d==0 ) parallel[d] = 0;
+      if (parallel[d]) {
+	range[d] = grid->_ldimensions[d];
+	start[d] = grid->_processor_coor[d]*range[d];
+	ioproc[d]= grid->_processor_coor[d];
+      } else {
+	range[d] = grid->_gdimensions[d];
+	start[d] = 0;
+	ioproc[d]= 0;
+
+	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+      }
+      slice_vol = slice_vol * range[d];
+    }
+
+    {
+      uint32_t tmp = IOnode;
+      grid->GlobalSum(tmp);
+      std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <<tmp<< " IOnodes for subslice ";
+      for(int d=0;d<grid->_ndimension;d++){
+	std::cout<< range[d];
+	if( d< grid->_ndimension-1 ) 
+	  std::cout<< " x ";
+      }
+      std::cout << std::endl;
+    }
+
+    int myrank = grid->ThisRank();
+    int iorank = grid->RankFromProcessorCoor(ioproc);
+
+    if ( IOnode ) { 
+      fin.open(file,std::ios::binary|std::ios::in);
+    }
+
+    //////////////////////////////////////////////////////////
+    // Find the location of each site and send to primary node
+    // Take loop order from Chroma; defines loop order now that NERSC doc no longer
+    // available (how short sighted is that?)
+    //////////////////////////////////////////////////////////
+    Umu = zero;
+    uint32_t csum=0;
+    fobj fileObj;
+    sobj siteObj;
+
+      // need to implement these loops in Nd independent way with a lexico conversion
+    for(int tlex=0;tlex<slice_vol;tlex++){
+	
+      std::vector<int> tsite(nd); // temporary mixed up site
+      std::vector<int> gsite(nd);
+      std::vector<int> lsite(nd);
+      std::vector<int> iosite(nd);
+
+      grid->CoorFromIndex(tsite,tlex,range);
+
+      for(int d=0;d<nd;d++){
+	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+	gsite[d] = tsite[d]+start[d];               // global site
+      }
+
+      /////////////////////////
+      // Get the rank of owner of data
+      /////////////////////////
+	int rank, o_idx,i_idx, g_idx;
+      grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
+      grid->GlobalCoorToGlobalIndex(gsite,g_idx);
+      
+      ////////////////////////////////
+      // iorank reads from the seek
+      ////////////////////////////////
+      if (myrank == iorank) {
+	
+	fin.seekg(offset+g_idx*sizeof(fileObj));
+	fin.read((char *)&fileObj,sizeof(fileObj));
+	
+	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
+	
+	munge(fileObj,siteObj,csum);
+	
+	if ( rank != myrank ) {
+	  grid->SendTo((void *)&siteObj,rank,sizeof(siteObj));
+	} else { 
+	  pokeLocalSite(siteObj,Umu,lsite);
+	}
+	 
+      } else { 
+	if ( myrank == rank ) {
+	  grid->RecvFrom((void *)&siteObj,iorank,sizeof(siteObj));
+	  pokeLocalSite(siteObj,Umu,lsite);
+	} 
+      }
+      grid->Barrier(); // necessary?
+    }
+
+    grid->GlobalSum(csum);
+    
+    return csum;
+  }
+
+  //////////////////////////////////////////////////////////
+  // Parallel writer
+  //////////////////////////////////////////////////////////
+  template<class vobj,class fobj,class munger>
+  static inline uint32_t writeObjectParallel(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string & format)
+  {
+    typedef typename vobj::scalar_object sobj;
+    GridBase *grid = Umu._grid;
+
+    int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32    = (format == std::string("IEEE32"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64    = (format == std::string("IEEE64"));
+
+    int nd = grid->_ndimension;
+    for(int d=0;d<nd;d++){
+      assert(grid->CheckerBoarded(d) == 0);
+    }
+
+    std::vector<int> parallel(nd,1);
+    std::vector<int> ioproc  (nd);
+    std::vector<int> start(nd);
+    std::vector<int> range(nd);
+
+    uint64_t slice_vol = 1;
+
+    int IOnode = 1;
+
+    for(int d=0;d<grid->_ndimension;d++) {
+
+      if ( d==0 ) parallel[d] = 0;
+
+      if (parallel[d]) {
+	range[d] = grid->_ldimensions[d];
+	start[d] = grid->_processor_coor[d]*range[d];
+	ioproc[d]= grid->_processor_coor[d];
+      } else {
+	range[d] = grid->_gdimensions[d];
+	start[d] = 0;
+	ioproc[d]= 0;
+
+	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+      }
+
+      slice_vol = slice_vol * range[d];
+    }
+    
+    {
+      uint32_t tmp = IOnode;
+      grid->GlobalSum(tmp);
+      std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
+      for(int d=0;d<grid->_ndimension;d++){
+	std::cout<< range[d];
+	if( d< grid->_ndimension-1 ) 
+	  std::cout<< " x ";
+      }
+      std::cout << std::endl;
+    }
+
+    int myrank = grid->ThisRank();
+    int iorank = grid->RankFromProcessorCoor(ioproc);
+
+    // Take into account block size of parallel file systems want about
+    // 4-16MB chunks.
+    // Ideally one reader/writer per xy plane and read these contiguously
+    // with comms from nominated I/O nodes.
+    std::ofstream fout;
+    if ( IOnode ) fout.open(file,std::ios::binary|std::ios::in|std::ios::out);
+
+    //////////////////////////////////////////////////////////
+    // Find the location of each site and send to primary node
+    // Take loop order from Chroma; defines loop order now that NERSC doc no longer
+    // available (how short sighted is that?)
+    //////////////////////////////////////////////////////////
+
+    uint32_t csum=0;
+    fobj fileObj;
+    sobj siteObj;
+
+
+      // need to implement these loops in Nd independent way with a lexico conversion
+    for(int tlex=0;tlex<slice_vol;tlex++){
+	
+      std::vector<int> tsite(nd); // temporary mixed up site
+      std::vector<int> gsite(nd);
+      std::vector<int> lsite(nd);
+      std::vector<int> iosite(nd);
+
+      grid->CoorFromIndex(tsite,tlex,range);
+
+      for(int d=0;d<nd;d++){
+	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+	gsite[d] = tsite[d]+start[d];               // global site
+      }
+
+
+      /////////////////////////
+      // Get the rank of owner of data
+      /////////////////////////
+      int rank, o_idx,i_idx, g_idx;
+      grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
+      grid->GlobalCoorToGlobalIndex(gsite,g_idx);
+
+      ////////////////////////////////
+      // iorank writes from the seek
+      ////////////////////////////////
+      if (myrank == iorank) {
+
+	if ( rank != myrank ) {
+	  grid->RecvFrom((void *)&siteObj,rank,sizeof(siteObj));
+	} else { 
+	  peekLocalSite(siteObj,Umu,lsite);
+	}
+	
+	munge(siteObj,fileObj,csum);
+
+	if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
+	
+	fout.seekp(offset+g_idx*sizeof(fileObj));
+	fout.write((char *)&fileObj,sizeof(fileObj));
+
+      } else { 
+	if ( myrank == rank ) {
+	  peekLocalSite(siteObj,Umu,lsite);
+	  grid->SendTo((void *)&siteObj,iorank,sizeof(siteObj));
+	} 
+      }
+      grid->Barrier(); // necessary// or every 16 packets to rate throttle??
+    }
+
+    grid->GlobalSum(csum);
+
+    return csum;
+  }
+
+};
+
+}
+
+#endif
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@@ -7,57 +7,23 @@
 #include <fstream>
 #include <map>

-#ifdef HAVE_ENDIAN_H
-#include <endian.h>
-#endif
-
-
-#include <arpa/inet.h>
-
-// 64bit endian swap is a portability pain
-#ifndef __has_builtin         // Optional of course.
-#define __has_builtin(x) 0  // Compatibility with non-clang compilers.
-#endif
-
-#if HAVE_DECL_BE64TOH 
-#undef Grid_ntohll
-#define Grid_ntohll be64toh
-#endif
-
-#if HAVE_DECL_NTOHLL
-#undef  Grid_ntohll
-#define Grid_ntohll ntohll
-#endif
-
-#ifndef Grid_ntohll
-
-#if BYTE_ORDER == BIG_ENDIAN 
-
-#define Grid_ntohll(A) (A)
-
-#else 
-
-#if __has_builtin(__builtin_bswap64)
-#define Grid_ntohll(A) __builtin_bswap64(A)
-#else
-#error
-#endif
-
-#endif
-
-#endif
+#include <unistd.h>
+#include <sys/utsname.h>
+#include <pwd.h>

 namespace Grid {
+namespace QCD {

-  using namespace QCD;
+using namespace Grid;

 ////////////////////////////////////////////////////////////////////////////////
 // Some data types for intermediate storage
 ////////////////////////////////////////////////////////////////////////////////
  template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, 4 >;
-typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
-typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
-typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
+
+  typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
+  typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
+  typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;

 ////////////////////////////////////////////////////////////////////////////////
 // header specification/interpretation
@@ -86,50 +52,173 @@ class NerscField {
 };


+//////////////////////////////////////////////////////////////////////
+// Bit and Physical Checksumming and QA of data
+//////////////////////////////////////////////////////////////////////
+
+inline void NerscGrid(GridBase *grid,NerscField &header)
+{
+  assert(grid->_ndimension==4);
+  for(int d=0;d<4;d++) {
+    header.dimension[d] = grid->_fdimensions[d];
+  }
+  for(int d=0;d<4;d++) {
+    header.boundary[d] = std::string("PERIODIC");
+  }
+}
+template<class GaugeField>
+inline void NerscStatistics(GaugeField & data,NerscField &header)
+{
+  header.link_trace=Grid::QCD::WilsonLoops<GaugeField>::linkTrace(data);
+  header.plaquette =Grid::QCD::WilsonLoops<GaugeField>::avgPlaquette(data);
+}
+
+inline void NerscMachineCharacteristics(NerscField &header)
+{
+  // Who
+  struct passwd *pw = getpwuid (getuid());
+  if (pw) header.creator = std::string(pw->pw_name); 
+
+  // When
+  std::time_t t = std::time(nullptr);
+  std::tm tm = *std::localtime(&t);
+  std::ostringstream oss; 
+  //  oss << std::put_time(&tm, "%c %Z");
+  header.creation_date = oss.str();
+  header.archive_date  = header.creation_date;
+
+  // What
+  struct utsname name;  uname(&name);
+  header.creator_hardware = std::string(name.nodename)+"-";
+  header.creator_hardware+= std::string(name.machine)+"-";
+  header.creator_hardware+= std::string(name.sysname)+"-";
+  header.creator_hardware+= std::string(name.release);
+
+}
+//////////////////////////////////////////////////////////////////////
+// Utilities ; these are QCD aware
+//////////////////////////////////////////////////////////////////////
+    inline void NerscChecksum(uint32_t *buf,uint32_t buf_size_bytes,uint32_t &csum)
+    {
+      BinaryIO::Uint32Checksum(buf,buf_size_bytes,csum);
+    }
+    inline void reconstruct3(LorentzColourMatrix & cm)
+    {
+      const int x=0;
+      const int y=1;
+      const int z=2;
+      for(int mu=0;mu<4;mu++){
+	cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
+	cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
+	cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
+      }
+    }
+
+    template<class fobj,class sobj>
+    struct NerscSimpleMunger{
+
+      void operator() (fobj &in,sobj &out,uint32_t &csum){
+
+      for(int mu=0;mu<4;mu++){
+      for(int i=0;i<3;i++){
+      for(int j=0;j<3;j++){
+	out(mu)()(i,j) = in(mu)()(i,j);
+      }}}
+      NerscChecksum((uint32_t *)&in,sizeof(in),csum); 
+      };
+    };
+
+    template<class fobj,class sobj>
+    struct NerscSimpleUnmunger{
+      void operator() (sobj &in,fobj &out,uint32_t &csum){
+	for(int mu=0;mu<Nd;mu++){
+	for(int i=0;i<Nc;i++){
+	for(int j=0;j<Nc;j++){
+	  out(mu)()(i,j) = in(mu)()(i,j);
+	}}}
+	NerscChecksum((uint32_t *)&out,sizeof(out),csum); 
+      };
+    };
+ 
+    template<class fobj,class sobj>
+    struct Nersc3x2munger{
+      void operator() (fobj &in,sobj &out,uint32_t &csum){
+     
+	NerscChecksum((uint32_t *)&in,sizeof(in),csum); 
+
+	for(int mu=0;mu<4;mu++){
+	  for(int i=0;i<2;i++){
+	    for(int j=0;j<3;j++){
+	      out(mu)()(i,j) = in(mu)(i)(j);
+	    }}
+	}
+	reconstruct3(out);
+      }
+    };
+
+    template<class fobj,class sobj>
+    struct Nersc3x2unmunger{
+
+      void operator() (sobj &in,fobj &out,uint32_t &csum){
+
+
+	for(int mu=0;mu<4;mu++){
+	  for(int i=0;i<2;i++){
+	    for(int j=0;j<3;j++){
+	      out(mu)(i)(j) = in(mu)()(i,j);
+	    }}
+	}
+
+	NerscChecksum((uint32_t *)&out,sizeof(out),csum); 
+
+      }
+    };
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // Write and read from fstream; comput header offset for payload
 ////////////////////////////////////////////////////////////////////////////////
-inline unsigned int writeNerscHeader(NerscField &field,std::string file)
-{
-  std::ofstream fout(file,std::ios::out);
+class NerscIO : public BinaryIO { 
+ public:

-  fout.seekp(0,std::ios::beg);
-  fout << "BEGIN_HEADER"      << std::endl;
-  fout << "HDR_VERSION = "    << field.hdr_version    << std::endl;
-  fout << "DATATYPE = "       << field.data_type      << std::endl;
-  fout << "STORAGE_FORMAT = " << field.storage_format << std::endl;
+  static inline unsigned int writeHeader(NerscField &field,std::string file)
+  {
+    std::ofstream fout(file,std::ios::out);
  
-  for(int i=0;i<4;i++){
-    fout << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ;
-  }
-  // just to keep the space and write it later
-  fout << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl;
-  fout << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl;
-  for(int i=0;i<4;i++){
-    fout << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;
-  }
-  fout << "CHECKSUM = "<< std::hex << std::setw(16) << 0 << field.checksum << std::endl;
+    fout.seekp(0,std::ios::beg);
+    fout << "BEGIN_HEADER"      << std::endl;
+    fout << "HDR_VERSION = "    << field.hdr_version    << std::endl;
+    fout << "DATATYPE = "       << field.data_type      << std::endl;
+    fout << "STORAGE_FORMAT = " << field.storage_format << std::endl;

-  fout << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;
-  fout << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;
-  fout << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;
-  fout << "CREATOR = "         << field.creator          << std::endl;
-  fout << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;
-  fout << "CREATION_DATE = "   << field.creation_date    << std::endl;
-  fout << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;
-  fout << "FLOATING_POINT = "  << field.floating_point   << std::endl;
-  fout << "END_HEADER"         << std::endl;
-  field.data_start = fout.tellp();
-  return field.data_start;
+    for(int i=0;i<4;i++){
+      fout << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ;
+    }
+    // just to keep the space and write it later
+    fout << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl;
+    fout << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl;
+    for(int i=0;i<4;i++){
+      fout << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;
+    }
+
+    fout << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::endl;
+    fout << std::dec;
+
+    fout << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;
+    fout << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;
+    fout << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;
+    fout << "CREATOR = "         << field.creator          << std::endl;
+    fout << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;
+    fout << "CREATION_DATE = "   << field.creation_date    << std::endl;
+    fout << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;
+    fout << "FLOATING_POINT = "  << field.floating_point   << std::endl;
+    fout << "END_HEADER"         << std::endl;
+    field.data_start = fout.tellp();
+    return field.data_start;
 }

-// A little helper
-inline void removeWhitespace(std::string &key)
-{
-  key.erase(std::remove_if(key.begin(), key.end(), ::isspace),key.end());
-}
 // for the header-reader
-inline int readNerscHeader(std::string file,GridBase *grid,  NerscField &field)
+static inline int readHeader(std::string file,GridBase *grid,  NerscField &field)
 {
  int offset=0;
  std::map<std::string,std::string> header;
@@ -163,7 +252,6 @@ inline int readNerscHeader(std::string file,GridBase *grid,  NerscField &field)
  //////////////////////////////////////////////////
  // chomp the values
  //////////////////////////////////////////////////
-
  field.hdr_version    = header["HDR_VERSION"];
  field.data_type      = header["DATATYPE"];
  field.storage_format = header["STORAGE_FORMAT"];
@@ -200,314 +288,21 @@ inline int readNerscHeader(std::string file,GridBase *grid,  NerscField &field)
 }


-//////////////////////////////////////////////////////////////////////
-// Utilities
-//////////////////////////////////////////////////////////////////////
-inline void reconstruct3(LorentzColourMatrix & cm)
-{
-  const int x=0;
-  const int y=1;
-  const int z=2;
-  for(int mu=0;mu<4;mu++){
-    cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
-    cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
-    cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
-  }
-}


- void inline be32toh_v(void *file_object,uint32_t bytes)
- {
-   uint32_t * f = (uint32_t *)file_object;
-   for(int i=0;i*sizeof(uint32_t)<bytes;i++){  
-     f[i] = ntohl(f[i]);
-   }
- }
- void inline le32toh_v(void *file_object,uint32_t bytes)
- {
-   uint32_t *fp = (uint32_t *)file_object;
-
-   uint32_t f;
-
-   for(int i=0;i*sizeof(uint32_t)<bytes;i++){  
-     f = fp[i];
-     // got network order and the network to host
-     f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
-     fp[i] = ntohl(f);
-   }
- }
- void inline be64toh_v(void *file_object,uint32_t bytes)
- {
-   uint64_t * f = (uint64_t *)file_object;
-   for(int i=0;i*sizeof(uint64_t)<bytes;i++){  
-     f[i] = Grid_ntohll(f[i]);
-   }
- }
- void inline le64toh_v(void *file_object,uint32_t bytes)
- {
-   uint64_t *fp = (uint64_t *)file_object;
-   uint64_t f,g;
-
-   for(int i=0;i*sizeof(uint64_t)<bytes;i++){  
-     f = fp[i];
-     // got network order and the network to host
-     g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
-     g = g << 32;
-     f = f >> 32;
-     g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
-     fp[i] = ntohl(g);
-   }
- }
-
-inline void NerscChecksum(uint32_t *buf,uint32_t buf_size,uint32_t &csum)
-{
-  for(int i=0;i*sizeof(uint32_t)<buf_size;i++){
-    csum=csum+buf[i];
-  }
-}
-
-  template<class fobj,class sobj>
-  struct NerscSimpleMunger{
-    void operator() (fobj &in,sobj &out,uint32_t &csum){
-
-      for(int mu=0;mu<4;mu++){
-      for(int i=0;i<3;i++){
-      for(int j=0;j<3;j++){
-	out(mu)()(i,j) = in(mu)()(i,j);
-      }}}
-
-      NerscChecksum((uint32_t *)&in,sizeof(in),csum); 
-    };
-  };
-  template<class fobj,class sobj>
-  struct NerscSimpleUnmunger{
-    void operator() (sobj &in,fobj &out,uint32_t &csum){
-      for(int mu=0;mu<4;mu++){
-      for(int i=0;i<3;i++){
-      for(int j=0;j<3;j++){
-	out(mu)()(i,j) = in(mu)()(i,j);
-      }}}
-      NerscChecksum((uint32_t *)&out,sizeof(out),csum); 
-    };
-  };
- 
- template<class fobj,class sobj>
- struct Nersc3x2munger{
-   void operator() (fobj &in,sobj &out,uint32_t &csum){
-     
-     NerscChecksum((uint32_t *)&in,sizeof(in),csum); 
-
-     for(int mu=0;mu<4;mu++){
-       for(int i=0;i<2;i++){
-       for(int j=0;j<3;j++){
-	   out(mu)()(i,j) = in(mu)(i)(j);
-       }}
-     }
-     reconstruct3(out);
-   }
- };
-
- template<class fobj,class sobj>
- struct Nersc3x2unmunger{
-
-  void operator() (sobj &in,fobj &out,uint32_t &csum){
-
-    NerscChecksum((uint32_t *)&out,sizeof(out),csum); 
-
-    for(int mu=0;mu<4;mu++){
-      for(int i=0;i<2;i++){
-      for(int j=0;j<3;j++){
-	out(mu)(i)(j) = in(mu)()(i,j);
-      }}
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////
-// Template wizardry to map types to strings for NERSC in an extensible way
-////////////////////////////////////////////////////////////////////////////
- template<class vobj> struct NerscDataType {
-   static void DataType     (std::string &str) { str = std::string("4D_BINARY_UNKNOWN"); };
-   static void FloatingPoint(std::string &str) { str = std::string("IEEE64BIG"); };
- };
-
- template<> struct NerscDataType<iColourMatrix<ComplexD> > {
-   static void DataType     (std::string &str) { str = std::string("4D_SU3_GAUGE_3X3"); };
-   static void FloatingPoint(std::string &str) { str = std::string("IEEE64BIG");};
- };
-
- template<> struct NerscDataType<iColourMatrix<ComplexF> > {
-   static void DataType     (std::string &str) { str = std::string("4D_SU3_GAUGE_3X3"); };
-   static void FloatingPoint(std::string &str) { str = std::string("IEEE32BIG");};
- };
-
-//////////////////////////////////////////////////////////////////////
-// Bit and Physical Checksumming and QA of data
-//////////////////////////////////////////////////////////////////////
-/*
-template<class vobj> inline uint32_t NerscChecksum(Lattice<vobj> & data)
-{
-  uint32_t sum;
-  for(int ss=0;ss<data._grid->Osites();ss++){
-    uint32_t *iptr = (uint32_t *)& data._odata[0] ;
-    for(int i=0;i<sizeof(vobj);i+=sizeof(uint32_t)){
-      sum=sum+iptr[i];
-    }
-  }
-  data._grid->globalSum(sum);
-  return sum;
-}
-*/
-template<class vobj> inline void NerscPhysicalCharacteristics(Lattice<vobj> & data,NerscField &header)
-{
-  header.data_type      = NerscDataType<vobj>::DataType;
-  header.floating_point = NerscDataType<vobj>::FloatingPoint;
-  return;
-}
-
- template<> inline void NerscPhysicalCharacteristics(LatticeGaugeField & data,NerscField &header)
-{
-  NerscDataType<decltype(data._odata[0])>::DataType(header.data_type);
-  NerscDataType<decltype(data._odata[0])>::FloatingPoint(header.floating_point);
-  header.link_trace=1.0;
-  header.plaquette =1.0;
-}
-
-template<class vobj> inline void NerscStatisics(Lattice<vobj> & data,NerscField &header)
-{
-  assert(data._grid->_ndimension==4);
-
-  for(int d=0;d<4;d++)
-    header.dimension[d] = data._grid->_fdimensions[d];
-
-  // compute checksum and any physical properties contained for this type
-  //  header.checksum = NerscChecksum(data);
-
-  NerscPhysicalCharacteristics(data,header);
-}
-
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Now the meat: the object readers
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class vobj,class sobj,class fobj,class munger>
-inline void readNerscObject(Lattice<vobj> &Umu,std::string file,munger munge,int offset,std::string &format)
+
+template<class vsimd>
+static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,NerscField& header,std::string file)
 {
+  typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
+
  GridBase *grid = Umu._grid;
+  int offset = readHeader(file,Umu._grid,header);

-  int ieee32big = (format == std::string("IEEE32BIG"));
-  int ieee32    = (format == std::string("IEEE32"));
-  int ieee64big = (format == std::string("IEEE64BIG"));
-  int ieee64    = (format == std::string("IEEE64"));
-
-  // Find the location of each site and send to primary node
-  //  for(int site=0; site < Layout::vol(); ++site){
-  //     multi1d<int> coord = crtesn(site, Layout::lattSize());
-  //     for(int dd=0; dd<Nd; dd++){        /* dir */
-  //        cfg_in.readArray(su3_buffer, float_size, mat_size);
-  //
-  // Above from Chroma; defines loop order now that NERSC doc no longer
-  // available (how short sighted is that?)
-  {
-    std::ifstream fin(file,std::ios::binary|std::ios::in);
-    fin.seekg(offset);
-
-    Umu = zero;
-    uint32_t csum=0;
-    fobj file_object;
-    sobj munged;
-    
-    for(int t=0;t<grid->_fdimensions[3];t++){
-    for(int z=0;z<grid->_fdimensions[2];z++){
-    for(int y=0;y<grid->_fdimensions[1];y++){
-    for(int x=0;x<grid->_fdimensions[0];x++){
-
-      std::vector<int> site({x,y,z,t});
-
-      if ( grid->IsBoss() ) {
-	fin.read((char *)&file_object,sizeof(file_object));
-
-	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee64)    le64toh_v((void *)&file_object,sizeof(file_object));
-
-	munge(file_object,munged,csum);
-      }
-      // The boss who read the file has their value poked
-      pokeSite(munged,Umu,site);
-    }}}}
-  }
-}
-
-template<class vobj,class sobj,class fobj,class munger>
-inline void writeNerscObject(Lattice<vobj> &Umu,std::string file,munger munge,int offset,
-                       int sequence,double lt,double pl)
-{
-  GridBase *grid = Umu._grid;
-  NerscField header;
-  
-  //////////////////////////////////////////////////
-  // First write the header; this is in wrong place
-  //////////////////////////////////////////////////
-  assert(grid->_ndimension == 4);
-  for(int d=0;d<4;d++){
-    header.dimension[d]=grid->_fdimensions[d];
-    header.boundary [d]=std::string("PERIODIC");; 
-  }
-  header.hdr_version=std::string("WHATDAHECK");
-  //  header.storage_format=storage_format<vobj>::string; // use template specialisation
-  //  header.data_type=data_type<vobj>::string;
-  header.storage_format=std::string("debug");
-  header.data_type     =std::string("debug");
-
-  //FIXME; use template specialisation to fill these out
-  header.link_trace   =lt;
-  header.plaquette    =pl;
-  header.checksum     =0;
-
-  //
-  header.sequence_number =sequence;
-  header.ensemble_id     =std::string("UKQCD");
-  header.ensemble_label  =std::string("UKQCD");
-  header.creator         =std::string("Tadahito");
-  header.creator_hardware=std::string("BlueGene/Q");
-  header.creation_date   =std::string("AnnoDomini");
-  header.archive_date    =std::string("AnnoDomini");
-  header.floating_point  =std::string("IEEE64BIG");
-  //  header.data_start=;
-  //  unsigned int checksum;
-
-  //////////////////////////////////////////////////
-  // Now write the body
-  //////////////////////////////////////////////////
-  {
-    std::ofstream fout(file,std::ios::binary|std::ios::out);
-    fout.seekp(offset);
-
-    Umu = zero;
-    uint32_t csum=0;
-    fobj file_object;
-    sobj unmunged;
-    for(int t=0;t<grid->_fdimensions[3];t++){
-    for(int z=0;z<grid->_fdimensions[2];z++){
-    for(int y=0;y<grid->_fdimensions[1];y++){
-    for(int x=0;x<grid->_fdimensions[0];x++){
-      std::vector<int> site({x,y,z,t});
-      peekSite(unmunged,Umu,site);
-      munge(unmunged,file_object,csum);
-      // broadcast & insert
-      fout.write((char *)&file_object,sizeof(file_object));
-    }}}}
-  }
-}
-
-
-
-inline void readNerscConfiguration(LatticeGaugeField &Umu,NerscField& header,std::string file)
-{
-  GridBase *grid = Umu._grid;
-
-  int offset = readNerscHeader(file,Umu._grid,header);
+  NerscField clone(header);

  std::string format(header.floating_point);

@@ -516,48 +311,106 @@ inline void readNerscConfiguration(LatticeGaugeField &Umu,NerscField& header,std
  int ieee64big = (format == std::string("IEEE64BIG"));
  int ieee64    = (format == std::string("IEEE64"));

+  uint32_t csum;
  // depending on datatype, set up munger;
  // munger is a function of <floating point, Real, data_type>
  if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
    if ( ieee32 || ieee32big ) {
-      readNerscObject<vLorentzColourMatrix, LorentzColourMatrix, LorentzColour2x3F> 
-	(Umu,file,
-	 Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(),
-	 offset,format);
+      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
+      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
+	(Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
    }
    if ( ieee64 || ieee64big ) {
-      readNerscObject<vLorentzColourMatrix, LorentzColourMatrix, LorentzColour2x3D> 
-	(Umu,file,
-	 Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),
-	 offset,format);
+      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
+      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
+	(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
    }
  } else if ( header.data_type == std::string("4D_SU3_GAUGE_3X3") ) {
    if ( ieee32 || ieee32big ) {
-      readNerscObject<vLorentzColourMatrix,LorentzColourMatrix,LorentzColourMatrixF>
+      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
+      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 	(Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
    }
    if ( ieee64 || ieee64big ) {
-      readNerscObject<vLorentzColourMatrix,LorentzColourMatrix,LorentzColourMatrixD>
+      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
+      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
 	(Umu,file,NerscSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format);
    }
  } else {
    assert(0);
  }

+  NerscStatistics<GaugeField>(Umu,clone);
+
+  assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+  assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
+  assert(csum == header.checksum );
+
+  std::cout<<GridLogMessage <<"Read NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
 }

-template<class vobj>
-inline void writeNerscConfiguration(Lattice<vobj> &Umu,NerscField &header,std::string file)
+template<class vsimd>
+static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,std::string file, int two_row,int bits32)
 {
-  GridBase &grid = Umu._grid;
+  typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;

-  NerscStatisics(Umu,header);
+  typedef iLorentzColourMatrix<vsimd> vobj;
+  typedef typename vobj::scalar_object sobj;

-  int offset = writeNerscHeader(header,file);
+  // Following should become arguments
+  NerscField header;
+  header.sequence_number = 1;
+  header.ensemble_id     = "UKQCD";
+  header.ensemble_label  = "DWF";

-  writeNerscObject(Umu,NerscSimpleMunger<vobj,vobj>(),offset);
-}
+  typedef LorentzColourMatrixD fobj3D;
+  typedef LorentzColour2x3D    fobj2D;
+  typedef LorentzColourMatrixF fobj3f;
+  typedef LorentzColour2x3F    fobj2f;
+
+  GridBase *grid = Umu._grid;
+
+  NerscGrid(grid,header);
+  NerscStatistics<GaugeField>(Umu,header);
+  NerscMachineCharacteristics(header);
+
+  uint32_t csum;
+  int offset;
+  
+  if ( two_row ) { 
+
+    header.floating_point = std::string("IEEE64BIG");
+    header.data_type      = std::string("4D_SU3_GAUGE");
+    Nersc3x2unmunger<fobj2D,sobj> munge;
+    BinaryIO::Uint32Checksum<vobj,fobj2D>(Umu, munge,header.checksum);
+    offset = writeHeader(header,file);
+    csum=BinaryIO::writeObjectSerial<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point);
+
+    std::string file1 = file+"para";
+    int offset1 = writeHeader(header,file1);
+    int csum1=BinaryIO::writeObjectParallel<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point);

    
-}
+    std::cout << GridLogMessage << " TESTING PARALLEL WRITE offsets " << offset1 << " "<< offset << std::endl;
+    std::cout << GridLogMessage <<std::hex<< " TESTING PARALLEL WRITE csums   " << csum1 << " "<< csum << std::endl;
+    std::cout << std::dec;
+
+    assert(offset1==offset);  
+    assert(csum1==csum);  
+
+  } else { 
+    header.floating_point = std::string("IEEE64BIG");
+    header.data_type      = std::string("4D_SU3_GAUGE_3X3");
+    NerscSimpleUnmunger<fobj3D,sobj> munge;
+    BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum);
+    offset = writeHeader(header,file);
+    csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
+  }
+
+  std::cout<<GridLogMessage <<"Written NERSC Configuration "<<file<< " checksum "<<std::hex<<csum<< std::dec<<" plaq "<< header.plaquette <<std::endl;
+
+ }
+};
+
+}}
 #endif
--- a/lib/pugixml/README.md
+++ b/lib/pugixml/README.md
@@ -0,0 +1,44 @@
+pugixml [![Build Status](https://travis-ci.org/zeux/pugixml.svg?branch=master)](https://travis-ci.org/zeux/pugixml) [![Build status](https://ci.appveyor.com/api/projects/status/9hdks1doqvq8pwe7/branch/master?svg=true)](https://ci.appveyor.com/project/zeux/pugixml)
+=======
+
+pugixml is a C++ XML processing library, which consists of a DOM-like interface with rich traversal/modification
+capabilities, an extremely fast XML parser which constructs the DOM tree from an XML file/buffer, and an XPath 1.0
+implementation for complex data-driven tree queries. Full Unicode support is also available, with Unicode interface
+variants and conversions between different Unicode encodings (which happen automatically during parsing/saving).
+
+pugixml is used by a lot of projects, both open-source and proprietary, for performance and easy-to-use interface.
+
+## Documentation
+
+Documentation for the current release of pugixml is available on-line as two separate documents:
+
+* [Quick-start guide](http://pugixml.org/docs/quickstart.html), that aims to provide enough information to start using the library;
+* [Complete reference manual](http://pugixml.org/docs/manual.html), that describes all features of the library in detail.
+
+You’re advised to start with the quick-start guide; however, many important library features are either not described in it at all or only mentioned briefly; if you require more information you should read the complete manual.
+
+## License
+This library is available to anybody free of charge, under the terms of MIT License:
+
+Copyright (c) 2006-2015 Arseny Kapoulkine
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
--- a/lib/pugixml/pugiconfig.hpp
+++ b/lib/pugixml/pugiconfig.hpp
@@ -0,0 +1,71 @@
+/**
+ * pugixml parser - version 1.6
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
+ */
+
+#ifndef HEADER_PUGICONFIG_HPP
+#define HEADER_PUGICONFIG_HPP
+
+// Uncomment this to enable wchar_t mode
+// #define PUGIXML_WCHAR_MODE
+
+// Uncomment this to disable XPath
+// #define PUGIXML_NO_XPATH
+
+// Uncomment this to disable STL
+// #define PUGIXML_NO_STL
+
+// Uncomment this to disable exceptions
+// #define PUGIXML_NO_EXCEPTIONS
+
+// Set this to control attributes for public classes/functions, i.e.:
+// #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL
+// #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL
+// #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall
+// In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead
+
+// Tune these constants to adjust memory-related behavior
+// #define PUGIXML_MEMORY_PAGE_SIZE 32768
+// #define PUGIXML_MEMORY_OUTPUT_STACK 10240
+// #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096
+
+// Uncomment this to switch to header-only version
+// #define PUGIXML_HEADER_ONLY
+
+// Uncomment this to enable long long support
+// #define PUGIXML_HAS_LONG_LONG
+
+#endif
+
+/**
+ * Copyright (c) 2006-2015 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
--- a/lib/pugixml/pugixml.cc
+++ b/lib/pugixml/pugixml.cc
--- a/lib/pugixml/pugixml.h
+++ b/lib/pugixml/pugixml.h
--- a/lib/pugixml/readme.txt
+++ b/lib/pugixml/readme.txt
@@ -0,0 +1,52 @@
+pugixml 1.6 - an XML processing library
+
+Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+Report bugs and download new versions at http://pugixml.org/
+
+This is the distribution of pugixml, which is a C++ XML processing library,
+which consists of a DOM-like interface with rich traversal/modification
+capabilities, an extremely fast XML parser which constructs the DOM tree from
+an XML file/buffer, and an XPath 1.0 implementation for complex data-driven
+tree queries. Full Unicode support is also available, with Unicode interface
+variants and conversions between different Unicode encodings (which happen
+automatically during parsing/saving).
+
+The distribution contains the following folders:
+
+	contrib/ - various contributions to pugixml
+
+	docs/ - documentation
+		docs/samples - pugixml usage examples
+		docs/quickstart.html - quick start guide
+		docs/manual.html - complete manual
+
+	scripts/ - project files for IDE/build systems
+
+	src/ - header and source files
+
+	readme.txt - this file.
+
+This library is distributed under the MIT License:
+
+Copyright (c) 2006-2015 Arseny Kapoulkine
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -19,15 +19,25 @@ namespace QCD {
    static const int Nd=4;
    static const int Nhs=2; // half spinor
    static const int Nds=8; // double stored gauge field
+    static const int Ngp=2; // gparity index range

    //////////////////////////////////////////////////////////////////////////////
    // QCD iMatrix types
    // Index conventions:                            Lorentz x Spin x Colour
    //////////////////////////////////////////////////////////////////////////////
-    static const int ColourIndex = 1;
-    static const int SpinIndex   = 2;
-    static const int LorentzIndex= 3;
+    static const int ColourIndex = 2;
+    static const int SpinIndex   = 1;
+    static const int LorentzIndex= 0;

+    // Useful traits is this a spin index
+    //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
+
+    const int SpinorIndex = 2;
+    template<typename T> struct isSpinor {
+      static const bool value = (SpinorIndex==T::TensorLevel);
+    };
+    template <typename T> using IfSpinor    = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
+    template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;

    // ChrisK very keen to add extra space for Gparity doubling.
    //
@@ -49,6 +59,9 @@ namespace QCD {
    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;

+    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+
    // Spin matrix
    typedef iSpinMatrix<Complex  >          SpinMatrix;
    typedef iSpinMatrix<ComplexF >          SpinMatrixF;
@@ -140,7 +153,7 @@ namespace QCD {
    typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
    
    // singlets
-    typedef iSinglet<Complex >         TComplex;    // FIXME This is painful. Tensor singlet complex type.
+    typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
    typedef iSinglet<ComplexF>         TComplexF;    // FIXME This is painful. Tensor singlet complex type.
    typedef iSinglet<ComplexD>         TComplexD;    // FIXME This is painful. Tensor singlet complex type.

@@ -148,7 +161,7 @@ namespace QCD {
    typedef iSinglet<vComplexF>        vTComplexF;   // what if we don't know the tensor structure
    typedef iSinglet<vComplexD>        vTComplexD;   // what if we don't know the tensor structure

-    typedef iSinglet<Real >            TReal;       // Shouldn't need these; can I make it work without?
+    typedef iSinglet<Real >            TReal;        // Shouldn't need these; can I make it work without?
    typedef iSinglet<RealF>            TRealF;       // Shouldn't need these; can I make it work without?
    typedef iSinglet<RealD>            TRealD;       // Shouldn't need these; can I make it work without?

@@ -237,6 +250,8 @@ namespace QCD {
    typedef LatticeDoubleStoredColourMatrixF       LatticeDoubledGaugeFieldF;
    typedef LatticeDoubleStoredColourMatrixD       LatticeDoubledGaugeFieldD;

+    template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;
+
    // Uhgg... typing this hurt  ;)
    // (my keyboard got burning hot when I typed this, must be the anti-Fermion)
    typedef Lattice<vColourVector>          LatticeStaggeredFermion;    
@@ -252,47 +267,47 @@ namespace QCD {
    //////////////////////////////////////////////////////////////////////////////

    //spin
-    template<class vobj> auto peekSpin(const vobj &rhs,int i) -> decltype(peekIndex<SpinIndex>(rhs,0))
+    template<class vobj> auto peekSpin(const vobj &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
    {
-      return peekIndex<SpinIndex>(rhs,i);
+      return PeekIndex<SpinIndex>(rhs,i);
    }
-    template<class vobj> auto peekSpin(const vobj &rhs,int i,int j) -> decltype(peekIndex<SpinIndex>(rhs,0,0))
+    template<class vobj> auto peekSpin(const vobj &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
    {
-      return peekIndex<SpinIndex>(rhs,i,j);
+      return PeekIndex<SpinIndex>(rhs,i,j);
    }
-    template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i) -> decltype(peekIndex<SpinIndex>(rhs,0))
+    template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
    {
-      return peekIndex<SpinIndex>(rhs,i);
+      return PeekIndex<SpinIndex>(rhs,i);
    }
-    template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i,int j) -> decltype(peekIndex<SpinIndex>(rhs,0,0))
+    template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
    {
-      return peekIndex<SpinIndex>(rhs,i,j);
+      return PeekIndex<SpinIndex>(rhs,i,j);
    }
    //colour
-    template<class vobj> auto peekColour(const vobj &rhs,int i) -> decltype(peekIndex<ColourIndex>(rhs,0))
+    template<class vobj> auto peekColour(const vobj &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
    {
-      return peekIndex<ColourIndex>(rhs,i);
+      return PeekIndex<ColourIndex>(rhs,i);
    }
-    template<class vobj> auto peekColour(const vobj &rhs,int i,int j) -> decltype(peekIndex<ColourIndex>(rhs,0,0))
+    template<class vobj> auto peekColour(const vobj &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
    {
-      return peekIndex<ColourIndex>(rhs,i,j);
+      return PeekIndex<ColourIndex>(rhs,i,j);
    }
-    template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i) -> decltype(peekIndex<ColourIndex>(rhs,0))
+    template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
    {
-      return peekIndex<ColourIndex>(rhs,i);
+      return PeekIndex<ColourIndex>(rhs,i);
    }
-    template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i,int j) -> decltype(peekIndex<ColourIndex>(rhs,0,0))
+    template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
    {
-      return peekIndex<ColourIndex>(rhs,i,j);
+      return PeekIndex<ColourIndex>(rhs,i,j);
    }
    //lorentz
-    template<class vobj> auto peekLorentz(const vobj &rhs,int i) -> decltype(peekIndex<LorentzIndex>(rhs,0))
+    template<class vobj> auto peekLorentz(const vobj &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
    {
-      return peekIndex<LorentzIndex>(rhs,i);
+      return PeekIndex<LorentzIndex>(rhs,i);
    }
-    template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i) -> decltype(peekIndex<LorentzIndex>(rhs,0))
+    template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
    {
-      return peekIndex<LorentzIndex>(rhs,i);
+      return PeekIndex<LorentzIndex>(rhs,i);
    }

    //////////////////////////////////////////////
@@ -303,35 +318,35 @@ namespace QCD {
 		      const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0))> & rhs,
 		      int i)
    {
-      pokeIndex<ColourIndex>(lhs,rhs,i);
+      PokeIndex<ColourIndex>(lhs,rhs,i);
    }
    template<class vobj> 
      void pokeColour(Lattice<vobj> &lhs,
 		      const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0,0))> & rhs,
 		      int i,int j)
    {
-      pokeIndex<ColourIndex>(lhs,rhs,i,j);
+      PokeIndex<ColourIndex>(lhs,rhs,i,j);
    }
    template<class vobj> 
      void pokeSpin(Lattice<vobj> &lhs,
 		      const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0))> & rhs,
 		      int i)
    {
-      pokeIndex<SpinIndex>(lhs,rhs,i);
+      PokeIndex<SpinIndex>(lhs,rhs,i);
    }
    template<class vobj> 
      void pokeSpin(Lattice<vobj> &lhs,
 		      const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0,0))> & rhs,
 		      int i,int j)
    {
-      pokeIndex<SpinIndex>(lhs,rhs,i,j);
+      PokeIndex<SpinIndex>(lhs,rhs,i,j);
    }
    template<class vobj> 
      void pokeLorentz(Lattice<vobj> &lhs,
 		      const Lattice<decltype(peekIndex<LorentzIndex>(lhs._odata[0],0))> & rhs,
 		      int i)
    {
-      pokeIndex<LorentzIndex>(lhs,rhs,i);
+      PokeIndex<LorentzIndex>(lhs,rhs,i);
    }

    //////////////////////////////////////////////
@@ -411,6 +426,11 @@ namespace QCD {
 #include <qcd/utils/LinalgUtils.h>
 #include <qcd/utils/CovariantCshift.h>
 #include <qcd/utils/WilsonLoops.h>
+#include <qcd/utils/SUn.h>
 #include <qcd/action/Actions.h>
+#include <qcd/hmc/integrators/Integrator.h>
+#include <qcd/hmc/integrators/Integrator_algorithm.h>
+#include <qcd/hmc/HMC.h>
+

 #endif
--- a/lib/qcd/action/ActionBase.h
+++ b/lib/qcd/action/ActionBase.h
@@ -0,0 +1,58 @@
+#ifndef QCD_ACTION_BASE
+#define QCD_ACTION_BASE
+namespace Grid {
+namespace QCD{
+
+template<class GaugeField>
+class Action { 
+
+ public:
+  // Boundary conditions? // Heatbath?
+  virtual void  refresh(const GaugeField &U, GridParallelRNG& pRNG) = 0;// refresh pseudofermions
+  virtual RealD S    (const GaugeField &U)                        = 0;  // evaluate the action
+  virtual void  deriv(const GaugeField &U,GaugeField & dSdU )     = 0;  // evaluate the action derivative
+  virtual ~Action() {};
+};
+
+// Could derive PseudoFermion action with a PF field, FermionField, and a Grid; implement refresh
+/*
+template<class GaugeField, class FermionField>
+class PseudoFermionAction : public Action<GaugeField> {
+ public:
+  FermionField Phi;
+  GridParallelRNG &pRNG;
+  GridBase &Grid;
+
+  PseudoFermionAction(GridBase &_Grid,GridParallelRNG &_pRNG) : Grid(_Grid), Phi(&_Grid), pRNG(_pRNG) {
+  };
+
+  virtual void refresh(const GaugeField &gauge) {
+    gaussian(Phi,pRNG);
+  };
+
+};
+*/
+
+template<class GaugeField> struct ActionLevel{
+public:
+   
+  typedef Action<GaugeField>*  ActPtr; // now force the same colours as the rest of the code
+
+  int multiplier;
+
+  std::vector<ActPtr> actions;
+
+  ActionLevel(int mul = 1) : multiplier(mul) {
+    assert (mul > 0);
+  };
+   
+  void push_back(ActPtr ptr){
+    actions.push_back(ptr);
+  }
+};
+
+template<class GaugeField> using ActionSet = std::vector<ActionLevel< GaugeField > >;
+
+
+}}
+#endif
--- a/lib/qcd/action/ActionParams.h
+++ b/lib/qcd/action/ActionParams.h
@@ -0,0 +1,29 @@
+#ifndef GRID_QCD_ACTION_PARAMS_H
+#define GRID_QCD_ACTION_PARAMS_H
+
+namespace Grid {
+namespace QCD {
+
+    // These can move into a params header and be given MacroMagic serialisation
+    struct GparityWilsonImplParams {
+      std::vector<int> twists; 
+    };
+
+    struct WilsonImplParams { };
+
+    struct OneFlavourRationalParams { 
+      RealD  lo;
+      RealD  hi;
+      int MaxIter;   // Vector?
+      RealD tolerance; // Vector? 
+      int    degree=10;
+      int precision=64;
+
+      OneFlavourRationalParams (RealD _lo,RealD _hi,int _maxit,RealD tol=1.0e-8,int _degree = 10,int _precision=64) :
+        lo(_lo), hi(_hi), MaxIter(_maxit), tolerance(tol), degree(_degree), precision(_precision)
+      {};
+    };
+
+}}
+
+#endif
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -6,68 +6,173 @@
 // are separating the concept of the operator from that of action.
 //
 // The FermAction contains methods to create 
-//
 // * Linear operators             (Hermitian and non-hermitian)  .. my LinearOperator
 // * System solvers               (Hermitian and non-hermitian)  .. my OperatorFunction
 // * MultiShift System solvers    (Hermitian and non-hermitian)  .. my OperatorFunction

-
 ////////////////////////////////////////////
 // Abstract base interface
 ////////////////////////////////////////////
-#include <qcd/action/fermion/FermionOperator.h>
+#include <qcd/action/ActionBase.h>
+#include <qcd/action/ActionParams.h>
+
+

 ////////////////////////////////////////////
 // Utility functions
 ////////////////////////////////////////////
 #include <qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
+#include <qcd/action/fermion/FermionOperatorImpl.h>
+#include <qcd/action/fermion/FermionOperator.h>
 #include <qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions

+
 ////////////////////////////////////////////
-// 4D formulations
+// Gauge Actions
 ////////////////////////////////////////////
-#include <qcd/action/fermion/WilsonFermion.h>
+#include <qcd/action/gauge/WilsonGaugeAction.h>
+namespace Grid {
+namespace QCD {
+typedef WilsonGaugeAction<LatticeGaugeField>     WilsonGaugeActionR;
+typedef WilsonGaugeAction<LatticeGaugeFieldF>    WilsonGaugeActionF;
+typedef WilsonGaugeAction<LatticeGaugeFieldD>    WilsonGaugeActionD;
+}}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Explicit explicit template instantiation is still required in the .cc files
+//
+// - CayleyFermion5D.cc
+// - PartialFractionFermion5D.cc
+// - WilsonFermion5D.cc
+// - WilsonKernelsHand.cc
+// - ContinuedFractionFermion5D.cc
+// - WilsonFermion.cc
+// - WilsonKernels.cc
+//
+// The explicit instantiation is only avoidable if we move this source to headers and end up with include/parse/recompile
+// for EVERY .cc file. This define centralises the list and restores global push of impl cases
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define FermOpTemplateInstantiate(A) \
+  template class A<WilsonImplF>;		\
+  template class A<WilsonImplD>; 
+  //  template class A<GparityWilsonImplF>;	\
+  //  template class A<GparityWilsonImplD>;		
+
+////////////////////////////////////////////
+// Fermion operators / actions
+////////////////////////////////////////////
+
+#include <qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
+#include <qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
+
 //#include <qcd/action/fermion/CloverFermion.h>

-////////////////////////////////////////////
-// 5D formulations...
-////////////////////////////////////////////
-
-#include <qcd/action/fermion/WilsonFermion5D.h> // used by all 5d overlap types
-
-//////////
-// Cayley
-//////////
-#include <qcd/action/fermion/CayleyFermion5D.h>
-
+#include <qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
 #include <qcd/action/fermion/DomainWallFermion.h>
 #include <qcd/action/fermion/DomainWallFermion.h>
-
 #include <qcd/action/fermion/MobiusFermion.h>
 #include <qcd/action/fermion/ScaledShamirFermion.h>
-#include <qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
-
 #include <qcd/action/fermion/MobiusZolotarevFermion.h>
 #include <qcd/action/fermion/ShamirZolotarevFermion.h>
+#include <qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
 #include <qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>

-//////////////////////
-// Continued fraction
-//////////////////////
-#include <qcd/action/fermion/ContinuedFractionFermion5D.h>
+#include <qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
 #include <qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
 #include <qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>

-//////////////////////
-// Partial fraction
-//////////////////////
-#include <qcd/action/fermion/PartialFractionFermion5D.h>
+#include <qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
 #include <qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
 #include <qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>

+////////////////////////////////////////////////////////////////////////////////////////////////////
+// More maintainable to maintain the following typedef list centrally, as more "impl" targets
+// are added, (e.g. extension for gparity, half precision project in comms etc..)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+// Cayley 5d
+namespace Grid {
+  namespace QCD {
+
+typedef WilsonFermion<WilsonImplR> WilsonFermionR;
+typedef WilsonFermion<WilsonImplF> WilsonFermionF;
+typedef WilsonFermion<WilsonImplD> WilsonFermionD;
+
+typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
+typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
+typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
+typedef MobiusFermion<WilsonImplR> MobiusFermionR;
+typedef MobiusFermion<WilsonImplF> MobiusFermionF;
+typedef MobiusFermion<WilsonImplD> MobiusFermionD;
+typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
+typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
+typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
+
+typedef MobiusZolotarevFermion<WilsonImplR> MobiusZolotarevFermionR;
+typedef MobiusZolotarevFermion<WilsonImplF> MobiusZolotarevFermionF;
+typedef MobiusZolotarevFermion<WilsonImplD> MobiusZolotarevFermionD;
+typedef ShamirZolotarevFermion<WilsonImplR> ShamirZolotarevFermionR;
+typedef ShamirZolotarevFermion<WilsonImplF> ShamirZolotarevFermionF;
+typedef ShamirZolotarevFermion<WilsonImplD> ShamirZolotarevFermionD;
+
+typedef OverlapWilsonCayleyTanhFermion<WilsonImplR> OverlapWilsonCayleyTanhFermionR;
+typedef OverlapWilsonCayleyTanhFermion<WilsonImplF> OverlapWilsonCayleyTanhFermionF;
+typedef OverlapWilsonCayleyTanhFermion<WilsonImplD> OverlapWilsonCayleyTanhFermionD;
+typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplR> OverlapWilsonCayleyZolotarevFermionR;
+typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplF> OverlapWilsonCayleyZolotarevFermionF;
+typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD> OverlapWilsonCayleyZolotarevFermionD;
+
+// Continued fraction
+typedef OverlapWilsonContFracTanhFermion<WilsonImplR> OverlapWilsonContFracTanhFermionR;
+typedef OverlapWilsonContFracTanhFermion<WilsonImplF> OverlapWilsonContFracTanhFermionF;
+typedef OverlapWilsonContFracTanhFermion<WilsonImplD> OverlapWilsonContFracTanhFermionD;
+typedef OverlapWilsonContFracZolotarevFermion<WilsonImplR> OverlapWilsonContFracZolotarevFermionR;
+typedef OverlapWilsonContFracZolotarevFermion<WilsonImplF> OverlapWilsonContFracZolotarevFermionF;
+typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD> OverlapWilsonContFracZolotarevFermionD;
+
+// Partial fraction
+typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplR> OverlapWilsonPartialFractionTanhFermionR;
+typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplF> OverlapWilsonPartialFractionTanhFermionF;
+typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD> OverlapWilsonPartialFractionTanhFermionD;
+
+typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplR> OverlapWilsonPartialFractionZolotarevFermionR;
+typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplF> OverlapWilsonPartialFractionZolotarevFermionF;
+typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonPartialFractionZolotarevFermionD;
+
+// Gparity cases; partial list until tested
+typedef WilsonFermion<GparityWilsonImplR>     GparityWilsonFermionR;
+typedef WilsonFermion<GparityWilsonImplF>     GparityWilsonFermionF;
+typedef WilsonFermion<GparityWilsonImplD>     GparityWilsonFermionD;
+typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
+typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
+typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
+
+  }}
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
 #include <qcd/action/fermion/g5HermitianLinop.h>

+////////////////////////////////////////
+// Pseudo fermion combinations for HMC
+////////////////////////////////////////
+#include <qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
+
+#include <qcd/action/pseudofermion/TwoFlavour.h>
+#include <qcd/action/pseudofermion/TwoFlavourRatio.h>
+#include <qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
+#include <qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>
+
+//IroIro inserted general "Nf" param; could also be done,
+//but not clear why unless into large Nf BSM studies
+//Even there, don't want the explicit (2) on power denominator
+//if even number of flavours, so further generalised interface
+//would be required but easy.
+#include <qcd/action/pseudofermion/OneFlavourRational.h>
+#include <qcd/action/pseudofermion/OneFlavourRationalRatio.h>
+#include <qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
+#include <qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
+
 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -2,27 +2,27 @@
 namespace Grid {
 namespace QCD {

- CayleyFermion5D::CayleyFermion5D(LatticeGaugeField &_Umu,
-				  GridCartesian         &FiveDimGrid,
-				  GridRedBlackCartesian &FiveDimRedBlackGrid,
-				  GridCartesian         &FourDimGrid,
-				  GridRedBlackCartesian &FourDimRedBlackGrid,
-				  RealD _mass,RealD _M5) :
-   WilsonFermion5D(_Umu,
+ template<class Impl>
+ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
+					GridCartesian         &FiveDimGrid,
+					GridRedBlackCartesian &FiveDimRedBlackGrid,
+					GridCartesian         &FourDimGrid,
+					GridRedBlackCartesian &FourDimRedBlackGrid,
+					RealD _mass,RealD _M5,const ImplParams &p) :
+   WilsonFermion5D<Impl>(_Umu,
 		   FiveDimGrid,
 		   FiveDimRedBlackGrid,
 		   FourDimGrid,
-		   FourDimRedBlackGrid,_M5),
+ 	 	   FourDimRedBlackGrid,_M5,p),
   mass(_mass)
 {
 }

-  // override multiply
-  RealD CayleyFermion5D::M    (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
  {
-    LatticeFermion Din(psi._grid);
-
    // Assemble Din
+    int Ls=this->Ls;
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	//	Din = bs psi[s] + cs[s] psi[s+1}
@@ -37,11 +37,57 @@ namespace QCD {
 	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
      }
    }
+  }
+ template<class Impl>
+  void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
+  {
+    int Ls=this->Ls;
+    for(int s=0;s<Ls;s++){
+      if ( s==0 ) {
+	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
+	axpby_ssp_pminus(Din,1.0,Din,-mass*cs[Ls-1],psi,s,Ls-1);
+      } else if ( s==(Ls-1)) { 
+	axpby_ssp_pplus (Din,bs[s],psi,-mass*cs[0],psi,s,0);
+	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
+      } else {
+	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
+	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
+      }
+    }
+  }

-    DW(Din,chi,DaggerNo);
+  // override multiply
+ template<class Impl>
+  RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
+  {
+    int Ls=this->Ls;
+
+    FermionField Din(psi._grid);
+
+    // Assemble Din
+    /*
+    for(int s=0;s<Ls;s++){
+      if ( s==0 ) {
+	//	Din = bs psi[s] + cs[s] psi[s+1}
+	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
+	//      Din+= -mass*cs[s] psi[s+1}
+	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
+      } else if ( s==(Ls-1)) { 
+	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
+	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
+      } else {
+	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
+	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
+      }
+    }
+    */
+    Meooe5D(psi,Din);
+
+    this->DW(Din,chi,DaggerNo);
    // ((b D_W + D_w hop terms +1) on s-diag
    axpby(chi,1.0,1.0,chi,psi); 

+    // Call Mooee??
    for(int s=0;s<Ls;s++){
      if ( s==0 ){
 	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
@@ -57,20 +103,26 @@ namespace QCD {
    return norm2(chi);
  }

-  RealD CayleyFermion5D::Mdag (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
+  RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
  {
    // Under adjoint
    //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
    //D2- P+     D2+            P-D1-^dag D2+dag

-    LatticeFermion Din(psi._grid);
+    FermionField Din(psi._grid);
    // Apply Dw
-    DW(psi,Din,DaggerYes); 
+    this->DW(psi,Din,DaggerYes); 

+    Meooe5D(Din,chi);
+
+    int Ls=this->Ls;
    for(int s=0;s<Ls;s++){
+
      // Collect the terms in DW
      //	Chi = bs Din[s] + cs[s] Din[s+1}
      //    Chi+= -mass*cs[s] psi[s+1}
+      /*
      if ( s==0 ) {
 	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,-mass*cs[Ls-1],Din,s,Ls-1);
@@ -81,6 +133,10 @@ namespace QCD {
 	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
      }
+      */
+
+      // FIXME just call MooeeDag??
+
      // Collect the terms indept of DW
      if ( s==0 ){
 	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s+1);
@@ -99,10 +155,17 @@ namespace QCD {
  }

  // half checkerboard operations
-  void CayleyFermion5D::Meooe       (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
  {
-    LatticeFermion tmp(psi._grid);
+    int Ls=this->Ls;
+
+    FermionField tmp(psi._grid);
    // Assemble the 5d matrix
+    Meooe5D(psi,tmp); 
+
+#if 0
+    std::cout << "Meooe Test replacement norm2 tmp = " <<norm2(tmp)<<std::endl;
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	//	tmp = bs psi[s] + cs[s] psi[s+1}
@@ -117,24 +180,33 @@ namespace QCD {
 	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
      }
    }
+    std::cout << "Meooe Test replacement norm2 tmp old = " <<norm2(tmp)<<std::endl;
+#endif
+
    // Apply 4d dslash
    if ( psi.checkerboard == Odd ) {
-      DhopEO(tmp,chi,DaggerNo);
+      this->DhopEO(tmp,chi,DaggerNo);
    } else {
-      DhopOE(tmp,chi,DaggerNo);
+      this->DhopOE(tmp,chi,DaggerNo);
    }
  }

-  void CayleyFermion5D::MeooeDag    (const LatticeFermion &psi, LatticeFermion &chi)
+  template<class Impl>
+  void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
  {
-    LatticeFermion tmp(psi._grid);
+    FermionField tmp(psi._grid);
    // Apply 4d dslash
    if ( psi.checkerboard == Odd ) {
-      DhopEO(psi,tmp,DaggerYes);
+      this->DhopEO(psi,tmp,DaggerYes);
    } else {
-      DhopOE(psi,tmp,DaggerYes);
+      this->DhopOE(psi,tmp,DaggerYes);
    }
+
+    Meooe5D(tmp,chi); 
+#if 0
+    std::cout << "Meooe Test replacement norm2 chi new = " <<norm2(chi)<<std::endl;
    // Assemble the 5d matrix
+    int Ls=this->Ls;
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	axpby_ssp_pplus(chi,beo[s],tmp,   -ceo[s+1]  ,tmp,s,s+1);
@@ -147,10 +219,15 @@ namespace QCD {
 	axpby_ssp_pminus(chi,1.0   ,chi,-ceo[s-1],tmp,s,s-1);
      }
    }
+    std::cout << "Meooe Test replacement norm2 chi old = " <<norm2(chi)<<std::endl;
+#endif
+
  }

-  void CayleyFermion5D::Mooee       (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
  {
+    int Ls=this->Ls;
    for (int s=0;s<Ls;s++){
      if ( s==0 ) {
 	axpby_ssp_pminus(chi,bee[s],psi ,-cee[s],psi,s,s+1);
@@ -165,8 +242,10 @@ namespace QCD {
    }
  }

-  void  CayleyFermion5D::Mdir (const LatticeFermion &psi, LatticeFermion &chi,int dir,int disp){
-    LatticeFermion tmp(psi._grid);
+ template<class Impl>
+  void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+    int Ls=this->Ls;
+    FermionField tmp(psi._grid);
    // Assemble the 5d matrix
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
@@ -183,11 +262,13 @@ namespace QCD {
      }
    }
    // Apply 4d dslash fragment
-    DhopDir(tmp,chi,dir,disp);
+    this->DhopDir(tmp,chi,dir,disp);
  }

-  void CayleyFermion5D::MooeeDag    (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
  {
+    int Ls=this->Ls;
    for (int s=0;s<Ls;s++){
      // Assemble the 5d matrix
      if ( s==0 ) {
@@ -203,8 +284,10 @@ namespace QCD {
    }
  }

-  void CayleyFermion5D::MooeeInv    (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
  {
+    int Ls=this->Ls;
    // Apply (L^{\prime})^{-1}
    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
    for (int s=1;s<Ls;s++){
@@ -227,8 +310,10 @@ namespace QCD {
    }
  }

-  void CayleyFermion5D::MooeeInvDag (const LatticeFermion &psi, LatticeFermion &chi)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
  {
+    int Ls=this->Ls;
    // Apply (U^{\prime})^{-dagger}
    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
    for (int s=1;s<Ls;s++){
@@ -250,15 +335,65 @@ namespace QCD {
    }
  }

+  // force terms; five routines; default to Dhop on diagonal
+  template<class Impl>
+  void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    FermionField Din(V._grid);
+
+    if ( dag == DaggerNo ) {
+      //      U d/du [D_w D5] V = U d/du DW D5 V
+      Meooe5D(V,Din);
+      this->DhopDeriv(mat,U,Din,dag);
+    } else {
+      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+      Meooe5D(U,Din);
+      this->DhopDeriv(mat,Din,V,dag);
+    }
+  };
+ template<class Impl>
+  void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    FermionField Din(V._grid);
+
+    if ( dag == DaggerNo ) {
+      //      U d/du [D_w D5] V = U d/du DW D5 V
+      Meooe5D(V,Din);
+      this->DhopDerivOE(mat,U,Din,dag);
+    } else {
+      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+      Meooe5D(U,Din);
+      this->DhopDerivOE(mat,Din,V,dag);
+    }
+  };
+ template<class Impl>
+  void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    FermionField Din(V._grid);
+
+    if ( dag == DaggerNo ) {
+      //      U d/du [D_w D5] V = U d/du DW D5 V
+      Meooe5D(V,Din);
+      this->DhopDerivEO(mat,U,Din,dag);
+    } else {
+      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+      Meooe5D(U,Din);
+      this->DhopDerivEO(mat,Din,V,dag);
+    }
+  };
+  
  // Tanh
-  void CayleyFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
  {
    SetCoefficientsZolotarev(1.0,zdata,b,c);

  }
  //Zolo
-  void CayleyFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
+ template<class Impl>
+  void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
  {
+    int Ls=this->Ls;

    ///////////////////////////////////////////////////////////
    // The Cayley coeffs (unprec)
@@ -308,8 +443,8 @@ namespace QCD {
    ceo.resize(Ls);
    
    for(int i=0;i<Ls;i++){
-      bee[i]=as[i]*(bs[i]*(4.0-M5) +1.0);
-      cee[i]=as[i]*(1.0-cs[i]*(4.0-M5));
+      bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
+      cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
      beo[i]=as[i]*bs[i];
      ceo[i]=-as[i]*cs[i];
    }
@@ -362,6 +497,8 @@ namespace QCD {
    }
  }

+  FermOpTemplateInstantiate(CayleyFermion5D);
+
 }}


--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@@ -5,25 +5,36 @@ namespace Grid {

  namespace QCD {

-    class CayleyFermion5D : public WilsonFermion5D
+    template<class Impl>
+    class CayleyFermion5D : public WilsonFermion5D<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      // override multiply
-      virtual RealD  M    (const LatticeFermion &in, LatticeFermion &out);
-      virtual RealD  Mdag (const LatticeFermion &in, LatticeFermion &out);
+      virtual RealD  M    (const FermionField &in, FermionField &out);
+      virtual RealD  Mdag (const FermionField &in, FermionField &out);

      // half checkerboard operations
-      virtual void   Meooe       (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MeooeDag    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   Mooee       (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeDag    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   Meooe       (const FermionField &in, FermionField &out);
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+      virtual void   Mooee       (const FermionField &in, FermionField &out);
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
      virtual void   Instantiatable(void)=0;

+      // force terms; five routines; default to Dhop on diagonal
+      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+
      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+      
+      void   Meooe5D       (const FermionField &in, FermionField &out);
+      void   MeooeDag5D    (const FermionField &in, FermionField &out);

      //    protected:
      RealD mass;
@@ -48,12 +59,12 @@ namespace Grid {
      std::vector<RealD> dee;    

      // Constructors
-      CayleyFermion5D(LatticeGaugeField &_Umu,
+      CayleyFermion5D(GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
 		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 		      GridCartesian         &FourDimGrid,
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
-		      RealD _mass,RealD _M5);
+		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
--- a/lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
+++ b/lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
@@ -3,20 +3,22 @@
 namespace Grid {
  namespace QCD {

-    void ContinuedFractionFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
    {
      SetCoefficientsZolotarev(1.0/scale,zdata);
    }
-    void ContinuedFractionFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
    {
      // How to check Ls matches??
-      //      std::cout << Ls << " Ls"<<std::endl;
-      //      std::cout << zdata->n  << " - n"<<std::endl;
-      //      std::cout << zdata->da << " -da "<<std::endl;
-      //      std::cout << zdata->db << " -db"<<std::endl;
-      //      std::cout << zdata->dn << " -dn"<<std::endl;
-      //      std::cout << zdata->dd << " -dd"<<std::endl;
-
+      //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+      int Ls = this->Ls;
      assert(zdata->db==Ls);// Beta has Ls coeffs

      R=(1+this->mass)/(1-this->mass);
@@ -39,7 +41,7 @@ namespace Grid {


      ZoloHiInv =1.0/zolo_hi;
-      dw_diag = (4.0-M5)*ZoloHiInv;
+      dw_diag = (4.0-this->M5)*ZoloHiInv;
    
      See.resize(Ls);
      Aee.resize(Ls);
@@ -55,17 +57,20 @@ namespace Grid {
 	See[s] = Aee[s] - 1.0/See[s-1];
      }
      for(int s=0;s<Ls;s++){
-	std::cout <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
+	std::cout<<GridLogMessage <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
      }
    }



-    RealD  ContinuedFractionFermion5D::M           (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
    {
-      LatticeFermion D(psi._grid);
+      int Ls = this->Ls;

-      DW(psi,D,DaggerNo); 
+      FermionField D(psi._grid);
+
+      this->DW(psi,D,DaggerNo); 

      int sign=1;
      for(int s=0;s<Ls;s++){
@@ -83,15 +88,20 @@ namespace Grid {
      }
      return norm2(chi);
    }
-    RealD  ContinuedFractionFermion5D::Mdag        (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    RealD  ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
    {
      // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
      // The rest of matrix is symmetric.
      // Can ignore "dag"
      return M(psi,chi);
    }
-    void  ContinuedFractionFermion5D::Mdir (const LatticeFermion &psi, LatticeFermion &chi,int dir,int disp){
-      DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
+    template<class Impl>
+    void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+      int Ls = this->Ls;
+
+      this->DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
+
      int sign=1;
      for(int s=0;s<Ls;s++){
 	if ( s==(Ls-1) ){
@@ -102,13 +112,16 @@ namespace Grid {
 	sign=-sign; 
      }
    }
-    void   ContinuedFractionFermion5D::Meooe       (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
    {
+      int Ls = this->Ls;
+
      // Apply 4d dslash
      if ( psi.checkerboard == Odd ) {
-	DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
+	this->DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
      } else {
-	DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
+	this->DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
      }
      
      int sign=1;
@@ -121,12 +134,16 @@ namespace Grid {
 	sign=-sign; 
      }
    }
-    void   ContinuedFractionFermion5D::MeooeDag    (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
    {
-      Meooe(psi,chi);
+      this->Meooe(psi,chi);
    }
-    void   ContinuedFractionFermion5D::Mooee       (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
    {
+      int Ls = this->Ls;
+
      int sign=1;
      for(int s=0;s<Ls;s++){
 	if ( s==0 ) {
@@ -144,12 +161,16 @@ namespace Grid {
      }
    }

-    void   ContinuedFractionFermion5D::MooeeDag    (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
    {
-      Mooee(psi,chi);
+      this->Mooee(psi,chi);
    }
-    void   ContinuedFractionFermion5D::MooeeInv    (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
    {
+      int Ls = this->Ls;
+
      // Apply Linv
      axpby_ssp(chi,1.0/cc_d[0],psi,0.0,psi,0,0); 
      for(int s=1;s<Ls;s++){
@@ -165,27 +186,88 @@ namespace Grid {
 	axpbg5y_ssp(chi,1.0/cc_d[s],chi,-1.0*cc_d[s+1]/See[s]/cc_d[s],chi,s,s+1);
      }
    }
-    void   ContinuedFractionFermion5D::MooeeInvDag (const LatticeFermion &psi, LatticeFermion &chi)
+    template<class Impl>
+    void   ContinuedFractionFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
    {
-      MooeeInv(psi,chi);
+      this->MooeeInv(psi,chi);
    }

+  // force terms; five routines; default to Dhop on diagonal
+    template<class Impl>
+   void ContinuedFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int sign=1;
+    for(int s=0;s<Ls;s++){
+      if ( s==(Ls-1) ){
+	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+      } else {
+	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+      }
+      sign=-sign; 
+    }
+    this->DhopDeriv(mat,D,V,DaggerNo); 
+  };
+    template<class Impl>
+   void ContinuedFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int sign=1;
+    for(int s=0;s<Ls;s++){
+      if ( s==(Ls-1) ){
+	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+      } else {
+	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+      }
+      sign=-sign; 
+    }
+    this->DhopDerivOE(mat,D,V,DaggerNo); 
+  };
+  template<class Impl>
+  void ContinuedFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int sign=1;
+    for(int s=0;s<Ls;s++){
+      if ( s==(Ls-1) ){
+	ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
+      } else {
+	ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
+      }
+      sign=-sign; 
+    }
+    this->DhopDerivEO(mat,D,V,DaggerNo); 
+  };
+    
    // Constructors
-    ContinuedFractionFermion5D::ContinuedFractionFermion5D(
-							   LatticeGaugeField &_Umu,
+    template<class Impl>
+    ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
+							   GaugeField &_Umu,
 							   GridCartesian         &FiveDimGrid,
 							   GridRedBlackCartesian &FiveDimRedBlackGrid,
 							   GridCartesian         &FourDimGrid,
 							   GridRedBlackCartesian &FourDimRedBlackGrid,
-							   RealD _mass,RealD M5) :
-      WilsonFermion5D(_Umu,
-		      FiveDimGrid, FiveDimRedBlackGrid,
-		      FourDimGrid, FourDimRedBlackGrid,M5),
+							   RealD _mass,RealD M5,const ImplParams &p) :
+      WilsonFermion5D<Impl>(_Umu,
+			    FiveDimGrid, FiveDimRedBlackGrid,
+			    FourDimGrid, FourDimRedBlackGrid,M5,p),
      mass(_mass)
    {
+      int Ls = this->Ls;
      assert((Ls&0x1)==1); // Odd Ls required
    }

+    FermOpTemplateInstantiate(ContinuedFractionFermion5D);
+
  }
 }

--- a/lib/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/lib/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -5,35 +5,43 @@ namespace Grid {

  namespace QCD {

-    class ContinuedFractionFermion5D : public WilsonFermion5D
+    template<class Impl>
+    class ContinuedFractionFermion5D : public WilsonFermion5D<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      // override multiply
-      virtual RealD  M    (const LatticeFermion &in, LatticeFermion &out);
-      virtual RealD  Mdag (const LatticeFermion &in, LatticeFermion &out);
+      virtual RealD  M    (const FermionField &in, FermionField &out);
+      virtual RealD  Mdag (const FermionField &in, FermionField &out);

      // half checkerboard operaions
-      virtual void   Meooe       (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MeooeDag    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   Mooee       (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeDag    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   Meooe       (const FermionField &in, FermionField &out);
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+      virtual void   Mooee       (const FermionField &in, FermionField &out);
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+
+      // force terms; five routines; default to Dhop on diagonal
+      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

      //      virtual void   Instantiatable(void)=0;
      virtual void   Instantiatable(void) =0;

      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);

      // Constructors
-      ContinuedFractionFermion5D(LatticeGaugeField &_Umu,
+      ContinuedFractionFermion5D(GaugeField &_Umu,
 				 GridCartesian         &FiveDimGrid,
 				 GridRedBlackCartesian &FiveDimRedBlackGrid,
 				 GridCartesian         &FourDimGrid,
 				 GridRedBlackCartesian &FourDimRedBlackGrid,
-				 RealD _mass,RealD M5);
+				 RealD _mass,RealD M5,const ImplParams &p= ImplParams());

    protected:

--- a/lib/qcd/action/fermion/DomainWallFermion.h
+++ b/lib/qcd/action/fermion/DomainWallFermion.h
@@ -7,24 +7,27 @@ namespace Grid {

  namespace QCD {

-    class DomainWallFermion : public CayleyFermion5D
+    template<class Impl>
+    class DomainWallFermion : public CayleyFermion5D<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      virtual void   Instantiatable(void) {};
      // Constructors
-      DomainWallFermion(LatticeGaugeField &_Umu,
+      DomainWallFermion(GaugeField &_Umu,
 			GridCartesian         &FiveDimGrid,
 			GridRedBlackCartesian &FiveDimRedBlackGrid,
 			GridCartesian         &FourDimGrid,
 			GridRedBlackCartesian &FourDimRedBlackGrid,
-			RealD _mass,RealD _M5) : 
+			RealD _mass,RealD _M5,const ImplParams &p= ImplParams()) : 

-      CayleyFermion5D(_Umu,
-		      FiveDimGrid,
-		      FiveDimRedBlackGrid,
-		      FourDimGrid,
-		      FourDimRedBlackGrid,_mass,_M5)
+      CayleyFermion5D<Impl>(_Umu,
+			    FiveDimGrid,
+			    FiveDimRedBlackGrid,
+			    FourDimGrid,
+			    FourDimRedBlackGrid,_mass,_M5,p)

      {
 	RealD eps = 1.0;
@@ -32,9 +35,9 @@ namespace Grid {
 	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
 	assert(zdata->n==this->Ls);
 	
-	std::cout << "DomainWallFermion with Ls="<<Ls<<std::endl;
+	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
 	// Call base setter
-	this->CayleyFermion5D::SetCoefficientsTanh(zdata,1.0,0.0);
+	this->SetCoefficientsTanh(zdata,1.0,0.0);

 	Approx::zolotarev_free(zdata);
      }
--- a/lib/qcd/action/fermion/FermionOperator.h
+++ b/lib/qcd/action/fermion/FermionOperator.h
@@ -5,16 +5,20 @@ namespace Grid {

  namespace QCD {

-    //////////////////////////////////////////////////////////////////////////////
-    // Four component fermions
-    // Should type template the vector and gauge types
-    // Think about multiple representations
-    //////////////////////////////////////////////////////////////////////////////
-    template<class FermionField,class GaugeField>
-    class FermionOperator : public CheckerBoardedSparseMatrixBase<FermionField>
+    ////////////////////////////////////////////////////////////////
+    // Allow to select  between gauge representation rank bc's, flavours etc.
+    // and single/double precision.
+    ////////////////////////////////////////////////////////////////
+    
+    template<class Impl>
+    class FermionOperator : public CheckerBoardedSparseMatrixBase<typename Impl::FermionField>, public Impl
    {
    public:

+      INHERIT_IMPL_TYPES(Impl);
+
+      FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
+
      GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
      GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };

@@ -28,6 +32,8 @@ namespace Grid {
      virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;

      // half checkerboard operaions
+      virtual int    ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
+
      virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
      virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0;
      virtual void   Mooee       (const FermionField &in, FermionField &out)=0;
@@ -39,13 +45,31 @@ namespace Grid {
      virtual void Dhop  (const FermionField &in, FermionField &out,int dag)=0;
      virtual void DhopOE(const FermionField &in, FermionField &out,int dag)=0;
      virtual void DhopEO(const FermionField &in, FermionField &out,int dag)=0;
+      virtual void DhopDir(const FermionField &in, FermionField &out,int dir,int disp)=0; // implemented by WilsonFermion and WilsonFermion5D

-      virtual void  Mdiag(const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
-      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
-      virtual void  DhopDir(const FermionField &in, FermionField &out,int dir,int disp)=0; // implemented by WilsonFermion and WilsonFermion5D
+      // force terms; five routines; default to Dhop on diagonal
+      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDeriv(mat,U,V,dag);};
+      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivOE(mat,U,V,dag);};
+      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){DhopDerivEO(mat,U,V,dag);};
+      virtual void MooDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=zero;}; // Clover can override these
+      virtual void MeeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){mat=zero;};
+
+      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+      virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+      virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
+
+
+      virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
+      virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+
+      ///////////////////////////////////////////////
+      // Updates gauge field during HMC
+      ///////////////////////////////////////////////
+      virtual void ImportGauge(const GaugeField & _U)=0;

    };

  }
 }
+
 #endif
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -0,0 +1,359 @@
+#ifndef  GRID_QCD_FERMION_OPERATOR_IMPL_H
+#define  GRID_QCD_FERMION_OPERATOR_IMPL_H
+
+namespace Grid {
+
+  namespace QCD {
+
+
+    //////////////////////////////////////////////
+    // Template parameter class constructs to package
+    // externally control Fermion implementations
+    // in orthogonal directions
+    //
+    // Ultimately need Impl to always define types where XXX is opaque
+    //
+    //    typedef typename XXX               Simd;
+    //    typedef typename XXX     GaugeLinkField;	
+    //    typedef typename XXX         GaugeField;
+    //    typedef typename XXX      GaugeActField;
+    //    typedef typename XXX       FermionField;
+    //    typedef typename XXX  DoubledGaugeField;
+    //    typedef typename XXX         SiteSpinor;
+    //    typedef typename XXX     SiteHalfSpinor;	
+    //    typedef typename XXX         Compressor;	
+    //
+    // and Methods:
+    //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+    //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+    //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
+    //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
+    //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
+    //
+    //
+    // To acquire the typedefs from "Base" (either a base class or template param) use:
+    //
+    // INHERIT_GIMPL_TYPES(Base)
+    // INHERIT_FIMPL_TYPES(Base)
+    // INHERIT_IMPL_TYPES(Base)
+    //
+    // The Fermion operators will do the following:
+    //
+    // struct MyOpParams { 
+    //   RealD mass;
+    // };
+    //
+    //
+    // template<class Impl>
+    // class MyOp : pubic<Impl> { 
+    // public:
+    //
+    //    INHERIT_ALL_IMPL_TYPES(Impl);
+    //
+    //    MyOp(MyOpParams Myparm, ImplParams &ImplParam) :  Impl(ImplParam)
+    //    {
+    //
+    //    };
+    //    
+    //  }
+    //////////////////////////////////////////////
+
+    
+    ////////////////////////////////////////////////////////////////////////
+    // Implementation dependent gauge types
+    ////////////////////////////////////////////////////////////////////////
+
+#define INHERIT_IMPL_TYPES(Base) \
+    INHERIT_GIMPL_TYPES(Base)\
+    INHERIT_FIMPL_TYPES(Base)
+
+#define INHERIT_GIMPL_TYPES(GImpl) \
+    typedef typename GImpl::Simd                           Simd;\
+    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
+    typedef typename GImpl::GaugeField               GaugeField;	
+    
+    // Composition with smeared link, bc's etc.. probably need multiple inheritance
+    // Variable precision "S" and variable Nc
+    template<class S,int Nrepresentation=Nc>
+    class ImplGauge { 
+    public:
+    
+      typedef S Simd;
+    
+      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd  >;
+    
+      typedef iImplGaugeLink    <Simd>           SiteGaugeLink;
+      typedef iImplGaugeField   <Simd>           SiteGaugeField;
+    
+      typedef Lattice<SiteGaugeLink>                GaugeLinkField; // bit ugly naming; polarised gauge field, lorentz... all ugly
+      typedef Lattice<SiteGaugeField>                   GaugeField;
+
+    };
+
+    ////////////////////////////////////////////////////////////////////////
+    // Implementation dependent fermion types
+    ////////////////////////////////////////////////////////////////////////
+
+#define INHERIT_FIMPL_TYPES(Impl)\
+    typedef typename Impl::FermionField           FermionField;		\
+    typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
+    typedef typename Impl::SiteSpinor               SiteSpinor;		\
+    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
+    typedef typename Impl::Compressor               Compressor;		\
+    typedef typename Impl::StencilImpl              StencilImpl;	\
+    typedef typename Impl::ImplParams ImplParams;
+
+    ///////
+    // Single flavour four spinors with colour index
+    ///////
+    template<class S,int Nrepresentation=Nc>
+    class WilsonImpl :  public ImplGauge<S,Nrepresentation> { 
+    public:
+
+      typedef ImplGauge<S,Nrepresentation> Gimpl;
+
+      INHERIT_GIMPL_TYPES(Gimpl);
+
+      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
+      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
+      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
+    
+      typedef iImplSpinor    <Simd>           SiteSpinor;
+      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
+      typedef iImplDoubledGaugeField<Simd>    SiteDoubledGaugeField;
+
+      typedef Lattice<SiteSpinor>                 FermionField;
+      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+
+      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
+      typedef WilsonImplParams ImplParams;
+      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
+      ImplParams Params;
+      WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
+
+      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
+        mult(&phi(),&U(mu),&chi());
+      }
+
+      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+      {
+        conformable(Uds._grid,GaugeGrid);
+        conformable(Umu._grid,GaugeGrid);
+        GaugeLinkField U(GaugeGrid);
+        for(int mu=0;mu<Nd;mu++){
+  	  U = PeekIndex<LorentzIndex>(Umu,mu);
+	  PokeIndex<LorentzIndex>(Uds,U,mu);
+	  U = adj(Cshift(U,mu,-1));
+	  PokeIndex<LorentzIndex>(Uds,U,mu+4);
+	}
+      }
+      
+      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+	GaugeLinkField link(mat._grid);
+	link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+	PokeIndex<LorentzIndex>(mat,link,mu);
+      }   
+
+      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+
+	int Ls=Btilde._grid->_fdimensions[0];
+
+	GaugeLinkField tmp(mat._grid);
+	tmp = zero;
+PARALLEL_FOR_LOOP
+	for(int sss=0;sss<tmp._grid->oSites();sss++){
+	  int sU=sss;
+	  for(int s=0;s<Ls;s++){
+	    int sF = s+Ls*sU;
+	    tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
+	  }
+	}
+	PokeIndex<LorentzIndex>(mat,tmp,mu);
+	
+      }
+
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+    // Flavour doubled spinors; is Gparity the only? what about C*?
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+    template<class S,int Nrepresentation>
+    class GparityWilsonImpl : public ImplGauge<S,Nrepresentation> { 
+    public:
+
+      typedef ImplGauge<S,Nrepresentation> Gimpl;
+
+      INHERIT_GIMPL_TYPES(Gimpl);
+
+      template<typename vtype> using iImplSpinor             = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp >;
+      template<typename vtype> using iImplHalfSpinor         = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp >;
+      template<typename vtype> using iImplDoubledGaugeField  = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >, Ngp >;
+    
+      typedef iImplSpinor    <Simd>           SiteSpinor;
+      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
+      typedef iImplDoubledGaugeField<Simd>    SiteDoubledGaugeField;
+
+      typedef Lattice<SiteSpinor>                 FermionField;
+      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+
+      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
+      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
+
+      typedef GparityWilsonImplParams ImplParams;
+      ImplParams Params;
+      GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
+      
+
+      // provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
+      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
+
+	typedef SiteHalfSpinor vobj;
+	typedef typename SiteHalfSpinor::scalar_object sobj;
+
+	vobj vtmp;
+	sobj stmp;
+	
+	GridBase *grid = St._grid;
+      
+	const int Nsimd = grid->Nsimd();
+	
+	int direction    = St._directions[mu];
+	int distance     = St._distances[mu];
+	int ptype        = St._permute_type[mu]; 
+	int sl           = St._grid->_simd_layout[direction];
+
+	// Fixme X.Y.Z.T hardcode in stencil
+	int mmu          = mu % Nd;
+
+	// assert our assumptions
+	assert((distance==1)||(distance==-1)); // nearest neighbour stencil hard code
+	assert((sl==1)||(sl==2));
+	
+	std::vector<int> icoor;
+      
+	if ( SE->_around_the_world && Params.twists[mmu] ) {
+
+	  if ( sl == 2 ) {
+
+	    std::vector<sobj> vals(Nsimd);
+
+	    extract(chi,vals);
+	    for(int s=0;s<Nsimd;s++){
+
+	      grid->iCoorFromIindex(icoor,s);
+	      
+	      assert((icoor[direction]==0)||(icoor[direction]==1));
+	      
+	      int permute_lane;
+	      if ( distance == 1) {
+		permute_lane = icoor[direction]?1:0;
+	      } else {
+		permute_lane = icoor[direction]?0:1;
+	      }
+	      
+	      if ( permute_lane ) { 
+		stmp(0) = vals[s](1);
+		stmp(1) = vals[s](0);
+		vals[s] = stmp;
+	      }
+	    }
+	    merge(vtmp,vals);
+
+	  } else { 
+	    vtmp(0) = chi(1);
+	    vtmp(1) = chi(0);
+	  }
+	  mult(&phi(0),&U(0)(mu),&vtmp(0));
+	  mult(&phi(1),&U(1)(mu),&vtmp(1));
+	  
+	} else { 
+	  mult(&phi(0),&U(0)(mu),&chi(0));
+	  mult(&phi(1),&U(1)(mu),&chi(1));
+	}
+	
+      }
+
+      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+      {
+	
+	conformable(Uds._grid,GaugeGrid);
+	conformable(Umu._grid,GaugeGrid);
+	
+	GaugeLinkField Utmp(GaugeGrid);
+	GaugeLinkField U(GaugeGrid);
+	GaugeLinkField Uconj(GaugeGrid);
+	
+	Lattice<iScalar<vInteger> > coor(GaugeGrid);
+
+	
+	for(int mu=0;mu<Nd;mu++){
+	  
+	  LatticeCoordinate(coor,mu);
+	  
+	  U     = PeekIndex<LorentzIndex>(Umu,mu);
+	  Uconj = conjugate(U);
+
+	  // This phase could come from a simple bc 1,1,-1,1 ..
+	  int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
+	  if ( Params.twists[mu] ) { 
+	    Uconj = where(coor==neglink,-Uconj,Uconj);
+	  }
+
+	  
+PARALLEL_FOR_LOOP
+	  for(auto ss=U.begin();ss<U.end();ss++){
+	    Uds[ss](0)(mu) = U[ss]();
+	    Uds[ss](1)(mu) = Uconj[ss]();
+	  }
+	  
+	  U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
+	  Uconj = adj(Cshift(Uconj,mu,-1));
+	  
+	  Utmp = U;
+	  if ( Params.twists[mu] ) { 
+	    Utmp = where(coor==0,Uconj,Utmp);
+	  }
+	  
+PARALLEL_FOR_LOOP
+	  for(auto ss=U.begin();ss<U.end();ss++){
+	    Uds[ss](0)(mu+4) = Utmp[ss]();
+	  }
+	  
+	  Utmp = Uconj;
+	  if ( Params.twists[mu] ) { 
+	    Utmp = where(coor==0,U,Utmp);
+	  }
+	  
+PARALLEL_FOR_LOOP
+	  for(auto ss=U.begin();ss<U.end();ss++){
+	    Uds[ss](1)(mu+4) = Utmp[ss]();
+	  }
+	  
+	}
+      }
+
+      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+	assert(0);
+	// Fixme
+	return;
+      }
+      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+	assert(0);
+	// Fixme
+	return;
+      }
+    };
+
+    typedef WilsonImpl<vComplex ,Nc> WilsonImplR; // Real.. whichever prec
+    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
+    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
+
+    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
+    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
+    typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
+
+  }
+}
+#endif
--- a/lib/qcd/action/fermion/MobiusFermion.h
+++ b/lib/qcd/action/fermion/MobiusFermion.h
@@ -7,35 +7,38 @@ namespace Grid {

  namespace QCD {

-    class MobiusFermion : public CayleyFermion5D
+    template<class Impl>
+    class MobiusFermion : public CayleyFermion5D<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      virtual void   Instantiatable(void) {};
      // Constructors
-      MobiusFermion(LatticeGaugeField &_Umu,
+      MobiusFermion(GaugeField &_Umu,
 		    GridCartesian         &FiveDimGrid,
 		    GridRedBlackCartesian &FiveDimRedBlackGrid,
 		    GridCartesian         &FourDimGrid,
 		    GridRedBlackCartesian &FourDimRedBlackGrid,
 		    RealD _mass,RealD _M5,
-		    RealD b, RealD c) : 
+		    RealD b, RealD c,const ImplParams &p= ImplParams()) : 
      
-      CayleyFermion5D(_Umu,
-		      FiveDimGrid,
-		      FiveDimRedBlackGrid,
-		      FourDimGrid,
-		      FourDimRedBlackGrid,_mass,_M5)
+      CayleyFermion5D<Impl>(_Umu,
+			    FiveDimGrid,
+			    FiveDimRedBlackGrid,
+			    FourDimGrid,
+			    FourDimRedBlackGrid,_mass,_M5,p)

      {
 	RealD eps = 1.0;

-	std::cout << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<Ls<<" Tanh approx"<<std::endl;
+	std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
 	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
 	assert(zdata->n==this->Ls);
 	
 	// Call base setter
-	this->CayleyFermion5D::SetCoefficientsTanh(zdata,b,c);
+	this->SetCoefficientsTanh(zdata,b,c);

 	Approx::zolotarev_free(zdata);
 
--- a/lib/qcd/action/fermion/MobiusZolotarevFermion.h
+++ b/lib/qcd/action/fermion/MobiusZolotarevFermion.h
@@ -7,26 +7,29 @@ namespace Grid {

  namespace QCD {

-    class MobiusZolotarevFermion : public CayleyFermion5D
+    template<class Impl>
+    class MobiusZolotarevFermion : public CayleyFermion5D<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      virtual void   Instantiatable(void) {};
      // Constructors
-       MobiusZolotarevFermion(LatticeGaugeField &_Umu,
+       MobiusZolotarevFermion(GaugeField &_Umu,
 			      GridCartesian         &FiveDimGrid,
 			      GridRedBlackCartesian &FiveDimRedBlackGrid,
 			      GridCartesian         &FourDimGrid,
 			      GridRedBlackCartesian &FourDimRedBlackGrid,
 			      RealD _mass,RealD _M5,
 			      RealD b, RealD c,
-			      RealD lo, RealD hi) : 
+			      RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
      
-      CayleyFermion5D(_Umu,
-		      FiveDimGrid,
-		      FiveDimRedBlackGrid,
-		      FourDimGrid,
-		      FourDimRedBlackGrid,_mass,_M5)
+      CayleyFermion5D<Impl>(_Umu,
+			    FiveDimGrid,
+			    FiveDimRedBlackGrid,
+			    FourDimGrid,
+			    FourDimRedBlackGrid,_mass,_M5,p)

      {
 	RealD eps = lo/hi;
@@ -34,10 +37,10 @@ namespace Grid {
 	Approx::zolotarev_data *zdata = Approx::zolotarev(eps,this->Ls,0);
 	assert(zdata->n==this->Ls);

-	std::cout << "MobiusZolotarevFermion (b="<<b<<",c="<<c<<") with Ls= "<<Ls<<" Zolotarev range ["<<lo<<","<<hi<<"]"<<std::endl;
+	std::cout<<GridLogMessage << "MobiusZolotarevFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Zolotarev range ["<<lo<<","<<hi<<"]"<<std::endl;
 	
 	// Call base setter
-	this->CayleyFermion5D::SetCoefficientsZolotarev(hi,zdata,b,c);
+	this->SetCoefficientsZolotarev(hi,zdata,b,c);
 
 	Approx::zolotarev_free(zdata);
      }
--- a/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
@@ -7,25 +7,28 @@ namespace Grid {

  namespace QCD {

-    class OverlapWilsonCayleyTanhFermion : public MobiusFermion
+    template<class Impl>
+    class OverlapWilsonCayleyTanhFermion : public MobiusFermion<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      // Constructors
-    OverlapWilsonCayleyTanhFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
 				   GridCartesian         &FiveDimGrid,
 				   GridRedBlackCartesian &FiveDimRedBlackGrid,
 				   GridCartesian         &FourDimGrid,
 				   GridRedBlackCartesian &FourDimRedBlackGrid,
 				   RealD _mass,RealD _M5,
-				   RealD scale) :
+				   RealD scale,const ImplParams &p= ImplParams()) :
      
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      MobiusFermion(_Umu,
-		    FiveDimGrid,
-		    FiveDimRedBlackGrid,
-		    FourDimGrid,
-		    FourDimRedBlackGrid,_mass,_M5,0.5*scale,0.5*scale)
+      MobiusFermion<Impl>(_Umu,
+			  FiveDimGrid,
+			  FiveDimRedBlackGrid,
+			  FourDimGrid,
+			  FourDimRedBlackGrid,_mass,_M5,0.5*scale,0.5*scale,p)
 	{
 	}
    };
--- a/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
@@ -7,25 +7,28 @@ namespace Grid {

  namespace QCD {

-    class OverlapWilsonCayleyZolotarevFermion : public MobiusZolotarevFermion
+    template<class Impl>
+    class OverlapWilsonCayleyZolotarevFermion : public MobiusZolotarevFermion<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      // Constructors

-    OverlapWilsonCayleyZolotarevFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
 					GridCartesian         &FiveDimGrid,
 					GridRedBlackCartesian &FiveDimRedBlackGrid,
 					GridCartesian         &FourDimGrid,
 					GridRedBlackCartesian &FourDimRedBlackGrid,
 					RealD _mass,RealD _M5,
-					RealD lo, RealD hi) : 
+					RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
      // b+c=1.0, b-c = 0 <=> b =c = 1/2
-      MobiusZolotarevFermion(_Umu,
-			     FiveDimGrid,
-			     FiveDimRedBlackGrid,
-			     FourDimGrid,
-			     FourDimRedBlackGrid,_mass,_M5,0.5,0.5,lo,hi)
+      MobiusZolotarevFermion<Impl>(_Umu,
+				   FiveDimGrid,
+				   FiveDimRedBlackGrid,
+				   FourDimGrid,
+				   FourDimRedBlackGrid,_mass,_M5,0.5,0.5,lo,hi,p)

      {}

--- a/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@@ -7,31 +7,34 @@ namespace Grid {

  namespace QCD {

-    class OverlapWilsonContFracTanhFermion : public ContinuedFractionFermion5D
+    template<class Impl>
+    class OverlapWilsonContFracTanhFermion : public ContinuedFractionFermion5D<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      virtual void   Instantiatable(void){};
      // Constructors
-    OverlapWilsonContFracTanhFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
 				     GridCartesian         &FiveDimGrid,
 				     GridRedBlackCartesian &FiveDimRedBlackGrid,
 				     GridCartesian         &FourDimGrid,
 				     GridRedBlackCartesian &FourDimRedBlackGrid,
 				     RealD _mass,RealD _M5,
-				     RealD scale) :
+				     RealD scale,const ImplParams &p= ImplParams()) :
      
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      ContinuedFractionFermion5D(_Umu,
-				 FiveDimGrid,
-				 FiveDimRedBlackGrid,
-				 FourDimGrid,
-				 FourDimRedBlackGrid,_mass,_M5)
+      ContinuedFractionFermion5D<Impl>(_Umu,
+				       FiveDimGrid,
+				       FiveDimRedBlackGrid,
+				       FourDimGrid,
+				       FourDimRedBlackGrid,_mass,_M5,p)
 	{
-	  assert((Ls&0x1)==1); // Odd Ls required
-	  int nrational=Ls-1;// Even rational order
+	  assert((this->Ls&0x1)==1); // Odd Ls required
+	  int nrational=this->Ls-1;// Even rational order
 	  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
-	  SetCoefficientsTanh(zdata,scale);
+	  this->SetCoefficientsTanh(zdata,scale);
 	  Approx::zolotarev_free(zdata);
 	}
    };
--- a/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@@ -7,34 +7,36 @@ namespace Grid {

  namespace QCD {

-    class OverlapWilsonContFracZolotarevFermion : public ContinuedFractionFermion5D
+    template<class Impl>
+    class OverlapWilsonContFracZolotarevFermion : public ContinuedFractionFermion5D<Impl>
    {
    public:
+     INHERIT_IMPL_TYPES(Impl);

      virtual void   Instantiatable(void){};
      // Constructors
-    OverlapWilsonContFracZolotarevFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
 					  GridCartesian         &FiveDimGrid,
 					  GridRedBlackCartesian &FiveDimRedBlackGrid,
 					  GridCartesian         &FourDimGrid,
 					  GridRedBlackCartesian &FourDimRedBlackGrid,
 					  RealD _mass,RealD _M5,
-					  RealD lo,RealD hi):
+					  RealD lo,RealD hi,const ImplParams &p= ImplParams()):
      
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      ContinuedFractionFermion5D(_Umu,
-				 FiveDimGrid,
-				 FiveDimRedBlackGrid,
-				 FourDimGrid,
-				 FourDimRedBlackGrid,_mass,_M5)
+      ContinuedFractionFermion5D<Impl>(_Umu,
+				       FiveDimGrid,
+				       FiveDimRedBlackGrid,
+				       FourDimGrid,
+				       FourDimRedBlackGrid,_mass,_M5,p)
 	{
-	  assert((Ls&0x1)==1); // Odd Ls required
+	  assert((this->Ls&0x1)==1); // Odd Ls required

-	  int nrational=Ls;// Odd rational order
+	  int nrational=this->Ls;// Odd rational order
 	  RealD eps = lo/hi;

 	  Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
-	  SetCoefficientsZolotarev(hi,zdata);
+	  this->SetCoefficientsZolotarev(hi,zdata);
 	  Approx::zolotarev_free(zdata);

 	}
--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@@ -7,31 +7,34 @@ namespace Grid {

  namespace QCD {

-    class OverlapWilsonPartialFractionTanhFermion : public PartialFractionFermion5D
+    template<class Impl>
+    class OverlapWilsonPartialFractionTanhFermion : public PartialFractionFermion5D<Impl>
    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
    public:

      virtual void   Instantiatable(void){};
      // Constructors
-    OverlapWilsonPartialFractionTanhFermion(LatticeGaugeField &_Umu,
+    OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
 					    GridCartesian         &FiveDimGrid,
 					    GridRedBlackCartesian &FiveDimRedBlackGrid,
 					    GridCartesian         &FourDimGrid,
 					    GridRedBlackCartesian &FourDimRedBlackGrid,
 					    RealD _mass,RealD _M5,
-					    RealD scale) :
+					    RealD scale,const ImplParams &p= ImplParams()) :
      
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      PartialFractionFermion5D(_Umu,
-			       FiveDimGrid,
-			       FiveDimRedBlackGrid,
-			       FourDimGrid,
-			       FourDimRedBlackGrid,_mass,_M5)
+      PartialFractionFermion5D<Impl>(_Umu,
+				     FiveDimGrid,
+				     FiveDimRedBlackGrid,
+				     FourDimGrid,
+				     FourDimRedBlackGrid,_mass,_M5,p)
 	{
-	  assert((Ls&0x1)==1); // Odd Ls required
-	  int nrational=Ls-1;// Even rational order
+	  assert((this->Ls&0x1)==1); // Odd Ls required
+	  int nrational=this->Ls-1;// Even rational order
 	  Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
-	  SetCoefficientsTanh(zdata,scale);
+	  this->SetCoefficientsTanh(zdata,scale);
 	  Approx::zolotarev_free(zdata);
 	}
    };
--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@@ -7,34 +7,36 @@ namespace Grid {

  namespace QCD {

-    class OverlapWilsonPartialFractionZolotarevFermion : public PartialFractionFermion5D
+    template<class Impl>
+    class OverlapWilsonPartialFractionZolotarevFermion : public PartialFractionFermion5D<Impl>
    {
    public:
+     INHERIT_IMPL_TYPES(Impl);

      virtual void   Instantiatable(void){};
      // Constructors
-    OverlapWilsonPartialFractionZolotarevFermion(LatticeGaugeField &_Umu,
-					  GridCartesian         &FiveDimGrid,
-					  GridRedBlackCartesian &FiveDimRedBlackGrid,
-					  GridCartesian         &FourDimGrid,
-					  GridRedBlackCartesian &FourDimRedBlackGrid,
-					  RealD _mass,RealD _M5,
-					  RealD lo,RealD hi):
+    OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
+						 GridCartesian         &FiveDimGrid,
+						 GridRedBlackCartesian &FiveDimRedBlackGrid,
+						 GridCartesian         &FourDimGrid,
+						 GridRedBlackCartesian &FourDimRedBlackGrid,
+						 RealD _mass,RealD _M5,
+						 RealD lo,RealD hi,const ImplParams &p= ImplParams()):
      
      // b+c=scale, b-c = 0 <=> b =c = scale/2
-      PartialFractionFermion5D(_Umu,
-			       FiveDimGrid,
-			       FiveDimRedBlackGrid,
-			       FourDimGrid,
-			       FourDimRedBlackGrid,_mass,_M5)
+      PartialFractionFermion5D<Impl>(_Umu,
+				     FiveDimGrid,
+				     FiveDimRedBlackGrid,
+				     FourDimGrid,
+				     FourDimRedBlackGrid,_mass,_M5,p)
 	{
-	  assert((Ls&0x1)==1); // Odd Ls required
+	  assert((this->Ls&0x1)==1); // Odd Ls required

-	  int nrational=Ls;// Odd rational order
+	  int nrational=this->Ls;// Odd rational order
 	  RealD eps = lo/hi;

 	  Approx::zolotarev_data *zdata = Approx::zolotarev(eps,nrational,0);
-	  SetCoefficientsZolotarev(hi,zdata);
+	  this->SetCoefficientsZolotarev(hi,zdata);
 	  Approx::zolotarev_free(zdata);

 	}
--- a/lib/qcd/action/fermion/PartialFractionFermion5D.cc
+++ b/lib/qcd/action/fermion/PartialFractionFermion5D.cc
@@ -2,12 +2,15 @@
 namespace Grid {
  namespace QCD {

-    void  PartialFractionFermion5D::Mdir (const LatticeFermion &psi, LatticeFermion &chi,int dir,int disp){
+
+    template<class Impl>
+    void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
      // this does both dag and undag but is trivial; make a common helper routing

      int sign = 1;
+      int Ls = this->Ls;

-      DhopDir(psi,chi,dir,disp);
+      this->DhopDir(psi,chi,dir,disp);

      int nblock=(Ls-1)/2;
      for(int b=0;b<nblock;b++){
@@ -18,15 +21,16 @@ namespace Grid {
      ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);

    }
-    void   PartialFractionFermion5D::Meooe_internal(const LatticeFermion &psi, LatticeFermion &chi,int dag)
+    template<class Impl>
+    void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
    {
-      // this does both dag and undag but is trivial; make a common helper routing
+      int Ls = this->Ls;
      int sign = dag ? (-1) : 1;

      if ( psi.checkerboard == Odd ) {
-	DhopEO(psi,chi,DaggerNo);
+	this->DhopEO(psi,chi,DaggerNo);
      } else {
-	DhopOE(psi,chi,DaggerNo);
+	this->DhopOE(psi,chi,DaggerNo);
      }

      int nblock=(Ls-1)/2;
@@ -38,10 +42,12 @@ namespace Grid {
      ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
    }

-    void   PartialFractionFermion5D::Mooee_internal(const LatticeFermion &psi, LatticeFermion &chi,int dag)
+    template<class Impl>
+    void   PartialFractionFermion5D<Impl>::Mooee_internal(const FermionField &psi, FermionField &chi,int dag)
    {
      // again dag and undag are trivially related
      int sign = dag ? (-1) : 1;
+      int Ls = this->Ls;
      
      int nblock=(Ls-1)/2;
      for(int b=0;b<nblock;b++){
@@ -69,11 +75,13 @@ namespace Grid {
      }
    }

-    void   PartialFractionFermion5D::MooeeInv_internal(const LatticeFermion &psi, LatticeFermion &chi,int dag)
+    template<class Impl>
+    void   PartialFractionFermion5D<Impl>::MooeeInv_internal(const FermionField &psi, FermionField &chi,int dag)
    {
      int sign = dag ? (-1) : 1;
+      int Ls = this->Ls;

-      LatticeFermion tmp(psi._grid);
+      FermionField tmp(psi._grid);
      
      ///////////////////////////////////////////////////////////////////////////////////////
      //Linv
@@ -129,10 +137,12 @@ namespace Grid {
      axpby_ssp  (chi, 1.0/scale,tmp,0.0,tmp,Ls-1,Ls-1);
    }

-    void   PartialFractionFermion5D::M_internal(const LatticeFermion &psi, LatticeFermion &chi,int dag)
+    template<class Impl>
+    void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, FermionField &chi,int dag)
    {
-      LatticeFermion D(psi._grid);
+      FermionField D(psi._grid);
  
+      int Ls = this->Ls;
      int sign = dag ? (-1) : 1;

      // For partial frac Hw case (b5=c5=1) chroma quirkily computes
@@ -186,7 +196,7 @@ namespace Grid {
      //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H
      //

-      DW(psi,D,DaggerNo); 
+      this->DW(psi,D,DaggerNo); 

      int nblock=(Ls-1)/2;
      for(int b=0;b<nblock;b++){
@@ -217,61 +227,127 @@ namespace Grid {

    }

-    RealD  PartialFractionFermion5D::M    (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    RealD  PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
    {
      M_internal(in,out,DaggerNo);
      return norm2(out);
    }
-    RealD  PartialFractionFermion5D::Mdag (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    RealD  PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
    {
      M_internal(in,out,DaggerYes);
      return norm2(out);
    }

-    void PartialFractionFermion5D::Meooe       (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::Meooe       (const FermionField &in, FermionField &out)
    {
      Meooe_internal(in,out,DaggerNo);
    }
-    void PartialFractionFermion5D::MeooeDag    (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::MeooeDag    (const FermionField &in, FermionField &out)
    {
      Meooe_internal(in,out,DaggerYes);
    }
-    void PartialFractionFermion5D::Mooee       (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::Mooee       (const FermionField &in, FermionField &out)
    {
      Mooee_internal(in,out,DaggerNo);
    }
-    void PartialFractionFermion5D::MooeeDag    (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::MooeeDag    (const FermionField &in, FermionField &out)
    {
      Mooee_internal(in,out,DaggerYes);
    }

-    void PartialFractionFermion5D::MooeeInv    (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::MooeeInv    (const FermionField &in, FermionField &out)
    {
      MooeeInv_internal(in,out,DaggerNo);
    }
-    void PartialFractionFermion5D::MooeeInvDag (const LatticeFermion &in, LatticeFermion &out)
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::MooeeInvDag (const FermionField &in, FermionField &out)
    {
      MooeeInv_internal(in,out,DaggerYes);
    }

-    void  PartialFractionFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
+
+  // force terms; five routines; default to Dhop on diagonal
+    template<class Impl>
+   void PartialFractionFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int nblock=(Ls-1)/2;
+    for(int b=0;b<nblock;b++){
+      int s = 2*b;
+      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+    }
+    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+    this->DhopDeriv(mat,D,V,DaggerNo); 
+  };
+    template<class Impl>
+   void PartialFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int nblock=(Ls-1)/2;
+    for(int b=0;b<nblock;b++){
+      int s = 2*b;
+      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+    }
+    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+    this->DhopDerivOE(mat,D,V,DaggerNo); 
+  };
+    template<class Impl>
+   void PartialFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  {
+    int Ls = this->Ls;
+
+    FermionField D(V._grid);
+
+    int nblock=(Ls-1)/2;
+    for(int b=0;b<nblock;b++){
+      int s = 2*b;
+      ag5xpby_ssp(D,-scale,U,0.0,U,s,s); 
+      ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1); 
+    }
+    ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
+
+    this->DhopDerivEO(mat,D,V,DaggerNo); 
+  };
+
+    template<class Impl>
+    void  PartialFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
      SetCoefficientsZolotarev(1.0/scale,zdata);
    }
-    void  PartialFractionFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){
+    template<class Impl>
+    void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){

      // check on degree matching
-      //      std::cout << Ls << " Ls"<<std::endl;
-      //      std::cout << zdata->n  << " - n"<<std::endl;
-      //      std::cout << zdata->da << " -da "<<std::endl;
-      //      std::cout << zdata->db << " -db"<<std::endl;
-      //      std::cout << zdata->dn << " -dn"<<std::endl;
-      //      std::cout << zdata->dd << " -dd"<<std::endl;
+      //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+      //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+      int Ls = this->Ls;
+
      assert(Ls == (2*zdata->da -1) );

      // Part frac
      //      RealD R;
      R=(1+mass)/(1-mass);
-      dw_diag = (4.0-M5);
+      dw_diag = (4.0-this->M5);

      //      std::vector<RealD> p; 
      //      std::vector<RealD> q;
@@ -291,18 +367,22 @@ namespace Grid {
    }

      // Constructors
-    PartialFractionFermion5D::PartialFractionFermion5D(LatticeGaugeField &_Umu,
-						       GridCartesian         &FiveDimGrid,
-						       GridRedBlackCartesian &FiveDimRedBlackGrid,
-						       GridCartesian         &FourDimGrid,
-						       GridRedBlackCartesian &FourDimRedBlackGrid,
-						       RealD _mass,RealD M5) :
-      WilsonFermion5D(_Umu,
-		      FiveDimGrid, FiveDimRedBlackGrid,
-		      FourDimGrid, FourDimRedBlackGrid,M5),
+    template<class Impl>
+    PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
+							     GridCartesian         &FiveDimGrid,
+							     GridRedBlackCartesian &FiveDimRedBlackGrid,
+							     GridCartesian         &FourDimGrid,
+							     GridRedBlackCartesian &FourDimRedBlackGrid,
+							     RealD _mass,RealD M5,
+							     const ImplParams &p) :
+      WilsonFermion5D<Impl>(_Umu,
+			    FiveDimGrid, FiveDimRedBlackGrid,
+			    FourDimGrid, FourDimRedBlackGrid,M5,p),
      mass(_mass)

    {
+      int Ls = this->Ls;
+
      assert((Ls&0x1)==1); // Odd Ls required
      int nrational=Ls-1;

@@ -321,6 +401,8 @@ namespace Grid {

    }
 
+    FermOpTemplateInstantiate(PartialFractionFermion5D);
+
 }
 }

--- a/lib/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/lib/qcd/action/fermion/PartialFractionFermion5D.h
@@ -5,41 +5,48 @@ namespace Grid {

  namespace QCD {

-    class PartialFractionFermion5D : public WilsonFermion5D
+    template<class Impl>
+    class PartialFractionFermion5D : public WilsonFermion5D<Impl>
    {
    public:
+     INHERIT_IMPL_TYPES(Impl);

      const int part_frac_chroma_convention=1;

-      void   Meooe_internal(const LatticeFermion &in, LatticeFermion &out,int dag);
-      void   Mooee_internal(const LatticeFermion &in, LatticeFermion &out,int dag);
-      void   MooeeInv_internal(const LatticeFermion &in, LatticeFermion &out,int dag);
-      void   M_internal(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
+      void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
+      void   MooeeInv_internal(const FermionField &in, FermionField &out,int dag);
+      void   M_internal(const FermionField &in, FermionField &out,int dag);

      // override multiply
-      virtual RealD  M    (const LatticeFermion &in, LatticeFermion &out);
-      virtual RealD  Mdag (const LatticeFermion &in, LatticeFermion &out);
+      virtual RealD  M    (const FermionField &in, FermionField &out);
+      virtual RealD  Mdag (const FermionField &in, FermionField &out);

      // half checkerboard operaions
-      virtual void   Meooe       (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MeooeDag    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   Mooee       (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeDag    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out);
-      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
+      virtual void   Meooe       (const FermionField &in, FermionField &out);
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+      virtual void   Mooee       (const FermionField &in, FermionField &out);
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+
+      // force terms; five routines; default to Dhop on diagonal
+      virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

      virtual void   Instantiatable(void) =0; // ensure no make-eee

      // Efficient support for multigrid coarsening
-      virtual void  Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);

      // Constructors
-      PartialFractionFermion5D(LatticeGaugeField &_Umu,
-				    GridCartesian         &FiveDimGrid,
-				    GridRedBlackCartesian &FiveDimRedBlackGrid,
-				    GridCartesian         &FourDimGrid,
-				    GridRedBlackCartesian &FourDimRedBlackGrid,
-				    RealD _mass,RealD M5);
+      PartialFractionFermion5D(GaugeField &_Umu,
+			       GridCartesian         &FiveDimGrid,
+			       GridRedBlackCartesian &FiveDimRedBlackGrid,
+			       GridCartesian         &FourDimGrid,
+			       GridRedBlackCartesian &FourDimRedBlackGrid,
+			       RealD _mass,RealD M5,const ImplParams &p= ImplParams());

    protected:

--- a/lib/qcd/action/fermion/ScaledShamirFermion.h
+++ b/lib/qcd/action/fermion/ScaledShamirFermion.h
@@ -7,12 +7,14 @@ namespace Grid {

  namespace QCD {

-    class ScaledShamirFermion : public MobiusFermion
+    template<class Impl>
+    class ScaledShamirFermion : public MobiusFermion<Impl>
    {
    public:
+     INHERIT_IMPL_TYPES(Impl);

      // Constructors
-    ScaledShamirFermion(LatticeGaugeField &_Umu,
+    ScaledShamirFermion(GaugeField &_Umu,
 			GridCartesian         &FiveDimGrid,
 			GridRedBlackCartesian &FiveDimRedBlackGrid,
 			GridCartesian         &FourDimGrid,
@@ -21,7 +23,7 @@ namespace Grid {
 			RealD scale) :
      
      // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
-      MobiusFermion(_Umu,
+      MobiusFermion<Impl>(_Umu,
 		    FiveDimGrid,
 		    FiveDimRedBlackGrid,
 		    FourDimGrid,
--- a/lib/qcd/action/fermion/ShamirZolotarevFermion.h
+++ b/lib/qcd/action/fermion/ShamirZolotarevFermion.h
@@ -7,27 +7,29 @@ namespace Grid {

  namespace QCD {

-    class ShamirZolotarevFermion : public MobiusZolotarevFermion
+    template<class Impl>
+    class ShamirZolotarevFermion : public MobiusZolotarevFermion<Impl>
    {
    public:
+     INHERIT_IMPL_TYPES(Impl);

      // Constructors


-    ShamirZolotarevFermion(LatticeGaugeField &_Umu,
+    ShamirZolotarevFermion(GaugeField &_Umu,
 			   GridCartesian         &FiveDimGrid,
 			   GridRedBlackCartesian &FiveDimRedBlackGrid,
 			   GridCartesian         &FourDimGrid,
 			   GridRedBlackCartesian &FourDimRedBlackGrid,
 			   RealD _mass,RealD _M5,
-			   RealD lo, RealD hi) : 
+			   RealD lo, RealD hi,const ImplParams &p= ImplParams()) : 
      
      // b+c = 1; b-c = 1 => b=1, c=0
-      MobiusZolotarevFermion(_Umu,
-			     FiveDimGrid,
-			     FiveDimRedBlackGrid,
-			     FourDimGrid,
-			     FourDimRedBlackGrid,_mass,_M5,1.0,0.0,lo,hi)
+      MobiusZolotarevFermion<Impl>(_Umu,
+				   FiveDimGrid,
+				   FiveDimRedBlackGrid,
+				   FourDimGrid,
+				   FourDimRedBlackGrid,_mass,_M5,1.0,0.0,lo,hi,p)
      
      {}

--- a/Show More
+++ b/Show More