Merge pull request #2 from paboyle/master

Update from Peter
2025-08-01 20:27:07 +01:00 · 2015-10-19 14:52:52 +01:00
parent dd498f993e 1878bf97d0
commit d9f2e2e06a
81 changed files with 3649 additions and 1246 deletions
--- a/119
+++ b/119
@@ -1,48 +1,48 @@
- PseudoFermions
+RECENT
+---------------

-Done: Cayley, Partial , ContFrac force terms.
-
-Done:
-  - TwoFlavour
-  - TwoFlavourEvenOdd        
-  - TwoFlavourRatio
-  - TwoFlavourRatioEvenOdd
-
-Done:
-  - OneFlavourRationalEvenOdd
-  - OneFlavourRationalRatioEvenOdd
-  - OneFlavourRationalRatio
+  - Clean up HMC                                                             -- DONE
+  - LorentzScalar<GaugeField> gets Gauge link type (cleaner).                -- DONE
+  - Simplified the integrators a bit.                                        -- DONE
+  - Multi-timescale looks broken and operating on single timescale for now.  -- DONE
+  - pass GaugeField as template param.                        -- DONE
+  - Reunitarise                                               -- DONE
+  - Force Gradient                                            -- DONE
+  - Prefer "RefreshInternal" or such like to "init" in naming -- DONE
+  - Parallel io improvements                                  -- DONE
+  - Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test. -- DONE

 TODO:
-=> generalise to non-const EE
-=> Test DWF HMC
-=> Clean up HMC
-   - Fix a threading bug that has been introduced and prevents HMC running hybrid OMP mode
+---------------
+Policies:
+* Link smearing/boundary conds; Policy class based implementation ; framework more in place
+* Support different boundary conditions (finite temp, chem. potential ... )
+* Support different fermion representations? 
+  - contained entirely within the integrator presently

-=> Integrators
-  - Force Gradient
-  - Multi-timescale looks broken and operating on single timescale for now.
-    Fix/debug/rewrite this 
-  - Sign of force term.
-  - Prefer "RefreshInternal" or such like to "init" in naming
-  - Rename "Ta" as too unclear
+- Sign of force term.

- MacroMagic -> virtual reader class.
+- Reversibility test.

- Link smearing/boundary conds; Policy class based implementation
+- Rename "Ta" as too unclear
+
+- Lanczos

 - Rectangle gauge actions.
  Iwasaki,
  Symanzik,
  ... etc...

- Prepare multigrid for HMC.
-  Alternate setup schemes.
+- Prepare multigrid for HMC. - Alternate setup schemes.

- RNG filling from sparser grid, lower dim grid.
+- Support for ILDG --- ugly, not done
+
+- Flavour matrices?
+
+- FFTnD ?

 ================================================================
-*** Hacks and bug fixes to clean up and Audits
+* Hacks and bug fixes to clean up and Audits
 ================================================================

 *  Extract/merge/set cleanup ; too many variants; rationalise and call simpler ones
@@ -68,6 +68,8 @@ Insert/Extract

 * Thread scaling tests Xeon, XeonPhi

+Not sure of status of this -- reverify. Things are working nicely now though.
+
 * Make the Tensor types and Complex etc... play more nicely.
  - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar<Scalar<Scalar<Complex > > >
    QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I
@@ -84,6 +86,37 @@ Insert/Extract
    template specialize the scalar scalar scalar sum and SliceSum,  on the basis of being
    pure scalar.

+======================================================================
+======================================================================
+======================================================================
+======================================================================
+Done: Cayley, Partial , ContFrac force terms.
+
+DONE
+- PseudoFermions
+=> generalise to non-const EE ; likely defer (??) (NOT DONE)
+Done:
+  - TwoFlavour
+  - TwoFlavourEvenOdd        
+  - TwoFlavourRatio
+  - TwoFlavourRatioEvenOdd
+
+Done:
+  - OneFlavourRationalEvenOdd
+  - OneFlavourRationalRatioEvenOdd
+  - OneFlavourRationalRatio
+
+Done
+=> Test DWF HMC
+   - Fix a threading bug that has been introduced and prevents HMC running hybrid OMP mode
+
+Done:
+- RNG filling from sparser grid, lower dim grid.
+
+
+DONE
+  - MacroMagic -> virtual reader class.
+
 *** Expression template engine: -- DONE

 [   -- Norm2(expression) problem: introduce norm2 unary op, or Introduce conversion automatic from expression to Lattice<vobj>
@@ -103,18 +136,10 @@ Insert/Extract

 * CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)

-* Parallel io improvements
-  - optional parallel MPI2 IO
-  - move Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test.
-
-* Support for ILDG
-
-* Support different boundary conditions (finite temp, chem. potential ... )
-
-* Support different fermion representations? 
-
-Actions -- coherent framework for implementing actions and their forces.
+-- coherent framework for implementing actions and their forces.
+Actions 

+DONE
 * Fermion
  - Wilson
  - Clover
@@ -122,6 +147,7 @@ Actions -- coherent framework for implementing actions and their forces.
  - Mobius
  - z-Mobius

+
 Algorithms (lots of reuse/port from BFM)
 * LinearOperator
 * LinearSolver
@@ -139,19 +165,11 @@ Algorithms (lots of reuse/port from BFM)
 * Integrators, leapfrog, omelyan, force gradient etc...
 * etc..

-* Gauge
-  - Wilson, symanzik, iwasaki
-
-* Flavour matrices?
-
+Done
 * Pauli, SU subgroup, etc.. 

 * su3 exponentiation & log etc.. [Jamie's code?]

-* TaProj
-
-* FFTnD ?
-
 ======================================================================================================
 FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane)
 ======================================================================================================
@@ -184,7 +202,6 @@ FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me argua
  - lib/communicator
  - lib/algorithms
  - lib/qcd
- future
  - lib/io/   -- GridLog, GridIn, GridErr, GridDebug, GridMessage
  - lib/qcd/actions
  - lib/qcd/measurements
--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@@ -0,0 +1,84 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Nvec=4;
+  typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
+  typedef iVector<vReal,Nvec> Vec;
+
+
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking READ bandwidth"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t"<<"bytes/thread"<<"\t\t\t"<<"GB/s"<<"\t\t\t"<<"GB/s per thread"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+
+  const int lmax = 16536*16;
+  for(int lat=4;lat<=lmax;lat*=2){
+
+    int Nloop=lmax*128*4/lat;
+
+    std::vector<int> latt_size  ({2*mpi_layout[0],2*mpi_layout[1],4*mpi_layout[2],lat*mpi_layout[3]});
+
+    GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+    int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]*threads;
+
+    Vec tsum; tsum = zero;
+
+    GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+
+    std::vector<double> stop(threads);
+    Vector<Vec> sum(threads);
+
+    std::vector<LatticeVec> x(threads,&Grid);
+    for(int t=0;t<threads;t++){
+      random(pRNG,x[t]);
+    }
+
+    double start=usecond();
+PARALLEL_FOR_LOOP
+    for(int t=0;t<threads;t++){
+
+      sum[t] = x[t]._odata[0];
+      for(int i=0;i<Nloop;i++){
+	for(auto ss=x[t].begin();ss<x[t].end();ss++){
+	  sum[t] = sum[t]+x[t]._odata[ss];
+	}
+      }
+      stop[t]=usecond();
+    }
+
+    double max_stop=stop[0];
+    double min_stop=stop[0];
+    
+    for(int t=0;t<threads;t++){
+      tsum+=sum[t];
+      if ( stop[t]<min_stop ) min_stop=stop[t];
+      if ( stop[t]>max_stop ) max_stop=stop[t];
+    }
+
+    
+
+    double max_time = (max_stop-start)/Nloop*1000;
+    double min_time = (min_stop-start)/Nloop*1000;
+      
+    double bytes=vol*Nvec*sizeof(Real);
+    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/threads<<"\t\t"<<bytes/max_time<<" - "<< bytes/min_time<<"\t\t"<<bytes/min_time/threads <<std::endl;
+    
+  }    
+
+  Grid_finalize();
+}
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -10,144 +10,158 @@ int main (int argc, char ** argv)

  const int Nvec=8;
  typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
+  typedef iVector<vReal,Nvec> Vec;

-  int Nloop=1000;
+
+  Vec rn = zero;

  std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();

  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
  
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking fused AXPY bandwidth ; sizeof(Real) "<<sizeof(Real)<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-
-  for(int lat=4;lat<=32;lat+=4){
+  uint64_t lmax=44;
+#define NLOOP (100*lmax*lmax*lmax*lmax/vol)
+  for(int lat=4;lat<=lmax;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

-      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      uint64_t Nloop=NLOOP;

-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
      double a=2.0;


      double start=usecond();
      for(int i=0;i<Nloop;i++){
-	//   inline void axpy(Lattice<vobj> &ret,double a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
 	axpy(z,a,x,y);
+        x._odata[0]=z._odata[0]; // serial loop dependence to prevent optimise
+        y._odata[4]=z._odata[4];
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      
      double flops=vol*Nvec*2;// mul,add
      double bytes=3*vol*Nvec*sizeof(Real);
-      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;

    }

  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
  
-  for(int lat=4;lat<=32;lat+=4){
+  for(int lat=4;lat<=lmax;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

-      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();

-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
      double a=2.0;

+      uint64_t Nloop=NLOOP;
+
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	z=a*x-y;
+        x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
+        y._odata[4]=z._odata[4];
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
     
      double flops=vol*Nvec*2;// mul,add
      double bytes=3*vol*Nvec*sizeof(Real);
-      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;

    }

  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking SCALE bandwidth"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
+
+  for(int lat=4;lat<=lmax;lat+=4){

-  for(int lat=4;lat<=32;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      uint64_t Nloop=NLOOP;

      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

-      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();

-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
      RealD a=2.0;


      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	z=a*x;
+        x._odata[0]=z._odata[0]*2.0;
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      
      double bytes=2*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*1;// mul
-      std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;

  }

  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking READ bandwidth"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=4;lat<=32;lat+=4){
+  for(int lat=4;lat<=lmax;lat+=4){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      uint64_t Nloop=NLOOP;
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

-      //GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
-
-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
      RealD a=2.0;
-      ComplexD nn;
-
+      Real nn;      
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	nn=norm2(x);
+	vsplat(x._odata[0]._internal[0],nn);
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      
      double bytes=vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
-      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;

  }    

--- a/benchmarks/Make.inc
+++ b/benchmarks/Make.inc
@@ -1,5 +1,5 @@

-bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson
+bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson


 Benchmark_comms_SOURCES=Benchmark_comms.cc
@@ -10,6 +10,10 @@ Benchmark_dwf_SOURCES=Benchmark_dwf.cc
 Benchmark_dwf_LDADD=-lGrid


+Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
+Benchmark_memory_asynch_LDADD=-lGrid
+
+
 Benchmark_memory_bandwidth_SOURCES=Benchmark_memory_bandwidth.cc
 Benchmark_memory_bandwidth_LDADD=-lGrid

--- a/123
+++ b/123
@@ -1384,9 +1384,9 @@ Optional Features:
  --disable-dependency-tracking
                          speeds up one-time build
  --disable-openmp        do not use OpenMP
-  --enable-simd=SSE4|AVX|AVX2|AVX512|MIC
+  --enable-simd=SSE4|AVX|AVX2|AVX512|IMCI
                          Select instructions to be SSE4.0, AVX 1.0, AVX
-                          2.0+FMA, AVX 512, MIC
+                          2.0+FMA, AVX 512, IMCI
  --enable-precision=single|double
                          Select default word size of Real
  --enable-comms=none|mpi Select communications
@@ -6359,105 +6359,15 @@ fi
 done


-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __gmpf_init in -lgmp" >&5
-$as_echo_n "checking for __gmpf_init in -lgmp... " >&6; }
-if ${ac_cv_lib_gmp___gmpf_init+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lgmp  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char __gmpf_init ();
-int
-main ()
-{
-return __gmpf_init ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_link "$LINENO"; then :
-  ac_cv_lib_gmp___gmpf_init=yes
-else
-  ac_cv_lib_gmp___gmpf_init=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_gmp___gmpf_init" >&5
-$as_echo "$ac_cv_lib_gmp___gmpf_init" >&6; }
-if test "x$ac_cv_lib_gmp___gmpf_init" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBGMP 1
-_ACEOF
-
-  LIBS="-lgmp $LIBS"
-
-else
-  as_fn_error $? "GNU Multiple Precision GMP library was not found in your system.
-Please install or provide the correct path to your installation
-Info at: http://www.gmplib.org" "$LINENO" 5
-fi
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for mpfr_init in -lmpfr" >&5
-$as_echo_n "checking for mpfr_init in -lmpfr... " >&6; }
-if ${ac_cv_lib_mpfr_mpfr_init+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lmpfr  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char mpfr_init ();
-int
-main ()
-{
-return mpfr_init ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_link "$LINENO"; then :
-  ac_cv_lib_mpfr_mpfr_init=yes
-else
-  ac_cv_lib_mpfr_mpfr_init=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_mpfr_mpfr_init" >&5
-$as_echo "$ac_cv_lib_mpfr_mpfr_init" >&6; }
-if test "x$ac_cv_lib_mpfr_mpfr_init" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBMPFR 1
-_ACEOF
-
-  LIBS="-lmpfr $LIBS"
-
-else
-  as_fn_error $? "GNU Multiple Precision MPFR library was not found in your system.
-Please install or provide the correct path to your installation
-Info at: http://www.mpfr.org/" "$LINENO" 5
-fi
+#AC_CHECK_LIB([gmp],[__gmpf_init],,
+#        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
+#Please install or provide the correct path to your installation
+#Info at: http://www.gmplib.org)])

+#AC_CHECK_LIB([mpfr],[mpfr_init],,
+#        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
+#Please install or provide the correct path to your installation
+#Info at: http://www.mpfr.org/)])

 # Check whether --enable-simd was given.
 if test "${enable_simd+set}" = set; then :
@@ -6504,13 +6414,20 @@ $as_echo "#define AVX2 1" >>confdefs.h
 $as_echo "$as_me: WARNING: Your processor does not support AVX2 instructions" >&2;}
       fi
     ;;
-     AVX512|MIC)
-       echo Configuring for AVX512 and MIC
+     AVX512)
+       echo Configuring for AVX512

 $as_echo "#define AVX512 1" >>confdefs.h

       supported="cross compilation"
     ;;
+     IMCI)
+       echo Configuring for IMCI
+
+$as_echo "#define IMCI 1" >>confdefs.h
+
+       supported="cross compilation"
+     ;;
     NEONv8)
       echo Configuring for experimental ARMv8a support

@@ -6533,7 +6450,7 @@ esac
 if test "${enable_precision+set}" = set; then :
  enableval=$enable_precision; ac_PRECISION=${enable_precision}
 else
-  ac_PRECISION=single
+  ac_PRECISION=double
 fi

 case ${ac_PRECISION} in
--- a/configure.ac
+++ b/configure.ac
@@ -55,18 +55,18 @@ echo :::::::::::::::::::::::::::::::::::::::::::

 AC_CHECK_FUNCS([gettimeofday])

-AC_CHECK_LIB([gmp],[__gmpf_init],,
-        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
-Please install or provide the correct path to your installation
-Info at: http://www.gmplib.org)])
+#AC_CHECK_LIB([gmp],[__gmpf_init],,
+#        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
+#Please install or provide the correct path to your installation
+#Info at: http://www.gmplib.org)])

-AC_CHECK_LIB([mpfr],[mpfr_init],,
-        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
-Please install or provide the correct path to your installation
-Info at: http://www.mpfr.org/)])
+#AC_CHECK_LIB([mpfr],[mpfr_init],,
+#        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
+#Please install or provide the correct path to your installation
+#Info at: http://www.mpfr.org/)])

-AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|MIC],\
-	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, MIC])],\
+AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|IMCI],\
+	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
 	[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])

 supported=no
@@ -99,9 +99,14 @@ case ${ac_SIMD} in
       AC_MSG_WARN([Your processor does not support AVX2 instructions])
       fi
     ;;
-     AVX512|MIC)
-       echo Configuring for AVX512 and MIC
-       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Corner] )
+     AVX512)
+       echo Configuring for AVX512 
+       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Landing] )
+       supported="cross compilation"
+     ;;
+     IMCI)
+       echo Configuring for IMCI
+       AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
       supported="cross compilation"
     ;;
     NEONv8)
@@ -118,7 +123,7 @@ case ${ac_SIMD} in
     ;;
 esac

-AC_ARG_ENABLE([precision],[AC_HELP_STRING([--enable-precision=single|double],[Select default word size of Real])],[ac_PRECISION=${enable_precision}],[ac_PRECISION=single])
+AC_ARG_ENABLE([precision],[AC_HELP_STRING([--enable-precision=single|double],[Select default word size of Real])],[ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
 case ${ac_PRECISION} in
     single)
       echo default precision is single
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -17,6 +17,9 @@

 #include <algorithms/iterative/ConjugateGradientMultiShift.h>

+// Lanczos support
+//#include <algorithms/iterative/MatrixUtils.h>
+//#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>

 #include <algorithms/CoarsenedMatrix.h>

--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -72,7 +72,6 @@ operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return t

 template<typename _Tp>  inline bool
 operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
-
    
 }; // namespace Grid
 #endif
--- a/lib/Config.h.in
+++ b/lib/Config.h.in
@@ -6,7 +6,7 @@
 /* AVX2 Intrinsics */
 #undef AVX2

-/* AVX512 Intrinsics for Knights Corner */
+/* AVX512 Intrinsics for Knights Landing */
 #undef AVX512

 /* EMPTY_SIMD only for DEBUGGING */
@@ -56,12 +56,6 @@
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H

-/* Define to 1 if you have the `gmp' library (-lgmp). */
-#undef HAVE_LIBGMP
-
-/* Define to 1 if you have the `mpfr' library (-lmpfr). */
-#undef HAVE_LIBMPFR
-
 /* Define to 1 if you have the <malloc.h> header file. */
 #undef HAVE_MALLOC_H

@@ -116,6 +110,9 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H

+/* IMCI Intrinsics for Knights Corner */
+#undef IMCI
+
 /* NEON ARMv8 Experimental support */
 #undef NEONv8

--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -30,7 +30,7 @@
 ///////////////////
 // Grid headers
 ///////////////////
-#include <MacroMagic.h>
+#include <serialisation/Serialisation.h>
 #include <Config.h>
 #include <Timer.h>
 #include <Log.h>
@@ -45,6 +45,7 @@
 #include <Stencil.h>      
 #include <Algorithms.h>   
 #include <qcd/QCD.h>
+#include <parallelIO/BinaryIO.h>
 #include <parallelIO/NerscIO.h>

 #include <Init.h>
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -177,7 +177,8 @@ void Grid_init(int *argc,char ***argv)
    Grid_quiesce_nodes();
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
-    WilsonFermionStatic::HandOptDslash=1;
+    QCD::WilsonFermionStatic::HandOptDslash=1;
+    QCD::WilsonFermion5DStatic::HandOptDslash=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@

-HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Config.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./Init.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./Log.h ./MacroMagic.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./parallelIO/NerscIO.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/WilsonLoops.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h ./Timer.h
+HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Config.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./Init.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./pugixml/pugixml.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h ./Timer.h

-CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./Init.cc ./Log.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/hmc/HMC.cc ./qcd/hmc/integrators/Integrator.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
+CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./Init.cc ./Log.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -13,6 +13,11 @@

 typedef uint32_t Integer;

+#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
+#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
+#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
+#define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)
+
 namespace Grid {

  typedef  float  RealF;
--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@@ -17,7 +17,8 @@

 #include <stddef.h>

-#include <algorithms/approx/bigfloat.h>
+//#include <algorithms/approx/bigfloat.h>
+#include <algorithms/approx/bigfloat_double.h>

 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
 #define SUM_MAX 10 // Maximum number of terms in exponential
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -0,0 +1,388 @@
+#ifndef GRID_IRL_H
+#define GRID_IRL_H
+
+namespace Grid {
+
+    /////////////////////////////////////////////////////////////
+    // Base classes for iterative processes based on operators
+    // single input vec, single output vec.
+    /////////////////////////////////////////////////////////////
+
+  template<class Field> 
+    class ImplicitlyRestartedLanczos {
+public:       
+
+    int Niter;
+    int Nk;
+    int Np;
+    RealD enorm;
+    RealD vthr;
+    
+    LinearOperatorBase<Field> &_Linop;
+    OperatorFunction<Field>   &_poly;
+
+    ImplicitlyRestartedLanczos(
+			       LinearOperatorBase<Field> &Linop,
+			       OperatorFunction<Field> & poly,
+			       int _Nk,
+			       int _Np,
+			       RealD _enorm,
+			       RealD _vthrs,
+			       int _Niter) :
+    _Linop(Linop),
+      _poly(poly),
+      Nk(_Nk),
+      Np(_Np),
+      enorm(_enorm),
+      vthr(_vthrs)
+    { 
+      vthr=_vthrs;
+      Niter=_Niter;
+    };
+
+
+    void step(Vector<RealD>& lmda,
+	      Vector<RealD>& lmdb, 
+	      Vector<Field>& evec,
+	      Field& f,int Nm,int k)
+    {
+
+      assert( k< Nm );
+      
+      w = opr_->mult(evec[k]);
+
+      if(k==0){  // Initial step
+    
+	RealD wnorm= w*w;
+	std::cout<<"wnorm="<<wnorm<<std::endl;
+
+	RealD alph = evec[k] * w;
+	w -= alph * evec[k];
+	lmd[k] = alph;
+
+	RealD beta = w * w;
+	beta = sqrt(beta);
+	RealD betar = 1.0/beta;
+
+	evec[k+1] = betar * w;
+	lme[k] = beta;
+
+      } else {   // Iteration step
+
+	w -= lme[k-1] * evec[k-1];
+	
+	RealD alph = evec[k] * w;
+	w -= alph * evec[k];
+
+	RealD beta = w * w;
+	beta = sqrt(beta);
+	RealD betar = 1.0/beta;
+	w *= betar;
+	
+	lmd[k] = alph;
+	lme[k] = beta;
+	orthogonalize(w,evec,k);
+	
+	if(k < Nm-1) evec[k+1] = w;
+      }
+    }
+
+    void qr_decomp(Vector<RealD>& lmda,
+		   Vector<RealD>& lmdb,
+		   int Nk,
+		   int Nm,
+		   Vector<RealD>& Qt,
+		   RealD Dsft, 
+		   int kmin,
+		   int kmax)
+    {
+      int k = kmin-1;
+      RealD x;
+
+      RealD Fden = 1.0/sqrt((lmd[k]-Dsh)*(lmd[k]-Dsh) +lme[k]*lme[k]);
+      RealD c = ( lmd[k] -Dsh) *Fden;
+      RealD s = -lme[k] *Fden;
+      
+      RealD tmpa1 = lmd[k];
+      RealD tmpa2 = lmd[k+1];
+      RealD tmpb  = lme[k];
+
+      lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
+      lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
+      lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
+      x        = -s*lme[k+1];
+      lme[k+1] = c*lme[k+1];
+      
+      for(int i=0; i<Nk; ++i){
+	RealD Qtmp1 = Qt[i+Nm*k  ];
+	RealD Qtmp2 = Qt[i+Nm*(k+1)];
+	Qt[i+Nm*k    ] = c*Qtmp1 - s*Qtmp2;
+	Qt[i+Nm*(k+1)] = s*Qtmp1 + c*Qtmp2; 
+      }
+
+      // Givens transformations
+      for(int k = kmin; k < kmax-1; ++k){
+	RealD Fden = 1.0/sqrt( x*x +lme[k-1]*lme[k-1]);
+	RealD c = lme[k-1]*Fden;
+	RealD s = - x*Fden;
+	
+	RealD tmpa1 = lmd[k];
+	RealD tmpa2 = lmd[k+1];
+	RealD tmpb  = lme[k];
+
+	lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
+	lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
+	lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
+	lme[k-1] = c*lme[k-1] -s*x;
+
+	if(k != kmax-2){
+	  x = -s*lme[k+1];
+	  lme[k+1] = c*lme[k+1];
+	}
+
+	for(int i=0; i<Nk; ++i){
+	  RealD Qtmp1 = Qt[i+Nm*k    ];
+	  RealD Qtmp2 = Qt[i+Nm*(k+1)];
+	  Qt[i+Nm*k    ] = c*Qtmp1 -s*Qtmp2;
+	  Qt[i+Nm*(k+1)] = s*Qtmp1 +c*Qtmp2;
+	}
+      }
+    }
+
+    void diagonalize(Vector<RealD>& lmda,
+		     Vector<RealD>& lmdb, 
+		     int Nm2,
+		     int Nm,
+		     Vector<RealD>& Qt)
+    {
+      int Niter = 100*Nm;
+      int kmin = 1;
+      int kmax = Nk;
+      // (this should be more sophisticated)
+
+      for(int iter=0; iter<Niter; ++iter){
+
+	// determination of 2x2 leading submatrix
+	RealD dsub = lmd[kmax-1]-lmd[kmax-2];
+	RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
+	RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
+	// (Dsh: shift)
+	
+	// transformation
+	qr_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax);
+	
+	// Convergence criterion (redef of kmin and kamx)
+	for(int j=kmax-1; j>= kmin; --j){
+	  RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
+	  if(fabs(lme[j-1])+dds > dds){
+	    kmax = j+1;
+	    goto continued;
+	  }
+	}
+	Niter = iter;
+	return;
+
+      continued:
+	for(int j=0; j<kmax-1; ++j){
+	  RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
+	  if(fabs(lme[j])+dds > dds){
+	    kmin = j+1;
+	    break;
+	  }
+	}
+      }
+      std::cout << "[QL method] Error - Too many iteration: "<<Niter<<"\n";
+      abort();
+    }
+
+    void orthogonalize(Field& w,
+		       const Vector<Field>& evec,
+		       int k)
+    {
+  // Schmidt orthogonalization                                   
+                                                
+      size_t size = w.size();
+      assert(size%2 ==0);
+
+      std::slice re(0,size/2,2);
+      std::slice im(1,size/2,2);
+
+      for(int j=0; j<k; ++j){
+	RealD prdr = evec[j]*w;
+	RealD prdi = evec[j].im_prod(w);
+	
+	valarray<RealD> evr(evec[j][re]);
+	valarray<RealD> evi(evec[j][im]);
+
+	w.add(re, -prdr*evr +prdi*evi);
+	w.add(im, -prdr*evi -prdi*evr);
+      }
+    }
+
+    void calc(Vector<RealD>& lmd,
+	      Vector<Field>& evec,
+	      const Field& b,
+	      int& Nsbt,
+	      int& Nconv)
+      {
+	const size_t fsize = evec[0].size();
+  
+	Nconv = -1;
+	Nsbt = 0;
+	int Nm = Nk_+Np_;
+
+	std::cout << " -- Nk = " << Nk_ << " Np = "<< Np_ << endl;
+	std::cout << " -- Nm = " << Nm << endl;
+	std::cout << " -- size of lmd   = " << lmd.size() << endl;
+	std::cout << " -- size of evec  = " << evec.size() << endl;
+	
+	assert(Nm < evec.size() && Nm < lmd.size());
+	
+	vector<RealD> lme(Nm);
+	vector<RealD> lmd2(Nm);
+	vector<RealD> lme2(Nm);
+	vector<RealD> Qt(Nm*Nm);
+	vector<int>    Iconv(Nm);
+	
+	vector<Field>  B(Nm);
+	for(int k=0; k<Nm; ++k) B[k].resize(fsize);
+	
+	Field f(fsize);
+	Field v(fsize);
+  
+	int k1 = 1;
+	int k2 = Nk_;
+	int kconv = 0;
+	
+	int Kdis  = 0;
+	int Kthrs = 0;
+	RealD beta_k;
+  
+	// Set initial vector
+	evec[0] = 1.0;
+	RealD vnorm = evec[0]*evec[0];
+	evec[0] = 1.0/sqrt(vnorm);
+	// (uniform vector)
+	
+	// Initial Nk steps
+	for(int k=0; k<k2; ++k) step(lmd,lme,evec,f,Nm,k);
+	
+	// Restarting loop begins
+	for(int iter = 0; iter<Niter_; ++iter){
+	  std::cout<<"\n iteration = "<< iter << endl;
+
+	  int Nm2 = Nm - kconv;
+	  for(int k=k2; k<Nm; ++k) step(lmd,lme,evec,f,Nm,k);
+	  f *= lme[Nm-1];
+	  
+	  // getting eigenvalues
+	  for(int k=0; k<Nm2; ++k){
+	    lmd2[k] = lmd[k+k1-1];
+	    lme2[k] = lme[k+k1-1];
+	  }
+	  setUnit_Qt(Nm,Qt);
+	  diagonalize(lmd2,lme2,Nm2,Nm,Qt);
+	  // sorting
+	  sort_->push(lmd2,Nm);
+	  
+	  // Implicitly shifted QR transformations
+	  setUnit_Qt(Nm,Qt);
+	  for(int ip=k2; ip<Nm; ++ip) 
+	    qr_decomp(lmd,lme,Nm,Nm,Qt,lmd2[ip],k1,Nm);
+    
+	  for(int i=0; i<(Nk_+1); ++i) B[i] = 0.0;
+	  
+	  for(int j=k1-1; j<k2+1; ++j){
+	    for(int k=0; k<Nm; ++k){
+	      B[j] += Qt[k+Nm*j] * evec[k];
+	    }
+	  }
+	  for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j];
+
+	  // Compressed vector f and beta(k2)
+	  f *= Qt[Nm-1+Nm*(k2-1)];
+	  f += lme[k2-1] * evec[k2];
+	  beta_k = f * f;
+	  beta_k = sqrt(beta_k);
+	  std::cout<<" beta(k) = "<<beta_k<<endl;
+
+	  RealD betar = 1.0/beta_k;
+	  evec[k2] = betar * f;
+	  lme[k2-1] = beta_k;
+
+	  // Convergence test
+	  for(int k=0; k<Nm2; ++k){    
+	    lmd2[k] = lmd[k];
+	    lme2[k] = lme[k];
+	  }
+	  setUnit_Qt(Nm,Qt);
+	  diagonalize(lmd2,lme2,Nk_,Nm,Qt);
+	  
+	  for(int k = 0; k<Nk_; ++k) B[k]=0.0;
+	  
+	  for(int j = 0; j<Nk_; ++j){
+	    for(int k = 0; k<Nk_; ++k){
+	      B[j] += Qt[k+j*Nm] * evec[k];
+	    }
+	  }
+	  Kdis = 0;
+	  Kthrs = 0;
+
+	  std::cout << setiosflags(ios_base::scientific);
+	  for(int i=0; i<Nk_; ++i){
+	    v = opr_->mult(B[i]);
+	    //std::cout<<"vv="<<v*v<<std::endl;
+	    
+	    RealD vnum = B[i]*v;
+	    RealD vden = B[i]*B[i];
+	    lmd2[i] = vnum/vden;
+	    v -= lmd2[i]*B[i];
+	    RealD vv = v*v;
+	    
+	    std::cout << " [" << setw(3)<< setiosflags(ios_base::right) <<i<<"] ";
+	    std::cout << setw(25)<< setiosflags(ios_base::left)<< lmd2[i];
+	    std::cout <<"  "<< setw(25)<< setiosflags(ios_base::right)<< vv<< endl;
+	    
+	    if(vv<enorm_){
+	      Iconv[Kdis] = i;
+	      ++Kdis;
+	      if(sort_->saturated(lmd2[i],vthr)) ++Kthrs;
+	      std::cout<<"Kthrs="<<Kthrs<<endl;
+	    }
+	  }  // i-loop end
+	  std::cout << resetiosflags(ios_base::scientific);
+
+
+	  std::cout<<" #modes converged: "<<Kdis<<endl;
+
+	  if(Kthrs > 0){
+	    // (there is a converged eigenvalue larger than Vthrs.)
+	    Nconv = iter;
+	    goto converged;
+	  }
+	} // end of iter loop
+	
+	std::cout<<"\n NOT converged.\n";
+	abort();
+	
+      converged:
+	// Sorting
+	
+	lmd.clear();
+	evec.clear();
+	for(int i=0; i<Kdis; ++i){
+	  lmd.push_back(lmd2[Iconv[i]]);
+	  evec.push_back(B[Iconv[i]]);
+	}
+	sort_->push(lmd,evec,Kdis);
+	Nsbt = Kdis - Kthrs;
+	
+	std::cout << "\n Converged\n Summary :\n";
+	std::cout << " -- Iterations  = "<< Nconv  << "\n";
+	std::cout << " -- beta(k)     = "<< beta_k << "\n";
+	std::cout << " -- Kdis        = "<< Kdis   << "\n";
+	std::cout << " -- Nsbt        = "<< Nsbt   << "\n";
+      }
+  };
+}
+#endif
--- a/lib/algorithms/iterative/MatrixUtils.h
+++ b/lib/algorithms/iterative/MatrixUtils.h
@@ -0,0 +1,48 @@
+#ifndef GRID_MATRIX_UTILS_H
+#define GRID_MATRIX_UTILS_H
+
+namespace Grid {
+
+  namespace MatrixUtils { 
+
+    template<class T> inline void Size(Matrix<T>& A,int &N,int &M){
+      N=A.size(); assert(N>0);
+      M=A[0].size();
+      for(int i=0;i<N;i++){
+	assert(A[i].size()==M);
+      }
+    }
+
+    template<class T> inline void SizeSquare(Matrix<T>& A,int &N)
+    {
+      int M;
+      Size(A,N,M);
+      assert(N==M);
+    }
+
+    template<class T> inline void Fill(Matrix<T>& A,T & val)
+    { 
+      int N,M;
+      Size(A,N,M);
+      for(int i=0;i<N;i++){
+      for(int j=0;j<M;j++){
+	A[i][j]=val;
+      }}
+    }
+    template<class T> inline void Diagonal(Matrix<T>& A,T & val)
+    { 
+      int N;
+      SizeSquare(A,N);
+      for(int i=0;i<N;i++){
+	A[i][i]=val;
+      }
+    }
+    template<class T> inline void Identity(Matrix<T>& A)
+    {
+      Fill(A,0.0);
+      Diagonal(A,1.0);
+    }
+
+  };
+}
+#endif
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -87,6 +87,14 @@ class CartesianCommunicator {
 			void *recv,
 			int recv_from_rank,
 			int bytes);
+
+    void RecvFrom(void *recv,
+		  int recv_from_rank,
+		  int bytes);
+    void SendTo(void *xmit,
+		int xmit_to_rank,
+		int bytes);
+
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -81,13 +81,30 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
+void CartesianCommunicator::RecvFrom(void *recv,
+				     int from,
+				     int bytes) 
+{
+  MPI_Status stat;
+  int ierr=MPI_Recv(recv, bytes, MPI_CHAR,from,from,communicator,&stat);
+  assert(ierr==0);
+}
+void CartesianCommunicator::SendTo(void *xmit,
+				   int dest,
+				   int bytes)
+{
+  int rank = _processor; // used for tag; must know who it comes from
+  int ierr = MPI_Send(xmit, bytes, MPI_CHAR,dest,_processor,communicator);
+  assert(ierr==0);
+}
+
 // Basic Halo comms primitive
-  void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						  void *xmit,
-						  int dest,
-						  void *recv,
-						  int from,
-						  int bytes)
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
 {
  MPI_Request xrq;
  MPI_Request rrq;
@@ -100,7 +117,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,

  list.push_back(xrq);
  list.push_back(rrq);
-
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -22,6 +22,20 @@ void CartesianCommunicator::GlobalSum(double &){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}

+void CartesianCommunicator::RecvFrom(void *recv,
+				     int recv_from_rank,
+				     int bytes) 
+{
+  assert(0);
+}
+void CartesianCommunicator::SendTo(void *xmit,
+				   int xmit_to_rank,
+				   int bytes)
+{
+  assert(0);
+}
+
+
 // Basic Halo comms primitive -- should never call in single node
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
--- a/lib/lattice/Lattice_ET.h
+++ b/lib/lattice/Lattice_ET.h
@@ -178,6 +178,7 @@ GridUnopClass(UnaryConj,conjugate(a));
 GridUnopClass(UnaryTrace,trace(a));
 GridUnopClass(UnaryTranspose,transpose(a));
 GridUnopClass(UnaryTa,Ta(a));
+GridUnopClass(UnaryProjectOnGroup,ProjectOnGroup(a));
 GridUnopClass(UnaryReal,real(a));
 GridUnopClass(UnaryImag,imag(a));
 GridUnopClass(UnaryToReal,toReal(a));
@@ -290,6 +291,7 @@ GRID_DEF_UNOP(conjugate,UnaryConj);
 GRID_DEF_UNOP(trace,UnaryTrace);
 GRID_DEF_UNOP(transpose,UnaryTranspose);
 GRID_DEF_UNOP(Ta,UnaryTa);
+GRID_DEF_UNOP(ProjectOnGroup,UnaryProjectOnGroup);
 GRID_DEF_UNOP(real,UnaryReal);
 GRID_DEF_UNOP(imag,UnaryImag);
 GRID_DEF_UNOP(toReal,UnaryToReal);
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -29,6 +29,9 @@ extern int GridCshiftPermuteMap[4][16];
 class LatticeBase {};
 class LatticeExpressionBase {};

+template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
+template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; // Aligned allocator??
+
 template <typename Op, typename T1>                           
 class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
 public:
@@ -59,7 +62,7 @@ public:

    GridBase *_grid;
    int checkerboard;
-    std::vector<vobj,alignedAllocator<vobj> > _odata;
+    Vector<vobj> _odata;
    
    // to pthread need a computable loop where loop induction is not required
    int begin(void) { return 0;};
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -0,0 +1,513 @@
+#ifndef GRID_BINARY_IO_H
+#define GRID_BINARY_IO_H
+
+#ifdef HAVE_ENDIAN_H
+#include <endian.h>
+#endif
+
+
+#include <arpa/inet.h>
+#include <algorithm>
+// 64bit endian swap is a portability pain
+#ifndef __has_builtin         // Optional of course.
+#define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif
+
+#if HAVE_DECL_BE64TOH 
+#undef Grid_ntohll
+#define Grid_ntohll be64toh
+#endif
+
+#if HAVE_DECL_NTOHLL
+#undef  Grid_ntohll
+#define Grid_ntohll ntohll
+#endif
+
+#ifndef Grid_ntohll
+
+#if BYTE_ORDER == BIG_ENDIAN 
+
+#define Grid_ntohll(A) (A)
+
+#else 
+
+#if __has_builtin(__builtin_bswap64)
+#define Grid_ntohll(A) __builtin_bswap64(A)
+#else
+#error
+#endif
+
+#endif
+
+#endif
+
+namespace Grid { 
+
+  // A little helper
+  inline void removeWhitespace(std::string &key)
+  {
+    key.erase(std::remove_if(key.begin(), key.end(), ::isspace),key.end());
+  }
+
+class BinaryIO {
+
+ public:
+
+
+  // Network is big endian
+  static inline void htobe32_v(void *file_object,uint32_t bytes){ be32toh_v(file_object,bytes);} 
+  static inline void htobe64_v(void *file_object,uint32_t bytes){ be64toh_v(file_object,bytes);} 
+  static inline void htole32_v(void *file_object,uint32_t bytes){ le32toh_v(file_object,bytes);} 
+  static inline void htole64_v(void *file_object,uint32_t bytes){ le64toh_v(file_object,bytes);} 
+
+  static inline void be32toh_v(void *file_object,uint32_t bytes)
+  {
+    uint32_t * f = (uint32_t *)file_object;
+    for(int i=0;i*sizeof(uint32_t)<bytes;i++){  
+      f[i] = ntohl(f[i]);
+    }
+  }
+
+  // LE must Swap and switch to host
+  static inline void le32toh_v(void *file_object,uint32_t bytes)
+  {
+    uint32_t *fp = (uint32_t *)file_object;
+    uint32_t f;
+
+    for(int i=0;i*sizeof(uint32_t)<bytes;i++){  
+      f = fp[i];
+      // got network order and the network to host
+      f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
+      fp[i] = ntohl(f);
+    }
+  }
+
+  // BE is same as network
+  static inline void be64toh_v(void *file_object,uint32_t bytes)
+  {
+    uint64_t * f = (uint64_t *)file_object;
+    for(int i=0;i*sizeof(uint64_t)<bytes;i++){  
+      f[i] = Grid_ntohll(f[i]);
+    }
+  }
+  
+  // LE must swap and switch;
+  static inline void le64toh_v(void *file_object,uint32_t bytes)
+  {
+    uint64_t *fp = (uint64_t *)file_object;
+    uint64_t f,g;
+    
+    for(int i=0;i*sizeof(uint64_t)<bytes;i++){  
+      f = fp[i];
+      // got network order and the network to host
+      g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
+      g = g << 32;
+      f = f >> 32;
+      g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
+      fp[i] = Grid_ntohll(g);
+    }
+  }
+
+  template<class vobj,class fobj,class munger> static inline void Uint32Checksum(Lattice<vobj> lat,munger munge,uint32_t &csum)
+  {
+    typedef typename vobj::scalar_object sobj;
+    GridBase *grid = lat._grid ;
+    std::cout <<GridLogMessage<< "Uint32Checksum "<<norm2(lat)<<std::endl;
+    sobj siteObj;
+    fobj fileObj;
+
+    csum = 0;
+    std::vector<int> lcoor;
+    for(int l=0;l<grid->lSites();l++){
+      grid->CoorFromIndex(lcoor,l,grid->_ldimensions);
+      peekLocalSite(siteObj,lat,lcoor);
+      munge(siteObj,fileObj,csum);
+    }
+    grid->GlobalSum(csum);
+  }
+    
+  static inline void Uint32Checksum(uint32_t *buf,uint32_t buf_size_bytes,uint32_t &csum)
+  {
+    for(int i=0;i*sizeof(uint32_t)<buf_size_bytes;i++){
+      csum=csum+buf[i];
+    }
+  }
+  
+  template<class vobj,class fobj,class munger>
+  static inline uint32_t readObjectSerial(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string &format)
+  {
+    typedef typename vobj::scalar_object sobj;
+
+    GridBase *grid = Umu._grid;
+
+    std::cout<< GridLogMessage<< "Serial read I/O "<< file<< std::endl;
+
+    int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32    = (format == std::string("IEEE32"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64    = (format == std::string("IEEE64"));
+
+    // Find the location of each site and send to primary node
+    // Take loop order from Chroma; defines loop order now that NERSC doc no longer
+    // available (how short sighted is that?)
+    std::ifstream fin(file,std::ios::binary|std::ios::in);
+    fin.seekg(offset);
+
+    Umu = zero;
+    uint32_t csum=0;
+    fobj file_object;
+    sobj munged;
+    
+    for(int t=0;t<grid->_fdimensions[3];t++){
+    for(int z=0;z<grid->_fdimensions[2];z++){
+    for(int y=0;y<grid->_fdimensions[1];y++){
+    for(int x=0;x<grid->_fdimensions[0];x++){
+
+      std::vector<int> site({x,y,z,t});
+
+      if ( grid->IsBoss() ) {
+	fin.read((char *)&file_object,sizeof(file_object));
+	
+	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
+	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
+	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
+	if(ieee64)    le64toh_v((void *)&file_object,sizeof(file_object));
+
+	munge(file_object,munged,csum);
+      }
+      // The boss who read the file has their value poked
+      pokeSite(munged,Umu,site);
+    }}}}
+    return csum;
+  }
+
+  template<class vobj,class fobj,class munger> 
+  static inline uint32_t writeObjectSerial(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string & format)
+  {
+    typedef typename vobj::scalar_object sobj;
+
+    GridBase *grid = Umu._grid;
+
+    int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32    = (format == std::string("IEEE32"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64    = (format == std::string("IEEE64"));
+
+    //////////////////////////////////////////////////
+    // Serialise through node zero
+    //////////////////////////////////////////////////
+    std::cout<< GridLogMessage<< "Serial write I/O "<< file<<std::endl;
+
+    std::ofstream fout;
+    if ( grid->IsBoss() ) {
+      fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
+      fout.seekp(offset);
+    }
+    
+    uint32_t csum=0;
+    fobj file_object;
+    sobj unmunged;
+    for(int t=0;t<grid->_fdimensions[3];t++){
+    for(int z=0;z<grid->_fdimensions[2];z++){
+    for(int y=0;y<grid->_fdimensions[1];y++){
+    for(int x=0;x<grid->_fdimensions[0];x++){
+
+      std::vector<int> site({x,y,z,t});
+      // peek & write
+      peekSite(unmunged,Umu,site);
+
+      munge(unmunged,file_object,csum);
+
+      
+      if ( grid->IsBoss() ) {
+	
+	if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
+	if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
+	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
+	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
+	
+	fout.write((char *)&file_object,sizeof(file_object));
+      }
+    }}}}
+
+    return csum;
+  }
+
+  template<class vobj,class fobj,class munger>
+  static inline uint32_t readObjectParallel(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string &format)
+  {
+    typedef typename vobj::scalar_object sobj;
+
+    GridBase *grid = Umu._grid;
+
+    int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32    = (format == std::string("IEEE32"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64    = (format == std::string("IEEE64"));
+
+
+    // Take into account block size of parallel file systems want about
+    // 4-16MB chunks.
+    // Ideally one reader/writer per xy plane and read these contiguously
+    // with comms from nominated I/O nodes.
+    std::ifstream fin;
+
+    int nd = grid->_ndimension;
+    std::vector<int> parallel(nd,1);
+    std::vector<int> ioproc  (nd);
+    std::vector<int> start(nd);
+    std::vector<int> range(nd);
+
+    for(int d=0;d<nd;d++){
+      assert(grid->CheckerBoarded(d) == 0);
+    }
+
+    uint64_t slice_vol = 1;
+
+    int IOnode = 1;
+    for(int d=0;d<grid->_ndimension;d++) {
+
+      if ( d==0 ) parallel[d] = 0;
+      if (parallel[d]) {
+	range[d] = grid->_ldimensions[d];
+	start[d] = grid->_processor_coor[d]*range[d];
+	ioproc[d]= grid->_processor_coor[d];
+      } else {
+	range[d] = grid->_gdimensions[d];
+	start[d] = 0;
+	ioproc[d]= 0;
+
+	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+      }
+      slice_vol = slice_vol * range[d];
+    }
+
+    {
+      uint32_t tmp = IOnode;
+      grid->GlobalSum(tmp);
+      std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <<tmp<< " IOnodes for subslice ";
+      for(int d=0;d<grid->_ndimension;d++){
+	std::cout<< range[d];
+	if( d< grid->_ndimension-1 ) 
+	  std::cout<< " x ";
+      }
+      std::cout << std::endl;
+    }
+
+    int myrank = grid->ThisRank();
+    int iorank = grid->RankFromProcessorCoor(ioproc);
+
+    if ( IOnode ) { 
+      fin.open(file,std::ios::binary|std::ios::in);
+    }
+
+    //////////////////////////////////////////////////////////
+    // Find the location of each site and send to primary node
+    // Take loop order from Chroma; defines loop order now that NERSC doc no longer
+    // available (how short sighted is that?)
+    //////////////////////////////////////////////////////////
+    Umu = zero;
+    uint32_t csum=0;
+    fobj fileObj;
+    sobj siteObj;
+
+      // need to implement these loops in Nd independent way with a lexico conversion
+    for(int tlex=0;tlex<slice_vol;tlex++){
+	
+      std::vector<int> tsite(nd); // temporary mixed up site
+      std::vector<int> gsite(nd);
+      std::vector<int> lsite(nd);
+      std::vector<int> iosite(nd);
+
+      grid->CoorFromIndex(tsite,tlex,range);
+
+      for(int d=0;d<nd;d++){
+	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+	gsite[d] = tsite[d]+start[d];               // global site
+      }
+
+      /////////////////////////
+      // Get the rank of owner of data
+      /////////////////////////
+	int rank, o_idx,i_idx, g_idx;
+      grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
+      grid->GlobalCoorToGlobalIndex(gsite,g_idx);
+      
+      ////////////////////////////////
+      // iorank reads from the seek
+      ////////////////////////////////
+      if (myrank == iorank) {
+	
+	fin.seekg(offset+g_idx*sizeof(fileObj));
+	fin.read((char *)&fileObj,sizeof(fileObj));
+	
+	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
+	
+	munge(fileObj,siteObj,csum);
+	
+	if ( rank != myrank ) {
+	  grid->SendTo((void *)&siteObj,rank,sizeof(siteObj));
+	} else { 
+	  pokeLocalSite(siteObj,Umu,lsite);
+	}
+	 
+      } else { 
+	if ( myrank == rank ) {
+	  grid->RecvFrom((void *)&siteObj,iorank,sizeof(siteObj));
+	  pokeLocalSite(siteObj,Umu,lsite);
+	} 
+      }
+      grid->Barrier(); // necessary?
+    }
+
+    grid->GlobalSum(csum);
+    
+    return csum;
+  }
+
+  //////////////////////////////////////////////////////////
+  // Parallel writer
+  //////////////////////////////////////////////////////////
+  template<class vobj,class fobj,class munger>
+  static inline uint32_t writeObjectParallel(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string & format)
+  {
+    typedef typename vobj::scalar_object sobj;
+    GridBase *grid = Umu._grid;
+
+    int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32    = (format == std::string("IEEE32"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64    = (format == std::string("IEEE64"));
+
+    int nd = grid->_ndimension;
+    for(int d=0;d<nd;d++){
+      assert(grid->CheckerBoarded(d) == 0);
+    }
+
+    std::vector<int> parallel(nd,1);
+    std::vector<int> ioproc  (nd);
+    std::vector<int> start(nd);
+    std::vector<int> range(nd);
+
+    uint64_t slice_vol = 1;
+
+    int IOnode = 1;
+
+    for(int d=0;d<grid->_ndimension;d++) {
+
+      if ( d==0 ) parallel[d] = 0;
+
+      if (parallel[d]) {
+	range[d] = grid->_ldimensions[d];
+	start[d] = grid->_processor_coor[d]*range[d];
+	ioproc[d]= grid->_processor_coor[d];
+      } else {
+	range[d] = grid->_gdimensions[d];
+	start[d] = 0;
+	ioproc[d]= 0;
+
+	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+      }
+
+      slice_vol = slice_vol * range[d];
+    }
+    
+    {
+      uint32_t tmp = IOnode;
+      grid->GlobalSum(tmp);
+      std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
+      for(int d=0;d<grid->_ndimension;d++){
+	std::cout<< range[d];
+	if( d< grid->_ndimension-1 ) 
+	  std::cout<< " x ";
+      }
+      std::cout << std::endl;
+    }
+
+    int myrank = grid->ThisRank();
+    int iorank = grid->RankFromProcessorCoor(ioproc);
+
+    // Take into account block size of parallel file systems want about
+    // 4-16MB chunks.
+    // Ideally one reader/writer per xy plane and read these contiguously
+    // with comms from nominated I/O nodes.
+    std::ofstream fout;
+    if ( IOnode ) fout.open(file,std::ios::binary|std::ios::in|std::ios::out);
+
+    //////////////////////////////////////////////////////////
+    // Find the location of each site and send to primary node
+    // Take loop order from Chroma; defines loop order now that NERSC doc no longer
+    // available (how short sighted is that?)
+    //////////////////////////////////////////////////////////
+
+    uint32_t csum=0;
+    fobj fileObj;
+    sobj siteObj;
+
+
+      // need to implement these loops in Nd independent way with a lexico conversion
+    for(int tlex=0;tlex<slice_vol;tlex++){
+	
+      std::vector<int> tsite(nd); // temporary mixed up site
+      std::vector<int> gsite(nd);
+      std::vector<int> lsite(nd);
+      std::vector<int> iosite(nd);
+
+      grid->CoorFromIndex(tsite,tlex,range);
+
+      for(int d=0;d<nd;d++){
+	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+	gsite[d] = tsite[d]+start[d];               // global site
+      }
+
+
+      /////////////////////////
+      // Get the rank of owner of data
+      /////////////////////////
+      int rank, o_idx,i_idx, g_idx;
+      grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
+      grid->GlobalCoorToGlobalIndex(gsite,g_idx);
+
+      ////////////////////////////////
+      // iorank writes from the seek
+      ////////////////////////////////
+      if (myrank == iorank) {
+
+	if ( rank != myrank ) {
+	  grid->RecvFrom((void *)&siteObj,rank,sizeof(siteObj));
+	} else { 
+	  peekLocalSite(siteObj,Umu,lsite);
+	}
+	
+	munge(siteObj,fileObj,csum);
+
+	if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
+	
+	fout.seekp(offset+g_idx*sizeof(fileObj));
+	fout.write((char *)&fileObj,sizeof(fileObj));
+
+      } else { 
+	if ( myrank == rank ) {
+	  peekLocalSite(siteObj,Umu,lsite);
+	  grid->SendTo((void *)&siteObj,iorank,sizeof(siteObj));
+	} 
+      }
+      grid->Barrier(); // necessary// or every 16 packets to rate throttle??
+    }
+
+    grid->GlobalSum(csum);
+
+    return csum;
+  }
+
+};
+
+}
+
+#endif
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@@ -7,57 +7,23 @@
 #include <fstream>
 #include <map>

-#ifdef HAVE_ENDIAN_H
-#include <endian.h>
-#endif
-
-
-#include <arpa/inet.h>
-
-// 64bit endian swap is a portability pain
-#ifndef __has_builtin         // Optional of course.
-#define __has_builtin(x) 0  // Compatibility with non-clang compilers.
-#endif
-
-#if HAVE_DECL_BE64TOH 
-#undef Grid_ntohll
-#define Grid_ntohll be64toh
-#endif
-
-#if HAVE_DECL_NTOHLL
-#undef  Grid_ntohll
-#define Grid_ntohll ntohll
-#endif
-
-#ifndef Grid_ntohll
-
-#if BYTE_ORDER == BIG_ENDIAN 
-
-#define Grid_ntohll(A) (A)
-
-#else 
-
-#if __has_builtin(__builtin_bswap64)
-#define Grid_ntohll(A) __builtin_bswap64(A)
-#else
-#error
-#endif
-
-#endif
-
-#endif
+#include <unistd.h>
+#include <sys/utsname.h>
+#include <pwd.h>

 namespace Grid {
+namespace QCD {

-  using namespace QCD;
+using namespace Grid;

 ////////////////////////////////////////////////////////////////////////////////
 // Some data types for intermediate storage
 ////////////////////////////////////////////////////////////////////////////////
  template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, 4 >;
-typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
-typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
-typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
+
+  typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
+  typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
+  typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;

 ////////////////////////////////////////////////////////////////////////////////
 // header specification/interpretation
@@ -86,50 +52,173 @@ class NerscField {
 };


+//////////////////////////////////////////////////////////////////////
+// Bit and Physical Checksumming and QA of data
+//////////////////////////////////////////////////////////////////////
+
+inline void NerscGrid(GridBase *grid,NerscField &header)
+{
+  assert(grid->_ndimension==4);
+  for(int d=0;d<4;d++) {
+    header.dimension[d] = grid->_fdimensions[d];
+  }
+  for(int d=0;d<4;d++) {
+    header.boundary[d] = std::string("PERIODIC");
+  }
+}
+template<class GaugeField>
+inline void NerscStatistics(GaugeField & data,NerscField &header)
+{
+  header.link_trace=Grid::QCD::WilsonLoops<GaugeField>::linkTrace(data);
+  header.plaquette =Grid::QCD::WilsonLoops<GaugeField>::avgPlaquette(data);
+}
+
+inline void NerscMachineCharacteristics(NerscField &header)
+{
+  // Who
+  struct passwd *pw = getpwuid (getuid());
+  if (pw) header.creator = std::string(pw->pw_name); 
+
+  // When
+  std::time_t t = std::time(nullptr);
+  std::tm tm = *std::localtime(&t);
+  std::ostringstream oss; 
+  //  oss << std::put_time(&tm, "%c %Z");
+  header.creation_date = oss.str();
+  header.archive_date  = header.creation_date;
+
+  // What
+  struct utsname name;  uname(&name);
+  header.creator_hardware = std::string(name.nodename)+"-";
+  header.creator_hardware+= std::string(name.machine)+"-";
+  header.creator_hardware+= std::string(name.sysname)+"-";
+  header.creator_hardware+= std::string(name.release);
+
+}
+//////////////////////////////////////////////////////////////////////
+// Utilities ; these are QCD aware
+//////////////////////////////////////////////////////////////////////
+    inline void NerscChecksum(uint32_t *buf,uint32_t buf_size_bytes,uint32_t &csum)
+    {
+      BinaryIO::Uint32Checksum(buf,buf_size_bytes,csum);
+    }
+    inline void reconstruct3(LorentzColourMatrix & cm)
+    {
+      const int x=0;
+      const int y=1;
+      const int z=2;
+      for(int mu=0;mu<4;mu++){
+	cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
+	cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
+	cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
+      }
+    }
+
+    template<class fobj,class sobj>
+    struct NerscSimpleMunger{
+
+      void operator() (fobj &in,sobj &out,uint32_t &csum){
+
+      for(int mu=0;mu<4;mu++){
+      for(int i=0;i<3;i++){
+      for(int j=0;j<3;j++){
+	out(mu)()(i,j) = in(mu)()(i,j);
+      }}}
+      NerscChecksum((uint32_t *)&in,sizeof(in),csum); 
+      };
+    };
+
+    template<class fobj,class sobj>
+    struct NerscSimpleUnmunger{
+      void operator() (sobj &in,fobj &out,uint32_t &csum){
+	for(int mu=0;mu<Nd;mu++){
+	for(int i=0;i<Nc;i++){
+	for(int j=0;j<Nc;j++){
+	  out(mu)()(i,j) = in(mu)()(i,j);
+	}}}
+	NerscChecksum((uint32_t *)&out,sizeof(out),csum); 
+      };
+    };
+ 
+    template<class fobj,class sobj>
+    struct Nersc3x2munger{
+      void operator() (fobj &in,sobj &out,uint32_t &csum){
+     
+	NerscChecksum((uint32_t *)&in,sizeof(in),csum); 
+
+	for(int mu=0;mu<4;mu++){
+	  for(int i=0;i<2;i++){
+	    for(int j=0;j<3;j++){
+	      out(mu)()(i,j) = in(mu)(i)(j);
+	    }}
+	}
+	reconstruct3(out);
+      }
+    };
+
+    template<class fobj,class sobj>
+    struct Nersc3x2unmunger{
+
+      void operator() (sobj &in,fobj &out,uint32_t &csum){
+
+
+	for(int mu=0;mu<4;mu++){
+	  for(int i=0;i<2;i++){
+	    for(int j=0;j<3;j++){
+	      out(mu)(i)(j) = in(mu)()(i,j);
+	    }}
+	}
+
+	NerscChecksum((uint32_t *)&out,sizeof(out),csum); 
+
+      }
+    };
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // Write and read from fstream; comput header offset for payload
 ////////////////////////////////////////////////////////////////////////////////
-inline unsigned int writeNerscHeader(NerscField &field,std::string file)
-{
-  std::ofstream fout(file,std::ios::out);
+class NerscIO : public BinaryIO { 
+ public:
+
+  static inline unsigned int writeHeader(NerscField &field,std::string file)
+  {
+    std::ofstream fout(file,std::ios::out);
  
-  fout.seekp(0,std::ios::beg);
-  fout << "BEGIN_HEADER"      << std::endl;
-  fout << "HDR_VERSION = "    << field.hdr_version    << std::endl;
-  fout << "DATATYPE = "       << field.data_type      << std::endl;
-  fout << "STORAGE_FORMAT = " << field.storage_format << std::endl;
+    fout.seekp(0,std::ios::beg);
+    fout << "BEGIN_HEADER"      << std::endl;
+    fout << "HDR_VERSION = "    << field.hdr_version    << std::endl;
+    fout << "DATATYPE = "       << field.data_type      << std::endl;
+    fout << "STORAGE_FORMAT = " << field.storage_format << std::endl;

-  for(int i=0;i<4;i++){
-    fout << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ;
-  }
-  // just to keep the space and write it later
-  fout << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl;
-  fout << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl;
-  for(int i=0;i<4;i++){
-    fout << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;
-  }
-  fout << "CHECKSUM = "<< std::hex << std::setw(16) << 0 << field.checksum << std::endl;
+    for(int i=0;i<4;i++){
+      fout << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ;
+    }
+    // just to keep the space and write it later
+    fout << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl;
+    fout << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl;
+    for(int i=0;i<4;i++){
+      fout << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;
+    }

-  fout << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;
-  fout << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;
-  fout << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;
-  fout << "CREATOR = "         << field.creator          << std::endl;
-  fout << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;
-  fout << "CREATION_DATE = "   << field.creation_date    << std::endl;
-  fout << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;
-  fout << "FLOATING_POINT = "  << field.floating_point   << std::endl;
-  fout << "END_HEADER"         << std::endl;
-  field.data_start = fout.tellp();
-  return field.data_start;
+    fout << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::endl;
+    fout << std::dec;
+
+    fout << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;
+    fout << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;
+    fout << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;
+    fout << "CREATOR = "         << field.creator          << std::endl;
+    fout << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;
+    fout << "CREATION_DATE = "   << field.creation_date    << std::endl;
+    fout << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;
+    fout << "FLOATING_POINT = "  << field.floating_point   << std::endl;
+    fout << "END_HEADER"         << std::endl;
+    field.data_start = fout.tellp();
+    return field.data_start;
 }

-// A little helper
-inline void removeWhitespace(std::string &key)
-{
-  key.erase(std::remove_if(key.begin(), key.end(), ::isspace),key.end());
-}
 // for the header-reader
-inline int readNerscHeader(std::string file,GridBase *grid,  NerscField &field)
+static inline int readHeader(std::string file,GridBase *grid,  NerscField &field)
 {
  int offset=0;
  std::map<std::string,std::string> header;
@@ -163,7 +252,6 @@ inline int readNerscHeader(std::string file,GridBase *grid,  NerscField &field)
  //////////////////////////////////////////////////
  // chomp the values
  //////////////////////////////////////////////////
-
  field.hdr_version    = header["HDR_VERSION"];
  field.data_type      = header["DATATYPE"];
  field.storage_format = header["STORAGE_FORMAT"];
@@ -200,314 +288,21 @@ inline int readNerscHeader(std::string file,GridBase *grid,  NerscField &field)
 }


-//////////////////////////////////////////////////////////////////////
-// Utilities
-//////////////////////////////////////////////////////////////////////
-inline void reconstruct3(LorentzColourMatrix & cm)
-{
-  const int x=0;
-  const int y=1;
-  const int z=2;
-  for(int mu=0;mu<4;mu++){
-    cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
-    cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
-    cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
-  }
-}


- void inline be32toh_v(void *file_object,uint32_t bytes)
- {
-   uint32_t * f = (uint32_t *)file_object;
-   for(int i=0;i*sizeof(uint32_t)<bytes;i++){  
-     f[i] = ntohl(f[i]);
-   }
- }
- void inline le32toh_v(void *file_object,uint32_t bytes)
- {
-   uint32_t *fp = (uint32_t *)file_object;
-
-   uint32_t f;
-
-   for(int i=0;i*sizeof(uint32_t)<bytes;i++){  
-     f = fp[i];
-     // got network order and the network to host
-     f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
-     fp[i] = ntohl(f);
-   }
- }
- void inline be64toh_v(void *file_object,uint32_t bytes)
- {
-   uint64_t * f = (uint64_t *)file_object;
-   for(int i=0;i*sizeof(uint64_t)<bytes;i++){  
-     f[i] = Grid_ntohll(f[i]);
-   }
- }
- void inline le64toh_v(void *file_object,uint32_t bytes)
- {
-   uint64_t *fp = (uint64_t *)file_object;
-   uint64_t f,g;
-
-   for(int i=0;i*sizeof(uint64_t)<bytes;i++){  
-     f = fp[i];
-     // got network order and the network to host
-     g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
-     g = g << 32;
-     f = f >> 32;
-     g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
-     fp[i] = ntohl(g);
-   }
- }
-
-inline void NerscChecksum(uint32_t *buf,uint32_t buf_size,uint32_t &csum)
-{
-  for(int i=0;i*sizeof(uint32_t)<buf_size;i++){
-    csum=csum+buf[i];
-  }
-}
-
-  template<class fobj,class sobj>
-  struct NerscSimpleMunger{
-    void operator() (fobj &in,sobj &out,uint32_t &csum){
-
-      for(int mu=0;mu<4;mu++){
-      for(int i=0;i<3;i++){
-      for(int j=0;j<3;j++){
-	out(mu)()(i,j) = in(mu)()(i,j);
-      }}}
-
-      NerscChecksum((uint32_t *)&in,sizeof(in),csum); 
-    };
-  };
-  template<class fobj,class sobj>
-  struct NerscSimpleUnmunger{
-    void operator() (sobj &in,fobj &out,uint32_t &csum){
-      for(int mu=0;mu<4;mu++){
-      for(int i=0;i<3;i++){
-      for(int j=0;j<3;j++){
-	out(mu)()(i,j) = in(mu)()(i,j);
-      }}}
-      NerscChecksum((uint32_t *)&out,sizeof(out),csum); 
-    };
-  };
- 
- template<class fobj,class sobj>
- struct Nersc3x2munger{
-   void operator() (fobj &in,sobj &out,uint32_t &csum){
-     
-     NerscChecksum((uint32_t *)&in,sizeof(in),csum); 
-
-     for(int mu=0;mu<4;mu++){
-       for(int i=0;i<2;i++){
-       for(int j=0;j<3;j++){
-	   out(mu)()(i,j) = in(mu)(i)(j);
-       }}
-     }
-     reconstruct3(out);
-   }
- };
-
- template<class fobj,class sobj>
- struct Nersc3x2unmunger{
-
-  void operator() (sobj &in,fobj &out,uint32_t &csum){
-
-    NerscChecksum((uint32_t *)&out,sizeof(out),csum); 
-
-    for(int mu=0;mu<4;mu++){
-      for(int i=0;i<2;i++){
-      for(int j=0;j<3;j++){
-	out(mu)(i)(j) = in(mu)()(i,j);
-      }}
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////
-// Template wizardry to map types to strings for NERSC in an extensible way
-////////////////////////////////////////////////////////////////////////////
- template<class vobj> struct NerscDataType {
-   static void DataType     (std::string &str) { str = std::string("4D_BINARY_UNKNOWN"); };
-   static void FloatingPoint(std::string &str) { str = std::string("IEEE64BIG"); };
- };
-
- template<> struct NerscDataType<iColourMatrix<ComplexD> > {
-   static void DataType     (std::string &str) { str = std::string("4D_SU3_GAUGE_3X3"); };
-   static void FloatingPoint(std::string &str) { str = std::string("IEEE64BIG");};
- };
-
- template<> struct NerscDataType<iColourMatrix<ComplexF> > {
-   static void DataType     (std::string &str) { str = std::string("4D_SU3_GAUGE_3X3"); };
-   static void FloatingPoint(std::string &str) { str = std::string("IEEE32BIG");};
- };
-
-//////////////////////////////////////////////////////////////////////
-// Bit and Physical Checksumming and QA of data
-//////////////////////////////////////////////////////////////////////
-/*
-template<class vobj> inline uint32_t NerscChecksum(Lattice<vobj> & data)
-{
-  uint32_t sum;
-  for(int ss=0;ss<data._grid->Osites();ss++){
-    uint32_t *iptr = (uint32_t *)& data._odata[0] ;
-    for(int i=0;i<sizeof(vobj);i+=sizeof(uint32_t)){
-      sum=sum+iptr[i];
-    }
-  }
-  data._grid->globalSum(sum);
-  return sum;
-}
-*/
-template<class vobj> inline void NerscPhysicalCharacteristics(Lattice<vobj> & data,NerscField &header)
-{
-  header.data_type      = NerscDataType<vobj>::DataType;
-  header.floating_point = NerscDataType<vobj>::FloatingPoint;
-  return;
-}
-
- template<> inline void NerscPhysicalCharacteristics(LatticeGaugeField & data,NerscField &header)
-{
-  NerscDataType<decltype(data._odata[0])>::DataType(header.data_type);
-  NerscDataType<decltype(data._odata[0])>::FloatingPoint(header.floating_point);
-  header.link_trace=1.0;
-  header.plaquette =1.0;
-}
-
-template<class vobj> inline void NerscStatisics(Lattice<vobj> & data,NerscField &header)
-{
-  assert(data._grid->_ndimension==4);
-
-  for(int d=0;d<4;d++)
-    header.dimension[d] = data._grid->_fdimensions[d];
-
-  // compute checksum and any physical properties contained for this type
-  //  header.checksum = NerscChecksum(data);
-
-  NerscPhysicalCharacteristics(data,header);
-}
-
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Now the meat: the object readers
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class vobj,class sobj,class fobj,class munger>
-inline void readNerscObject(Lattice<vobj> &Umu,std::string file,munger munge,int offset,std::string &format)
+
+template<class vsimd>
+static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,NerscField& header,std::string file)
 {
+  typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
+
  GridBase *grid = Umu._grid;
+  int offset = readHeader(file,Umu._grid,header);

-  int ieee32big = (format == std::string("IEEE32BIG"));
-  int ieee32    = (format == std::string("IEEE32"));
-  int ieee64big = (format == std::string("IEEE64BIG"));
-  int ieee64    = (format == std::string("IEEE64"));
-
-  // Find the location of each site and send to primary node
-  //  for(int site=0; site < Layout::vol(); ++site){
-  //     multi1d<int> coord = crtesn(site, Layout::lattSize());
-  //     for(int dd=0; dd<Nd; dd++){        /* dir */
-  //        cfg_in.readArray(su3_buffer, float_size, mat_size);
-  //
-  // Above from Chroma; defines loop order now that NERSC doc no longer
-  // available (how short sighted is that?)
-  {
-    std::ifstream fin(file,std::ios::binary|std::ios::in);
-    fin.seekg(offset);
-
-    Umu = zero;
-    uint32_t csum=0;
-    fobj file_object;
-    sobj munged;
-    
-    for(int t=0;t<grid->_fdimensions[3];t++){
-    for(int z=0;z<grid->_fdimensions[2];z++){
-    for(int y=0;y<grid->_fdimensions[1];y++){
-    for(int x=0;x<grid->_fdimensions[0];x++){
-
-      std::vector<int> site({x,y,z,t});
-
-      if ( grid->IsBoss() ) {
-	fin.read((char *)&file_object,sizeof(file_object));
-
-	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee64)    le64toh_v((void *)&file_object,sizeof(file_object));
-
-	munge(file_object,munged,csum);
-      }
-      // The boss who read the file has their value poked
-      pokeSite(munged,Umu,site);
-    }}}}
-  }
-}
-
-template<class vobj,class sobj,class fobj,class munger>
-inline void writeNerscObject(Lattice<vobj> &Umu,std::string file,munger munge,int offset,
-                       int sequence,double lt,double pl)
-{
-  GridBase *grid = Umu._grid;
-  NerscField header;
-  
-  //////////////////////////////////////////////////
-  // First write the header; this is in wrong place
-  //////////////////////////////////////////////////
-  assert(grid->_ndimension == 4);
-  for(int d=0;d<4;d++){
-    header.dimension[d]=grid->_fdimensions[d];
-    header.boundary [d]=std::string("PERIODIC");; 
-  }
-  header.hdr_version=std::string("WHATDAHECK");
-  //  header.storage_format=storage_format<vobj>::string; // use template specialisation
-  //  header.data_type=data_type<vobj>::string;
-  header.storage_format=std::string("debug");
-  header.data_type     =std::string("debug");
-
-  //FIXME; use template specialisation to fill these out
-  header.link_trace   =lt;
-  header.plaquette    =pl;
-  header.checksum     =0;
-
-  //
-  header.sequence_number =sequence;
-  header.ensemble_id     =std::string("UKQCD");
-  header.ensemble_label  =std::string("UKQCD");
-  header.creator         =std::string("Tadahito");
-  header.creator_hardware=std::string("BlueGene/Q");
-  header.creation_date   =std::string("AnnoDomini");
-  header.archive_date    =std::string("AnnoDomini");
-  header.floating_point  =std::string("IEEE64BIG");
-  //  header.data_start=;
-  //  unsigned int checksum;
-
-  //////////////////////////////////////////////////
-  // Now write the body
-  //////////////////////////////////////////////////
-  {
-    std::ofstream fout(file,std::ios::binary|std::ios::out);
-    fout.seekp(offset);
-
-    Umu = zero;
-    uint32_t csum=0;
-    fobj file_object;
-    sobj unmunged;
-    for(int t=0;t<grid->_fdimensions[3];t++){
-    for(int z=0;z<grid->_fdimensions[2];z++){
-    for(int y=0;y<grid->_fdimensions[1];y++){
-    for(int x=0;x<grid->_fdimensions[0];x++){
-      std::vector<int> site({x,y,z,t});
-      peekSite(unmunged,Umu,site);
-      munge(unmunged,file_object,csum);
-      // broadcast & insert
-      fout.write((char *)&file_object,sizeof(file_object));
-    }}}}
-  }
-}
-
-
-
-inline void readNerscConfiguration(LatticeGaugeField &Umu,NerscField& header,std::string file)
-{
-  GridBase *grid = Umu._grid;
-
-  int offset = readNerscHeader(file,Umu._grid,header);
+  NerscField clone(header);

  std::string format(header.floating_point);

@@ -516,48 +311,106 @@ inline void readNerscConfiguration(LatticeGaugeField &Umu,NerscField& header,std
  int ieee64big = (format == std::string("IEEE64BIG"));
  int ieee64    = (format == std::string("IEEE64"));

+  uint32_t csum;
  // depending on datatype, set up munger;
  // munger is a function of <floating point, Real, data_type>
  if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
    if ( ieee32 || ieee32big ) {
-      readNerscObject<vLorentzColourMatrix, LorentzColourMatrix, LorentzColour2x3F> 
-	(Umu,file,
-	 Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(),
-	 offset,format);
+      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
+      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
+	(Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
    }
    if ( ieee64 || ieee64big ) {
-      readNerscObject<vLorentzColourMatrix, LorentzColourMatrix, LorentzColour2x3D> 
-	(Umu,file,
-	 Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),
-	 offset,format);
+      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
+      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
+	(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
    }
  } else if ( header.data_type == std::string("4D_SU3_GAUGE_3X3") ) {
    if ( ieee32 || ieee32big ) {
-      readNerscObject<vLorentzColourMatrix,LorentzColourMatrix,LorentzColourMatrixF>
+      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
+      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 	(Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
    }
    if ( ieee64 || ieee64big ) {
-      readNerscObject<vLorentzColourMatrix,LorentzColourMatrix,LorentzColourMatrixD>
+      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
+      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
 	(Umu,file,NerscSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format);
    }
  } else {
    assert(0);
  }

+  NerscStatistics<GaugeField>(Umu,clone);
+
+  assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+  assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
+  assert(csum == header.checksum );
+
+  std::cout<<GridLogMessage <<"Read NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
 }

-template<class vobj>
-inline void writeNerscConfiguration(Lattice<vobj> &Umu,NerscField &header,std::string file)
+template<class vsimd>
+static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,std::string file, int two_row,int bits32)
 {
-  GridBase &grid = Umu._grid;
+  typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
+
+  typedef iLorentzColourMatrix<vsimd> vobj;
+  typedef typename vobj::scalar_object sobj;
+
+  // Following should become arguments
+  NerscField header;
+  header.sequence_number = 1;
+  header.ensemble_id     = "UKQCD";
+  header.ensemble_label  = "DWF";
+
+  typedef LorentzColourMatrixD fobj3D;
+  typedef LorentzColour2x3D    fobj2D;
+  typedef LorentzColourMatrixF fobj3f;
+  typedef LorentzColour2x3F    fobj2f;
+
+  GridBase *grid = Umu._grid;
+
+  NerscGrid(grid,header);
+  NerscStatistics<GaugeField>(Umu,header);
+  NerscMachineCharacteristics(header);
+
+  uint32_t csum;
+  int offset;
  
-  NerscStatisics(Umu,header);
+  if ( two_row ) { 

-  int offset = writeNerscHeader(header,file);
+    header.floating_point = std::string("IEEE64BIG");
+    header.data_type      = std::string("4D_SU3_GAUGE");
+    Nersc3x2unmunger<fobj2D,sobj> munge;
+    BinaryIO::Uint32Checksum<vobj,fobj2D>(Umu, munge,header.checksum);
+    offset = writeHeader(header,file);
+    csum=BinaryIO::writeObjectSerial<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point);

-  writeNerscObject(Umu,NerscSimpleMunger<vobj,vobj>(),offset);
-}
+    std::string file1 = file+"para";
+    int offset1 = writeHeader(header,file1);
+    int csum1=BinaryIO::writeObjectParallel<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point);

+    
+    std::cout << GridLogMessage << " TESTING PARALLEL WRITE offsets " << offset1 << " "<< offset << std::endl;
+    std::cout << GridLogMessage <<std::hex<< " TESTING PARALLEL WRITE csums   " << csum1 << " "<< csum << std::endl;
+    std::cout << std::dec;

-}
+    assert(offset1==offset);  
+    assert(csum1==csum);  
+
+  } else { 
+    header.floating_point = std::string("IEEE64BIG");
+    header.data_type      = std::string("4D_SU3_GAUGE_3X3");
+    NerscSimpleUnmunger<fobj3D,sobj> munge;
+    BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum);
+    offset = writeHeader(header,file);
+    csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
+  }
+
+  std::cout<<GridLogMessage <<"Written NERSC Configuration "<<file<< " checksum "<<std::hex<<csum<< std::dec<<" plaq "<< header.plaquette <<std::endl;
+
+ }
+};
+
+}}
 #endif
--- a/lib/pugixml/pugixml.cpp
+++ b/lib/pugixml/pugixml.cpp
@@ -14,7 +14,7 @@
 #ifndef SOURCE_PUGIXML_CPP
 #define SOURCE_PUGIXML_CPP

-#include "pugixml.hpp"
+#include <pugixml/pugixml.h>

 #include <stdlib.h>
 #include <stdio.h>
--- a/lib/pugixml/pugixml.hpp
+++ b/lib/pugixml/pugixml.hpp
@@ -17,7 +17,7 @@
 #endif

 // Include user configuration file (this can define various configuration macros)
-#include "pugiconfig.hpp"
+#include <pugixml/pugiconfig.hpp>

 #ifndef HEADER_PUGIXML_HPP
 #define HEADER_PUGIXML_HPP
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -62,7 +62,6 @@ namespace QCD {
    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;

-
    // Spin matrix
    typedef iSpinMatrix<Complex  >          SpinMatrix;
    typedef iSpinMatrix<ComplexF >          SpinMatrixF;
@@ -154,7 +153,7 @@ namespace QCD {
    typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
    
    // singlets
-    typedef iSinglet<Complex >         TComplex;    // FIXME This is painful. Tensor singlet complex type.
+    typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
    typedef iSinglet<ComplexF>         TComplexF;    // FIXME This is painful. Tensor singlet complex type.
    typedef iSinglet<ComplexD>         TComplexD;    // FIXME This is painful. Tensor singlet complex type.

@@ -162,7 +161,7 @@ namespace QCD {
    typedef iSinglet<vComplexF>        vTComplexF;   // what if we don't know the tensor structure
    typedef iSinglet<vComplexD>        vTComplexD;   // what if we don't know the tensor structure

-    typedef iSinglet<Real >            TReal;       // Shouldn't need these; can I make it work without?
+    typedef iSinglet<Real >            TReal;        // Shouldn't need these; can I make it work without?
    typedef iSinglet<RealF>            TRealF;       // Shouldn't need these; can I make it work without?
    typedef iSinglet<RealD>            TRealD;       // Shouldn't need these; can I make it work without?

@@ -251,6 +250,8 @@ namespace QCD {
    typedef LatticeDoubleStoredColourMatrixF       LatticeDoubledGaugeFieldF;
    typedef LatticeDoubleStoredColourMatrixD       LatticeDoubledGaugeFieldD;

+    template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;
+
    // Uhgg... typing this hurt  ;)
    // (my keyboard got burning hot when I typed this, must be the anti-Fermion)
    typedef Lattice<vColourVector>          LatticeStaggeredFermion;    
--- a/lib/qcd/action/ActionBase.h
+++ b/lib/qcd/action/ActionBase.h
@@ -7,16 +7,15 @@ template<class GaugeField>
 class Action { 

 public:
-  virtual void  init (const GaugeField &U, GridParallelRNG& pRNG) = 0;  // 
+  // Boundary conditions? // Heatbath?
+  virtual void  refresh(const GaugeField &U, GridParallelRNG& pRNG) = 0;// refresh pseudofermions
  virtual RealD S    (const GaugeField &U)                        = 0;  // evaluate the action
  virtual void  deriv(const GaugeField &U,GaugeField & dSdU )     = 0;  // evaluate the action derivative
-  virtual void  refresh(const GaugeField & ) {};                        // Default to no-op for actions with no internal fields
-  // Boundary conditions?
-  // Heatbath?
  virtual ~Action() {};
 };

 // Could derive PseudoFermion action with a PF field, FermionField, and a Grid; implement refresh
+/*
 template<class GaugeField, class FermionField>
 class PseudoFermionAction : public Action<GaugeField> {
 public:
@@ -32,5 +31,28 @@ class PseudoFermionAction : public Action<GaugeField> {
  };

 };
+*/
+
+template<class GaugeField> struct ActionLevel{
+public:
+   
+  typedef Action<GaugeField>*  ActPtr; // now force the same colours as the rest of the code
+
+  int multiplier;
+
+  std::vector<ActPtr> actions;
+
+  ActionLevel(int mul = 1) : multiplier(mul) {
+    assert (mul > 0);
+  };
+   
+  void push_back(ActPtr ptr){
+    actions.push_back(ptr);
+  }
+};
+
+template<class GaugeField> using ActionSet = std::vector<ActionLevel< GaugeField > >;
+
+
 }}
 #endif
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -16,10 +16,6 @@
 #include <qcd/action/ActionBase.h>
 #include <qcd/action/ActionParams.h>

-////////////////////////////////////////////
-// Gauge Actions
-////////////////////////////////////////////
-#include <qcd/action/gauge/WilsonGaugeAction.h>


 ////////////////////////////////////////////
@@ -31,6 +27,17 @@
 #include <qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions


+////////////////////////////////////////////
+// Gauge Actions
+////////////////////////////////////////////
+#include <qcd/action/gauge/WilsonGaugeAction.h>
+namespace Grid {
+namespace QCD {
+typedef WilsonGaugeAction<LatticeGaugeField>     WilsonGaugeActionR;
+typedef WilsonGaugeAction<LatticeGaugeFieldF>    WilsonGaugeActionF;
+typedef WilsonGaugeAction<LatticeGaugeFieldD>    WilsonGaugeActionD;
+}}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Explicit explicit template instantiation is still required in the .cc files
 //
@@ -47,10 +54,10 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////////

 #define FermOpTemplateInstantiate(A) \
-  template class A<GparityWilsonImplF>;		\
-  template class A<GparityWilsonImplD>;		\
  template class A<WilsonImplF>;		\
-  template class A<WilsonImplD>;
+  template class A<WilsonImplD>; 
+  //  template class A<GparityWilsonImplF>;	\
+  //  template class A<GparityWilsonImplD>;		

 ////////////////////////////////////////////
 // Fermion operators / actions
@@ -72,8 +79,8 @@
 #include <qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>

 #include <qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
-#include <qcd/action/fermion/OverlapWilsonContFracTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonContFracZolotarevFermion.h>
+#include <qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
+#include <qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>

 #include <qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
 #include <qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -82,7 +82,8 @@ namespace QCD {
  template<class Impl>
  void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
    out.checkerboard = in.checkerboard;
-    out = (4.0+mass)*in;
+    typename FermionField::scalar_type scal(4.0+mass);
+    out = scal*in;
  }
  
  template<class Impl>
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -28,18 +28,31 @@ namespace Grid {
     void DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
+#define HANDOPT
+#ifdef HANDOPT
+     void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+			       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+			       int sF,int sU,const FermionField &in, FermionField &out);
+
+     void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+				  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+				  int sF,int sU,const FermionField &in, FermionField &out);
+#else

     void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
 			       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			       int sF,int sU,const FermionField &in, FermionField &out){
+			       int sF,int sU,const FermionField &in, FermionField &out)
+     {
       DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
     }

     void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
 				  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-				  int sF,int sU,const FermionField &in, FermionField &out){
+				  int sF,int sU,const FermionField &in, FermionField &out)
+     {
       DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
     }
+#endif

     WilsonKernels(const ImplParams &p= ImplParams()) : Base(p) {};

--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -3,7 +3,7 @@
 #define REGISTER

 #define LOAD_CHIMU \
-  const vSpinColourVector & ref (in._odata[offset]);	\
+  const SiteSpinor & ref (in._odata[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
    Chimu_02=ref()(0)(2);\
@@ -18,7 +18,7 @@
    Chimu_32=ref()(3)(2);

 #define LOAD_CHI\
-  const vHalfSpinColourVector &ref(buf[offset]);	\
+  const SiteHalfSpinor &ref(buf[offset]);	\
    Chi_00 = ref()(0)(0);\
    Chi_01 = ref()(0)(1);\
    Chi_02 = ref()(0)(2);\
@@ -56,13 +56,13 @@
    UChi_02+= U_20*Chi_02;\
    UChi_12+= U_20*Chi_12;

-#define PERMUTE\
-      permute(Chi_00,Chi_00,ptype);\
-      permute(Chi_01,Chi_01,ptype);\
-      permute(Chi_02,Chi_02,ptype);\
-      permute(Chi_10,Chi_10,ptype);\
-      permute(Chi_11,Chi_11,ptype);\
-      permute(Chi_12,Chi_12,ptype);
+#define PERMUTE_DIR(dir)			\
+      permute##dir(Chi_00,Chi_00);\
+      permute##dir(Chi_01,Chi_01);\
+      permute##dir(Chi_02,Chi_02);\
+      permute##dir(Chi_10,Chi_10);\
+      permute##dir(Chi_11,Chi_11);\
+      permute##dir(Chi_12,Chi_12);

 //      hspin(0)=fspin(0)+timesI(fspin(3));
 //      hspin(1)=fspin(1)+timesI(fspin(2));
@@ -280,12 +280,16 @@
 namespace Grid {
 namespace QCD {

-#if 0
-template<class Simd>
-void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+#ifdef HANDOPT
+template<class Impl>
+void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int sF,int sU,const FermionField &in, FermionField &out)
+					       int ss,int sU,const FermionField &in, FermionField &out)
 {
+  //  std::cout << "Hand op Dhop "<<std::endl;
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
  REGISTER Simd result_00; // 12 regs on knc
  REGISTER Simd result_01;
  REGISTER Simd result_02;
@@ -339,20 +343,20 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSite(CartesianStencil &
 #define Chimu_32 UChi_12


+  StencilEntry *SE;
  int offset,local,perm, ptype;
-  int ss=sF;
  
  // Xp
-  offset = st._offsets [Xp][ss];
-  local  = st._is_local[Xp][ss];
-  perm   = st._permute[Xp][ss];
-  ptype  = st._permute_type[Xp];
+  SE=st.GetEntry(ptype,Xp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
  
  if ( local ) {
    LOAD_CHIMU;
    XP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -364,16 +368,16 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSite(CartesianStencil &
  XP_RECON;

  // Yp
-  offset = st._offsets [Yp][ss];
-  local  = st._is_local[Yp][ss];
-  perm   = st._permute[Yp][ss];
-  ptype  = st._permute_type[Yp];
+  SE=st.GetEntry(ptype,Yp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
  
  if ( local ) {
    LOAD_CHIMU;
    YP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -385,16 +389,16 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSite(CartesianStencil &


  // Zp
-  offset = st._offsets [Zp][ss];
-  local  = st._is_local[Zp][ss];
-  perm   = st._permute[Zp][ss];
-  ptype  = st._permute_type[Zp];
+  SE=st.GetEntry(ptype,Zp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
  
  if ( local ) {
    LOAD_CHIMU;
    ZP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -405,16 +409,16 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSite(CartesianStencil &
  ZP_RECON_ACCUM;

  // Tp
-  offset = st._offsets [Tp][ss];
-  local  = st._is_local[Tp][ss];
-  perm   = st._permute[Tp][ss];
-  ptype  = st._permute_type[Tp];
+  SE=st.GetEntry(ptype,Tp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
  
  if ( local ) {
    LOAD_CHIMU;
    TP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -425,16 +429,16 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSite(CartesianStencil &
  TP_RECON_ACCUM;
  
  // Xm
-  offset = st._offsets [Xm][ss];
-  local  = st._is_local[Xm][ss];
-  perm   = st._permute[Xm][ss];
-  ptype  = st._permute_type[Xm];
+  SE=st.GetEntry(ptype,Xm,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
  
  if ( local ) {
    LOAD_CHIMU;
    XM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -445,16 +449,16 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSite(CartesianStencil &
  XM_RECON_ACCUM;
  
  // Ym
-  offset = st._offsets [Ym][ss];
-  local  = st._is_local[Ym][ss];
-  perm   = st._permute[Ym][ss];
-  ptype  = st._permute_type[Ym];
+  SE=st.GetEntry(ptype,Ym,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
  
  if ( local ) {
    LOAD_CHIMU;
    YM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -465,16 +469,16 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSite(CartesianStencil &
  YM_RECON_ACCUM;

  // Zm
-  offset = st._offsets [Zm][ss];
-  local  = st._is_local[Zm][ss];
-  perm   = st._permute[Zm][ss];
-  ptype  = st._permute_type[Zm];
+  SE=st.GetEntry(ptype,Zm,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;

  if ( local ) {
    LOAD_CHIMU;
    ZM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -485,16 +489,16 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSite(CartesianStencil &
  ZM_RECON_ACCUM;

  // Tm
-  offset = st._offsets [Tm][ss];
-  local  = st._is_local[Tm][ss];
-  perm   = st._permute[Tm][ss];
-  ptype  = st._permute_type[Tm];
+  SE=st.GetEntry(ptype,Tm,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;

  if ( local ) {
    LOAD_CHIMU;
    TM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -505,7 +509,7 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSite(CartesianStencil &
  TM_RECON_ACCUM;

  {
-    vSpinColourVector & ref (out._odata[ss]);
+    SiteSpinor & ref (out._odata[ss]);
    vstream(ref()(0)(0),result_00*(-0.5));
    vstream(ref()(0)(1),result_01*(-0.5));
    vstream(ref()(0)(2),result_02*(-0.5));
@@ -521,11 +525,14 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSite(CartesianStencil &
  }
 }

-template<class Simd>
-void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
-							      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							      int ss,int sU,const FermionField &in, FermionField &out)
+template<class Impl>
+void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						   int ss,int sU,const FermionField &in, FermionField &out)
 {
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
  REGISTER Simd result_00; // 12 regs on knc
  REGISTER Simd result_01;
  REGISTER Simd result_02;
@@ -580,18 +587,19 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSiteDag(CartesianStenci


  int offset,local,perm, ptype;
+  StencilEntry *SE;

  // Xp
-  offset = st._offsets [Xp][ss];
-  local  = st._is_local[Xp][ss];
-  perm   = st._permute[Xp][ss];
-  ptype  = st._permute_type[Xp];
+  SE=st.GetEntry(ptype,Xp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
  
  if ( local ) {
    LOAD_CHIMU;
    XM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -602,16 +610,16 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSiteDag(CartesianStenci
  XM_RECON;
  
  // Yp
-  offset = st._offsets [Yp][ss];
-  local  = st._is_local[Yp][ss];
-  perm   = st._permute[Yp][ss];
-  ptype  = st._permute_type[Yp];
+  SE=st.GetEntry(ptype,Yp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
  
  if ( local ) {
    LOAD_CHIMU;
    YM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -623,16 +631,16 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSiteDag(CartesianStenci


  // Zp
-  offset = st._offsets [Zp][ss];
-  local  = st._is_local[Zp][ss];
-  perm   = st._permute[Zp][ss];
-  ptype  = st._permute_type[Zp];
+  SE=st.GetEntry(ptype,Zp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
  
  if ( local ) {
    LOAD_CHIMU;
    ZM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -643,16 +651,16 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSiteDag(CartesianStenci
  ZM_RECON_ACCUM;

  // Tp
-  offset = st._offsets [Tp][ss];
-  local  = st._is_local[Tp][ss];
-  perm   = st._permute[Tp][ss];
-  ptype  = st._permute_type[Tp];
+  SE=st.GetEntry(ptype,Tp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
  
  if ( local ) {
    LOAD_CHIMU;
    TM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -663,16 +671,16 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSiteDag(CartesianStenci
  TM_RECON_ACCUM;
  
  // Xm
-  offset = st._offsets [Xm][ss];
-  local  = st._is_local[Xm][ss];
-  perm   = st._permute[Xm][ss];
-  ptype  = st._permute_type[Xm];
+  SE=st.GetEntry(ptype,Xm,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
  
  if ( local ) {
    LOAD_CHIMU;
    XP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -684,16 +692,16 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSiteDag(CartesianStenci
  
  
  // Ym
-  offset = st._offsets [Ym][ss];
-  local  = st._is_local[Ym][ss];
-  perm   = st._permute[Ym][ss];
-  ptype  = st._permute_type[Ym];
+  SE=st.GetEntry(ptype,Ym,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
  
  if ( local ) {
    LOAD_CHIMU;
    YP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -704,16 +712,16 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSiteDag(CartesianStenci
  YP_RECON_ACCUM;

  // Zm
-  offset = st._offsets [Zm][ss];
-  local  = st._is_local[Zm][ss];
-  perm   = st._permute[Zm][ss];
-  ptype  = st._permute_type[Zm];
-
+  SE=st.GetEntry(ptype,Zm,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+  
  if ( local ) {
    LOAD_CHIMU;
    ZP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -724,16 +732,16 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSiteDag(CartesianStenci
  ZP_RECON_ACCUM;

  // Tm
-  offset = st._offsets [Tm][ss];
-  local  = st._is_local[Tm][ss];
-  perm   = st._permute[Tm][ss];
-  ptype  = st._permute_type[Tm];
-
+  SE=st.GetEntry(ptype,Tm,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+  
  if ( local ) {
    LOAD_CHIMU;
    TP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -759,5 +767,6 @@ void WilsonKernels<WilsonImpl<Simd,3> >::DiracOptHandDhopSiteDag(CartesianStenci
    vstream(ref()(3)(2),result_32*(-0.5));
  }
 }
+  FermOpTemplateInstantiate(WilsonKernels);
 #endif
 }}
--- a/lib/qcd/action/gauge/WilsonGaugeAction.h
+++ b/lib/qcd/action/gauge/WilsonGaugeAction.h
@@ -7,35 +7,42 @@ namespace Grid{
    ////////////////////////////////////////////////////////////////////////
    // Wilson Gauge Action .. should I template the Nc etc..
    ////////////////////////////////////////////////////////////////////////
-    template<class GaugeField, class MatrixField>
-      class WilsonGaugeAction : public Action<GaugeField> {
+    template<class GaugeField>
+    class WilsonGaugeAction : public Action<GaugeField> {
+    public:
+
+      typedef LorentzScalar<GaugeField> GaugeLinkField;
+
    private:
      RealD beta;
    public:
    WilsonGaugeAction(RealD b):beta(b){};
      
-      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {};
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {}; // noop as no pseudoferms
      
      virtual RealD S(const GaugeField &U) {
-	RealD plaq = WilsonLoops<MatrixField,GaugeField>::avgPlaquette(U);
+	RealD plaq = WilsonLoops<GaugeField>::avgPlaquette(U);
 	std::cout<<GridLogMessage << "Plaq : "<<plaq << "\n";
 	RealD vol = U._grid->gSites();
 	RealD action=beta*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5;
 	std::cout << GridLogMessage << "WilsonGauge action "<<action<<std::endl;
 	return action;
      };
+
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	//not optimal implementation FIXME
 	//extend Ta to include Lorentz indexes
 	RealD factor = 0.5*beta/RealD(Nc);
-	MatrixField dSdU_mu(U._grid);
-	MatrixField Umu(U._grid);
+	GaugeLinkField Umu(U._grid);
+	GaugeLinkField dSdU_mu(U._grid);
 	for (int mu=0; mu < Nd; mu++){
+
 	  Umu = PeekIndex<LorentzIndex>(U,mu);
+
 	  // Staple in direction mu
-	  WilsonLoops<MatrixField,GaugeField>::Staple(dSdU_mu,U,mu);
+	  WilsonLoops<GaugeField>::Staple(dSdU_mu,U,mu);
 	  dSdU_mu = Ta(Umu*adj(dSdU_mu))*factor;
-	  pokeLorentz(dSdU, dSdU_mu, mu);
+	  PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
 	}
      };
    };
--- a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
@@ -61,7 +61,7 @@ namespace Grid{
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
      
-      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {

 	// P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
 	//        = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
--- a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@@ -61,7 +61,7 @@ namespace Grid{
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
      
-      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {

 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
--- a/lib/qcd/action/pseudofermion/OneFlavourRational.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourRational.h
@@ -57,7 +57,7 @@ namespace Grid{
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
      
-      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {

 	// P(phi) = e^{- phi^dag (MdagM)^-1/2 phi}
 	//        = e^{- phi^dag (MdagM)^-1/4 (MdagM)^-1/4 phi}
--- a/lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h
@@ -55,7 +55,7 @@ namespace Grid{
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
      
-      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {

 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
--- a/lib/qcd/action/pseudofermion/TwoFlavour.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavour.h
@@ -35,7 +35,7 @@ namespace Grid{
      //////////////////////////////////////////////////////////////////////////////////////
      // Push the gauge field in to the dops. Assume any BC's and smearing already applied
      //////////////////////////////////////////////////////////////////////////////////////
-      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {

 	// P(phi) = e^{- phi^dag (MdagM)^-1 phi}
 	// Phi = Mdag eta 
--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
@@ -44,7 +44,7 @@ namespace Grid{
      //////////////////////////////////////////////////////////////////////////////////////
      // Push the gauge field in to the dops. Assume any BC's and smearing already applied
      //////////////////////////////////////////////////////////////////////////////////////
-      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {

 	// P(phi) = e^{- phi^dag (MpcdagMpc)^-1 phi}
 	// Phi = McpDag eta 
--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -40,7 +40,7 @@ namespace Grid{
 	  conformable(_NumOp.GaugeRedBlackGrid(), _DenOp.GaugeRedBlackGrid());
 	};
      
-      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {

 	// P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
 	//
--- a/lib/qcd/action/pseudofermion/TwoFlavourRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourRatio.h
@@ -28,7 +28,7 @@ namespace Grid{
 					 OperatorFunction<FermionField> & AS
 					 ) : NumOp(_NumOp), DenOp(_DenOp), DerivativeSolver(DS), ActionSolver(AS), Phi(_NumOp.FermionGrid()) {};
      
-      virtual void init(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {

 	// P(phi) = e^{- phi^dag V (MdagM)^-1 Vdag phi}
 	//
--- a/lib/qcd/hmc/HMC.cc
+++ b/lib/qcd/hmc/HMC.cc
@@ -8,8 +8,8 @@ namespace Grid{
 	  
 	////////////////////////////// Default values
 	Nsweeps             = 200;
-	TotalSweeps         = 220;
-	ThermalizationSteps = 20;
+	TotalSweeps         = 240;
+	ThermalizationSteps = 40;
 	StartingConfig      = 0;
 	SaveInterval        = 1;
 	Filename_prefix     = "Conf_";
--- a/lib/qcd/hmc/HMC.h
+++ b/lib/qcd/hmc/HMC.h
@@ -15,6 +15,7 @@
 namespace Grid{
  namespace QCD{
    
+
    struct HMCparameters{
      Integer Nsweeps; /* @brief Number of sweeps in this run */
      Integer TotalSweeps; /* @brief If provided, the total number of sweeps */
@@ -26,14 +27,17 @@ namespace Grid{
      HMCparameters();
    };
    
-    template <class Algorithm> 
-    class HybridMonteCarlo{
+    //    template <class GaugeField, class Integrator, class Smearer, class Boundary> 
+    template <class GaugeField, class IntegratorType>
+    class HybridMonteCarlo {
+    private:

      const HMCparameters Params;
+      
+      GridSerialRNG   &sRNG;   // Fixme: need a RNG management strategy.
+      GridParallelRNG &pRNG; // Fixme: need a RNG management strategy.

-      GridSerialRNG sRNG; // Fixme: need a RNG management strategy.
-
-      Integrator<Algorithm>& MD;
+      IntegratorType &TheIntegrator;

      /////////////////////////////////////////////////////////
      // Metropolis step
@@ -63,16 +67,20 @@ namespace Grid{
      /////////////////////////////////////////////////////////
      // Evolution
      /////////////////////////////////////////////////////////
-      RealD evolve_step(LatticeGaugeField& U){
-	MD.init(U); // set U and initialize P and phi's 
+      RealD evolve_step(GaugeField& U){
+
+	TheIntegrator.refresh(U,pRNG); // set U and initialize P and phi's 
+
+	RealD H0 = TheIntegrator.S(U); // initial state action  

-	RealD H0 = MD.S(U); // initial state action  
 	std::cout<<GridLogMessage<<"Total H before = "<< H0 << "\n";

-	MD.integrate(U);
+	TheIntegrator.integrate(U);
      
-	RealD H1 = MD.S(U); // updated state action            
+	RealD H1 = TheIntegrator.S(U); // updated state action            
+
 	std::cout<<GridLogMessage<<"Total H after = "<< H1 << "\n";
+
 	return (H1-H0);
      }
      
@@ -81,16 +89,18 @@ namespace Grid{
      /////////////////////////////////////////
      // Constructor
      /////////////////////////////////////////
-      HybridMonteCarlo(HMCparameters Pms,  Integrator<Algorithm>& MolDyn): Params(Pms),MD(MolDyn) {
-
-	//FIXME...  initialize RNGs also with seed ; RNG management strategy
-	sRNG.SeedRandomDevice();
-
+     HybridMonteCarlo(HMCparameters Pms,  IntegratorType &_Int, GridSerialRNG &_sRNG, GridParallelRNG &_pRNG ) :
+        Params(Pms), 
+	TheIntegrator(_Int), 
+	sRNG(_sRNG),
+	pRNG(_pRNG)
+      {
      }
      ~HybridMonteCarlo(){};


-      void evolve(LatticeGaugeField& Uin){
+      void evolve(GaugeField& Uin){
+
 	Real DeltaH;
 	
 	// Thermalizations
@@ -102,9 +112,8 @@ namespace Grid{
 	}

 	// Actual updates (evolve a copy Ucopy then copy back eventually)
-	LatticeGaugeField Ucopy(Uin._grid);
-	for(int iter=Params.StartingConfig; 
-	    iter < Params.Nsweeps+Params.StartingConfig; ++iter){
+	GaugeField Ucopy(Uin._grid);
+	for(int iter=Params.StartingConfig; iter < Params.Nsweeps+Params.StartingConfig; ++iter){
 	  std::cout<<GridLogMessage << "-- # Sweep = "<< iter <<  "\n";
 	  
 	  Ucopy = Uin;
--- a/lib/qcd/hmc/integrators/Integrator.cc
+++ b/lib/qcd/hmc/integrators/Integrator.cc
@@ -1,28 +0,0 @@
-/*! 
-  @file Integrator_base.cc
-  @brief utilities for MD including funcs to generate initial HMC momentum
- */
-
-#include <Grid.h>
-
-namespace Grid{
-  namespace QCD{
-
-    void MDutils::generate_momenta(LatticeLorentzColourMatrix& P,GridParallelRNG& pRNG){
-      // for future support of different groups
-      MDutils::generate_momenta_su3(P, pRNG);
-    }
-
-    void MDutils::generate_momenta_su3(LatticeLorentzColourMatrix& P,GridParallelRNG& pRNG){
-      LatticeColourMatrix Pmu(P._grid);
-      Pmu = zero;
-      for(int mu=0;mu<Nd;mu++){
-	SU3::GaussianLieAlgebraMatrix(pRNG, Pmu);
-	PokeIndex<LorentzIndex>(P, Pmu, mu);
-      }
-      
-    }
-
-
-  }
-}
--- a/lib/qcd/hmc/integrators/Integrator.h
+++ b/lib/qcd/hmc/integrators/Integrator.h
@@ -10,143 +10,193 @@
 #ifndef INTEGRATOR_INCLUDED
 #define INTEGRATOR_INCLUDED

-class Observer;
+//class Observer;

 #include <memory>

 namespace Grid{
  namespace QCD{

-    typedef Action<LatticeGaugeField>*  ActPtr; // now force the same colours as the rest of the code
-    struct ActionLevel{
-      int multiplier;
-    public:
-      std::vector<ActPtr> actions;
-      explicit ActionLevel(int mul = 1):multiplier(mul){assert (mul > 0);};
-      void push_back(ActPtr ptr){
-	actions.push_back(ptr);
-      }
-    };
-    typedef std::vector<ActionLevel> ActionSet;
-    typedef std::vector<Observer*> ObserverList;
-    
    struct IntegratorParameters{
+
      int Nexp;
      int MDsteps;  // number of outer steps
      RealD trajL;  // trajectory length 
      RealD stepsize;

-      IntegratorParameters(int Nexp_,
-			   int MDsteps_, 
-			   RealD trajL_):
-      Nexp(Nexp_),MDsteps(MDsteps_),trajL(trajL_),stepsize(trajL/MDsteps){};
+      IntegratorParameters(int MDsteps_, 
+			   RealD trajL_=1.0,
+			   int Nexp_=12):
+        Nexp(Nexp_),
+	MDsteps(MDsteps_),
+	trajL(trajL_),
+	stepsize(trajL/MDsteps)
+        {
+	  // empty body constructor
+	};
+
    };

-
-    namespace MDutils{
-      void generate_momenta(LatticeGaugeField&,GridParallelRNG&);
-      void generate_momenta_su3(LatticeGaugeField&,GridParallelRNG&);
-    }
-
    /*! @brief Class for Molecular Dynamics management */   
-    template< class IntegratorAlgorithm >
-    class Integrator{
-    private:
+    template<class GaugeField>
+    class Integrator {
+
+    protected:
+
+      typedef IntegratorParameters ParameterType;
+
      IntegratorParameters Params;
-      const ActionSet as;
-      std::unique_ptr<LatticeGaugeField> P;
-      GridParallelRNG pRNG;
-      //ObserverList observers; // not yet
-     
-      IntegratorAlgorithm TheIntegrator;

-      void register_observers();
-      void notify_observers();
+      const ActionSet<GaugeField> as;

-      void update_P(LatticeGaugeField&U, int level,double ep){
-	for(int a=0; a<as[level].actions.size(); ++a){
-	  LatticeGaugeField force(U._grid);
-	  as[level].actions.at(a)->deriv(U,force);
-	  *P -= force*ep;
+      int levels;              //
+      double t_U;              // Track time passing on each level and for U and for P
+      std::vector<double> t_P; //
+
+      GaugeField P;
+
+      // Should match any legal (SU(n)) gauge field
+      // Need to use this template to match Ncol to pass to SU<N> class
+      template<int Ncol,class vec> void generate_momenta(Lattice< iVector< iScalar< iMatrix<vec,Ncol> >, Nd> > & P,GridParallelRNG& pRNG){
+	typedef Lattice< iScalar< iScalar< iMatrix<vec,Ncol> > > > GaugeLinkField;
+	GaugeLinkField Pmu(P._grid);
+	Pmu = zero;
+	for(int mu=0;mu<Nd;mu++){
+	  SU<Ncol>::GaussianLieAlgebraMatrix(pRNG, Pmu);
+	  PokeIndex<LorentzIndex>(P, Pmu, mu);
 	}
      }

-      void update_U(LatticeGaugeField&U, double ep){
+
+      //ObserverList observers; // not yet
+      //      typedef std::vector<Observer*> ObserverList;
+      //      void register_observers();
+      //      void notify_observers();
+
+      void update_P(GaugeField&U, int level,double ep){
+	t_P[level]+=ep;
+	update_P(P,U,level,ep);
+
+	std::cout<<GridLogMessage;
+	for(int l=0; l<level;++l) std::cout<<"   ";	    
+	std::cout<<"["<<level<<"] P " << " dt "<< ep <<" : t_P "<< t_P[level] <<std::endl;
+      }
+
+      void update_P(GaugeField &Mom,GaugeField&U, int level,double ep){
+	for(int a=0; a<as[level].actions.size(); ++a){
+	  GaugeField force(U._grid);
+	  as[level].actions.at(a)->deriv(U,force);
+	  Mom = Mom - force*ep;
+	}
+      }
+
+      void update_U(GaugeField&U, double ep){
+	update_U(P,U,ep);
+
+	t_U+=ep;
+	int fl = levels-1;
+	std::cout<<GridLogMessage<<"   ";
+	for(int l=0; l<fl;++l) std::cout<<"   ";	    
+	std::cout<<"["<<fl<<"] U " << " dt "<< ep <<" : t_U "<< t_U <<std::endl;
+
+      }
+      void update_U(GaugeField &Mom, GaugeField&U, double ep){
 	//rewrite exponential to deal automatically  with the lorentz index?
-	LatticeColourMatrix Umu(U._grid);
-	LatticeColourMatrix Pmu(U._grid);
+	//	GaugeLinkField Umu(U._grid);
+	//	GaugeLinkField Pmu(U._grid);
 	for (int mu = 0; mu < Nd; mu++){
-	  Umu=PeekIndex<LorentzIndex>(U, mu);
-	  Pmu=PeekIndex<LorentzIndex>(*P, mu);
+	  auto Umu=PeekIndex<LorentzIndex>(U, mu);
+	  auto Pmu=PeekIndex<LorentzIndex>(Mom, mu);
 	  Umu = expMat(Pmu, ep, Params.Nexp)*Umu;
+	  ProjectOnGroup(Umu);
 	  PokeIndex<LorentzIndex>(U, Umu, mu);
 	}
-
      }
      
-      
-      friend void IntegratorAlgorithm::step (LatticeGaugeField& U, 
-					     int level, std::vector<int>& clock,
-					     Integrator<IntegratorAlgorithm>* Integ);
+      virtual void step (GaugeField& U,int level, int first,int last)=0;
+
    public:
-    Integrator(GridBase* grid, IntegratorParameters Par,
-		 ActionSet& Aset):
-      Params(Par),as(Aset),P(new LatticeGaugeField(grid)),pRNG(grid){
-	pRNG.SeedRandomDevice();
+
+      Integrator(GridBase* grid, 
+		 IntegratorParameters Par,
+		 ActionSet<GaugeField> & Aset):
+          Params(Par),
+    	  as(Aset),
+	  P(grid),
+	  levels(Aset.size())
+      {
+	t_P.resize(levels,0.0);
+	t_U=0.0;
      };
      
-      ~Integrator(){}
+      virtual ~Integrator(){}

      //Initialization of momenta and actions
-      void init(LatticeGaugeField& U){
-	std::cout<<GridLogMessage<< "Integrator init\n";
-	MDutils::generate_momenta(*P,pRNG);
+      void refresh(GaugeField& U,GridParallelRNG &pRNG){
+	std::cout<<GridLogMessage<< "Integrator refresh\n";
+	generate_momenta(P,pRNG);
 	for(int level=0; level< as.size(); ++level){
 	  for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
-	    as[level].actions.at(actionID)->init(U, pRNG);
+	    as[level].actions.at(actionID)->refresh(U, pRNG);
 	  }
 	}
      }

      // Calculate action
-      RealD S(LatticeGaugeField& U){
-	LatticeComplex Hloc(U._grid);
-	Hloc = zero;
+      RealD S(GaugeField& U){
+
+	LatticeComplex Hloc(U._grid);	Hloc = zero;
 	// Momenta
 	for (int mu=0; mu <Nd; mu++){
-	  LatticeColourMatrix Pmu = peekLorentz(*P, mu);
+	  auto Pmu = PeekIndex<LorentzIndex>(P, mu);
 	  Hloc -= trace(Pmu*Pmu);
 	}
 	Complex Hsum = sum(Hloc);
 	
 	RealD H = Hsum.real();
-
+	RealD Hterm;
 	std::cout<<GridLogMessage << "Momentum action H_p = "<< H << "\n";

 	// Actions
-	for(int level=0; level<as.size(); ++level)
-	  for(int actionID=0; actionID<as[level].actions.size(); ++actionID)
-	    H += as[level].actions.at(actionID)->S(U);
-
+	for(int level=0; level<as.size(); ++level){
+	  for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
+	    Hterm = as[level].actions.at(actionID)->S(U);
+	    std::cout<<GridLogMessage << "Level "<<level<<" term "<<actionID<<" H = "<<Hterm<<std::endl;
+	    H += Hterm;
+	  }
+	}
 	std::cout<<GridLogMessage << "Total action H = "<< H << "\n";
 	
 	return H;
      }

-      void integrate(LatticeGaugeField& U){
-	std::vector<int> clock;
-	clock.resize(as.size(),0);
-	for(int step=0; step< Params.MDsteps; ++step)   // MD step
-	  TheIntegrator.step(U,0,clock, (this));
+      void integrate(GaugeField& U){
+
+	// reset the clocks
+	t_U=0;
+	for(int level=0; level<as.size(); ++level){
+	  t_P[level]=0;
+	}	
+
+	for(int step=0; step< Params.MDsteps; ++step){   // MD step
+	  int first_step = (step==0);
+	  int  last_step = (step==Params.MDsteps-1);
+	  this->step(U,0,first_step,last_step);
+	}
+
+	// Check the clocks all match on all levels
+	for(int level=0; level<as.size(); ++level){
+	  assert(fabs(t_U - t_P[level])<1.0e-6); // must be the same
+	  std::cout<<GridLogMessage<<" times["<<level<<"]= "<<t_P[level]<< " " << t_U <<std::endl;
+	}	
+
+	// and that we indeed got to the end of the trajectory
+	assert(fabs(t_U-Params.trajL) < 1.0e-6);
+
+
      }
    };
    
-
-
-
-
-
-    
  }
 }
 #endif//INTEGRATOR_INCLUDED
--- a/lib/qcd/hmc/integrators/Integrator_algorithm.h
+++ b/lib/qcd/hmc/integrators/Integrator_algorithm.h
@@ -9,138 +9,236 @@
 #ifndef INTEGRATOR_ALG_INCLUDED
 #define INTEGRATOR_ALG_INCLUDED

-
 namespace Grid{
  namespace QCD{

+   /* PAB:
+    *
+    * Recursive leapfrog; explanation of nested stepping
+    *
+    * Nested 1:4; units in dt for top level integrator
+    *
+    * CHROMA                           IroIro
+    *   0        1                      0              
+    *  P 1/2                           P 1/2
+    *          P 1/16                                  P1/16
+    *                 U 1/8                                   U1/8
+    *          P 1/8                                   P1/8
+    *                 U 1/8                                   U1/8
+    *          P 1/8                                   P1/8
+    *                 U 1/8                                 U1/8
+    *          P 1/8                                   P1/8
+    *                 U 1/8                                 U1/8
+    *          P 1/16                                  P1/8
+    *  P 1                             P 1
+    *          P 1/16                    * skipped --- avoids revaluating force
+    *                 U 1/8                                 U1/8
+    *          P 1/8                                   P1/8
+    *                 U 1/8                                 U1/8
+    *          P 1/8                                   P1/8
+    *                 U 1/8                                 U1/8
+    *          P 1/8                                   P1/8
+    *                 U 1/8                                 U1/8
+    *          P 1/16                                  P1/8
+    *  P 1                             P 1
+    *          P 1/16                    * skipped
+    *                 U 1/8                                 U1/8
+    *          P 1/8                                   P1/8
+    *                 U 1/8                                 U1/8
+    *          P 1/8                                   P1/8
+    *                 U 1/8                                 U1/8
+    *          P 1/8                                   P1/8
+    *                 U 1/8                                 U1/8
+    *          P 1/16                    * skipped
+    *  P 1                             P 1
+    *          P 1/16                                  P1/8
+    *                 U 1/8                                 U1/8
+    *          P 1/8                                   P1/8
+    *                 U 1/8                                 U1/8
+    *          P 1/8                                   P1/8
+    *                 U 1/8                                 U1/8
+    *          P 1/8                                   P1/8
+    *                 U 1/8                                 U1/8
+    *          P 1/16                                  P1/16
+    *  P 1/2                            P 1/2
+    */    

-    class MinimumNorm2{
-      const double lambda = 0.1931833275037836;
+    template<class GaugeField> class LeapFrog : public Integrator<GaugeField> {
    public:
-      void step (LatticeLorentzColourMatrix& U, 
-		 int level, std::vector<int>& clock,
-		 Integrator<MinimumNorm2>* Integ){
+
+      typedef LeapFrog<GaugeField> Algorithm;
+
+      LeapFrog(GridBase* grid, 
+	       IntegratorParameters Par,
+	       ActionSet<GaugeField> & Aset): Integrator<GaugeField>(grid,Par,Aset) {};
+
+
+      void step (GaugeField& U, int level,int _first, int _last){
+
+	int fl = this->as.size() -1;
 	// level  : current level
 	// fl     : final level
 	// eps    : current step size
-
-	int fl = Integ->as.size() -1;
-	double eps = Integ->Params.stepsize;
-	
-	for(int l=0; l<=level; ++l) eps/= 2.0*Integ->as[l].multiplier;
-	
-	int fin = Integ->as[0].multiplier;
-	for(int l=1; l<=level; ++l) fin*= 2.0*Integ->as[l].multiplier;
-	fin = 3*Integ->Params.MDsteps*fin -1;
-	
-	for(int e=0; e<Integ->as[level].multiplier; ++e){
-	  if(clock[level] == 0){    // initial half step 
-	    Integ->update_P(U,level,lambda*eps);
-	    ++clock[level];
-	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<GridLogMessage<<"P "<< clock[level] <<std::endl;
-	  }
-	  
-	  if(level == fl){          // lowest level 
-	    Integ->update_U(U,0.5*eps);
-	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<GridLogMessage<<"U "<< (clock[level]+1) <<std::endl;
-	  }else{                 // recursive function call 
-	    step(U,level+1,clock, Integ);
-	  }
-	  
-	  Integ->update_P(U,level,(1.0-2.0*lambda)*eps);
-	  ++clock[level];
-	  for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	  std::cout<<GridLogMessage<<"P "<< (clock[level]) <<std::endl;
-	  
-	  if(level == fl){          // lowest level 
-	    Integ->update_U(U,0.5*eps);
-	    
-	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<GridLogMessage<<"U "<< (clock[level]+1) <<std::endl;
-	  }else{                 // recursive function call 
-	    step(U,level+1,clock, Integ);
-	  }    
-	  
-	  
-	  if(clock[level] == fin){  // final half step
-	    Integ->update_P(U,level,lambda*eps);
-	    
-	    ++clock[level];
-	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<GridLogMessage<<"P "<< clock[level] <<std::endl;
-	  }else{                  // bulk step
-	    Integ->update_P(U,level,lambda*2.0*eps);
-	    
-	    clock[level]+=2;
-	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<GridLogMessage<<"P "<< clock[level] <<std::endl;
-	  }
-	}
-		
-      }
-      
-    };
-    
-    class LeapFrog{
-    public:
-      void step (LatticeLorentzColourMatrix& U, 
-		 int level, std::vector<int>& clock,
-		 Integrator<LeapFrog>* Integ){
-
-	// level  : current level
-	// fl     : final level
-	// eps    : current step size
-	
-	int fl = Integ->as.size() -1;
-	double eps = Integ->Params.stepsize;
 	
 	// Get current level step size
-	for(int l=0; l<=level; ++l) eps/= Integ->as[l].multiplier;
+        RealD eps = this->Params.stepsize;
+	for(int l=0; l<=level; ++l) eps/= this->as[l].multiplier;
 	
-	int fin = 1;
-	for(int l=0; l<=level; ++l) fin*= Integ->as[l].multiplier;
-	fin = 2*Integ->Params.MDsteps*fin - 1;
-	
-	for(int e=0; e<Integ->as[level].multiplier; ++e){
-	  
-	  if(clock[level] == 0){    // initial half step
-	    Integ->update_P(U, level,eps/2.0);
-	    ++clock[level];
-	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<GridLogMessage<<"P "<< 0.5*clock[level] <<std::endl;
+	int multiplier = this->as[level].multiplier;
+	for(int e=0; e<multiplier; ++e){
+
+	  int first_step = _first && (e==0);
+	  int last_step  = _last  && (e==multiplier-1);
+
+	  if(first_step){    // initial half step
+	    this->update_P(U, level,eps/2.0);
 	  }

 	  if(level == fl){          // lowest level
-	    Integ->update_U(U, eps);
-	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<GridLogMessage<<"U "<< 0.5*(clock[level]+1) <<std::endl;
+	    this->update_U(U, eps);
 	  }else{                 // recursive function call
-	    step(U, level+1,clock, Integ);
+	    this->step(U, level+1,first_step,last_step);
 	  }

-	  if(clock[level] == fin){  // final half step
-	    Integ->update_P(U, level,eps/2.0);
-	    ++clock[level];
-	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<GridLogMessage<<"P "<< 0.5*clock[level] <<std::endl;
-	  }else{                  // bulk step
-	    Integ->update_P(U, level,eps);
-	    clock[level]+=2;
-	    for(int l=0; l<level;++l) std::cout<<GridLogMessage<<"   ";
-	    std::cout<<GridLogMessage<<"P "<< 0.5*clock[level] <<std::endl;
-	  }
+	  int mm = last_step ? 1 : 2;
+	  this->update_P(U, level,mm*eps/2.0);	    

 	}
-
      }
    };

+    template<class GaugeField> class MinimumNorm2 : public Integrator<GaugeField> {
+    private:
+      const RealD lambda = 0.1931833275037836;
+
+    public:
+
+      MinimumNorm2(GridBase* grid, 
+		   IntegratorParameters Par,
+		   ActionSet<GaugeField> & Aset): Integrator<GaugeField>(grid,Par,Aset) {};
+
+      void step (GaugeField& U, int level, int _first,int _last){
+
+	// level  : current level
+	// fl     : final level
+	// eps    : current step size
+
+	int fl = this->as.size() -1;
+
+	RealD eps = this->Params.stepsize*2.0;                              
+	for(int l=0; l<=level; ++l) eps/= 2.0*this->as[l].multiplier;   
+
+	// Nesting:  2xupdate_U of size eps/2
+	// Next level is eps/2/multiplier
+
+	int multiplier = this->as[level].multiplier;
+	for(int e=0; e<multiplier; ++e){       // steps per step
+
+	  int first_step = _first && (e==0);
+	  int last_step  = _last  && (e==multiplier-1);
+
+	  if(first_step){    // initial half step 
+	    this->update_P(U,level,lambda*eps);
+	  }
+	  
+	  if(level == fl){          // lowest level 
+	    this->update_U(U,0.5*eps);
+	  }else{                 // recursive function call 
+	    this->step(U,level+1,first_step,0);
+	  }
+	  
+	  this->update_P(U,level,(1.0-2.0*lambda)*eps);
+	  
+	  if(level == fl){          // lowest level 
+	    this->update_U(U,0.5*eps);
+	  }else{                 // recursive function call 
+	    this->step(U,level+1,0,last_step);
+	  }    
+	  
+	  int mm = (last_step) ? 1 : 2;
+	  this->update_P(U,level,lambda*eps*mm);
+
+	}
+      }
+    };
+
+
+    template<class GaugeField> class ForceGradient : public Integrator<GaugeField> {
+    private:
+      const RealD lambda = 1.0/6.0;;
+      const RealD chi    = 1.0/72.0;
+      const RealD xi     = 0.0;
+      const RealD theta  = 0.0;
+    public:
+
+      // Looks like dH scales as dt^4. tested wilson/wilson 2 level.
+    ForceGradient(GridBase* grid, 
+		  IntegratorParameters Par,
+		  ActionSet<GaugeField> & Aset): Integrator<GaugeField>(grid,Par,Aset) {};
+
+
+      void FG_update_P(GaugeField&U, int level,double fg_dt,double ep){
+	GaugeField Ufg(U._grid);
+	GaugeField Pfg(U._grid);
+	Ufg = U;
+	Pfg = zero;
+	std::cout << GridLogMessage << "FG update "<<fg_dt<<" "<<ep<<std::endl;
+	// prepare_fg; no prediction/result cache for now
+	// could relax CG stopping conditions for the 
+	// derivatives in the small step since the force gets multiplied by
+	// a tiny dt^2 term relative to main force.
+	//
+	// Presently 4 force evals, and should have 3, so 1.33x too expensive.
+	// could reduce this with sloppy CG to perhaps 1.15x too expensive
+	// even without prediction.
+	this->update_P(Pfg,Ufg,level,1.0); 
+	this->update_U(Pfg,Ufg,fg_dt);
+	this->update_P(Ufg,level,ep);
+      }
+
+      void step (GaugeField& U, int level, int _first,int _last){
+
+	RealD eps = this->Params.stepsize*2.0;                              
+	for(int l=0; l<=level; ++l) eps/= 2.0*this->as[l].multiplier;
+
+	RealD Chi   = chi*eps*eps*eps;
+
+	int fl = this->as.size() -1;
+	
+	int multiplier = this->as[level].multiplier;
+
+	for(int e=0; e<multiplier; ++e){       // steps per step
+
+
+	  int first_step = _first && (e==0);
+	  int last_step  = _last  && (e==multiplier-1);
+
+	  if(first_step){    // initial half step 
+	    this->update_P(U,level,lambda*eps);
+	  }
+	  
+	  if(level == fl){          // lowest level 
+	    this->update_U(U,0.5*eps);
+	  }else{                 // recursive function call 
+	    this->step(U,level+1,first_step,0);
+	  }
+	  
+	  this->FG_update_P(U,level,2*Chi/((1.0-2.0*lambda)*eps),(1.0-2.0*lambda)*eps);
+	  
+	  if(level == fl){          // lowest level 
+	    this->update_U(U,0.5*eps);
+	  }else{                 // recursive function call 
+	    this->step(U,level+1,0,last_step);
+	  }    
+	  
+	  int mm = (last_step) ? 1 : 2;
+	  this->update_P(U,level,lambda*eps*mm); 
+
+	}
+      }
+    };

  }
 }

-
-
 #endif//INTEGRATOR_INCLUDED
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -524,16 +524,22 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
  // reunitarise??
  static void LieRandomize(GridParallelRNG     &pRNG,LatticeMatrix &out,double scale=1.0){
    GridBase *grid = out._grid;
+
    LatticeComplex ca (grid);
    LatticeMatrix  lie(grid);
    LatticeMatrix  la (grid);
    Complex ci(0.0,scale);
+    Complex cone(1.0,0.0);
    Matrix ta;

    lie=zero;
    for(int a=0;a<generators();a++){

-      random(pRNG,ca); ca=real(ca)-0.5;
+      random(pRNG,ca); 
+
+      ca = (ca+conjugate(ca))*0.5;
+      ca = ca - 0.5;
+
      generator(a,ta);

      la=ci*ca*ta;
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -4,9 +4,12 @@ namespace Grid {
 namespace QCD {

 // Common wilson loop observables
-template<class GaugeMat,class GaugeLorentz>
+template<class GaugeLorentz>
 class WilsonLoops {
 public:
+
+  typedef LorentzScalar<GaugeLorentz> GaugeMat;
+
  //////////////////////////////////////////////////
  // directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
@@ -68,6 +71,22 @@ public:
    
    return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
  }
+  static RealD linkTrace(const GaugeLorentz &Umu){
+    std::vector<GaugeMat> U(4,Umu._grid);
+
+    LatticeComplex Tr(Umu._grid); Tr=zero;
+    for(int mu=0;mu<Nd;mu++){
+      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+      Tr = Tr+trace(U[mu]);
+    }
+    
+    TComplex Tp = sum(Tr);
+    Complex p  = TensorRemove(Tp);
+
+    double vol = Umu._grid->gSites();
+
+    return p.real()/vol/4.0/3.0;
+  };
  //////////////////////////////////////////////////
  // the sum over all staples on each site
  //////////////////////////////////////////////////
@@ -125,10 +144,10 @@ void siteRectangle(GaugeMat &plaq,const std::vector<GaugeMat> &U, const int mu,
 };


- typedef WilsonLoops<LatticeColourMatrix,LatticeGaugeField> ColourWilsonLoops;
- typedef WilsonLoops<LatticeColourMatrix,LatticeGaugeField> U1WilsonLoops;
- typedef WilsonLoops<LatticeColourMatrix,LatticeGaugeField> SU2WilsonLoops;
- typedef WilsonLoops<LatticeColourMatrix,LatticeGaugeField> SU3WilsonLoops;
+ typedef WilsonLoops<LatticeGaugeField> ColourWilsonLoops;
+ typedef WilsonLoops<LatticeGaugeField> U1WilsonLoops;
+ typedef WilsonLoops<LatticeGaugeField> SU2WilsonLoops;
+ typedef WilsonLoops<LatticeGaugeField> SU3WilsonLoops;

 }}

--- a/lib/serialisation/BaseIO.h
+++ b/lib/serialisation/BaseIO.h
@@ -0,0 +1,49 @@
+#ifndef GRID_SERIALISATION_ABSTRACT_READER_H
+#define GRID_SERIALISATION_ABSTRACT_READER_H
+
+namespace Grid {
+ class Writer  {
+public:
+
+  virtual ~Writer() {};
+
+  virtual void push(const std::string &s) = 0;
+  virtual void pop(void) =0;
+  virtual void write( const std::string& s,const std::string &output      ) =0;
+  virtual void write( const std::string& s,const  int16_t    output      ) =0;
+  virtual void write( const std::string& s,const uint16_t    output      ) =0;
+  virtual void write( const std::string& s,const  int32_t    output      ) =0;
+  virtual void write( const std::string& s,const uint32_t    output      ) =0;
+  virtual void write( const std::string& s,const  int64_t    output      ) =0;
+  virtual void write( const std::string& s,const uint64_t    output      ) =0;
+  virtual void write( const std::string& s,const  float      output      ) =0;
+  virtual void write( const std::string& s,const double      output      ) =0;
+  virtual void write( const std::string& s,const bool        output      ) =0;
+  
+};
+
+class Reader {
+public:
+
+
+
+  virtual ~Reader() {};
+
+  virtual void read( const std::string& s,std::string &output      ) =0;
+  virtual void push(const std::string &s) =0;
+  virtual void pop(void) = 0;
+
+  virtual void read( const std::string& s,  int16_t  &output      ) =0;
+  virtual void read( const std::string& s, uint16_t  &output      ) =0;
+  virtual void read( const std::string& s,  int32_t  &output      ) =0;
+  virtual void read( const std::string& s, uint32_t  &output      ) =0;
+  virtual void read( const std::string& s,  int64_t  &output      ) =0;
+  virtual void read( const std::string& s, uint64_t  &output      ) =0;
+  virtual void read( const std::string& s,  float    &output      ) =0;
+  virtual void read( const std::string& s, double    &output      ) =0;
+  virtual void read( const std::string& s, bool      &output      ) =0;
+  
+};
+
+}
+#endif
--- a/lib/serialisation/BinaryIO.h
+++ b/lib/serialisation/BinaryIO.h
@@ -0,0 +1,109 @@
+#ifndef GRID_SERIALISATION_BINARY_READER_H
+#define GRID_SERIALISATION_BINARY_READER_H
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <sstream>
+#include <math.h>
+#include <vector>
+#include <cassert>
+
+
+namespace Grid {
+
+class BinaryWriter : public Writer {
+private:
+  
+  std::ofstream file;
+
+public:
+
+  BinaryWriter(const std::string &_file) : file(_file,std::ios::binary|std::ios::out) {}
+
+  ~BinaryWriter()  {}
+
+  // Binary is scopeless
+  void push(const std::string &s) {} 
+  void pop(void) {}
+
+  void write(const std::string& s,const std::string &output) { 
+    uint32_t sz = output.size();
+    write(s,sz);
+    const char * cstr = output.c_str();
+    for(int c=0;c<output.size();c++){
+      write(s,cstr[c]);
+    }
+  };
+  void write( const std::string& s,const  char       output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const  int16_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const uint16_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const  int32_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const uint32_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const  int64_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const uint64_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const  float      output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const double      output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const bool        output      ) { writeInternal(s,output); };
+
+private:
+  template<class T> void writeInternal( const std::string& s,const T output ){
+    // FIXME --- htons, htonl, htno64 etc..
+    file.write((char *)&output,sizeof(T));
+  }
+  
+};
+
+
+class BinaryReader : public Reader{
+private:
+
+  std::ifstream file;
+
+public:
+
+
+  BinaryReader(const std::string &_file) : file(_file,std::ios::binary|std::ios::in) {}
+
+  ~BinaryReader() {}
+
+  // Binary is scopeless
+  void push(const std::string &s) { }
+  void pop(void) { }
+
+  void read( const std::string& s,std::string &output ) { 
+
+    output.clear();
+
+    uint32_t sz;
+    file.read((char *)&sz,sizeof(sz));
+
+    for(int c=0;c<sz;c++){
+      char ch;
+      file.read(&ch,sizeof(ch));
+      output.push_back(ch);
+    }
+
+  };
+
+  void read( const std::string& s,  int16_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s, uint16_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s,  int32_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s, uint32_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s,  int64_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s, uint64_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s,  float    &output      ) { readInternal(s,output); };
+  void read( const std::string& s, double    &output      ) { readInternal(s,output); };
+  void read( const std::string& s, bool      &output      ) { readInternal(s,output); };
+
+
+private:
+
+  template<class T> void readInternal( const std::string& path, T &output ){
+    file.read((char *)&output,sizeof(output)); // byte order??
+  }
+
+};
+
+}
+#endif
--- a/lib/serialisation/MacroMagic.h
+++ b/lib/serialisation/MacroMagic.h
@@ -1,6 +1,47 @@
 #ifndef GRID_MACRO_MAGIC_H
 #define GRID_MACRO_MAGIC_H

+///////////////////////////////////////////
+// Strong credit to :
+//
+// http://jhnet.co.uk/articles/cpp_magic
+//
+// 
+// "The C Pre-Processor (CPP) is the somewhat basic macro system used by the C 
+// programming language to implement features such as #include and #define 
+// which allow very simple text-substitutions to be carried out at compile time.
+// In this article we abuse the humble #define to implement if-statements and iteration.
+//
+// Before we begin, a disclaimer: these tricks, while perfectly valid C, should not be 
+// considered good development practice and should almost certainly not be used for "real work".
+// That said it can totally be used for fun home-automation projects...
+//
+// https://github.com/18sg/uSHET/blob/master/lib/cpp_magic.h
+//
+// The cpp_magic.h license (prior to my modifications):
+/*
+Copyright (c) 2014 Thomas Nixon, Jonathan Heathcote
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+///////////////////////////////////////////
+
 #define strong_inline __attribute__((always_inline)) inline

 #ifndef MAX
@@ -63,9 +104,15 @@

 #define _GRID_MACRO_MAP() GRID_MACRO_MAP

+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// These are the Grid extensions to serialisation (beyond popping first AND second)
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
 #define GRID_MACRO_MEMBER(A,B)        A B;

 #define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" "#B <<" = "<< obj. B <<" ; " <<std::endl;
+#define GRID_MACRO_READ_MEMBER(A,B) read(RD,#B,obj. B);
+#define GRID_MACRO_WRITE_MEMBER(A,B) write(WR,#B,obj. B);

 #define GRID_DECL_CLASS_MEMBERS(cname,...)		\
  \
@@ -73,11 +120,26 @@
  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))		\
  \
  \
-  friend std::ostream & operator << (std::ostream &os, const cname &obj ) {	\
+  friend void write(Writer &WR,const std::string &s, const cname &obj){ \
+    push(WR,s);\
+    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__))	\
+    pop(WR);\
+  } \
+  \
+  \
+  friend void read(Reader &RD,const std::string &s, cname &obj){	\
+    push(RD,s);\
+    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__))	\
+    pop(RD);\
+  } \
+  \
+  \
+  friend std::ostream & operator << (std::ostream &os, const cname &obj ) { \
    os<<"class "<<#cname<<" {"<<std::endl;\
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__))	\
      os<<"}";								\
    return os;\
  };  

+
 #endif
--- a/lib/serialisation/Serialisation.h
+++ b/lib/serialisation/Serialisation.h
@@ -0,0 +1,103 @@
+#ifndef GRID_SERIALISATION_READER_H
+#define GRID_SERIALISATION_READER_H
+
+#include <serialisation/MacroMagic.h>
+#include <serialisation/BaseIO.h>
+#include <stdint.h>
+
+namespace Grid {
+
+
+  // Generic reader writer interface
+  inline void push(Writer & WR,const std::string &s) { WR.push(s);}
+  inline void push(Writer & WR,const char *s)        { WR.push(std::string(s));}
+  inline void pop (Writer & WR)                      { WR.pop();}
+
+  //  inline void write(Writer& wr, const std::string& s,const char * output      ) { wr.write(s,std::string(output)); };
+  inline void write(Writer& wr, const std::string& s,const std::string &output) { wr.write(s,output); };
+  inline void write(Writer& wr, const std::string& s,const  int16_t    output      ) { wr.write(s,output); };
+  inline void write(Writer& wr, const std::string& s,const uint16_t    output      ) { wr.write(s,output); };
+  inline void write(Writer& wr, const std::string& s,const  int32_t    output      ) { wr.write(s,output); };
+  inline void write(Writer& wr, const std::string& s,const uint32_t    output      ) { wr.write(s,output); };
+  inline void write(Writer& wr, const std::string& s,const  int64_t    output      ) { wr.write(s,output); };
+  inline void write(Writer& wr, const std::string& s,const uint64_t    output      ) { wr.write(s,output); };
+  inline void write(Writer& wr, const std::string& s,const  float      output      ) { wr.write(s,output); };
+  inline void write(Writer& wr, const std::string& s,const double      output      ) { wr.write(s,output); };
+  inline void write(Writer& wr, const std::string& s,const bool        output      ) { wr.write(s,output); };
+  
+  inline void push(Reader & WR,const std::string &s) { WR.push(s);}
+  inline void push(Reader & WR,const char *s)        { WR.push(std::string(s));}
+  inline void pop (Reader & WR)                      { WR.pop();}
+  
+  inline void read(Reader& rd, const std::string& s,std::string &output)       { rd.read(s,output); };
+  inline void read(Reader& rd, const std::string& s,  int16_t   &output      ) { rd.read(s,output); };
+  inline void read(Reader& rd, const std::string& s, uint16_t   &output      ) { rd.read(s,output); };
+  inline void read(Reader& rd, const std::string& s,  int32_t   &output      ) { rd.read(s,output); };
+  inline void read(Reader& rd, const std::string& s, uint32_t   &output      ) { rd.read(s,output); };
+  inline void read(Reader& rd, const std::string& s,  int64_t   &output      ) { rd.read(s,output); };
+  inline void read(Reader& rd, const std::string& s, uint64_t   &output      ) { rd.read(s,output); };
+  inline void read(Reader& rd, const std::string& s,  float     &output      ) { rd.read(s,output); };
+  inline void read(Reader& rd, const std::string& s, double     &output      ) { rd.read(s,output); };
+  inline void read(Reader& rd, const std::string& s, bool       &output      ) { rd.read(s,output); };
+
+
+  template<class T> void write(Writer& wr, const std::string& s,const std::vector<T> output ) { 
+    push(wr,s);
+    uint64_t sz =output.size();
+    write(wr,"N",sz);
+    for(int i=0;i<output.size();i++){
+      std::ostringstream oss;      oss << "elem" << i;
+      write(wr,oss.str(),output[i]);
+    }
+    pop(wr);
+  };
+
+
+  
+  template<class T>
+  void read(Reader& rd, const std::string& s,std::vector<T> &output ) { 
+
+    push(rd,s);
+
+    uint64_t sz; read(rd,"N",sz);
+    // skip the vector length
+    T tmp;
+    output.resize(0);
+    for(int i=0;i<sz;i++){
+      std::ostringstream oss;      oss << "elem" << i;
+      read(rd,oss.str(),tmp);
+      output.push_back(tmp);
+    }
+    pop(rd);
+  };
+  
+  template < class T >
+  inline std::ostream& operator << (std::ostream& os, const std::vector<T>& v) 
+    {
+      os << "[";
+      for (typename std::vector<T>::const_iterator ii = v.begin(); ii != v.end(); ++ii)
+	{
+	  os << " " << *ii;
+	}
+      os << " ]";
+      return os;
+    }
+}
+
+//////////////////////////////////////////
+// Todo:
+//////////////////////////////////////////
+#include <serialisation/BinaryIO.h>
+#include <serialisation/TextIO.h>
+//#include <serialisation/JsonIO.h>
+//#include <serialisation/YamlIO.h>
+#include <serialisation/XmlIO.h>
+
+//////////////////////////////////////////
+// Select the default serialiser use ifdef's
+//////////////////////////////////////////
+namespace Grid {
+  typedef XMLReader DefaultReader;
+  typedef XMLWriter DefaultWriter;
+}
+#endif
--- a/lib/serialisation/TextIO.h
+++ b/lib/serialisation/TextIO.h
@@ -0,0 +1,148 @@
+#ifndef GRID_SERIALISATION_TEXT_READER_H
+#define GRID_SERIALISATION_TEXT_READER_H
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <sstream>
+#include <math.h>
+#include <vector>
+#include <cassert>
+
+namespace Grid {
+
+class TextWriter  : public Writer {
+private:
+
+  std::ofstream file;
+  int level;
+  void indent(void) {
+    for(int i=0;i<level;i++){
+      file <<"\t";
+    }
+  }
+public:
+
+  TextWriter(const std::string &_file) : file(_file,std::ios::out) {
+    level=0;
+  }
+
+  ~TextWriter()  {  }
+
+  void push(const std::string &s)
+  {
+    //    std::string tmp = s;
+    //    write(s,tmp);
+    level++;
+  }
+  void pop(void) {
+    level--;
+  }
+
+  void write( const std::string& s,const std::string &output      ) { 
+    indent();
+    file<<output<<std::endl;
+  };
+  void write( const std::string& s,const  int16_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const uint16_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const  int32_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const uint32_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const  int64_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const uint64_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const  float      output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const double      output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const bool        output      ) { writeInternal(s,output); };
+
+private:
+
+  template<class T> void writeInternal( const std::string& s,const T output ){
+    indent();
+    file << std::boolalpha << output<<std::endl;
+  }
+  
+};
+
+class TextReader : public Reader {
+private:
+
+  std::ifstream file;
+  int level;
+
+public:
+
+
+ TextReader(const std::string &_file) : file(_file,std::ios::in) { level = 0;};
+
+  ~TextReader()  {  }
+
+  void read( const std::string& s,std::string &output      ) { 
+    char c='a';
+    for(int i=0;i<level;i++){
+      file.get(c);
+      if ( c != '\t' ) 
+	std::cout << "mismatch on tab "<<c<<" level "<< level<< " i "<< i<<std::endl;
+    }
+    output.clear();
+    std::getline(file,output);
+  };
+  void push(const std::string &s)  { 
+    //  std::string tmp; read(s,tmp); 
+    level++;
+  }
+  void pop(void) { level--; }
+
+  template<class T>
+  void read( const std::string& s, std::vector<T>  &output      ) { 
+    
+    push(s);
+
+    uint64_t n; read("N",n);
+
+    // skip the vector length
+    T tmp;
+    output.resize(0);
+    for(int i=0;i<n;i++){
+      std::ostringstream oss;      oss << "elem" << i;
+      read(*this,oss.str(),tmp);
+      output.push_back(tmp);
+    }
+
+    pop();
+
+  };
+
+  void read( const std::string& s,  int16_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s, uint16_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s,  int32_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s, uint32_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s,  int64_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s, uint64_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s,  float    &output      ) { readInternal(s,output); };
+  void read( const std::string& s, double    &output      ) { readInternal(s,output); };
+  void read( const std::string& s, bool      &output      ) { readInternal(s,output); };
+
+
+private:
+
+  template<class T> void readInternal( const std::string& path, T &output ){
+    std::string asString;
+    read(path,asString);
+    convert(asString,output);
+  }
+
+  template<class T> void convert(const std::string &asString,T &output)
+  {
+    std::istringstream is(asString);  is.exceptions(std::ios::failbit);
+    try {
+      is >> std::boolalpha >> output;
+    } catch(std::istringstream::failure e) {
+      std::cerr << "XML read failure on "<<" "<<asString<<" "<<typeid(T).name()<<std::endl;
+    }
+    assert( is.tellg()==-1);
+  }
+  
+};
+
+}
+#endif
+
--- a/lib/serialisation/XmlIO.h
+++ b/lib/serialisation/XmlIO.h
@@ -0,0 +1,146 @@
+#ifndef GRID_SERIALISATION_XML_READER_H
+#define GRID_SERIALISATION_XML_READER_H
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <sstream>
+#include <math.h>
+#include <vector>
+#include <cassert>
+
+#include "pugixml/pugixml.h"
+
+
+namespace Grid {
+
+class XMLWriter  : public Writer {
+private:
+
+  pugi::xml_document doc;
+  pugi::xml_node     node;
+  std::string file;
+
+public:
+
+  XMLWriter(const std::string &_file) : file(_file)
+  {
+    node=doc.append_child();
+    node.set_name("document");
+  }
+
+  ~XMLWriter()
+  {
+    //    simple_walker walker;
+    //    doc.traverse(walker);
+
+    doc.save_file(file.c_str(),"  ");
+  }
+
+  void push(const std::string &s)
+  {
+    node = node.append_child(s.c_str());
+  }
+  void pop(void) {
+    node = node.parent();
+  }
+
+  void write( const std::string& s,const std::string &output      ) { 
+    pugi::xml_node leaf=node.append_child(s.c_str());
+    leaf.append_child(pugi::node_pcdata).set_value(output.c_str());
+  };
+
+  void write( const std::string& s,const  int16_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const uint16_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const  int32_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const uint32_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const  int64_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const uint64_t    output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const  float      output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const double      output      ) { writeInternal(s,output); };
+  void write( const std::string& s,const bool        output      ) { writeInternal(s,output); };
+
+private:
+
+  template<class T> void writeInternal( const std::string& s,const T output ){
+    std::ostringstream os;
+    os << std::boolalpha << output;
+    write(s,os.str());
+  }
+  
+};
+
+
+class XMLReader : public Reader {
+private:
+
+  pugi::xml_document doc;
+  pugi::xml_node     node;
+
+public:
+
+
+  XMLReader(const std::string &_file) 
+  {
+    pugi::xml_parse_result result = doc.load_file(_file.c_str());
+
+    if ( !result ) { 
+      std::cout << "XML error description: " << result.description() << "\n";
+      std::cout << "XML error offset:      " << result.offset << "\n";
+    }
+
+    assert(result);
+
+    //    simple_walker walker;
+    //    doc.traverse(walker);
+
+    node= doc.child("document");
+  }
+
+  ~XMLReader()  {  }
+
+  void read( const std::string& s,std::string &output      ) { 
+    output=node.child(s.c_str()).first_child().value();
+  };
+  void push(const std::string &s)
+  {
+    node = node.child(s.c_str());
+  }
+  void pop(void) {
+    node = node.parent();
+  }
+
+  void read( const std::string& s,  int16_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s, uint16_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s,  int32_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s, uint32_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s,  int64_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s, uint64_t  &output      ) { readInternal(s,output); };
+  void read( const std::string& s,  float    &output      ) { readInternal(s,output); };
+  void read( const std::string& s, double    &output      ) { readInternal(s,output); };
+  void read( const std::string& s, bool      &output      ) { readInternal(s,output); };
+
+
+private:
+
+  template<class T> void readInternal( const std::string& path, T &output ){
+    std::string asString;
+    read(path,asString);
+    convert(asString,output);
+  }
+
+  template<class T> void convert(const std::string &asString,T &output)
+  {
+    std::istringstream is(asString);  is.exceptions(std::ios::failbit);
+    try {
+      is >> std::boolalpha >> output;
+    } catch(std::istringstream::failure e) {
+      std::cerr << "XML read failure on "<<" "<<asString<<" "<<typeid(T).name()<<std::endl;
+    }
+    assert( is.tellg()==-1);
+  }
+  
+};
+
+}
+#endif
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -183,11 +183,11 @@ namespace Optimization {
    // Complex float
    inline __m256 operator()(__m256 a, __m256 b){
      __m256 ymm0,ymm1,ymm2;
-      ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
+      ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
      ymm0 = _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
      // FIXME AVX2 could MAC
-      ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
-      ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
+      ymm1 = _mm256_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
+      ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
      ymm1 = _mm256_mul_ps(ymm1,ymm2);                    // ymm1 <- br ai, ai bi
      return _mm256_addsub_ps(ymm0,ymm1);  
    }
@@ -270,7 +270,7 @@ namespace Optimization {
    //Complex single
    inline __m256 operator()(__m256 in, __m256 ret){
      __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in);   // r,-i
-      return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
+      return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r
    }
    //Complex double
    inline __m256d operator()(__m256d in, __m256d ret){
@@ -282,7 +282,7 @@ namespace Optimization {
  struct TimesI{
    //Complex single
    inline __m256 operator()(__m256 in, __m256 ret){
-      __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r
+      __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r
      return _mm256_addsub_ps(_mm256_setzero_ps(),tmp);          // i,-r
    }
    //Complex double
@@ -296,27 +296,44 @@ namespace Optimization {
  // Some Template specialization
  //////////////////////////////////////////////

-  template < typename vtype > 
-    void permute(vtype &a,vtype b, int perm) {
-    uconv<vtype> conv;
-    conv.v = b;
-    switch (perm){
-      // 8x32 bits=>3 permutes
-    case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
-    case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
-    case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
-    default: assert(0); break;
-    }
-    a = conv.v;
-  }
+  struct Permute{
+
+    static inline __m256 Permute0(__m256 in){
+      return _mm256_permute2f128_ps(in,in,0x01);
+    };
+    static inline __m256 Permute1(__m256 in){
+      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m256 Permute2(__m256 in){
+      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m256 Permute3(__m256 in){
+      return in;
+    };
+
+    static inline __m256d Permute0(__m256d in){
+      return _mm256_permute2f128_pd(in,in,0x01);
+    };
+    static inline __m256d Permute1(__m256d in){
+      return _mm256_shuffle_pd(in,in,0x5);
+    };
+    static inline __m256d Permute2(__m256d in){
+      return in;
+    };
+    static inline __m256d Permute3(__m256d in){
+      return in;
+    };
+
+  };
+

  //Complex float Reduce
  template<>
    inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
    __m256 v1,v2;
-    Optimization::permute(v1,in,0); // avx 256; quad complex single
-    v1 = _mm256_add_ps(v1,in);
-    Optimization::permute(v2,v1,1); 
+    v1=Optimization::Permute::Permute0(in); // avx 256; quad complex single
+    v1= _mm256_add_ps(v1,in);
+    v2=Optimization::Permute::Permute1(v1); 
    v1 = _mm256_add_ps(v1,v2);
    u256f conv; conv.v = v1;
    return Grid::ComplexF(conv.f[0],conv.f[1]);
@@ -326,11 +343,11 @@ namespace Optimization {
  template<>
  inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
    __m256 v1,v2;
-    Optimization::permute(v1,in,0); // avx 256; octo-double
+    v1 = Optimization::Permute::Permute0(in); // avx 256; octo-double
    v1 = _mm256_add_ps(v1,in);
-    Optimization::permute(v2,v1,1); 
+    v2 = Optimization::Permute::Permute1(v1); 
    v1 = _mm256_add_ps(v1,v2);
-    Optimization::permute(v2,v1,2); 
+    v2 = Optimization::Permute::Permute2(v1); 
    v1 = _mm256_add_ps(v1,v2);
    u256f conv; conv.v=v1;
    return conv.f[0];
@@ -341,7 +358,7 @@ namespace Optimization {
  template<>
  inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
    __m256d v1;
-    Optimization::permute(v1,in,0); // sse 128; paired complex single
+    v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
    v1 = _mm256_add_pd(v1,in);
    u256d conv; conv.v = v1;
    return Grid::ComplexD(conv.f[0],conv.f[1]);
@@ -351,9 +368,9 @@ namespace Optimization {
  template<>
  inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
    __m256d v1,v2;
-    Optimization::permute(v1,in,0); // avx 256; quad double
+    v1 = Optimization::Permute::Permute0(in); // avx 256; quad double
    v1 = _mm256_add_pd(v1,in);
-    Optimization::permute(v2,v1,1); 
+    v2 = Optimization::Permute::Permute1(v1); 
    v1 = _mm256_add_pd(v1,v2);
    u256d conv; conv.v = v1;
    return conv.f[0];
@@ -387,13 +404,6 @@ namespace Grid {
    _mm_prefetch(ptr,_MM_HINT_T0);
  }

-
-
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    Optimization::permute(y.v,b.v,perm);
-  };
-
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
  typedef Optimization::Vstore   VstoreSIMD;
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -149,49 +149,33 @@ namespace Optimization {
    }
  };

+  // Note, we can beat the shuf overhead in chain with two temporaries
+  // Ar Ai , Br Bi,  Ai Ar  // one shuf
+  //tmpr Ar Br,  Ai Bi    // Mul/Mac/Mac
+  //tmpi Br Ai,  Bi Ar    // Mul/Mac/Mac
+  // add tmpi,shuf(tmpi)
+  // sub tmpr,shuf(tmpi)
+  // shuf(tmpr,tmpi).    // Could drop/trade for write mask
+
+  // Gives
+  //  2mul,4 mac +add+sub = 8 flop type insns
+  //  3shuf + 2 (+shuf)   = 5/6 simd perm and 1/2 the load.

  struct MultComplex{
    // Complex float
    inline __m512 operator()(__m512 a, __m512 b){
-      __m512 vzero,ymm0,ymm1,real, imag;
-      vzero = _mm512_setzero_ps();
-      ymm0  = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // 
-      real  = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
-      imag  = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0);
-      ymm1  = _mm512_mul_ps(real, b);
-      ymm0  = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
-      return _mm512_fmadd_ps(ymm0,imag,ymm1);
+      // dup, dup, perm, mul, madd
+      __m512 a_real = _mm512_moveldup_ps( a ); // Ar Ar
+      __m512 a_imag = _mm512_movehdup_ps( a ); // Ai Ai
+      a_imag = _mm512_mul_ps( a_imag, _mm512_permute_ps( b, 0xB1 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
+      return _mm512_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
    }
    // Complex double
    inline __m512d operator()(__m512d a, __m512d b){
-      /* This is from
-       * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets 
-       * @inproceedings{McFarlin:2011:ASV:1995896.1995938,
-       * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
-       * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
-       * booktitle = {Proceedings of the International Conference on Supercomputing},
-       * series = {ICS '11},
-       * year = {2011},
-       * isbn = {978-1-4503-0102-2},
-       * location = {Tucson, Arizona, USA},
-       * pages = {265--274},
-       * numpages = {10},
-       * url = {http://doi.acm.org/10.1145/1995896.1995938},
-       * doi = {10.1145/1995896.1995938},
-       * acmid = {1995938},
-       * publisher = {ACM},
-       * address = {New York, NY, USA},
-       * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
-       *                } 
-       */
-      __m512d vzero,ymm0,ymm1,real,imag;
-      vzero =_mm512_setzero_pd();
-      ymm0 =  _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); // 
-      real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
-      imag =  _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
-      ymm1 =  _mm512_mul_pd(real, b);
-      ymm0 =  _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
-      return  _mm512_fmadd_pd(ymm0,imag,ymm1);
+      __m512d a_real = _mm512_shuffle_pd( a, a, 0x00 );
+      __m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF );
+      a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) ); 
+      return _mm512_fmaddsub_pd( a_real, b, a_imag );
    }
  };
  
@@ -227,27 +211,25 @@ namespace Optimization {
    //Complex single
    inline __m512 operator()(__m512 in, __m512 ret){
      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
-      return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK
+      return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));   // 0x4E??
    }
    //Complex double
    inline __m512d operator()(__m512d in, __m512d ret){
      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
-      return  _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK
-    }
-
-
+      return _mm512_shuffle_pd(tmp,tmp,0x55);
+    } 
  };

  struct TimesI{
    //Complex single
    inline __m512 operator()(__m512 in, __m512 ret){
-      __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK
-      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag
+      __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); 
    }
    //Complex double
    inline __m512d operator()(__m512d in, __m512d ret){
-      __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK
-      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag
+      __m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
+      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); 
    }


@@ -255,6 +237,36 @@ namespace Optimization {


  
+  // Gpermute utilities consider coalescing into 1 Gpermute
+  struct Permute{
+    
+    static inline __m512 Permute0(__m512 in){
+      return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512 Permute1(__m512 in){
+      return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m512 Permute2(__m512 in){
+      return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512 Permute3(__m512 in){
+      return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+
+    static inline __m512d Permute0(__m512d in){
+      return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512d Permute1(__m512d in){
+      return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m512d Permute2(__m512d in){
+      return _mm512_shuffle_pd(in,in,0x55);
+    };
+    static inline __m512d Permute3(__m512d in){
+      return in;
+    };
+
+  };


  //////////////////////////////////////////////
@@ -314,25 +326,6 @@ namespace Grid {
  }


-
-
-  // Gpermute utilities consider coalescing into 1 Gpermute
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    union { 
-      __m512 f;
-      decltype(VectorSIMD::v) v;
-    } conv;
-    conv.v = b.v;
-    switch(perm){
-    case 3:  conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
-    case 2:  conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; 
-    case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
-    case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
-    default: assert(0); break;
-    }
-    y.v=conv.v;
-  };
  
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
--- a/lib/simd/Grid_imci.h
+++ b/lib/simd/Grid_imci.h
@@ -0,0 +1,365 @@
+//----------------------------------------------------------------------
+/*! @file Grid_knc.h
+  @brief Optimization libraries for AVX512 instructions set for KNC
+
+  Using intrinsics
+*/
+// Time-stamp: <2015-06-09 14:27:28 neo>
+//----------------------------------------------------------------------
+
+#include <immintrin.h>
+
+//#ifndef KNC_ONLY_STORES
+//#define  _mm512_storenrngo_ps _mm512_store_ps  // not present in AVX512
+//#define  _mm512_storenrngo_pd _mm512_store_pd  // not present in AVX512
+//#endif
+
+
+namespace Optimization {
+  
+  struct Vsplat{
+    //Complex float
+    inline __m512 operator()(float a, float b){
+      return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
+    }
+    // Real float
+    inline __m512 operator()(float a){
+      return _mm512_set1_ps(a);
+    }
+    //Complex double
+    inline __m512d operator()(double a, double b){
+      return _mm512_set_pd(b,a,b,a,b,a,b,a);
+    }
+    //Real double
+    inline __m512d operator()(double a){
+      return _mm512_set1_pd(a);
+    }
+    //Integer
+    inline __m512i operator()(Integer a){
+      return _mm512_set1_epi32(a);
+    }
+  };
+
+  struct Vstore{
+    //Float 
+    inline void operator()(__m512 a, float* F){
+      _mm512_store_ps(F,a);
+    }
+    //Double
+    inline void operator()(__m512d a, double* D){
+      _mm512_store_pd(D,a);
+    }
+    //Integer
+    inline void operator()(__m512i a, Integer* I){
+      _mm512_store_si512((__m512i *)I,a);
+    }
+
+  };
+
+
+  struct Vstream{
+    //Float
+    inline void operator()(float * a, __m512 b){
+      _mm512_storenrngo_ps(a,b);
+    }
+    //Double
+    inline void operator()(double * a, __m512d b){
+      _mm512_storenrngo_pd(a,b);
+    }
+
+
+  };
+
+
+
+  struct Vset{
+    // Complex float 
+    inline __m512 operator()(Grid::ComplexF *a){
+      return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
+			   a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
+			   a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
+			   a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Complex double 
+    inline __m512d operator()(Grid::ComplexD *a){
+      return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
+			   a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Real float 
+    inline __m512 operator()(float *a){
+      return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
+			    a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+    // Real double
+    inline __m512d operator()(double *a){
+      return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+    // Integer
+    inline __m512i operator()(Integer *a){
+      return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
+			       a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+
+
+  };
+
+  template <typename Out_type, typename In_type>
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+
+
+ 
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    //Complex/Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_add_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_add_pd(a,b);
+    }
+    //Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_add_epi32(a,b);
+    }
+  };
+
+  struct Sub{
+    //Complex/Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_sub_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_sub_pd(a,b);
+    }
+    //Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_sub_epi32(a,b);
+    }
+  };
+
+
+  struct MultComplex{
+    // Complex float
+    inline __m512 operator()(__m512 a, __m512 b){
+      __m512 vzero,ymm0,ymm1,real, imag;
+      vzero = _mm512_setzero_ps();
+      ymm0  = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // 
+      real  = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
+      imag  = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0);
+      ymm1  = _mm512_mul_ps(real, b);
+      ymm0  = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
+      return _mm512_fmadd_ps(ymm0,imag,ymm1);
+    }
+    // Complex double
+    inline __m512d operator()(__m512d a, __m512d b){
+      /* This is from
+       * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets 
+       * @inproceedings{McFarlin:2011:ASV:1995896.1995938,
+       * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
+       * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
+       * booktitle = {Proceedings of the International Conference on Supercomputing},
+       * series = {ICS '11},
+       * year = {2011},
+       * isbn = {978-1-4503-0102-2},
+       * location = {Tucson, Arizona, USA},
+       * pages = {265--274},
+       * numpages = {10},
+       * url = {http://doi.acm.org/10.1145/1995896.1995938},
+       * doi = {10.1145/1995896.1995938},
+       * acmid = {1995938},
+       * publisher = {ACM},
+       * address = {New York, NY, USA},
+       * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
+       *                } 
+       */
+      __m512d vzero,ymm0,ymm1,real,imag;
+      vzero =_mm512_setzero_pd();
+      ymm0 =  _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); // 
+      real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
+      imag =  _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
+      ymm1 =  _mm512_mul_pd(real, b);
+      ymm0 =  _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
+      return  _mm512_fmadd_pd(ymm0,imag,ymm1);
+    }
+  };
+  
+  struct Mult{
+    // Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_mul_ps(a,b);
+    }
+    // Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_mul_pd(a,b);
+    }
+    // Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_mullo_epi32(a,b);
+    }
+  };
+
+
+  struct Conj{
+    // Complex single
+    inline __m512 operator()(__m512 in){
+      return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag  
+    }
+    // Complex double
+    inline __m512d operator()(__m512d in){
+      return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
+    }
+    // do not define for integer input
+  };
+
+  struct TimesMinusI{
+    //Complex single
+    inline __m512 operator()(__m512 in, __m512 ret){
+      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
+      return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK
+    }
+    //Complex double
+    inline __m512d operator()(__m512d in, __m512d ret){
+      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
+      return  _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK
+    }
+
+
+  };
+
+  struct TimesI{
+    //Complex single
+    inline __m512 operator()(__m512 in, __m512 ret){
+      __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK
+      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag
+    }
+    //Complex double
+    inline __m512d operator()(__m512d in, __m512d ret){
+      __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK
+      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag
+    }
+
+
+  };
+
+
+   struct Permute{
+    
+    static inline __m512 Permute0(__m512 in){
+      return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512 Permute1(__m512 in){
+      return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m512 Permute2(__m512 in){
+      return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC);
+    };
+    static inline __m512 Permute3(__m512 in){
+      return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB); 
+    };
+
+    static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d
+      return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512d Permute1(__m512d in){
+      return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC);
+    };
+    static inline __m512d Permute2(__m512d in){
+      return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB);
+    };
+    static inline __m512d Permute3(__m512d in){
+      return in;
+    };
+
+  };
+ 
+
+
+  //////////////////////////////////////////////
+  // Some Template specialization
+  
+  //Complex float Reduce
+  template<>
+  inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
+    return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
+  }
+  //Real float Reduce
+  template<>
+  inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
+    return _mm512_reduce_add_ps(in);
+  }
+  
+  
+  //Complex double Reduce
+  template<>
+  inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
+    return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
+  }
+  
+  //Real double Reduce
+  template<>
+  inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
+    return _mm512_reduce_add_pd(in);
+  }
+
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
+    // FIXME unimplemented
+    printf("Reduce : Missing integer implementation -> FIX\n");
+    assert(0);
+  }
+  
+  
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Here assign types 
+namespace Grid {
+  typedef __m512 SIMD_Ftype;  // Single precision type
+  typedef __m512d SIMD_Dtype; // Double precision type
+  typedef __m512i SIMD_Itype; // Integer type
+
+  // prefecth
+  inline void v_prefetch0(int size, const char *ptr){
+    for(int i=0;i<size;i+=64){ //  Define L1 linesize above
+      _mm_prefetch(ptr+i+4096,_MM_HINT_T1);
+      _mm_prefetch(ptr+i+512,_MM_HINT_T0);
+    }
+  }
+  inline void prefetch_HINT_T0(const char *ptr){
+    _mm_prefetch(ptr,_MM_HINT_T0);
+  }
+
+
+  
+  // Function name aliases
+  typedef Optimization::Vsplat   VsplatSIMD;
+  typedef Optimization::Vstore   VstoreSIMD;
+  typedef Optimization::Vset     VsetSIMD;
+  typedef Optimization::Vstream  VstreamSIMD;
+  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+
+  // Arithmetic operations
+  typedef Optimization::Sum         SumSIMD;
+  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::Conj        ConjSIMD;
+  typedef Optimization::TimesMinusI TimesMinusISIMD;
+  typedef Optimization::TimesI      TimesISIMD;
+
+}
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -151,10 +151,10 @@ namespace Optimization {
    // Complex float
    inline __m128 operator()(__m128 a, __m128 b){
      __m128 ymm0,ymm1,ymm2;
-      ymm0 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
+      ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
      ymm0 = _mm_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
-      ymm1 = _mm_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
-      ymm2 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
+      ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
+      ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
      ymm1 = _mm_mul_ps(ymm1,ymm2);                    // ymm1 <- br ai, ai bi
      return _mm_addsub_ps(ymm0,ymm1);    
    }
@@ -201,7 +201,7 @@ namespace Optimization {
    //Complex single
    inline __m128 operator()(__m128 in, __m128 ret){
      __m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
-      return _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
+      return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1));
    }
    //Complex double
    inline __m128d operator()(__m128d in, __m128d ret){
@@ -215,7 +215,7 @@ namespace Optimization {
  struct TimesI{
    //Complex single
    inline __m128 operator()(__m128 in, __m128 ret){
-      __m128 tmp =_mm_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1));
+      __m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
      return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
    }
    //Complex double
@@ -225,27 +225,45 @@ namespace Optimization {
    }
  };

+  struct Permute{
+
+    static inline __m128 Permute0(__m128 in){
+      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m128 Permute1(__m128 in){
+      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m128 Permute2(__m128 in){
+      return in;
+    };
+    static inline __m128 Permute3(__m128 in){
+      return in;
+    };
+
+    static inline __m128d Permute0(__m128d in){
+      return _mm_shuffle_pd(in,in,0x1);
+    };
+    static inline __m128d Permute1(__m128d in){
+      return in;
+    };
+    static inline __m128d Permute2(__m128d in){
+      return in;
+    };
+    static inline __m128d Permute3(__m128d in){
+      return in;
+    };
+
+  };
+
  //////////////////////////////////////////////
  // Some Template specialization
-  template < typename vtype > 
-    void permute(vtype &a, vtype b, int perm) {
-    uconv<vtype> conv; 
-    conv.v = b;
-    switch(perm){
-    case 3: break; //empty for SSE4
-    case 2: break; //empty for SSE4
-    case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
-    case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
-    default: assert(0); break;
-    }
-    a=conv.v;
-  }; 
+

  //Complex float Reduce
  template<>
  inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
    __m128 v1; // two complex
-    Optimization::permute(v1,in,0); 
+    v1= Optimization::Permute::Permute0(in); 
    v1= _mm_add_ps(v1,in);
    u128f conv;    conv.v=v1;
    return Grid::ComplexF(conv.f[0],conv.f[1]);
@@ -254,9 +272,9 @@ namespace Optimization {
  template<>
  inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
    __m128 v1,v2; // quad single
-    Optimization::permute(v1,in,0); 
+    v1= Optimization::Permute::Permute0(in); 
    v1= _mm_add_ps(v1,in);
-    Optimization::permute(v2,v1,1); 
+    v2= Optimization::Permute::Permute1(v1); 
    v1 = _mm_add_ps(v1,v2);
    u128f conv; conv.v=v1;
    return conv.f[0];
@@ -274,7 +292,7 @@ namespace Optimization {
  template<>
  inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
    __m128d v1;
-    Optimization::permute(v1,in,0); // avx 256; quad double
+    v1 = Optimization::Permute::Permute0(in); 
    v1 = _mm_add_pd(v1,in);
    u128d conv; conv.v = v1;
    return conv.f[0];
@@ -302,14 +320,6 @@ namespace Grid {
  inline void prefetch_HINT_T0(const char *ptr){
    _mm_prefetch(ptr,_MM_HINT_T0);
  }
-  
-
-  // Gpermute function
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    Optimization::permute(y.v,b.v,perm);
-  }
-

  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -19,6 +19,9 @@
 #if defined AVX512
 #include "Grid_avx512.h"
 #endif
+#if defined IMCI
+#include "Grid_imci.h"
+#endif
 #if defined QPX
 #include "Grid_qpx.h"
 #endif
@@ -248,30 +251,43 @@ namespace Grid {
    // all subtypes; may not be a good assumption, but could
    // add the vector width as a template param for BG/Q for example
    ////////////////////////////////////////////////////////////////////
+    friend inline void permute0(Grid_simd &y,Grid_simd b){
+      y.v = Optimization::Permute::Permute0(b.v);
+    }
+    friend inline void permute1(Grid_simd &y,Grid_simd b){
+      y.v = Optimization::Permute::Permute1(b.v);
+    }
+    friend inline void permute2(Grid_simd &y,Grid_simd b){
+      y.v = Optimization::Permute::Permute2(b.v);
+    }
+    friend inline void permute3(Grid_simd &y,Grid_simd b){
+      y.v = Optimization::Permute::Permute3(b.v);
+    }
    friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
    {
-      Gpermute<Grid_simd>(y,b,perm);
+      if      (perm==3) permute3(y,b);
+      else if (perm==2) permute2(y,b);
+      else if (perm==1) permute1(y,b);
+      else if (perm==0) permute0(y,b);
    }

+
    
  };// end of Grid_simd class definition 

-
  ///////////////////////
  // Splat
  ///////////////////////
  
  // this is only for the complex version
  template <class S, class V, IfComplex<S> =0, class ABtype> 
-    inline void vsplat(Grid_simd<S,V> &ret,ABtype a, ABtype b){
+  inline void vsplat(Grid_simd<S,V> &ret,ABtype a, ABtype b){
    ret.v = binary<V>(a, b, VsplatSIMD());
  }    

  // overload if complex
  template <class S,class V> inline void vsplat(Grid_simd<S,V> &ret, EnableIf<is_complex < S >, S> c) {
-    Real a = real(c);
-    Real b = imag(c);
-    vsplat(ret,a,b);
+    vsplat(ret,real(c),imag(c));
  }

  //if real fill with a, if complex fill with a in the real part (first function above)
@@ -290,8 +306,8 @@ namespace Grid {
  template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));} 

  // if not complex overload here 
-  template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,1.0); }
-  template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret)     { vsplat(ret,0.0); }
+  template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
+  template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
   
  // For integral types
  template <class S,class V,IfInteger<S> = 0 > inline void vone(Grid_simd<S,V> &ret)  {vsplat(ret,1); }
@@ -304,13 +320,18 @@ namespace Grid {
  ///////////////////////
  // Vstream
  ///////////////////////
-  template <class S,class V, IfNotInteger<S> = 0 > 
-    inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
-      binary<void>((Real*)&out.v, in.v, VstreamSIMD());
-    }
+  template <class S,class V, IfReal<S> = 0 > 
+  inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
+    binary<void>((S *)&out.v, in.v, VstreamSIMD());
+  }
+  template <class S,class V, IfComplex<S> = 0 > 
+  inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
+    typedef typename S::value_type T;
+    binary<void>((T *)&out.v, in.v, VstreamSIMD());
+  }

  template <class S,class V, IfInteger<S> = 0 > 
-    inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
+  inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
    out=in;
  }

--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -23,6 +23,7 @@ template<class vtype> class iScalar
 public:
  vtype _internal;

+  typedef vtype element;
  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
@@ -124,6 +125,7 @@ template<class vtype,int N> class iVector
 public:
  vtype _internal[N];

+  typedef vtype element;
  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
@@ -175,6 +177,7 @@ public:
      permute(out._internal[i],in._internal[i],permutetype);
    }
  }
+
  // Unary negation
  friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
    iVector<vtype,N> ret;
@@ -219,6 +222,7 @@ template<class vtype,int N> class iMatrix
 public:
  vtype _internal[N][N];

+  typedef vtype element;
  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
@@ -287,12 +291,15 @@ public:
 	vstream(out._internal[i][j],in._internal[i][j]);
      }}
    }
+
  friend strong_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	permute(out._internal[i][j],in._internal[i][j],permutetype);
    }}
  }
+
+
  // Unary negation
  friend strong_inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
    iMatrix<vtype,N> ret;
--- a/scripts/configure-commands
+++ b/scripts/configure-commands
@@ -35,19 +35,22 @@ icpc-avx-openmp-mpi)
 CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
 ;;
 icpc-avx-openmp)
-CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
+CXX=icpc ../../configure --enable-precision=single --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
 ;;
 icpc-avx2)
-  CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
+  CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-march=core-avx2 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 icpc-avx512)
  CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3  -std=c++11" --host=none  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 icpc-mic)
-  CXX=icpc ../../configure --host=none --enable-simd=AVX512 CXXFLAGS="-mmic -O3  -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none
+  CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-mmic -O3  -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none
+  ;;
+icpc-mic-avx512)
+  CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3  -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-sse)
-CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
+CXX=clang++ ../../configure --enable-precision=single --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-avx)
 CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
--- a/scripts/hmc.sh
+++ b/scripts/hmc.sh
@@ -1,6 +1,35 @@
 #!/bin/bash

 LOG=$1
-SWEEPS=$2
-grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$10} END { print S/NR} ' 
-grep dH $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$10)} END { print S/NR} '
+SWEEPS=`grep dH $LOG | wc -l`
+SWEEPS=`expr $SWEEPS - 80`
+echo
+echo $SWEEPS thermalised sweeps
+echo
+plaq=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$10} END { print S/NR} ' `
+plaqe=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$10 ; SS=SS+$10*$10 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
+echo "Plaquette: $plaq (${plaqe})"
+echo
+
+dHv=`grep dH $LOG | tail -n $SWEEPS | awk '{ S=S+$10 ; SS=SS+$10*$10 } END { print sqrt(SS/NR) } ' `
+edH=`grep dH $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$10)} END { print S/NR} '`
+echo "<e-dH>: $edH"
+echo "<rms dH>: $dHv"
+
+TRAJ=`grep Acc $LOG | wc -l`
+ACC=`grep Acc $LOG | grep ACCE | wc -l`
+PACC=`expr  100 \* ${ACC} / ${TRAJ} `
+echo
+echo "Acceptance $PACC %  $ACC / $TRAJ "
+
+grep Plaq $LOG | awk '{ print $10 }' | uniq > plaq.dat
+grep dH $LOG | awk '{ print $10 }' > dH.dat
+echo set yrange [-0.1:0.6] > plot.gnu
+echo set terminal 'pdf' >> plot.gnu
+echo "set output 'plaq.${LOG}.pdf'" >> plot.gnu
+echo "plot 'plaq.dat' w l, 'dH.dat' w l " >> plot.gnu
+echo
+gnuplot plot.gnu >& gnu.errs
+open plaq.${LOG}.pdf
+
+
--- a/scripts/substitute
+++ b/scripts/substitute
@@ -10,7 +10,7 @@ shift
 while (( "$#" )); do

 echo $1
-sed -e "s:${WAS}:${IS}:g" -i .bak $1
+sed -e "s@${WAS}@${IS}@g" -i .bak $1

 shift

--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,5 +1,5 @@

-bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_gparity Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
+bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi


 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
@@ -86,12 +86,16 @@ Test_dwf_hdcr_SOURCES=Test_dwf_hdcr.cc
 Test_dwf_hdcr_LDADD=-lGrid


+#Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
+#Test_dwf_lanczos_LDADD=-lGrid
+
+
 Test_gamma_SOURCES=Test_gamma.cc
 Test_gamma_LDADD=-lGrid


-Test_gparity_SOURCES=Test_gparity.cc
-Test_gparity_LDADD=-lGrid
+#Test_gparity_SOURCES=Test_gparity.cc
+#Test_gparity_LDADD=-lGrid


 Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
@@ -174,6 +178,10 @@ Test_rng_fixed_SOURCES=Test_rng_fixed.cc
 Test_rng_fixed_LDADD=-lGrid


+Test_serialisation_SOURCES=Test_serialisation.cc
+Test_serialisation_LDADD=-lGrid
+
+
 Test_simd_SOURCES=Test_simd.cc
 Test_simd_LDADD=-lGrid

--- a/tests/Test_GaugeAction.cc
+++ b/tests/Test_GaugeAction.cc
@@ -50,7 +50,7 @@ int main (int argc, char ** argv)
  NerscField header;
  
  std::string file("./ckpoint_lat.4000");
-  readNerscConfiguration(Umu,header,file);
+  NerscIO::readConfiguration(Umu,header,file);

  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
--- a/tests/Test_cayley_ldop_cr.cc
+++ b/tests/Test_cayley_ldop_cr.cc
@@ -42,7 +42,7 @@ int main (int argc, char ** argv)

  NerscField header;
  std::string file("./ckpoint_lat.400");
-  readNerscConfiguration(Umu,header,file);
+  NerscIO::readConfiguration(Umu,header,file);

  //  SU3::ColdConfiguration(RNG4,Umu);
  //  SU3::TepidConfiguration(RNG4,Umu);
--- a/tests/Test_dwf_hdcr.cc
+++ b/tests/Test_dwf_hdcr.cc
@@ -388,7 +388,7 @@ int main (int argc, char ** argv)

  NerscField header;
  std::string file("./ckpoint_lat.4000");
-  readNerscConfiguration(Umu,header,file);
+  NerscIO::readConfiguration(Umu,header,file);

  //  SU3::ColdConfiguration(RNG4,Umu);
  //  SU3::TepidConfiguration(RNG4,Umu);
--- a/tests/Test_dwf_lanczos.cc
+++ b/tests/Test_dwf_lanczos.cc
@@ -0,0 +1,57 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=8;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeFermion    src(FGrid); gaussian(RNG5,src);
+  LatticeGaugeField Umu(UGrid); 
+  SU3::HotConfiguration(RNG4, Umu);
+
+  std::vector<LatticeColourMatrix> U(4,UGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+  }
+  
+  RealD mass=0.1;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
+
+  const int Nk = 10;
+  const int Np = 1;
+  RealD enorm  = 1.0;
+  RealD vthrs  = 1;
+  const int Nit= 1000;
+
+  ImplicitlyRestartedLanczos<LatticeFermion> IRL(HermOp,PolyX,
+						 Nk,Np,enorm,vthrs,Nit);
+
+  
+  std::vector<RealD>          eval(Nk);
+  std::vector<LatticeFermion> evec(Nk,FGrid);
+  IRL.calc(eval,evec,
+	   src,
+	   Nsbt,
+	   Nconv);
+
+
+  Grid_finalize();
+}
--- a/tests/Test_hmc_EODWFRatio.cc
+++ b/tests/Test_hmc_EODWFRatio.cc
@@ -17,7 +17,9 @@ int main (int argc, char ** argv)
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

+  GridSerialRNG    sRNG;
  GridParallelRNG  pRNG(UGrid);
+  sRNG.SeedRandomDevice();
  pRNG.SeedRandomDevice();

  LatticeLorentzColourMatrix     U(UGrid);
@@ -25,7 +27,7 @@ int main (int argc, char ** argv)
  SU3::HotConfiguration(pRNG, U);

  // simplify template declaration? Strip the lorentz from the second template
-  WilsonGaugeAction<LatticeLorentzColourMatrix, LatticeColourMatrix> Waction(5.6);
+  WilsonGaugeActionR Waction(5.6);

  Real mass=0.04;
  Real pv  =1.0;
@@ -38,21 +40,20 @@ int main (int argc, char ** argv)
  TwoFlavourEvenOddRatioPseudoFermionAction<WilsonImplR> Nf2(NumOp, DenOp,CG,CG);
  
  //Collect actions
-  ActionLevel Level1;
+  ActionLevel<LatticeGaugeField> Level1;
  Level1.push_back(&Nf2);
  Level1.push_back(&Waction);
-  ActionSet FullSet;
+  ActionSet<LatticeGaugeField> FullSet;
  FullSet.push_back(Level1);

  // Create integrator
-  //  typedef LeapFrog  IntegratorAlgorithm;// change here to change the algorithm
-  typedef MinimumNorm2  IntegratorAlgorithm;// change here to change the algorithm
-  IntegratorParameters MDpar(12,20,1.0);
-  Integrator<IntegratorAlgorithm> MDynamics(UGrid,MDpar, FullSet);
+  typedef MinimumNorm2<LatticeGaugeField>  IntegratorType;// change here to change the algorithm
+  IntegratorParameters MDpar(20);
+  IntegratorType MDynamics(UGrid,MDpar, FullSet);

  // Create HMC
  HMCparameters HMCpar;
-  HybridMonteCarlo<IntegratorAlgorithm>  HMC(HMCpar, MDynamics);
+  HybridMonteCarlo<LatticeGaugeField,IntegratorType>  HMC(HMCpar, MDynamics,sRNG,pRNG);

  // Run it
  HMC.evolve(U);
--- a/tests/Test_hmc_EOWilsonFermionGauge.cc
+++ b/tests/Test_hmc_EOWilsonFermionGauge.cc
@@ -15,40 +15,50 @@ int main (int argc, char ** argv)
  
  GridCartesian            Fine(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian  RBFine(latt_size,simd_layout,mpi_layout);
+
+  std::vector<int> seeds({6,7,8,80});
  GridParallelRNG  pRNG(&Fine);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(seeds);
+
+  std::vector<int> seedsS({1,2,3,4});
+  GridSerialRNG    sRNG;
+  sRNG.SeedFixedIntegers(seedsS);
+
  LatticeLorentzColourMatrix     U(&Fine);

  SU3::HotConfiguration(pRNG, U);

  // simplify template declaration? Strip the lorentz from the second template
-  WilsonGaugeAction<LatticeLorentzColourMatrix, LatticeColourMatrix> Waction(5.6);
+  WilsonGaugeActionR Waction(5.6);

  Real mass=-0.77;
  WilsonFermionR FermOp(U,Fine,RBFine,mass);
  
  ConjugateGradient<LatticeFermion>  CG(1.0e-8,10000);
-  ConjugateGradient<LatticeFermion>  CGmd(1.0e-6,10000);
  
  TwoFlavourEvenOddPseudoFermionAction<WilsonImplR> WilsonNf2(FermOp,CG,CG);
  
  //Collect actions
-  ActionLevel Level1;
+  ActionLevel<LatticeGaugeField> Level1(1);
+  ActionLevel<LatticeGaugeField> Level2(4);
  Level1.push_back(&WilsonNf2);
-  Level1.push_back(&Waction);
-  ActionSet FullSet;
+  Level2.push_back(&Waction);
+
+  ActionSet<LatticeGaugeField> FullSet;
  FullSet.push_back(Level1);
+  FullSet.push_back(Level2);

  // Create integrator
-  //  typedef MinimumNorm2  IntegratorAlgorithm;// change here to change the algorithm
-  typedef LeapFrog  IntegratorAlgorithm;// change here to change the algorithm
-  IntegratorParameters MDpar(12,40,1.0);
-  std::vector<int> rel ={1};
-  Integrator<IntegratorAlgorithm> MDynamics(&Fine,MDpar, FullSet);
+  //typedef LeapFrog<LatticeGaugeField>  IntegratorAlgorithm;// change here to change the algorithm
+  //typedef MinimumNorm2<LatticeGaugeField>  IntegratorAlgorithm;// change here to change the algorithm
+  typedef ForceGradient<LatticeGaugeField>  IntegratorAlgorithm;// change here to change the algorithm
+
+  IntegratorParameters MDpar(16);
+  IntegratorAlgorithm MDynamics(&Fine,MDpar, FullSet);

  // Create HMC
  HMCparameters HMCpar;
-  HybridMonteCarlo<IntegratorAlgorithm>  HMC(HMCpar, MDynamics);
+  HybridMonteCarlo<LatticeGaugeField,IntegratorAlgorithm>  HMC(HMCpar, MDynamics,sRNG,pRNG);

  HMC.evolve(U);

--- a/tests/Test_hmc_EOWilsonRatio.cc
+++ b/tests/Test_hmc_EOWilsonRatio.cc
@@ -15,14 +15,21 @@ int main (int argc, char ** argv)
  
  GridCartesian            Fine(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian  RBFine(latt_size,simd_layout,mpi_layout);
+
+  std::vector<int> seeds({6,7,8,80});
  GridParallelRNG  pRNG(&Fine);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(seeds);
+
+  std::vector<int> seedsS({1,2,3,4});
+  GridSerialRNG    sRNG;
+  sRNG.SeedFixedIntegers(seedsS);
+
  LatticeLorentzColourMatrix     U(&Fine);

  SU3::HotConfiguration(pRNG, U);

  // simplify template declaration? Strip the lorentz from the second template
-  WilsonGaugeAction<LatticeLorentzColourMatrix, LatticeColourMatrix> Waction(5.6);
+  WilsonGaugeActionR Waction(5.6);

  Real mass=-0.77;
  Real pv  =0.0;
@@ -34,22 +41,22 @@ int main (int argc, char ** argv)
  TwoFlavourEvenOddRatioPseudoFermionAction<WilsonImplR> WilsonNf2(NumOp, DenOp,CG,CG);
  
  //Collect actions
-  ActionLevel Level1;
+  ActionLevel<LatticeGaugeField> Level1;
  Level1.push_back(&WilsonNf2);
  Level1.push_back(&Waction);
-  ActionSet FullSet;
+  ActionSet<LatticeGaugeField> FullSet;
  FullSet.push_back(Level1);


  // Create integrator
-  typedef MinimumNorm2  IntegratorAlgorithm;// change here to change the algorithm
+  typedef MinimumNorm2<LatticeGaugeField>  IntegratorAlgorithm;// change here to change the algorithm
  //  typedef LeapFrog  IntegratorAlgorithm;// change here to change the algorithm
-  IntegratorParameters MDpar(12,20,1.0);
-  Integrator<IntegratorAlgorithm> MDynamics(&Fine,MDpar, FullSet);
+  IntegratorParameters MDpar(20);
+  IntegratorAlgorithm MDynamics(&Fine,MDpar, FullSet);

  // Create HMC
  HMCparameters HMCpar;
-  HybridMonteCarlo<IntegratorAlgorithm>  HMC(HMCpar, MDynamics);
+  HybridMonteCarlo<LatticeGaugeField,IntegratorAlgorithm>  HMC(HMCpar, MDynamics,sRNG,pRNG);

  HMC.evolve(U);

--- a/tests/Test_hmc_WilsonFermionGauge.cc
+++ b/tests/Test_hmc_WilsonFermionGauge.cc
@@ -15,40 +15,49 @@ int main (int argc, char ** argv)
  
  GridCartesian            Fine(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian  RBFine(latt_size,simd_layout,mpi_layout);
+
+  std::vector<int> seeds({6,7,8,80});
  GridParallelRNG  pRNG(&Fine);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(seeds);
+
+  std::vector<int> seedsS({1,2,3,4});
+  GridSerialRNG    sRNG;
+  sRNG.SeedFixedIntegers(seedsS);
+
  LatticeLorentzColourMatrix     U(&Fine);

  SU3::HotConfiguration(pRNG, U);

  // simplify template declaration? Strip the lorentz from the second template
-  WilsonGaugeAction<LatticeLorentzColourMatrix, LatticeColourMatrix> Waction(5.6);
+  WilsonGaugeActionR Waction(5.6);

  Real mass=-0.77;
  WilsonFermionR FermOp(U,Fine,RBFine,mass);
  
  ConjugateGradient<LatticeFermion>  CG(1.0e-8,10000);
-  ConjugateGradient<LatticeFermion>  CGmd(1.0e-6,10000);
  
  TwoFlavourPseudoFermionAction<WilsonImplR> WilsonNf2(FermOp,CG,CG);
  
  //Collect actions
-  ActionLevel Level1;
+  ActionLevel<LatticeGaugeField> Level1(1);
+  ActionLevel<LatticeGaugeField> Level2(4);
  Level1.push_back(&WilsonNf2);
  Level1.push_back(&Waction);
-  ActionSet FullSet;
+  //  Level1.push_back(&Waction);
+
+  ActionSet<LatticeGaugeField> FullSet;
  FullSet.push_back(Level1);
  //  FullSet.push_back(Level2);

  // Create integrator
-  typedef MinimumNorm2  IntegratorAlgorithm;// change here to change the algorithm
+  typedef MinimumNorm2<LatticeGaugeField>  IntegratorAlgorithm;// change here to change the algorithm
  //  typedef LeapFrog  IntegratorAlgorithm;// change here to change the algorithm
-  IntegratorParameters MDpar(12,20,1.0);
-  Integrator<IntegratorAlgorithm> MDynamics(&Fine,MDpar, FullSet);
+  IntegratorParameters MDpar(20);
+  IntegratorAlgorithm MDynamics(&Fine,MDpar, FullSet);

  // Create HMC
  HMCparameters HMCpar;
-  HybridMonteCarlo<IntegratorAlgorithm>  HMC(HMCpar, MDynamics);
+  HybridMonteCarlo<LatticeGaugeField,IntegratorAlgorithm>  HMC(HMCpar, MDynamics,sRNG,pRNG);

  HMC.evolve(U);

--- a/tests/Test_hmc_WilsonGauge.cc
+++ b/tests/Test_hmc_WilsonGauge.cc
@@ -22,30 +22,37 @@ int main (int argc, char ** argv)
  double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
  
  GridCartesian           Fine(latt_size,simd_layout,mpi_layout);
+
+
+  std::vector<int> seeds({6,7,8,80});
  GridParallelRNG  pRNG(&Fine);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(seeds);
+
+  std::vector<int> seedsS({1,2,3,4});
+  GridSerialRNG    sRNG;
+  sRNG.SeedFixedIntegers(seedsS);
+
  LatticeGaugeField U(&Fine);

  SU3::HotConfiguration(pRNG, U);
- 

  // simplify template declaration? Strip the lorentz from the second template
-  WilsonGaugeAction<LatticeGaugeField, LatticeColourMatrix> Waction(6.0);
+  WilsonGaugeActionR Waction(6.0);

  //Collect actions
-  ActionLevel Level1;
+  ActionLevel<LatticeGaugeField> Level1;
  Level1.push_back(&Waction);
-  ActionSet FullSet;
+  ActionSet<LatticeGaugeField> FullSet;
  FullSet.push_back(Level1);

  // Create integrator
-  typedef MinimumNorm2  IntegratorAlgorithm;// change here to modify the algorithm
-  IntegratorParameters MDpar(12,5,1.0);
-  Integrator<IntegratorAlgorithm> MDynamics(&Fine,MDpar, FullSet);
+  typedef MinimumNorm2<LatticeGaugeField>  IntegratorAlgorithm;// change here to modify the algorithm
+  IntegratorParameters MDpar(20);
+  IntegratorAlgorithm  MDynamics(&Fine,MDpar, FullSet);

  // Create HMC
  HMCparameters HMCpar;
-  HybridMonteCarlo<IntegratorAlgorithm>  HMC(HMCpar, MDynamics);
+  HybridMonteCarlo<LatticeGaugeField,IntegratorAlgorithm>  HMC(HMCpar, MDynamics, sRNG, pRNG);

  HMC.evolve(U);

--- a/tests/Test_hmc_WilsonRatio.cc
+++ b/tests/Test_hmc_WilsonRatio.cc
@@ -15,14 +15,21 @@ int main (int argc, char ** argv)
  
  GridCartesian            Fine(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian  RBFine(latt_size,simd_layout,mpi_layout);
+
+  std::vector<int> seeds({6,7,8,80});
  GridParallelRNG  pRNG(&Fine);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(seeds);
+
+  std::vector<int> seedsS({1,2,3,4});
+  GridSerialRNG    sRNG;
+  sRNG.SeedFixedIntegers(seedsS);
+
  LatticeLorentzColourMatrix     U(&Fine);

  SU3::HotConfiguration(pRNG, U);

  // simplify template declaration? Strip the lorentz from the second template
-  WilsonGaugeAction<LatticeLorentzColourMatrix, LatticeColourMatrix> Waction(5.6);
+  WilsonGaugeActionR Waction(5.6);

  Real mass=-0.77;
  Real pv  =0.0;
@@ -34,22 +41,22 @@ int main (int argc, char ** argv)
  TwoFlavourRatioPseudoFermionAction<WilsonImplR> WilsonNf2(NumOp,FermOp,CG,CG);
  
  //Collect actions
-  ActionLevel Level1;
+  ActionLevel<LatticeGaugeField> Level1;
  Level1.push_back(&WilsonNf2);
  Level1.push_back(&Waction);
-  ActionSet FullSet;
+  ActionSet<LatticeGaugeField> FullSet;
  FullSet.push_back(Level1);


  // Create integrator
-  typedef MinimumNorm2  IntegratorAlgorithm;// change here to change the algorithm
+  typedef MinimumNorm2<LatticeGaugeField>  IntegratorAlgorithm;// change here to change the algorithm
  //  typedef LeapFrog  IntegratorAlgorithm;// change here to change the algorithm
-  IntegratorParameters MDpar(12,20,1.0);
-  Integrator<IntegratorAlgorithm> MDynamics(&Fine,MDpar, FullSet);
+  IntegratorParameters MDpar(20);
+  IntegratorAlgorithm MDynamics(&Fine,MDpar, FullSet);

  // Create HMC
  HMCparameters HMCpar;
-  HybridMonteCarlo<IntegratorAlgorithm>  HMC(HMCpar, MDynamics);
+  HybridMonteCarlo<LatticeGaugeField,IntegratorAlgorithm>  HMC(HMCpar, MDynamics, sRNG, pRNG);

  HMC.evolve(U);

--- a/tests/Test_multishift_sqrt.cc
+++ b/tests/Test_multishift_sqrt.cc
@@ -17,7 +17,8 @@ public:
    pRNG.SeedFixedIntegers(seeds);

    random(pRNG,sqrtscale);
-    sqrtscale = real(sqrtscale)*3.0+0.5;// force real pos def
+    sqrtscale = 0.5*(sqrtscale + conjugate(sqrtscale));
+    sqrtscale = sqrtscale*3.0+0.5;// force real pos def
    scale = sqrtscale *sqrtscale; //scale should be bounded by 12.25

    // 
--- a/tests/Test_nersc_io.cc
+++ b/tests/Test_nersc_io.cc
@@ -28,7 +28,7 @@ int main (int argc, char ** argv)
  
  NerscField header;
  std::string file("./ckpoint_lat.4000");
-  readNerscConfiguration(Umu,header,file);
+  NerscIO::readConfiguration(Umu,header,file);

  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
@@ -89,6 +89,13 @@ int main (int argc, char ** argv)
  TComplex TcP = sum(cPlaq);
  Complex ll= TensorRemove(TcP);
  std::cout<<GridLogMessage << "coarsened plaquettes sum to " <<ll*PlaqScale<<std::endl;
+
+  std::string clone2x3("./ckpoint_clone2x3.4000");
+  std::string clone3x3("./ckpoint_clone3x3.4000");
+  int precision32 = 0;
+
+  NerscIO::writeConfiguration(Umu,clone3x3,0,precision32);
+  NerscIO::writeConfiguration(Umu,clone2x3,1,precision32);
  
  Grid_finalize();
 }
--- a/tests/Test_quenched_update.cc
+++ b/tests/Test_quenched_update.cc
@@ -73,8 +73,10 @@ int main (int argc, char ** argv)
 	}

 	PokeIndex<LorentzIndex>(Umu,link,mu);
-	//reunitarise link;
 	
+	//reunitarise link;
+	ProjectOnGroup(Umu);
+
      }

    }
--- a/tests/Test_rhmc_EOWilson1p1.cc
+++ b/tests/Test_rhmc_EOWilson1p1.cc
@@ -15,14 +15,22 @@ int main (int argc, char ** argv)
  
  GridCartesian            Fine(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian  RBFine(latt_size,simd_layout,mpi_layout);
+
+
+  std::vector<int> seeds({6,7,8,80});
  GridParallelRNG  pRNG(&Fine);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(seeds);
+
+  std::vector<int> seedsS({1,2,3,4});
+  GridSerialRNG    sRNG;
+  sRNG.SeedFixedIntegers(seedsS);
+
  LatticeLorentzColourMatrix     U(&Fine);

  SU3::HotConfiguration(pRNG, U);

  // simplify template declaration? Strip the lorentz from the second template
-  WilsonGaugeAction<LatticeLorentzColourMatrix, LatticeColourMatrix> Waction(5.6);
+  WilsonGaugeActionR Waction(5.6);

  Real mass=-0.77;
  WilsonFermionR FermOp(U,Fine,RBFine,mass);
@@ -33,23 +41,23 @@ int main (int argc, char ** argv)
  OneFlavourEvenOddRationalPseudoFermionAction<WilsonImplR> WilsonNf1b(FermOp,Params);
  
  //Collect actions
-  ActionLevel Level1;
+  ActionLevel<LatticeGaugeField> Level1;
  Level1.push_back(&WilsonNf1a);
  Level1.push_back(&WilsonNf1b);
  Level1.push_back(&Waction);

-  ActionSet FullSet;
+  ActionSet<LatticeGaugeField> FullSet;
  FullSet.push_back(Level1);

  // Create integrator
-  typedef MinimumNorm2  IntegratorAlgorithm;// change here to change the algorithm
+  typedef MinimumNorm2<LatticeGaugeField>  IntegratorAlgorithm;// change here to change the algorithm

-  IntegratorParameters MDpar(12,20,1.0);
-  Integrator<IntegratorAlgorithm> MDynamics(&Fine,MDpar, FullSet);
+  IntegratorParameters MDpar(20);
+  IntegratorAlgorithm MDynamics(&Fine,MDpar, FullSet);

  // Create HMC
  HMCparameters HMCpar;
-  HybridMonteCarlo<IntegratorAlgorithm>  HMC(HMCpar, MDynamics);
+  HybridMonteCarlo<LatticeGaugeField,IntegratorAlgorithm>  HMC(HMCpar, MDynamics, sRNG, pRNG);
  HMC.evolve(U);

 }
--- a/tests/Test_rhmc_EOWilsonRatio.cc
+++ b/tests/Test_rhmc_EOWilsonRatio.cc
@@ -15,14 +15,22 @@ int main (int argc, char ** argv)
  
  GridCartesian            Fine(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian  RBFine(latt_size,simd_layout,mpi_layout);
+
+
+  std::vector<int> seeds({6,7,8,80});
  GridParallelRNG  pRNG(&Fine);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(seeds);
+
+  std::vector<int> seedsS({1,2,3,4});
+  GridSerialRNG    sRNG;
+  sRNG.SeedFixedIntegers(seedsS);
+
  LatticeLorentzColourMatrix     U(&Fine);

  SU3::HotConfiguration(pRNG, U);

  // simplify template declaration? Strip the lorentz from the second template
-  WilsonGaugeAction<LatticeLorentzColourMatrix, LatticeColourMatrix> Waction(5.6);
+  WilsonGaugeActionR Waction(5.6);

  Real mass=-0.77;
  Real pv  =0.0;
@@ -35,23 +43,23 @@ int main (int argc, char ** argv)
  OneFlavourEvenOddRatioRationalPseudoFermionAction<WilsonImplR> WilsonNf1b(NumOp,DenOp,Params);
  
  //Collect actions
-  ActionLevel Level1;
+  ActionLevel<LatticeGaugeField> Level1;
  Level1.push_back(&WilsonNf1a);
  Level1.push_back(&WilsonNf1b);
  Level1.push_back(&Waction);
-  ActionSet FullSet;
+  ActionSet<LatticeGaugeField> FullSet;
  FullSet.push_back(Level1);


  // Create integrator
-  typedef MinimumNorm2  IntegratorAlgorithm;// change here to change the algorithm
+  typedef MinimumNorm2<LatticeGaugeField>  IntegratorAlgorithm;// change here to change the algorithm
  //  typedef LeapFrog  IntegratorAlgorithm;// change here to change the algorithm
-  IntegratorParameters MDpar(12,20,1.0);
-  Integrator<IntegratorAlgorithm> MDynamics(&Fine,MDpar, FullSet);
+  IntegratorParameters MDpar(20);
+  IntegratorAlgorithm MDynamics(&Fine,MDpar, FullSet);

  // Create HMC
  HMCparameters HMCpar;
-  HybridMonteCarlo<IntegratorAlgorithm>  HMC(HMCpar, MDynamics);
+  HybridMonteCarlo<LatticeGaugeField,IntegratorAlgorithm>  HMC(HMCpar, MDynamics, sRNG, pRNG);

  HMC.evolve(U);

--- a/tests/Test_rhmc_Wilson1p1.cc
+++ b/tests/Test_rhmc_Wilson1p1.cc
@@ -15,14 +15,22 @@ int main (int argc, char ** argv)
  
  GridCartesian            Fine(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian  RBFine(latt_size,simd_layout,mpi_layout);
+
+
+  std::vector<int> seeds({6,7,8,80});
  GridParallelRNG  pRNG(&Fine);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(seeds);
+
+  std::vector<int> seedsS({1,2,3,4});
+  GridSerialRNG    sRNG;
+  sRNG.SeedFixedIntegers(seedsS);
+
  LatticeLorentzColourMatrix     U(&Fine);

  SU3::HotConfiguration(pRNG, U);

  // simplify template declaration? Strip the lorentz from the second template
-  WilsonGaugeAction<LatticeLorentzColourMatrix, LatticeColourMatrix> Waction(5.6);
+  WilsonGaugeActionR Waction(5.6);

  Real mass=-0.77;
  WilsonFermionR FermOp(U,Fine,RBFine,mass);
@@ -33,23 +41,23 @@ int main (int argc, char ** argv)
  OneFlavourRationalPseudoFermionAction<WilsonImplR> WilsonNf1b(FermOp,Params);
  
  //Collect actions
-  ActionLevel Level1;
+  ActionLevel<LatticeGaugeField> Level1;
  Level1.push_back(&WilsonNf1a);
  Level1.push_back(&WilsonNf1b);
  Level1.push_back(&Waction);

-  ActionSet FullSet;
+  ActionSet<LatticeGaugeField> FullSet;
  FullSet.push_back(Level1);

  // Create integrator
-  typedef MinimumNorm2  IntegratorAlgorithm;// change here to change the algorithm
+  typedef MinimumNorm2<LatticeGaugeField>  IntegratorAlgorithm;// change here to change the algorithm

-  IntegratorParameters MDpar(12,20,1.0);
-  Integrator<IntegratorAlgorithm> MDynamics(&Fine,MDpar, FullSet);
+  IntegratorParameters MDpar(20);
+  IntegratorAlgorithm MDynamics(&Fine,MDpar, FullSet);

  // Create HMC
  HMCparameters HMCpar;
-  HybridMonteCarlo<IntegratorAlgorithm>  HMC(HMCpar, MDynamics);
+  HybridMonteCarlo<LatticeGaugeField,IntegratorAlgorithm>  HMC(HMCpar, MDynamics, sRNG, pRNG);

  HMC.evolve(U);

--- a/tests/Test_rhmc_WilsonRatio.cc
+++ b/tests/Test_rhmc_WilsonRatio.cc
@@ -15,14 +15,22 @@ int main (int argc, char ** argv)
  
  GridCartesian            Fine(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian  RBFine(latt_size,simd_layout,mpi_layout);
+
+
+  std::vector<int> seeds({6,7,8,80});
  GridParallelRNG  pRNG(&Fine);
-  pRNG.SeedRandomDevice();
+  pRNG.SeedFixedIntegers(seeds);
+
+  std::vector<int> seedsS({1,2,3,4});
+  GridSerialRNG    sRNG;
+  sRNG.SeedFixedIntegers(seedsS);
+
  LatticeLorentzColourMatrix     U(&Fine);

  SU3::HotConfiguration(pRNG, U);

  // simplify template declaration? Strip the lorentz from the second template
-  WilsonGaugeAction<LatticeLorentzColourMatrix, LatticeColourMatrix> Waction(5.6);
+  WilsonGaugeActionR Waction(5.6);

  Real mass=-0.77;
  Real pv  =0.0;
@@ -35,23 +43,23 @@ int main (int argc, char ** argv)
  OneFlavourRatioRationalPseudoFermionAction<WilsonImplR> WilsonNf1b(NumOp,DenOp,Params);
  
  //Collect actions
-  ActionLevel Level1;
+  ActionLevel<LatticeGaugeField> Level1;
  Level1.push_back(&WilsonNf1a);
  Level1.push_back(&WilsonNf1b);
  Level1.push_back(&Waction);
-  ActionSet FullSet;
+  ActionSet<LatticeGaugeField> FullSet;
  FullSet.push_back(Level1);


  // Create integrator
-  typedef MinimumNorm2  IntegratorAlgorithm;// change here to change the algorithm
+  typedef MinimumNorm2<LatticeGaugeField>  IntegratorAlgorithm;// change here to change the algorithm
  //  typedef LeapFrog  IntegratorAlgorithm;// change here to change the algorithm
-  IntegratorParameters MDpar(12,20,1.0);
-  Integrator<IntegratorAlgorithm> MDynamics(&Fine,MDpar, FullSet);
+  IntegratorParameters MDpar(20);
+  IntegratorAlgorithm MDynamics(&Fine,MDpar, FullSet);

  // Create HMC
  HMCparameters HMCpar;
-  HybridMonteCarlo<IntegratorAlgorithm>  HMC(HMCpar, MDynamics);
+  HybridMonteCarlo<LatticeGaugeField,IntegratorAlgorithm>  HMC(HMCpar, MDynamics, sRNG, pRNG);

  HMC.evolve(U);

--- a/tests/Test_serialisation.cc
+++ b/tests/Test_serialisation.cc
@@ -0,0 +1,93 @@
+#include <Grid.h>
+
+namespace Grid {
+
+class myclass {
+public:
+
+  GRID_DECL_CLASS_MEMBERS(myclass,
+			  int, x,
+			  double, y,
+			  bool , b,
+			  std::string, name,
+			  std::vector<double>, array,
+			  std::vector<std::vector<double> >, twodimarray,
+			  );
+
+  myclass(){}
+  myclass(int i) : array(4,5.1), twodimarray(3,std::vector<double>(2,1.23456)) {
+    x=i;
+    y=2*i;
+    b=true;
+    name="bother said pooh";
+  }
+
+};
+
+}
+
+ int16_t i16 = 1;
+uint16_t u16 = 2;
+ int32_t i32 = 3;
+uint32_t u32 = 4;
+ int64_t i64 = 5;
+uint64_t u64 = 6;
+float    f = M_PI;
+double   d = 2*M_PI;
+bool     b = false;
+
+using namespace Grid;
+
+int main(int argc,char **argv)
+{
+  {
+    XMLWriter WR("bother.xml");
+
+    push(WR,"BasicTypes");
+    write(WR,std::string("i16"),i16);
+    write(WR,"u16",u16);
+    write(WR,"i32",i32);
+    write(WR,"u32",u32);
+    write(WR,"i64",i64);
+    write(WR,"u64",u64);
+    write(WR,"f",f);
+    write(WR,"d",d);
+    write(WR,"b",b);
+    pop(WR);
+
+    myclass obj(1234); // non-trivial constructor
+    write(WR,"obj",obj);
+
+  };
+
+  XMLReader RD("bother.xml");
+
+  myclass copy1;
+  myclass copy2;
+  myclass copy3;
+
+  read(RD,"obj",copy1);
+  std::cout << "Loaded "  << copy1<<std::endl;
+
+  {
+    BinaryWriter BWR("bother.bin");
+    write(BWR,"discard",copy1 );
+  }
+  { 
+    BinaryReader BRD("bother.bin");
+    read (BRD,"discard",copy2 );
+    std::cout<<copy2<<std::endl;
+  }
+
+
+  {
+    TextWriter TWR("bother.txt");
+    write(TWR,"discard",copy1 );
+  }
+  { 
+    TextReader TRD("bother.txt");
+    read (TRD,"discard",copy3 );
+    std::cout<<copy3<<std::endl;
+  }
+  
+}