Merge pull request #226 from nils-asmussen/fix/Gauss

Fix compiling of MSource::Gauss for single precision
2025-06-14 05:07:05 +01:00 · 2019-08-14 17:50:38 +01:00 · 2019-08-12 14:57:11 +01:00 · 2019-08-08 12:29:55 +02:00 · 2019-07-30 22:51:23 +01:00 · 2019-07-30 22:51:04 +01:00
357 changed files with 18481 additions and 4936 deletions
--- a/.gitignore
+++ b/.gitignore
@ -114,3 +114,4 @@ gh-pages/
 #####################
 Grid/qcd/spin/gamma-gen/*.h
 Grid/qcd/spin/gamma-gen/*.cc
+Grid/util/Version.h
--- a/5
+++ b/5
@ -0,0 +1,5 @@
+Version : 0.8.0
+
+- Clang 3.5 and above, ICPC v16 and above, GCC 6.3 and above recommended
+- MPI and MPI3 comms optimisations for KNL and OPA finished
+- Half precision comms
--- a/Grid/Grid.h
+++ b/Grid/Grid.h
@ -42,6 +42,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/Action.h>
 #include <Grid/qcd/utils/GaugeFix.h>
+#include <Grid/qcd/utils/CovariantSmearing.h>
 #include <Grid/qcd/smearing/Smearing.h>
 #include <Grid/parallelIO/MetaData.h>
 #include <Grid/qcd/hmc/HMC_aggregate.h>
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@ -48,14 +48,16 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
+#include <Grid/algorithms/iterative/MinimalResidual.h>
+#include <Grid/algorithms/iterative/GeneralisedMinimalResidual.h>
+#include <Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h>
+#include <Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h>
+#include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
+#include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/PowerMethod.h>
+
 #include <Grid/algorithms/CoarsenedMatrix.h>
 #include <Grid/algorithms/FFT.h>

-// EigCg
-// Pcg
-// Hdcg
-// GCR
-// etc..
-
 #endif
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@ -211,6 +211,7 @@ namespace Grid {

      for(int b=0;b<nn;b++){
 	
+	subspace[b] = zero;
 	gaussian(RNG,noise);
 	scale = std::pow(norm2(noise),-0.5); 
 	noise=noise*scale;
@ -295,13 +296,58 @@ namespace Grid {
      return norm2(out);
    };

-    RealD Mdag (const CoarseVector &in, CoarseVector &out){ 
-      return M(in,out);
+    RealD Mdag (const CoarseVector &in, CoarseVector &out){
+      // // corresponds to Petrov-Galerkin coarsening
+      // return M(in,out);
+
+      // corresponds to Galerkin coarsening
+      CoarseVector tmp(Grid());
+      G5C(tmp, in);
+      M(tmp, out);
+      G5C(out, out);
+      return norm2(out);
    };

-    // Defer support for further coarsening for now
-    void Mdiag    (const CoarseVector &in,  CoarseVector &out){};
-    void Mdir     (const CoarseVector &in,  CoarseVector &out,int dir, int disp){};
+    void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
+
+      conformable(_grid,in._grid);
+      conformable(in._grid,out._grid);
+
+      SimpleCompressor<siteVector> compressor;
+      Stencil.HaloExchange(in,compressor);
+
+      auto point = [dir, disp](){
+        if(dir == 0 and disp == 0)
+          return 8;
+        else
+          return (4 * dir + 1 - disp) / 2;
+      }();
+
+      parallel_for(int ss=0;ss<Grid()->oSites();ss++){
+        siteVector res = zero;
+        siteVector nbr;
+        int ptype;
+        StencilEntry *SE;
+
+        SE=Stencil.GetEntry(ptype,point,ss);
+
+        if(SE->_is_local&&SE->_permute) {
+          permute(nbr,in._odata[SE->_offset],ptype);
+        } else if(SE->_is_local) {
+          nbr = in._odata[SE->_offset];
+        } else {
+          nbr = Stencil.CommBuf()[SE->_offset];
+        }
+
+        res = res + A[point]._odata[ss]*nbr;
+
+        vstream(out._odata[ss],res);
+      }
+    };
+
+    void Mdiag(const CoarseVector &in, CoarseVector &out){
+      Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil
+    };

    CoarsenedMatrix(GridCartesian &CoarseGrid) 	: 

@ -417,7 +463,7 @@ namespace Grid {
      std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
      //      ForceHermitian();
-      AssertHermitian();
+      // AssertHermitian();
      // ForceDiagonal();
    }
    void ForceDiagonal(void) {
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@ -178,7 +178,7 @@ namespace Grid {
    //////////////////////////////////////////////////////////

    template<class Field>
-      class SchurOperatorBase :  public LinearOperatorBase<Field> {
+    class SchurOperatorBase :  public LinearOperatorBase<Field> {
    public:
      virtual  RealD Mpc      (const Field &in, Field &out) =0;
      virtual  RealD MpcDag   (const Field &in, Field &out) =0;
@ -211,10 +211,9 @@ namespace Grid {
      }
    };
    template<class Matrix,class Field>
-      class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
-    protected:
-      Matrix &_Mat;
+    class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
    public:
+      Matrix &_Mat;
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
      Field tmp(in._grid);
@ -380,6 +379,12 @@ namespace Grid {
    template<class Field> class OperatorFunction {
    public:
      virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
+      virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
+	assert(in.size()==out.size());
+	for(int k=0;k<in.size();k++){
+	  (*this)(Linop,in[k],out[k]);
+	}
+      };
    };

    template<class Field> class LinearFunction {
@ -421,7 +426,7 @@ namespace Grid {
  // Hermitian operator Linear function and operator function
  ////////////////////////////////////////////////////////////////////////////////////////////
    template<class Field>
-      class HermOpOperatorFunction : public OperatorFunction<Field> {
+    class HermOpOperatorFunction : public OperatorFunction<Field> {
      void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
 	Linop.HermOp(in,out);
      };
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@ -55,6 +55,14 @@ namespace Grid {
    template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
    public:
      virtual GridBase *RedBlackGrid(void)=0;
+
+      //////////////////////////////////////////////////////////////////////
+      // Query the even even properties to make algorithmic decisions
+      //////////////////////////////////////////////////////////////////////
+      virtual RealD  Mass(void)        { return 0.0; };
+      virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden
+      virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
+
      // half checkerboard operaions
      virtual  void Meooe    (const Field &in, Field &out)=0;
      virtual  void Mooee    (const Field &in, Field &out)=0;
--- a/Grid/algorithms/iterative/BlockConjugateGradient.h
+++ b/Grid/algorithms/iterative/BlockConjugateGradient.h
@ -33,7 +33,7 @@ directory

 namespace Grid {

-enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
+enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };

 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient. Dimension zero should be the block direction
@ -42,7 +42,6 @@ template <class Field>
 class BlockConjugateGradient : public OperatorFunction<Field> {
 public:

-
  typedef typename Field::scalar_type scomplex;

  int blockDim ;
@ -54,21 +53,15 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  Integer PrintInterval; //GridLogMessages or Iterative
  
  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
+    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100)
  {};

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Thin QR factorisation (google it)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-void ThinQRfact (Eigen::MatrixXcd &m_rr,
-		 Eigen::MatrixXcd &C,
-		 Eigen::MatrixXcd &Cinv,
-		 Field & Q,
-		 const Field & R)
-{
-  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  //Dimensions
  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
@ -85,22 +78,20 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  // Cdag C = Rdag R ; passes.
  // QdagQ  = 1      ; passes
  ////////////////////////////////////////////////////////////////////////////////////////////////////
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 Field & Q,
+		 const Field & R)
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  sliceInnerProductMatrix(m_rr,R,R,Orthog);

  // Force manifest hermitian to avoid rounding related
  m_rr = 0.5*(m_rr+m_rr.adjoint());

-#if 0
-  std::cout << " Calling Cholesky  ldlt on m_rr "  << m_rr <<std::endl;
-  Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL(); 
-  std::cout << " Called Cholesky  ldlt on m_rr "  << L_ldlt <<std::endl;
-  auto  D_ldlt = m_rr.ldlt().vectorD(); 
-  std::cout << " Called Cholesky  ldlt on m_rr "  << D_ldlt <<std::endl;
-#endif
-
-  //  std::cout << " Calling Cholesky  llt on m_rr "  <<std::endl;
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
-  //  std::cout << " Called Cholesky  llt on m_rr "  << L <<std::endl;
+
  C    = L.adjoint();
  Cinv = C.inverse();
  ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -112,6 +103,25 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  sliceMulMatrix(Q,Cinv,R,Orthog);
 }
+// see comments above
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 std::vector<Field> & Q,
+		 const std::vector<Field> & R)
+{
+  InnerProductMatrix(m_rr,R,R);
+
+  m_rr = 0.5*(m_rr+m_rr.adjoint());
+
+  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+
+  C    = L.adjoint();
+  Cinv = C.inverse();
+
+  MulMatrix(Q,Cinv,R);
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Call one of several implementations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -119,14 +129,20 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 {
  if ( CGtype == BlockCGrQ ) {
    BlockCGrQsolve(Linop,Src,Psi);
-  } else if (CGtype == BlockCG ) {
-    BlockCGsolve(Linop,Src,Psi);
  } else if (CGtype == CGmultiRHS ) {
    CGmultiRHSsolve(Linop,Src,Psi);
  } else {
    assert(0);
  }
 }
+virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi) 
+{
+  if ( CGtype == BlockCGrQVec ) {
+    BlockCGrQsolveVec(Linop,Src,Psi);
+  } else {
+    assert(0);
+  }
+}

 ////////////////////////////////////////////////////////////////////////////
 // BlockCGrQ implementation:
@ -139,7 +155,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = B._grid->_fdimensions[Orthog];
-
+/* FAKE */
+  Nblock=8;
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;

  X.checkerboard = B.checkerboard;
@ -202,15 +219,10 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;

  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
-
  Linop.HermOp(X, AD);
  tmp = B - AD;  
-  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
+
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
-  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
-  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
-  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
-  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
  D=Q;

  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
@ -232,14 +244,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    MatrixTimer.Start();
    Linop.HermOp(D, Z);      
    MatrixTimer.Stop();
-    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;

    //4. M  = [D^dag Z]^{-1}
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
    sliceInnerTimer.Stop();
    m_M       = m_DZ.inverse();
-    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
    
    //5. X  = X + D MC
    m_tmp     = m_M * m_C;
@ -257,6 +267,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    
    //7. D  = Q + D S^dag
    m_tmp = m_S.adjoint();
+
    sliceMaddTimer.Start();
    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
    sliceMaddTimer.Stop();
@ -317,152 +328,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  IterationsToComplete = k;
 }
 //////////////////////////////////////////////////////////////////////////
-// Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
-//////////////////////////////////////////////////////////////////////////
-void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
-{
-  int Orthog = blockDim; // First dimension is block dim; this is an assumption
-  Nblock = Src._grid->_fdimensions[Orthog];
-
-  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
-
-  Psi.checkerboard = Src.checkerboard;
-  conformable(Psi, Src);
-
-  Field P(Src);
-  Field AP(Src);
-  Field R(Src);
-  
-  Eigen::MatrixXcd m_pAp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
-  Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
-  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-  Eigen::MatrixXcd m_alpha      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  Eigen::MatrixXcd m_beta   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-  // Initial residual computation & set up
-  std::vector<RealD> residuals(Nblock);
-  std::vector<RealD> ssq(Nblock);
-
-  sliceNorm(ssq,Src,Orthog);
-  RealD sssum=0;
-  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
-
-  sliceNorm(residuals,Src,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-  sliceNorm(residuals,Psi,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-  // Initial search dir is guess
-  Linop.HermOp(Psi, AP);
-  
-
-  /************************************************************************
-   * Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
-   ************************************************************************
-   * O'Leary : R = B - A X
-   * O'Leary : P = M R ; preconditioner M = 1
-   * O'Leary : alpha = PAP^{-1} RMR
-   * O'Leary : beta  = RMR^{-1}_old RMR_new
-   * O'Leary : X=X+Palpha
-   * O'Leary : R_new=R_old-AP alpha
-   * O'Leary : P=MR_new+P beta
-   */
-
-  R = Src - AP;  
-  P = R;
-  sliceInnerProductMatrix(m_rr,R,R,Orthog);
-
-  GridStopWatch sliceInnerTimer;
-  GridStopWatch sliceMaddTimer;
-  GridStopWatch MatrixTimer;
-  GridStopWatch SolverTimer;
-  SolverTimer.Start();
-
-  int k;
-  for (k = 1; k <= MaxIterations; k++) {
-
-    RealD rrsum=0;
-    for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
-
-    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
-	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
-
-    MatrixTimer.Start();
-    Linop.HermOp(P, AP);
-    MatrixTimer.Stop();
-
-    // Alpha
-    sliceInnerTimer.Start();
-    sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
-    sliceInnerTimer.Stop();
-    m_pAp_inv = m_pAp.inverse();
-    m_alpha   = m_pAp_inv * m_rr ;
-
-    // Psi, R update
-    sliceMaddTimer.Start();
-    sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog);     // add alpha *  P to psi
-    sliceMaddMatrix(R  ,m_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
-    sliceMaddTimer.Stop();
-
-    // Beta
-    m_rr_inv = m_rr.inverse();
-    sliceInnerTimer.Start();
-    sliceInnerProductMatrix(m_rr,R,R,Orthog);
-    sliceInnerTimer.Stop();
-    m_beta = m_rr_inv *m_rr;
-
-    // Search update
-    sliceMaddTimer.Start();
-    sliceMaddMatrix(AP,m_beta,P,R,Orthog);
-    sliceMaddTimer.Stop();
-    P= AP;
-
-    /*********************
-     * convergence monitor
-     *********************
-     */
-    RealD max_resid=0;
-    RealD rr;
-    for(int b=0;b<Nblock;b++){
-      rr = real(m_rr(b,b))/ssq[b];
-      if ( rr > max_resid ) max_resid = rr;
-    }
-    
-    if ( max_resid < Tolerance*Tolerance ) { 
-
-      SolverTimer.Stop();
-
-      std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
-      for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
-		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
-      }
-      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
-
-      Linop.HermOp(Psi, AP);
-      AP = AP-Src;
-      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
-
-      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
-      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
-      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
-	    
-      IterationsToComplete = k;
-      return;
-    }
-
-  }
-  std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;
-
-  if (ErrorOnNoConverge) assert(0);
-  IterationsToComplete = k;
-}
-//////////////////////////////////////////////////////////////////////////
 // multiRHS conjugate gradient. Dimension zero should be the block direction
 // Use this for spread out across nodes
 //////////////////////////////////////////////////////////////////////////
@ -600,6 +465,233 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
  IterationsToComplete = k;
 }

+void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
+  for(int b=0;b<Nblock;b++){
+  for(int bp=0;bp<Nblock;bp++) {
+    m(b,bp) = innerProduct(X[b],Y[bp]);  
+  }}
+}
+void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
+  // Should make this cache friendly with site outermost, parallel_for
+  // Deal with case AP aliases with either Y or X
+  std::vector<Field> tmp(Nblock,X[0]);
+  for(int b=0;b<Nblock;b++){
+    tmp[b]   = Y[b];
+    for(int bp=0;bp<Nblock;bp++) {
+      tmp[b] = tmp[b] + (scale*m(bp,b))*X[bp]; 
+    }
+  }
+  for(int b=0;b<Nblock;b++){
+    AP[b] = tmp[b];
+  }
+}
+void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
+  // Should make this cache friendly with site outermost, parallel_for
+  for(int b=0;b<Nblock;b++){
+    AP[b] = zero;
+    for(int bp=0;bp<Nblock;bp++) {
+      AP[b] += (m(bp,b))*X[bp]; 
+    }
+  }
+}
+double normv(const std::vector<Field> &P){
+  double nn = 0.0;
+  for(int b=0;b<Nblock;b++) {
+    nn+=norm2(P[b]);
+  }
+  return nn;
+}
+
+////////////////////////////////////////////////////////////////////////////
+// BlockCGrQvec implementation:
+//--------------------------
+// X is guess/Solution
+// B is RHS
+// Solve A X_i = B_i    ;        i refers to Nblock index
+////////////////////////////////////////////////////////////////////////////
+void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X) 
+{
+  Nblock = B.size();
+  assert(Nblock == X.size());
+
+  std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
+
+  for(int b=0;b<Nblock;b++){ 
+    X[b].checkerboard = B[b].checkerboard;
+    conformable(X[b], B[b]);
+    conformable(X[b], X[0]); 
+  }
+
+  Field Fake(B[0]);
+
+  std::vector<Field> tmp(Nblock,Fake);
+  std::vector<Field>   Q(Nblock,Fake);
+  std::vector<Field>   D(Nblock,Fake);
+  std::vector<Field>   Z(Nblock,Fake);
+  std::vector<Field>  AD(Nblock,Fake);
+
+  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  /************************************************************************
+   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
+   ************************************************************************
+   * Dimensions:
+   *
+   *   X,B==(Nferm x Nblock)
+   *   A==(Nferm x Nferm)
+   *  
+   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
+   * 
+   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+   * for k: 
+   *   Z  = AD
+   *   M  = [D^dag Z]^{-1}
+   *   X  = X + D MC
+   *   QS = Q - ZM
+   *   D  = Q + D S^dag
+   *   C  = S C
+   */
+  ///////////////////////////////////////
+  // Initial block: initial search dir is guess
+  ///////////////////////////////////////
+  std::cout << GridLogMessage<<"BlockCGrQvec algorithm initialisation " <<std::endl;
+
+  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+  for(int b=0;b<Nblock;b++) {
+    Linop.HermOp(X[b], AD[b]);
+    tmp[b] = B[b] - AD[b];  
+  }
+
+  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+
+  for(int b=0;b<Nblock;b++) D[b]=Q[b];
+
+  std::cout << GridLogMessage<<"BlockCGrQ vec computed initial residual and QR fact " <<std::endl;
+
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
+
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    //3. Z  = AD
+    MatrixTimer.Start();
+    for(int b=0;b<Nblock;b++) Linop.HermOp(D[b], Z[b]);      
+    MatrixTimer.Stop();
+
+    //4. M  = [D^dag Z]^{-1}
+    sliceInnerTimer.Start();
+    InnerProductMatrix(m_DZ,D,Z);
+    sliceInnerTimer.Stop();
+    m_M       = m_DZ.inverse();
+    
+    //5. X  = X + D MC
+    m_tmp     = m_M * m_C;
+    sliceMaddTimer.Start();
+    MaddMatrix(X,m_tmp, D,X);     
+    sliceMaddTimer.Stop();
+
+    //6. QS = Q - ZM
+    sliceMaddTimer.Start();
+    MaddMatrix(tmp,m_M,Z,Q,-1.0);
+    sliceMaddTimer.Stop();
+    QRTimer.Start();
+    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
+    QRTimer.Stop();
+    
+    //7. D  = Q + D S^dag
+    m_tmp = m_S.adjoint();
+    sliceMaddTimer.Start();
+    MaddMatrix(D,m_tmp,D,Q);
+    sliceMaddTimer.Stop();
+
+    //8. C  = S C
+    m_C = m_S*m_C;
+    
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    m_rr = m_C.adjoint() * m_C;
+
+    RealD max_resid=0;
+    RealD rrsum=0;
+    RealD rr;
+
+    for(int b=0;b<Nblock;b++) {
+      rrsum+=real(m_rr(b,b));
+      rr = real(m_rr(b,b))/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+
+    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl;
+
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
+
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
+      for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
+	    
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+
+
+
 };

 }
--- a/Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
@ -0,0 +1,244 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
+
+Copyright (C) 2015
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
+#define GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
+
+namespace Grid {
+
+template<class Field>
+class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
+ public:
+  bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
+                          // defaults to true
+
+  RealD   Tolerance;
+
+  Integer MaxIterations;
+  Integer RestartLength;
+  Integer MaxNumberOfRestarts;
+  Integer IterationCount; // Number of iterations the CAGMRES took to finish,
+                          // filled in upon completion
+
+  GridStopWatch MatrixTimer;
+  GridStopWatch LinalgTimer;
+  GridStopWatch QrTimer;
+  GridStopWatch CompSolutionTimer;
+
+  Eigen::MatrixXcd H;
+
+  std::vector<std::complex<double>> y;
+  std::vector<std::complex<double>> gamma;
+  std::vector<std::complex<double>> c;
+  std::vector<std::complex<double>> s;
+
+  CommunicationAvoidingGeneralisedMinimalResidual(RealD   tol,
+                                                  Integer maxit,
+                                                  Integer restart_length,
+                                                  bool    err_on_no_conv = true)
+      : Tolerance(tol)
+      , MaxIterations(maxit)
+      , RestartLength(restart_length)
+      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
+      , ErrorOnNoConverge(err_on_no_conv)
+      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
+      , y(RestartLength + 1, 0.)
+      , gamma(RestartLength + 1, 0.)
+      , c(RestartLength + 1, 0.)
+      , s(RestartLength + 1, 0.) {};
+
+  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
+
+    std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular GMRES" << std::endl;
+
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);
+
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+
+    RealD cp;
+    RealD ssq = norm2(src);
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    Field r(src._grid);
+
+    std::cout << std::setprecision(4) << std::scientific;
+    std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
+    std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual:   src " << ssq   << std::endl;
+
+    MatrixTimer.Reset();
+    LinalgTimer.Reset();
+    QrTimer.Reset();
+    CompSolutionTimer.Reset();
+
+    GridStopWatch SolverTimer;
+    SolverTimer.Start();
+
+    IterationCount = 0;
+
+    for (int k=0; k<MaxNumberOfRestarts; k++) {
+
+      cp = outerLoopBody(LinOp, src, psi, rsq);
+
+      // Stopping condition
+      if (cp <= rsq) {
+
+        SolverTimer.Stop();
+
+        LinOp.Op(psi,r);
+        axpy(r,-1.0,src,r);
+
+        RealD srcnorm       = sqrt(ssq);
+        RealD resnorm       = sqrt(norm2(r));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage        << "CommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
+                  << " computed residual " << sqrt(cp / ssq)
+                  << " true residual "     << true_residual
+                  << " target "            << Tolerance << std::endl;
+
+        std::cout << GridLogMessage << "CAGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "CAGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "CAGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "CAGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "CAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
+        return;
+      }
+    }
+
+    std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
+
+    if (ErrorOnNoConverge)
+      assert(0);
+  }
+
+  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
+
+    RealD cp = 0;
+
+    Field w(src._grid);
+    Field r(src._grid);
+
+    // this should probably be made a class member so that it is only allocated once, not in every restart
+    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
+
+    MatrixTimer.Start();
+    LinOp.Op(psi, w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    r = src - w;
+
+    gamma[0] = sqrt(norm2(r));
+
+    v[0] = (1. / gamma[0]) * r;
+    LinalgTimer.Stop();
+
+    for (int i=0; i<RestartLength; i++) {
+
+      IterationCount++;
+
+      arnoldiStep(LinOp, v, w, i);
+
+      qrUpdate(i);
+
+      cp = std::norm(gamma[i+1]);
+
+      std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
+                << " residual " << cp << " target " << rsq << std::endl;
+
+      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
+
+        computeSolution(v, psi, i);
+
+        return cp;
+      }
+    }
+
+    assert(0); // Never reached
+    return cp;
+  }
+
+  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
+
+    MatrixTimer.Start();
+    LinOp.Op(v[iter], w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    for (int i = 0; i <= iter; ++i) {
+      H(iter, i) = innerProduct(v[i], w);
+      w = w - H(iter, i) * v[i];
+    }
+
+    H(iter, iter + 1) = sqrt(norm2(w));
+    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
+    LinalgTimer.Stop();
+  }
+
+  void qrUpdate(int iter) {
+
+    QrTimer.Start();
+    for (int i = 0; i < iter ; ++i) {
+      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
+      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
+      H(iter, i + 1) = tmp;
+    }
+
+    // Compute new Givens Rotation
+    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
+    c[iter]     = H(iter, iter) / nu;
+    s[iter]     = H(iter, iter + 1) / nu;
+
+    // Apply new Givens rotation
+    H(iter, iter)     = nu;
+    H(iter, iter + 1) = 0.;
+
+    gamma[iter + 1] = -s[iter] * gamma[iter];
+    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
+    QrTimer.Stop();
+  }
+
+  void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
+
+    CompSolutionTimer.Start();
+    for (int i = iter; i >= 0; i--) {
+      y[i] = gamma[i];
+      for (int k = i + 1; k <= iter; k++)
+        y[i] = y[i] - H(k, i) * y[k];
+      y[i] = y[i] / H(i, i);
+    }
+
+    for (int i = 0; i <= iter; i++)
+      psi = psi + v[i] * y[i];
+    CompSolutionTimer.Stop();
+  }
+};
+}
+#endif
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@ -89,6 +89,8 @@ class ConjugateGradient : public OperatorFunction<Field> {

    // Check if guess is really REALLY good :)
    if (cp <= rsq) {
+      std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
+      IterationsToComplete = 0;	
      return;
    }

@ -104,7 +106,7 @@ class ConjugateGradient : public OperatorFunction<Field> {

    SolverTimer.Start();
    int k;
-    for (k = 1; k <= MaxIterations*1000; k++) {
+    for (k = 1; k <= MaxIterations; k++) {
      c = cp;

      MatrixTimer.Start();
@ -133,7 +135,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
      LinalgTimer.Stop();

      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
-                << " residual " << cp << " target " << rsq << std::endl;
+                << " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;

      // Stopping condition
      if (cp <= rsq) {
@ -165,8 +167,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
        return;
      }
    }
-    std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
-              << std::endl;
+    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl;

    if (ErrorOnNoConverge) assert(0);
    IterationsToComplete = k;
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@ -30,8 +30,11 @@ Author: Christopher Kelly <ckelly@phys.columbia.edu>

 namespace Grid {

+
  //Mixed precision restarted defect correction CG
-  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+  template<class FieldD,class FieldF, 
+    typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
+    typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
  public:                                                
    RealD   Tolerance;
@ -50,7 +53,12 @@ namespace Grid {
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
    
-    MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
+    MixedPrecisionConjugateGradient(RealD tol, 
+				    Integer maxinnerit, 
+				    Integer maxouterit, 
+				    GridBase* _sp_grid, 
+				    LinearOperatorBase<FieldF> &_Linop_f, 
+				    LinearOperatorBase<FieldD> &_Linop_d) :
      Linop_f(_Linop_f), Linop_d(_Linop_d),
      Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
      OuterLoopNormMult(100.), guesser(NULL){ };
@ -149,6 +157,8 @@ namespace Grid {
    }
  };

+
+
 }

 #endif
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@ -35,7 +35,11 @@ class ZeroGuesser: public LinearFunction<Field> {
 public:
  virtual void operator()(const Field &src, Field &guess) { guess = zero; };
 };
-
+template<class Field>
+class DoNothingGuesser: public LinearFunction<Field> {
+public:
+  virtual void operator()(const Field &src, Field &guess) {  };
+};
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
--- a/Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
@ -0,0 +1,256 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
+
+Copyright (C) 2015
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
+#define GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
+
+namespace Grid {
+
+template<class Field>
+class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
+ public:
+  bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
+                          // defaults to true
+
+  RealD   Tolerance;
+
+  Integer MaxIterations;
+  Integer RestartLength;
+  Integer MaxNumberOfRestarts;
+  Integer IterationCount; // Number of iterations the FCAGMRES took to finish,
+                          // filled in upon completion
+
+  GridStopWatch MatrixTimer;
+  GridStopWatch PrecTimer;
+  GridStopWatch LinalgTimer;
+  GridStopWatch QrTimer;
+  GridStopWatch CompSolutionTimer;
+
+  Eigen::MatrixXcd H;
+
+  std::vector<std::complex<double>> y;
+  std::vector<std::complex<double>> gamma;
+  std::vector<std::complex<double>> c;
+  std::vector<std::complex<double>> s;
+
+  LinearFunction<Field> &Preconditioner;
+
+  FlexibleCommunicationAvoidingGeneralisedMinimalResidual(RealD   tol,
+                                                          Integer maxit,
+                                                          LinearFunction<Field> &Prec,
+                                                          Integer restart_length,
+                                                          bool    err_on_no_conv = true)
+      : Tolerance(tol)
+      , MaxIterations(maxit)
+      , RestartLength(restart_length)
+      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
+      , ErrorOnNoConverge(err_on_no_conv)
+      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
+      , y(RestartLength + 1, 0.)
+      , gamma(RestartLength + 1, 0.)
+      , c(RestartLength + 1, 0.)
+      , s(RestartLength + 1, 0.)
+      , Preconditioner(Prec) {};
+
+  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
+
+    std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular FGMRES" << std::endl;
+
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);
+
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+
+    RealD cp;
+    RealD ssq = norm2(src);
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    Field r(src._grid);
+
+    std::cout << std::setprecision(4) << std::scientific;
+    std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
+    std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual:   src " << ssq   << std::endl;
+
+    PrecTimer.Reset();
+    MatrixTimer.Reset();
+    LinalgTimer.Reset();
+    QrTimer.Reset();
+    CompSolutionTimer.Reset();
+
+    GridStopWatch SolverTimer;
+    SolverTimer.Start();
+
+    IterationCount = 0;
+
+    for (int k=0; k<MaxNumberOfRestarts; k++) {
+
+      cp = outerLoopBody(LinOp, src, psi, rsq);
+
+      // Stopping condition
+      if (cp <= rsq) {
+
+        SolverTimer.Stop();
+
+        LinOp.Op(psi,r);
+        axpy(r,-1.0,src,r);
+
+        RealD srcnorm       = sqrt(ssq);
+        RealD resnorm       = sqrt(norm2(r));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage        << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
+                  << " computed residual " << sqrt(cp / ssq)
+                  << " true residual "     << true_residual
+                  << " target "            << Tolerance << std::endl;
+
+        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Precon  " <<         PrecTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FCAGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FCAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
+        return;
+      }
+    }
+
+    std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
+
+    if (ErrorOnNoConverge)
+      assert(0);
+  }
+
+  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
+
+    RealD cp = 0;
+
+    Field w(src._grid);
+    Field r(src._grid);
+
+    // these should probably be made class members so that they are only allocated once, not in every restart
+    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
+    std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
+
+    MatrixTimer.Start();
+    LinOp.Op(psi, w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    r = src - w;
+
+    gamma[0] = sqrt(norm2(r));
+
+    v[0] = (1. / gamma[0]) * r;
+    LinalgTimer.Stop();
+
+    for (int i=0; i<RestartLength; i++) {
+
+      IterationCount++;
+
+      arnoldiStep(LinOp, v, z, w, i);
+
+      qrUpdate(i);
+
+      cp = std::norm(gamma[i+1]);
+
+      std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
+                << " residual " << cp << " target " << rsq << std::endl;
+
+      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
+
+        computeSolution(z, psi, i);
+
+        return cp;
+      }
+    }
+
+    assert(0); // Never reached
+    return cp;
+  }
+
+  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
+
+    PrecTimer.Start();
+    Preconditioner(v[iter], z[iter]);
+    PrecTimer.Stop();
+
+    MatrixTimer.Start();
+    LinOp.Op(z[iter], w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    for (int i = 0; i <= iter; ++i) {
+      H(iter, i) = innerProduct(v[i], w);
+      w = w - H(iter, i) * v[i];
+    }
+
+    H(iter, iter + 1) = sqrt(norm2(w));
+    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
+    LinalgTimer.Stop();
+  }
+
+  void qrUpdate(int iter) {
+
+    QrTimer.Start();
+    for (int i = 0; i < iter ; ++i) {
+      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
+      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
+      H(iter, i + 1) = tmp;
+    }
+
+    // Compute new Givens Rotation
+    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
+    c[iter]     = H(iter, iter) / nu;
+    s[iter]     = H(iter, iter + 1) / nu;
+
+    // Apply new Givens rotation
+    H(iter, iter)     = nu;
+    H(iter, iter + 1) = 0.;
+
+    gamma[iter + 1] = -s[iter] * gamma[iter];
+    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
+    QrTimer.Stop();
+  }
+
+  void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
+
+    CompSolutionTimer.Start();
+    for (int i = iter; i >= 0; i--) {
+      y[i] = gamma[i];
+      for (int k = i + 1; k <= iter; k++)
+        y[i] = y[i] - H(k, i) * y[k];
+      y[i] = y[i] / H(i, i);
+    }
+
+    for (int i = 0; i <= iter; i++)
+      psi = psi + z[i] * y[i];
+    CompSolutionTimer.Stop();
+  }
+};
+}
+#endif
--- a/Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
@ -0,0 +1,254 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
+
+Copyright (C) 2015
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
+#define GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
+
+namespace Grid {
+
+template<class Field>
+class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
+ public:
+  bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
+                          // defaults to true
+
+  RealD   Tolerance;
+
+  Integer MaxIterations;
+  Integer RestartLength;
+  Integer MaxNumberOfRestarts;
+  Integer IterationCount; // Number of iterations the FGMRES took to finish,
+                          // filled in upon completion
+
+  GridStopWatch MatrixTimer;
+  GridStopWatch PrecTimer;
+  GridStopWatch LinalgTimer;
+  GridStopWatch QrTimer;
+  GridStopWatch CompSolutionTimer;
+
+  Eigen::MatrixXcd H;
+
+  std::vector<std::complex<double>> y;
+  std::vector<std::complex<double>> gamma;
+  std::vector<std::complex<double>> c;
+  std::vector<std::complex<double>> s;
+
+  LinearFunction<Field> &Preconditioner;
+
+  FlexibleGeneralisedMinimalResidual(RealD   tol,
+                                     Integer maxit,
+                                     LinearFunction<Field> &Prec,
+                                     Integer restart_length,
+                                     bool    err_on_no_conv = true)
+      : Tolerance(tol)
+      , MaxIterations(maxit)
+      , RestartLength(restart_length)
+      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
+      , ErrorOnNoConverge(err_on_no_conv)
+      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
+      , y(RestartLength + 1, 0.)
+      , gamma(RestartLength + 1, 0.)
+      , c(RestartLength + 1, 0.)
+      , s(RestartLength + 1, 0.)
+      , Preconditioner(Prec) {};
+
+  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
+
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);
+
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+
+    RealD cp;
+    RealD ssq = norm2(src);
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    Field r(src._grid);
+
+    std::cout << std::setprecision(4) << std::scientific;
+    std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: guess " << guess << std::endl;
+    std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual:   src " << ssq   << std::endl;
+
+    PrecTimer.Reset();
+    MatrixTimer.Reset();
+    LinalgTimer.Reset();
+    QrTimer.Reset();
+    CompSolutionTimer.Reset();
+
+    GridStopWatch SolverTimer;
+    SolverTimer.Start();
+
+    IterationCount = 0;
+
+    for (int k=0; k<MaxNumberOfRestarts; k++) {
+
+      cp = outerLoopBody(LinOp, src, psi, rsq);
+
+      // Stopping condition
+      if (cp <= rsq) {
+
+        SolverTimer.Stop();
+
+        LinOp.Op(psi,r);
+        axpy(r,-1.0,src,r);
+
+        RealD srcnorm       = sqrt(ssq);
+        RealD resnorm       = sqrt(norm2(r));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage        << "FlexibleGeneralisedMinimalResidual: Converged on iteration " << IterationCount
+                  << " computed residual " << sqrt(cp / ssq)
+                  << " true residual "     << true_residual
+                  << " target "            << Tolerance << std::endl;
+
+        std::cout << GridLogMessage << "FGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FGMRES Time elapsed: Precon  " <<         PrecTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
+        return;
+      }
+    }
+
+    std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl;
+
+    if (ErrorOnNoConverge)
+      assert(0);
+  }
+
+  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
+
+    RealD cp = 0;
+
+    Field w(src._grid);
+    Field r(src._grid);
+
+    // these should probably be made class members so that they are only allocated once, not in every restart
+    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
+    std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
+
+    MatrixTimer.Start();
+    LinOp.Op(psi, w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    r = src - w;
+
+    gamma[0] = sqrt(norm2(r));
+
+    v[0] = (1. / gamma[0]) * r;
+    LinalgTimer.Stop();
+
+    for (int i=0; i<RestartLength; i++) {
+
+      IterationCount++;
+
+      arnoldiStep(LinOp, v, z, w, i);
+
+      qrUpdate(i);
+
+      cp = std::norm(gamma[i+1]);
+
+      std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: Iteration " << IterationCount
+                << " residual " << cp << " target " << rsq << std::endl;
+
+      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
+
+        computeSolution(z, psi, i);
+
+        return cp;
+      }
+    }
+
+    assert(0); // Never reached
+    return cp;
+  }
+
+  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
+
+    PrecTimer.Start();
+    Preconditioner(v[iter], z[iter]);
+    PrecTimer.Stop();
+
+    MatrixTimer.Start();
+    LinOp.Op(z[iter], w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    for (int i = 0; i <= iter; ++i) {
+      H(iter, i) = innerProduct(v[i], w);
+      w = w - H(iter, i) * v[i];
+    }
+
+    H(iter, iter + 1) = sqrt(norm2(w));
+    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
+    LinalgTimer.Stop();
+  }
+
+  void qrUpdate(int iter) {
+
+    QrTimer.Start();
+    for (int i = 0; i < iter ; ++i) {
+      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
+      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
+      H(iter, i + 1) = tmp;
+    }
+
+    // Compute new Givens Rotation
+    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
+    c[iter]     = H(iter, iter) / nu;
+    s[iter]     = H(iter, iter + 1) / nu;
+
+    // Apply new Givens rotation
+    H(iter, iter)     = nu;
+    H(iter, iter + 1) = 0.;
+
+    gamma[iter + 1] = -s[iter] * gamma[iter];
+    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
+    QrTimer.Stop();
+  }
+
+  void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
+
+    CompSolutionTimer.Start();
+    for (int i = iter; i >= 0; i--) {
+      y[i] = gamma[i];
+      for (int k = i + 1; k <= iter; k++)
+        y[i] = y[i] - H(k, i) * y[k];
+      y[i] = y[i] / H(i, i);
+    }
+
+    for (int i = 0; i <= iter; i++)
+      psi = psi + z[i] * y[i];
+    CompSolutionTimer.Stop();
+  }
+};
+}
+#endif
--- a/Grid/algorithms/iterative/GeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/GeneralisedMinimalResidual.h
@ -0,0 +1,242 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/GeneralisedMinimalResidual.h
+
+Copyright (C) 2015
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_GENERALISED_MINIMAL_RESIDUAL_H
+#define GRID_GENERALISED_MINIMAL_RESIDUAL_H
+
+namespace Grid {
+
+template<class Field>
+class GeneralisedMinimalResidual : public OperatorFunction<Field> {
+ public:
+  bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
+                          // defaults to true
+
+  RealD   Tolerance;
+
+  Integer MaxIterations;
+  Integer RestartLength;
+  Integer MaxNumberOfRestarts;
+  Integer IterationCount; // Number of iterations the GMRES took to finish,
+                          // filled in upon completion
+
+  GridStopWatch MatrixTimer;
+  GridStopWatch LinalgTimer;
+  GridStopWatch QrTimer;
+  GridStopWatch CompSolutionTimer;
+
+  Eigen::MatrixXcd H;
+
+  std::vector<std::complex<double>> y;
+  std::vector<std::complex<double>> gamma;
+  std::vector<std::complex<double>> c;
+  std::vector<std::complex<double>> s;
+
+  GeneralisedMinimalResidual(RealD   tol,
+                             Integer maxit,
+                             Integer restart_length,
+                             bool    err_on_no_conv = true)
+      : Tolerance(tol)
+      , MaxIterations(maxit)
+      , RestartLength(restart_length)
+      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
+      , ErrorOnNoConverge(err_on_no_conv)
+      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
+      , y(RestartLength + 1, 0.)
+      , gamma(RestartLength + 1, 0.)
+      , c(RestartLength + 1, 0.)
+      , s(RestartLength + 1, 0.) {};
+
+  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
+
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);
+
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+
+    RealD cp;
+    RealD ssq = norm2(src);
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    Field r(src._grid);
+
+    std::cout << std::setprecision(4) << std::scientific;
+    std::cout << GridLogIterative << "GeneralisedMinimalResidual: guess " << guess << std::endl;
+    std::cout << GridLogIterative << "GeneralisedMinimalResidual:   src " << ssq   << std::endl;
+
+    MatrixTimer.Reset();
+    LinalgTimer.Reset();
+    QrTimer.Reset();
+    CompSolutionTimer.Reset();
+
+    GridStopWatch SolverTimer;
+    SolverTimer.Start();
+
+    IterationCount = 0;
+
+    for (int k=0; k<MaxNumberOfRestarts; k++) {
+
+      cp = outerLoopBody(LinOp, src, psi, rsq);
+
+      // Stopping condition
+      if (cp <= rsq) {
+
+        SolverTimer.Stop();
+
+        LinOp.Op(psi,r);
+        axpy(r,-1.0,src,r);
+
+        RealD srcnorm       = sqrt(ssq);
+        RealD resnorm       = sqrt(norm2(r));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage        << "GeneralisedMinimalResidual: Converged on iteration " << IterationCount
+                  << " computed residual " << sqrt(cp / ssq)
+                  << " true residual "     << true_residual
+                  << " target "            << Tolerance << std::endl;
+
+        std::cout << GridLogMessage << "GMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "GMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "GMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "GMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "GMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
+        return;
+      }
+    }
+
+    std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl;
+
+    if (ErrorOnNoConverge)
+      assert(0);
+  }
+
+  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
+
+    RealD cp = 0;
+
+    Field w(src._grid);
+    Field r(src._grid);
+
+    // this should probably be made a class member so that it is only allocated once, not in every restart
+    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
+
+    MatrixTimer.Start();
+    LinOp.Op(psi, w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    r = src - w;
+
+    gamma[0] = sqrt(norm2(r));
+
+    v[0] = (1. / gamma[0]) * r;
+    LinalgTimer.Stop();
+
+    for (int i=0; i<RestartLength; i++) {
+
+      IterationCount++;
+
+      arnoldiStep(LinOp, v, w, i);
+
+      qrUpdate(i);
+
+      cp = std::norm(gamma[i+1]);
+
+      std::cout << GridLogIterative << "GeneralisedMinimalResidual: Iteration " << IterationCount
+                << " residual " << cp << " target " << rsq << std::endl;
+
+      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
+
+        computeSolution(v, psi, i);
+
+        return cp;
+      }
+    }
+
+    assert(0); // Never reached
+    return cp;
+  }
+
+  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
+
+    MatrixTimer.Start();
+    LinOp.Op(v[iter], w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    for (int i = 0; i <= iter; ++i) {
+      H(iter, i) = innerProduct(v[i], w);
+      w = w - H(iter, i) * v[i];
+    }
+
+    H(iter, iter + 1) = sqrt(norm2(w));
+    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
+    LinalgTimer.Stop();
+  }
+
+  void qrUpdate(int iter) {
+
+    QrTimer.Start();
+    for (int i = 0; i < iter ; ++i) {
+      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
+      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
+      H(iter, i + 1) = tmp;
+    }
+
+    // Compute new Givens Rotation
+    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
+    c[iter]     = H(iter, iter) / nu;
+    s[iter]     = H(iter, iter + 1) / nu;
+
+    // Apply new Givens rotation
+    H(iter, iter)     = nu;
+    H(iter, iter + 1) = 0.;
+
+    gamma[iter + 1] = -s[iter] * gamma[iter];
+    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
+    QrTimer.Stop();
+  }
+
+  void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
+
+    CompSolutionTimer.Start();
+    for (int i = iter; i >= 0; i--) {
+      y[i] = gamma[i];
+      for (int k = i + 1; k <= iter; k++)
+        y[i] = y[i] - H(k, i) * y[k];
+      y[i] = y[i] / H(i, i);
+    }
+
+    for (int i = 0; i <= iter; i++)
+      psi = psi + v[i] * y[i];
+    CompSolutionTimer.Stop();
+  }
+};
+}
+#endif
--- a/Grid/algorithms/iterative/MinimalResidual.h
+++ b/Grid/algorithms/iterative/MinimalResidual.h
@ -0,0 +1,156 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/MinimalResidual.h
+
+Copyright (C) 2015
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MINIMAL_RESIDUAL_H
+#define GRID_MINIMAL_RESIDUAL_H
+
+namespace Grid {
+
+template<class Field> class MinimalResidual : public OperatorFunction<Field> {
+ public:
+  bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
+                          // Defaults true.
+  RealD   Tolerance;
+  Integer MaxIterations;
+  RealD   overRelaxParam;
+  Integer IterationsToComplete; // Number of iterations the MR took to finish.
+                                // Filled in upon completion
+
+  MinimalResidual(RealD tol, Integer maxit, Real ovrelparam = 1.0, bool err_on_no_conv = true)
+    : Tolerance(tol), MaxIterations(maxit), overRelaxParam(ovrelparam), ErrorOnNoConverge(err_on_no_conv){};
+
+  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
+
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);
+
+    Complex a, c;
+    Real    d;
+
+    Field Mr(src);
+    Field r(src);
+
+    // Initial residual computation & set up
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+
+    RealD ssq = norm2(src);
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    Linop.Op(psi, Mr);
+
+    r = src - Mr;
+
+    RealD cp = norm2(r);
+
+    std::cout << std::setprecision(4) << std::scientific;
+    std::cout << GridLogIterative << "MinimalResidual: guess " << guess << std::endl;
+    std::cout << GridLogIterative << "MinimalResidual:   src " << ssq << std::endl;
+    std::cout << GridLogIterative << "MinimalResidual:    mp " << d << std::endl;
+    std::cout << GridLogIterative << "MinimalResidual:  cp,r " << cp << std::endl;
+
+    if (cp <= rsq) {
+      return;
+    }
+
+    std::cout << GridLogIterative << "MinimalResidual: k=0 residual " << cp << " target " << rsq << std::endl;
+
+    GridStopWatch LinalgTimer;
+    GridStopWatch MatrixTimer;
+    GridStopWatch SolverTimer;
+
+    SolverTimer.Start();
+    int k;
+    for (k = 1; k <= MaxIterations; k++) {
+
+      MatrixTimer.Start();
+      Linop.Op(r, Mr);
+      MatrixTimer.Stop();
+
+      LinalgTimer.Start();
+
+      c = innerProduct(Mr, r);
+
+      d = norm2(Mr);
+
+      a = c / d;
+
+      a = a * overRelaxParam;
+
+      psi = psi + r * a;
+
+      r = r - Mr * a;
+
+      cp = norm2(r);
+
+      LinalgTimer.Stop();
+
+      std::cout << GridLogIterative << "MinimalResidual: Iteration " << k
+                << " residual " << cp << " target " << rsq << std::endl;
+      std::cout << GridLogDebug << "a = " << a << " c = " << c << " d = " << d << std::endl;
+
+      // Stopping condition
+      if (cp <= rsq) {
+        SolverTimer.Stop();
+
+        Linop.Op(psi, Mr);
+        r = src - Mr;
+
+        RealD srcnorm       = sqrt(ssq);
+        RealD resnorm       = sqrt(norm2(r));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage        << "MinimalResidual Converged on iteration " << k
+                  << " computed residual " << sqrt(cp / ssq)
+                  << " true residual "     << true_residual
+                  << " target "            << Tolerance << std::endl;
+
+        std::cout << GridLogMessage << "MR Time elapsed: Total   " << SolverTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MR Time elapsed: Matrix  " << MatrixTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MR Time elapsed: Linalg  " << LinalgTimer.Elapsed() << std::endl;
+
+        if (ErrorOnNoConverge)
+          assert(true_residual / Tolerance < 10000.0);
+
+        IterationsToComplete = k;
+
+        return;
+      }
+    }
+
+    std::cout << GridLogMessage << "MinimalResidual did NOT converge"
+              << std::endl;
+
+    if (ErrorOnNoConverge)
+      assert(0);
+
+    IterationsToComplete = k;
+  }
+};
+} // namespace Grid
+#endif
--- a/Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
@ -0,0 +1,273 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
+
+Copyright (C) 2015
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
+#define GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
+
+namespace Grid {
+
+template<class FieldD, class FieldF, typename std::enable_if<getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
+class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction<FieldD> {
+ public:
+  bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
+                          // defaults to true
+
+  RealD   Tolerance;
+
+  Integer MaxIterations;
+  Integer RestartLength;
+  Integer MaxNumberOfRestarts;
+  Integer IterationCount; // Number of iterations the MPFGMRES took to finish,
+                          // filled in upon completion
+
+  GridStopWatch MatrixTimer;
+  GridStopWatch PrecTimer;
+  GridStopWatch LinalgTimer;
+  GridStopWatch QrTimer;
+  GridStopWatch CompSolutionTimer;
+  GridStopWatch ChangePrecTimer;
+
+  Eigen::MatrixXcd H;
+
+  std::vector<std::complex<double>> y;
+  std::vector<std::complex<double>> gamma;
+  std::vector<std::complex<double>> c;
+  std::vector<std::complex<double>> s;
+
+  GridBase* SinglePrecGrid;
+
+  LinearFunction<FieldF> &Preconditioner;
+
+  MixedPrecisionFlexibleGeneralisedMinimalResidual(RealD   tol,
+                                                   Integer maxit,
+                                                   GridBase * sp_grid,
+                                                   LinearFunction<FieldF> &Prec,
+                                                   Integer restart_length,
+                                                   bool    err_on_no_conv = true)
+      : Tolerance(tol)
+      , MaxIterations(maxit)
+      , RestartLength(restart_length)
+      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
+      , ErrorOnNoConverge(err_on_no_conv)
+      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
+      , y(RestartLength + 1, 0.)
+      , gamma(RestartLength + 1, 0.)
+      , c(RestartLength + 1, 0.)
+      , s(RestartLength + 1, 0.)
+      , SinglePrecGrid(sp_grid)
+      , Preconditioner(Prec) {};
+
+  void operator()(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi) {
+
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);
+
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+
+    RealD cp;
+    RealD ssq = norm2(src);
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    FieldD r(src._grid);
+
+    std::cout << std::setprecision(4) << std::scientific;
+    std::cout << GridLogIterative << "MPFGMRES: guess " << guess << std::endl;
+    std::cout << GridLogIterative << "MPFGMRES:   src " << ssq   << std::endl;
+
+    PrecTimer.Reset();
+    MatrixTimer.Reset();
+    LinalgTimer.Reset();
+    QrTimer.Reset();
+    CompSolutionTimer.Reset();
+    ChangePrecTimer.Reset();
+
+    GridStopWatch SolverTimer;
+    SolverTimer.Start();
+
+    IterationCount = 0;
+
+    for (int k=0; k<MaxNumberOfRestarts; k++) {
+
+      cp = outerLoopBody(LinOp, src, psi, rsq);
+
+      // Stopping condition
+      if (cp <= rsq) {
+
+        SolverTimer.Stop();
+
+        LinOp.Op(psi,r);
+        axpy(r,-1.0,src,r);
+
+        RealD srcnorm       = sqrt(ssq);
+        RealD resnorm       = sqrt(norm2(r));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage        << "MPFGMRES: Converged on iteration " << IterationCount
+                  << " computed residual " << sqrt(cp / ssq)
+                  << " true residual "     << true_residual
+                  << " target "            << Tolerance << std::endl;
+
+        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Total      " <<       SolverTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Precon     " <<         PrecTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Matrix     " <<       MatrixTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Linalg     " <<       LinalgTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MPFGMRES Time elapsed: QR         " <<           QrTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MPFGMRES Time elapsed: CompSol    " << CompSolutionTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MPFGMRES Time elapsed: PrecChange " <<   ChangePrecTimer.Elapsed() << std::endl;
+        return;
+      }
+    }
+
+    std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl;
+
+    if (ErrorOnNoConverge)
+      assert(0);
+  }
+
+  RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) {
+
+    RealD cp = 0;
+
+    FieldD w(src._grid);
+    FieldD r(src._grid);
+
+    // these should probably be made class members so that they are only allocated once, not in every restart
+    std::vector<FieldD> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
+    std::vector<FieldD> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
+
+    MatrixTimer.Start();
+    LinOp.Op(psi, w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    r = src - w;
+
+    gamma[0] = sqrt(norm2(r));
+
+    v[0] = (1. / gamma[0]) * r;
+    LinalgTimer.Stop();
+
+    for (int i=0; i<RestartLength; i++) {
+
+      IterationCount++;
+
+      arnoldiStep(LinOp, v, z, w, i);
+
+      qrUpdate(i);
+
+      cp = std::norm(gamma[i+1]);
+
+      std::cout << GridLogIterative << "MPFGMRES: Iteration " << IterationCount
+                << " residual " << cp << " target " << rsq << std::endl;
+
+      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
+
+        computeSolution(z, psi, i);
+
+        return cp;
+      }
+    }
+
+    assert(0); // Never reached
+    return cp;
+  }
+
+  void arnoldiStep(LinearOperatorBase<FieldD> &LinOp, std::vector<FieldD> &v, std::vector<FieldD> &z, FieldD &w, int iter) {
+
+    FieldF v_f(SinglePrecGrid);
+    FieldF z_f(SinglePrecGrid);
+
+    ChangePrecTimer.Start();
+    precisionChange(v_f, v[iter]);
+    precisionChange(z_f, z[iter]);
+    ChangePrecTimer.Stop();
+
+    PrecTimer.Start();
+    Preconditioner(v_f, z_f);
+    PrecTimer.Stop();
+
+    ChangePrecTimer.Start();
+    precisionChange(z[iter], z_f);
+    ChangePrecTimer.Stop();
+
+    MatrixTimer.Start();
+    LinOp.Op(z[iter], w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    for (int i = 0; i <= iter; ++i) {
+      H(iter, i) = innerProduct(v[i], w);
+      w = w - H(iter, i) * v[i];
+    }
+
+    H(iter, iter + 1) = sqrt(norm2(w));
+    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
+    LinalgTimer.Stop();
+  }
+
+  void qrUpdate(int iter) {
+
+    QrTimer.Start();
+    for (int i = 0; i < iter ; ++i) {
+      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
+      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
+      H(iter, i + 1) = tmp;
+    }
+
+    // Compute new Givens Rotation
+    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
+    c[iter]     = H(iter, iter) / nu;
+    s[iter]     = H(iter, iter + 1) / nu;
+
+    // Apply new Givens rotation
+    H(iter, iter)     = nu;
+    H(iter, iter + 1) = 0.;
+
+    gamma[iter + 1] = -s[iter] * gamma[iter];
+    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
+    QrTimer.Stop();
+  }
+
+  void computeSolution(std::vector<FieldD> const &z, FieldD &psi, int iter) {
+
+    CompSolutionTimer.Start();
+    for (int i = iter; i >= 0; i--) {
+      y[i] = gamma[i];
+      for (int k = i + 1; k <= iter; k++)
+        y[i] = y[i] - H(k, i) * y[k];
+      y[i] = y[i] / H(i, i);
+    }
+
+    for (int i = 0; i <= iter; i++)
+      psi = psi + z[i] * y[i];
+    CompSolutionTimer.Stop();
+  }
+};
+}
+#endif
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@ -0,0 +1,45 @@
+#pragma once
+namespace Grid {
+template<class Field> class PowerMethod  
+{ 
+ public: 
+
+  template<typename T>  static RealD normalise(T& v) 
+  {
+    RealD nn = norm2(v);
+    nn = sqrt(nn);
+    v = v * (1.0/nn);
+    return nn;
+  }
+
+  RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src) 
+  { 
+    GridBase *grid = src._grid; 
+    
+    // quickly get an idea of the largest eigenvalue to more properly normalize the residuum 
+    RealD evalMaxApprox = 0.0; 
+    auto src_n = src; 
+    auto tmp = src; 
+    const int _MAX_ITER_EST_ = 50; 
+
+    for (int i=0;i<_MAX_ITER_EST_;i++) { 
+      
+      normalise(src_n); 
+      HermOp.HermOp(src_n,tmp); 
+      RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. 
+      RealD vden = norm2(src_n); 
+      RealD na = vnum/vden; 
+      
+      if ( (fabs(evalMaxApprox/na - 1.0) < 0.01) || (i==_MAX_ITER_EST_-1) ) { 
+ 	evalMaxApprox = na; 
+ 	return evalMaxApprox; 
+      } 
+      evalMaxApprox = na; 
+      std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
+      src_n = tmp;
+    }
+    assert(0);
+    return 0;
+  }
+};
+}
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@ -139,8 +139,11 @@ namespace Grid {
      MatTimer.Start();
      Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
      MatTimer.Stop();
+
+      LinalgTimer.Start();
      r=src-Az;
-      
+      LinalgTimer.Stop();
+
      /////////////////////
      // p = Prec(r)
      /////////////////////
@ -152,8 +155,10 @@ namespace Grid {
      Linop.HermOp(z,tmp); 
      MatTimer.Stop();

+      LinalgTimer.Start();
      ttmp=tmp;
      tmp=tmp-r;
+      LinalgTimer.Stop();

      /*
      std::cout<<GridLogMessage<<r<<std::endl;
@ -166,12 +171,14 @@ namespace Grid {
      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
      MatTimer.Stop();

+      LinalgTimer.Start();
      //p[0],q[0],qq[0] 
      p[0]= z;
      q[0]= Az;
      qq[0]= zAAz;

      cp =norm2(r);
+      LinalgTimer.Stop();

      for(int k=0;k<nstep;k++){

@ -181,12 +188,14 @@ namespace Grid {
 	int peri_k = k %mmax;
 	int peri_kp= kp%mmax;

+        LinalgTimer.Start();
 	rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
 	a = rq/qq[peri_k];

 	axpy(psi,a,p[peri_k],psi);         

-	cp = axpy_norm(r,-a,q[peri_k],r);  
+	cp = axpy_norm(r,-a,q[peri_k],r);
+        LinalgTimer.Stop();

 	if((k==nstep-1)||(cp<rsq)){
 	  return cp;
@ -202,6 +211,8 @@ namespace Grid {
 	Linop.HermOpAndNorm(z,Az,zAz,zAAz);
 	Linop.HermOp(z,tmp);
 	MatTimer.Stop();
+
+        LinalgTimer.Start();
        tmp=tmp-r;
 	std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 

@ -219,9 +230,9 @@ namespace Grid {

 	}
 	qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
-
-
+        LinalgTimer.Stop();
      }
+
      assert(0); // never reached
      return cp;
    }
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@ -87,228 +87,25 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {

  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Take a matrix and form a Red Black solver calling a Herm solver
-  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
+  // Use base class to share code
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Now make the norm reflect extra factor of Mee
-  template<class Field> class SchurRedBlackStaggeredSolve {
-  private:
-    OperatorFunction<Field> & _HermitianRBSolver;
-    int CBfactorise;
-    bool subGuess;
-  public:
-
-    /////////////////////////////////////////////////////
-    // Wrap the usual normal equations Schur trick
-    /////////////////////////////////////////////////////
-  SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
-     _HermitianRBSolver(HermitianRBSolver) 
-    { 
-      CBfactorise=0;
-      subtractGuess(initSubGuess);
-    };
-    void subtractGuess(const bool initSubGuess)
-    {
-      subGuess = initSubGuess;
-    }
-    bool isSubtractGuess(void)
-    {
-      return subGuess;
-    }
-
-    template<class Matrix>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out){
-      ZeroGuesser<Field> guess;
-      (*this)(_Matrix,in,out,guess);
-    }
-    template<class Matrix, class Guesser>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
-
-      // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME use CBfactorise to control schur decomp
-      GridBase *grid = _Matrix.RedBlackGrid();
-      GridBase *fgrid= _Matrix.Grid();
-
-      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
-      Field   tmp(grid);
-      Field  Mtmp(grid);
-      Field resid(fgrid);
-      
-      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve " <<std::endl;
-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
-      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
-    
-      /////////////////////////////////////////////////////
-      // src_o = (source_o - Moe MeeInv source_e)
-      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
-
-      //src_o = tmp;     assert(src_o.checkerboard ==Odd);
-      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
-
-      //////////////////////////////////////////////////////////////
-      // Call the red-black solver
-      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
-      guess(src_o, sol_o);
-      Mtmp = sol_o;
-      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;
-      // Fionn A2A boolean behavioural control
-      if (subGuess)        sol_o = sol_o-Mtmp;
-
-      ///////////////////////////////////////////////////
-      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-     
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver reconstructed other CB" <<std::endl;
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver inserted solution" <<std::endl;
-
-      // Verify the unprec residual
-      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
-        resid = resid-in;
-        RealD ns = norm2(in);
-        RealD nr = norm2(resid);
-        std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
-      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
-      }
-    }     
-  };
-  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
-
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackDiagMooeeSolve {
-  private:
+  template<class Field> class SchurRedBlackBase {
+  protected:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
+    bool useSolnAsInitGuess; // if true user-supplied solution vector is used as initial guess for solver
  public:

-    /////////////////////////////////////////////////////
-    // Wrap the usual normal equations Schur trick
-    /////////////////////////////////////////////////////
-  SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0, const bool initSubGuess = false)  :  _HermitianRBSolver(HermitianRBSolver) 
-  { 
-    CBfactorise=cb;
-    subtractGuess(initSubGuess);
-  };
-    void subtractGuess(const bool initSubGuess)
-    {
-      subGuess = initSubGuess;
-    }
-    bool isSubtractGuess(void)
-    {
-      return subGuess;
-    }
-    template<class Matrix>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out){
-      ZeroGuesser<Field> guess;
-      (*this)(_Matrix,in,out,guess);
-    }
-    template<class Matrix, class Guesser>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
-
-      // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME use CBfactorise to control schur decomp
-      GridBase *grid = _Matrix.RedBlackGrid();
-      GridBase *fgrid= _Matrix.Grid();
-
-      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
-      Field   tmp(grid);
-      Field  Mtmp(grid);
-      Field resid(fgrid);
-
-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
-    
-      /////////////////////////////////////////////////////
-      // src_o = Mdag * (source_o - Moe MeeInv source_e)
-      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
-
-      // get the right MpcDag
-      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
-
-      //////////////////////////////////////////////////////////////
-      // Call the red-black solver
-      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-      guess(src_o,sol_o);
-      Mtmp = sol_o;
-      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-      // Fionn A2A boolean behavioural control
-      if (subGuess)        sol_o = sol_o-Mtmp;
-
-      ///////////////////////////////////////////////////
-      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-     
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
-
-      // Verify the unprec residual
-      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
-        resid = resid-in;
-        RealD ns = norm2(in);
-        RealD nr = norm2(resid);
-
-        std::cout<<GridLogMessage << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
-      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
-      }
-    }     
-  };
-
-
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Take a matrix and form a Red Black solver calling a Herm solver
-  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackDiagTwoSolve {
-  private:
-    OperatorFunction<Field> & _HermitianRBSolver;
-    int CBfactorise;
-    bool subGuess;
-  public:
-
-    /////////////////////////////////////////////////////
-    // Wrap the usual normal equations Schur trick
-    /////////////////////////////////////////////////////
-  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
-     _HermitianRBSolver(HermitianRBSolver) 
+    SchurRedBlackBase(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
+        const bool _solnAsInitGuess = false)  :
+    _HermitianRBSolver(HermitianRBSolver),
+    useSolnAsInitGuess(_solnAsInitGuess)
    { 
      CBfactorise = 0;
      subtractGuess(initSubGuess);
@ -322,12 +119,90 @@ namespace Grid {
      return subGuess;
    }

-    template<class Matrix>
+    /////////////////////////////////////////////////////////////
+    // Shared code
+    /////////////////////////////////////////////////////////////
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
-    template<class Matrix,class Guesser>
+    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out) 
+    {
+      ZeroGuesser<Field> guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+
+    template<class Guesser>
+    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess) 
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+      int nblock = in.size();
+
+      std::vector<Field> src_o(nblock,grid);
+      std::vector<Field> sol_o(nblock,grid);
+      
+      std::vector<Field> guess_save;
+
+      Field resid(fgrid);
+      Field tmp(grid);
+
+      ////////////////////////////////////////////////
+      // Prepare RedBlack source
+      ////////////////////////////////////////////////
+      for(int b=0;b<nblock;b++){
+	RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
+      }
+      ////////////////////////////////////////////////
+      // Make the guesses
+      ////////////////////////////////////////////////
+      if ( subGuess ) guess_save.resize(nblock,grid);
+
+      for(int b=0;b<nblock;b++){
+        if(useSolnAsInitGuess) {
+          pickCheckerboard(Odd, sol_o[b], out[b]);
+        } else {
+          guess(src_o[b],sol_o[b]); 
+        }
+
+	if ( subGuess ) { 
+	  guess_save[b] = sol_o[b];
+	}
+      }
+      //////////////////////////////////////////////////////////////
+      // Call the block solver
+      //////////////////////////////////////////////////////////////
+      std::cout<<GridLogMessage << "SchurRedBlackBase calling the solver for "<<nblock<<" RHS" <<std::endl;
+      RedBlackSolve(_Matrix,src_o,sol_o);
+
+      ////////////////////////////////////////////////
+      // A2A boolean behavioural control & reconstruct other checkerboard
+      ////////////////////////////////////////////////
+      for(int b=0;b<nblock;b++) {
+
+	if (subGuess)   sol_o[b] = sol_o[b] - guess_save[b];
+
+	///////// Needs even source //////////////
+	pickCheckerboard(Even,tmp,in[b]);
+	RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
+
+	/////////////////////////////////////////////////
+	// Check unprec residual if possible
+	/////////////////////////////////////////////////
+	if ( ! subGuess ) {
+	  _Matrix.M(out[b],resid); 
+	  resid = resid-in[b];
+	  RealD ns = norm2(in[b]);
+	  RealD nr = norm2(resid);
+	
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
+	} else {
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
+	}
+
+      }
+    }
+    template<class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
@ -335,52 +210,42 @@ namespace Grid {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();

-      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
-      Field   tmp(grid);
-      Field  Mtmp(grid);
      Field resid(fgrid);
+      Field src_o(grid);
+      Field src_e(grid);
+      Field sol_o(grid);

-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
-    
-      /////////////////////////////////////////////////////
-      // src_o = Mdag * (source_o - Moe MeeInv source_e)
-      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+      ////////////////////////////////////////////////
+      // RedBlack source
+      ////////////////////////////////////////////////
+      RedBlackSource(_Matrix,in,src_e,src_o);

-      // get the right MpcDag
-      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
+      ////////////////////////////////
+      // Construct the guess
+      ////////////////////////////////
+      if(useSolnAsInitGuess) {
+        pickCheckerboard(Odd, sol_o, out);
+      } else {
+        guess(src_o,sol_o);
+      }
+
+      Field  guess_save(grid);
+      guess_save = sol_o;

      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-//      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-      guess(src_o,tmp);
-      Mtmp = tmp;
-      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
+      RedBlackSolve(_Matrix,src_o,sol_o);
+
+      ////////////////////////////////////////////////
      // Fionn A2A boolean behavioural control
-      if (subGuess)      tmp = tmp-Mtmp;
-      _Matrix.MooeeInv(tmp,sol_o);       assert(  sol_o.checkerboard   ==Odd);
+      ////////////////////////////////////////////////
+      if (subGuess)      sol_o= sol_o-guess_save;

      ///////////////////////////////////////////////////
-      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      // RedBlack solution needs the even source
      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-     
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
+      RedBlackSolution(_Matrix,sol_o,src_e,out);

      // Verify the unprec residual
      if ( ! subGuess ) {
@ -389,68 +254,185 @@ namespace Grid {
        RealD ns = norm2(in);
        RealD nr = norm2(resid);

-        std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
+        std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl;
      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
+        std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl;
      }
    }     
+    
+    /////////////////////////////////////////////////////////////
+    // Override in derived. 
+    /////////////////////////////////////////////////////////////
+    virtual void RedBlackSource  (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)                =0;
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)          =0;
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)                           =0;
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)=0;
+
  };
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Take a matrix and form a Red Black solver calling a Herm solver
-  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackDiagTwoMixed {
-  private:
-    LinearFunction<Field> & _HermitianRBSolver;
-    int CBfactorise;
-    bool subGuess;
+
+  template<class Field> class SchurRedBlackStaggeredSolve : public SchurRedBlackBase<Field> {
  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
+        const bool _solnAsInitGuess = false) 
+      :    SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess) 
+    {
+    }
+
+    //////////////////////////////////////////////////////
+    // Override RedBlack specialisation
+    //////////////////////////////////////////////////////
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+
+      /////////////////////////////////////////////////////
+      // src_o = (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+
+      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
+    }
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e_c,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field   sol_e(grid);
+      Field   src_e(grid);
+
+      src_e = src_e_c; // Const correctness
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
+      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
+     
+      setCheckerboard(sol,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(sol,sol_o); assert(  sol_o.checkerboard ==Odd );
+    }
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal has Mooee on it.
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
+        const bool _solnAsInitGuess = false)  
+      : SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
+
+
+    //////////////////////////////////////////////////////
+    // Override RedBlack specialisation
+    //////////////////////////////////////////////////////
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+
+      /////////////////////////////////////////////////////
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+
+      // get the right MpcDag
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
+
+    }
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  sol_e(grid);
+      Field  src_e_i(grid);
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);          assert(  tmp.checkerboard   ==Even);
+      src_e_i = src_e-tmp;               assert(  src_e_i.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e_i,sol_e);   assert(  sol_e.checkerboard ==Even);
+     
+      setCheckerboard(sol,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(sol,sol_o); assert(  sol_o.checkerboard ==Odd );
+    }
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal is identity, right preconditioned by Mee^inv
+  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
+  //=> psi = MeeInv phi
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;

    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
-  SchurRedBlackDiagTwoMixed(LinearFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
-     _HermitianRBSolver(HermitianRBSolver) 
-    { 
-      CBfactorise=0;
-      subtractGuess(initSubGuess);
-    };
-    void subtractGuess(const bool initSubGuess)
-    {
-      subGuess = initSubGuess;
-    }
-    bool isSubtractGuess(void)
-    {
-      return subGuess;
-    }
+  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
+      const bool _solnAsInitGuess = false)  
+    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};

-    template<class Matrix>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out){
-      ZeroGuesser<Field> guess;
-      (*this)(_Matrix,in,out,guess);
-    }
-    template<class Matrix, class Guesser>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
-
-      // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME use CBfactorise to control schur decomp
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();

      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
+      
      Field   tmp(grid);
      Field  Mtmp(grid);
-      Field resid(fgrid);

-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
    
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
@ -461,43 +443,44 @@ namespace Grid {

      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
+    }

-      //////////////////////////////////////////////////////////////
-      // Call the red-black solver
-      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-//      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-//      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
-      guess(src_o,tmp);
-      Mtmp = tmp;
-      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
-      // Fionn A2A boolean behavioural control
-      if (subGuess)      tmp = tmp-Mtmp;
-      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   sol_o_i(grid);
+      Field   tmp(grid);
+      Field   sol_e(grid);
+
+      ////////////////////////////////////////////////
+      // MooeeInv due to pecond
+      ////////////////////////////////////////////////
+      _Matrix.MooeeInv(sol_o,tmp);
+      sol_o_i = tmp;

      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
+      _Matrix.Meooe(sol_o_i,tmp);    assert(  tmp.checkerboard   ==Even);
+      tmp = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      _Matrix.MooeeInv(tmp,sol_e);   assert(  sol_e.checkerboard ==Even);
     
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
+      setCheckerboard(sol,sol_e);    assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(sol,sol_o_i);  assert(  sol_o_i.checkerboard ==Odd );
+    };

-      // Verify the unprec residual
-      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
-        resid = resid-in;
-        RealD ns = norm2(in);
-        RealD nr = norm2(resid);
-
-        std::cout << GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid " << std::sqrt(nr / ns) << " nr " << nr << " ns " << ns << std::endl;
-      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
-      }
-    }     
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
  };
-
 }
 #endif
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -50,15 +50,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
      assert(0);
  }

-  Grid_quiesce_nodes();
-
  // Never clean up as done once.
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);

+  Grid_quiesce_nodes();
  GlobalSharedMemory::Init(communicator_world);
  GlobalSharedMemory::SharedMemoryAllocate(
 		   GlobalSharedMemory::MAX_MPI_SHM_BYTES,
 		   GlobalSharedMemory::Hugepages);
+  Grid_unquiesce_nodes();
 }

 ///////////////////////////////////////////////////////////////////////////
@ -107,8 +107,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 //////////////////////////////////
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)    
 {
-  _ndimension = processors.size();
-
+  _ndimension = processors.size();  assert(_ndimension>=1);
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
  std::vector<int> parent_processor_coor(_ndimension,0);
  std::vector<int> parent_processors    (_ndimension,1);
@ -124,10 +123,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  // split the communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  //  int Nparent = parent._processors ; 
-  //  std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
  int Nparent;
  MPI_Comm_size(parent.communicator,&Nparent);
-  //  std::cout << " Parent size  "<<Nparent <<std::endl;

  int childsize=1;
  for(int d=0;d<processors.size();d++) {
@ -136,8 +133,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  int Nchild = Nparent/childsize;
  assert (childsize * Nchild == Nparent);

-  //  std::cout << " child size  "<<childsize <<std::endl;
-
  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
  std::vector<int> scoor(_ndimension); // coor of split within parent
  std::vector<int> ssize(_ndimension); // coor of split within parent
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@ -52,7 +52,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _processors = processors;
-  _ndimension = processors.size();
+  _ndimension = processors.size();  assert(_ndimension>=1);
  _processor_coor.resize(_ndimension);
  
  // Require 1^N processor grid for fake
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@ -103,6 +103,8 @@ class GlobalSharedMemory {
  //////////////////////////////////////////////////////////////////////////////////////
  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
  static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void OptimalCommunicatorHypercube(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void OptimalCommunicatorSharedMemory(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  ///////////////////////////////////////////////////
  // Provide shared memory facilities off comm world
  ///////////////////////////////////////////////////
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -132,7 +132,22 @@ int Log2Size(int TwoToPower,int MAXLOG2)
 }
 void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
 {
-#ifdef HYPERCUBE
+  //////////////////////////////////////////////////////////////////////////////
+  // Look and see if it looks like an HPE 8600 based on hostname conventions
+  //////////////////////////////////////////////////////////////////////////////
+  const int namelen = _POSIX_HOST_NAME_MAX;
+  char name[namelen];
+  int R;
+  int I;
+  int N;
+  gethostname(name,namelen);
+  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
+
+  if(nscan==3) OptimalCommunicatorHypercube(processors,optimal_comm);
+  else         OptimalCommunicatorSharedMemory(processors,optimal_comm);
+}
+void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
+{
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
@ -253,7 +268,9 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
  /////////////////////////////////////////////////////////////////
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
-#else 
+}
+void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
+{
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
@ -306,7 +323,6 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
  /////////////////////////////////////////////////////////////////
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
-#endif
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // SHMGET
@ -337,7 +353,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
        int errsv = errno;
        printf("Errno %d\n",errsv);
        printf("key   %d\n",key);
-        printf("size  %lld\n",size);
+        printf("size  %ld\n",size);
        printf("flags %d\n",flags);
        perror("shmget");
        exit(1);
@ -413,7 +429,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@ -455,7 +471,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@ -499,7 +515,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
      
-      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
+      //      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
      if ( ptr == (void * )MAP_FAILED ) {       
 	perror("failed mmap");     
 	assert(0);    
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@ -85,7 +85,7 @@ class LatticeTrinaryExpression :public std::pair<Op,std::tuple<T1,T2,T3> >, publ

 void inline conformable(GridBase *lhs,GridBase *rhs)
 {
-  assert(lhs == rhs);
+  assert((lhs == rhs) && " conformable check pointers mismatch ");
 }

 template<class vobj>
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@ -392,14 +392,10 @@ namespace Grid {

    void SeedUniqueString(const std::string &s){
      std::vector<int> seeds;
-      std::stringstream sha;
      seeds = GridChecksum::sha256_seeds(s);
-      for(int i=0;i<seeds.size();i++) { 
-        sha << std::hex << seeds[i];
-      }
      std::cout << GridLogMessage << "Intialising parallel RNG with unique string '" 
                << s << "'" << std::endl;
-      std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
+      std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
      SeedFixedIntegers(seeds);
    }
    void SeedFixedIntegers(const std::vector<int> &seeds){
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -464,8 +464,10 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
  assert(orthog>=0);

  for(int d=0;d<nh;d++){
-    assert(lg->_processors[d]  == hg->_processors[d]);
-    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    if ( d!=orthog ) {
+      assert(lg->_processors[d]  == hg->_processors[d]);
+      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    }
  }

  // the above should guarantee that the operations are local
@ -485,7 +487,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int


 template<class vobj>
-void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
  typedef typename vobj::scalar_object sobj;

@ -499,8 +501,10 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
  assert(orthog>=0);

  for(int d=0;d<nh;d++){
-    assert(lg->_processors[d]  == hg->_processors[d]);
-    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    if ( d!=orthog ) {
+      assert(lg->_processors[d]  == hg->_processors[d]);
+      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    }
  }

  // the above should guarantee that the operations are local
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@ -59,6 +59,7 @@ void GridLogTimestamp(int on){
 }

 Colours GridLogColours(0);
+GridLogger GridLogMG     (1, "MG"    , GridLogColours, "NORMAL");
 GridLogger GridLogIRL    (1, "IRL"   , GridLogColours, "NORMAL");
 GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
 GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
@ -76,19 +77,18 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
-  GridLogIntegrator.Active(0);
+  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);

  for (int i = 0; i < logstreams.size(); i++) {
-    if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
-    if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
-    if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
-    if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
-    if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
-    if (logstreams[i] == std::string("Performance"))
-      GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
-    if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
+    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
+    if (logstreams[i] == std::string("Warning"))     GridLogWarning.Active(1);
+    if (logstreams[i] == std::string("NoMessage"))   GridLogMessage.Active(0);
+    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
+    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
+    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
+    if (logstreams[i] == std::string("Integrator"))  GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }

--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@ -146,9 +146,11 @@ public:
      if ( log.timestamp ) {
 	log.StopWatch->Stop();
 	GridTime now = log.StopWatch->Elapsed();
+	
 	if ( log.timing_mode==1 ) log.StopWatch->Reset();
 	log.StopWatch->Start();
-	stream << log.evidence()<< std::setw(6)<<now << log.background() << " : " ;
+	stream << log.evidence()
+	       << now	       << log.background() << " : " ;
      }
      stream << log.colour();
      return stream;
@ -167,6 +169,7 @@ public:

 void GridLogConfigure(std::vector<std::string> &logstreams);

+extern GridLogger GridLogMG;
 extern GridLogger GridLogIRL;
 extern GridLogger GridLogSolver;
 extern GridLogger GridLogError;
--- a/Grid/parallelIO/BinaryIO.cc
+++ b/Grid/parallelIO/BinaryIO.cc
@ -0,0 +1,3 @@
+#include <Grid/GridCore.h>
+
+int Grid::BinaryIO::latticeWriteMaxRetry = -1;
--- a/Grid/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@ -81,6 +81,7 @@ inline void removeWhitespace(std::string &key)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 class BinaryIO {
 public:
+  static int latticeWriteMaxRetry;

  /////////////////////////////////////////////////////////////////////////////
  // more byte manipulation helpers
@ -209,10 +210,10 @@ PARALLEL_CRITICAL
  static inline void le32toh_v(void *file_object,uint64_t bytes)
  {
    uint32_t *fp = (uint32_t *)file_object;
-    uint32_t f;

    uint64_t count = bytes/sizeof(uint32_t);
    parallel_for(uint64_t i=0;i<count;i++){  
+      uint32_t f;
      f = fp[i];
      // got network order and the network to host
      f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
@ -234,10 +235,9 @@ PARALLEL_CRITICAL
  static inline void le64toh_v(void *file_object,uint64_t bytes)
  {
    uint64_t *fp = (uint64_t *)file_object;
-    uint64_t f,g;
-    
    uint64_t count = bytes/sizeof(uint64_t);
    parallel_for(uint64_t i=0;i<count;i++){  
+      uint64_t f,g;
      f = fp[i];
      // got network order and the network to host
      g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
@ -348,7 +348,8 @@ PARALLEL_CRITICAL
    int ieee32    = (format == std::string("IEEE32"));
    int ieee64big = (format == std::string("IEEE64BIG"));
    int ieee64    = (format == std::string("IEEE64"));
-
+    assert(ieee64||ieee32|ieee64big||ieee32big);
+    assert((ieee64+ieee32+ieee64big+ieee32big)==1);
    //////////////////////////////////////////////////////////////////////////////
    // Do the I/O
    //////////////////////////////////////////////////////////////////////////////
@ -370,7 +371,7 @@ PARALLEL_CRITICAL
 #endif
      } else {
 	std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
-                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
+                  << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
        std::ifstream fin;
 	fin.open(file, std::ios::binary | std::ios::in);
        if (control & BINARYIO_MASTER_APPEND)
@ -582,7 +583,9 @@ PARALLEL_CRITICAL
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
    GridBase *grid = Umu._grid;
-    uint64_t lsites = grid->lSites();
+    uint64_t lsites = grid->lSites(), offsetCopy = offset;
+    int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry);
+    bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0);

    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
@ -597,9 +600,36 @@ PARALLEL_CRITICAL

    grid->Barrier();
    timer.Stop();
+    while (attemptsLeft >= 0)
+    {
+      grid->Barrier();
+      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
+	             nersc_csum,scidac_csuma,scidac_csumb);
+      if (checkWrite)
+      {
+        std::vector<fobj> ckiodata(lsites);
+        uint32_t          cknersc_csum, ckscidac_csuma, ckscidac_csumb;
+        uint64_t          ckoffset = offsetCopy;

-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
-	     nersc_csum,scidac_csuma,scidac_csumb);
+        std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
+        grid->Barrier();
+        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
+	               cknersc_csum,ckscidac_csuma,ckscidac_csumb);
+        if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
+        {
+          std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl;
+          offset = offsetCopy;
+          parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
+        }
+        else
+        {
+          std::cout << GridLogMessage << "writeLatticeObject: read test checksum correct" << std::endl;
+          break;
+        }
+      }
+      attemptsLeft--;
+    }
+    

    std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed()  <<std::endl;
  }
@ -725,5 +755,6 @@ PARALLEL_CRITICAL
    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
  }
 };
+
 }
 #endif
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@ -46,6 +46,12 @@ extern "C" {
 namespace Grid {
 namespace QCD {

+#define GRID_FIELD_NORM "FieldNormMetaData"
+#define GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) \
+0.5*fabs(FieldNormMetaData_.norm2 - n2ck)/(FieldNormMetaData_.norm2 + n2ck)
+#define GRID_FIELD_NORM_CHECK(FieldNormMetaData_, n2ck) \
+assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
+
  /////////////////////////////////
  // Encode word types as strings
  /////////////////////////////////
@ -205,6 +211,7 @@ class GridLimeReader : public BinaryIO {
  {
    typedef typename vobj::scalar_object sobj;
    scidacChecksum scidacChecksum_;
+    FieldNormMetaData  FieldNormMetaData_;
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;

    std::string format = getFormatString<vobj>();
@ -233,20 +240,52 @@ class GridLimeReader : public BinaryIO {
 	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
 	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
-
+	std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
+	std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
 	/////////////////////////////////////////////
 	// Insist checksum is next record
 	/////////////////////////////////////////////
-	readLimeObject(scidacChecksum_,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
-
+	readScidacChecksum(scidacChecksum_,FieldNormMetaData_);
 	/////////////////////////////////////////////
 	// Verify checksums
 	/////////////////////////////////////////////
+	if(FieldNormMetaData_.norm2 != 0.0){ 
+	  RealD n2ck = norm2(field);
+	  std::cout << GridLogMessage << "Field norm: metadata= " << FieldNormMetaData_.norm2 
+              << " / field= " << n2ck << " / rdiff= " << GRID_FIELD_NORM_CALC(FieldNormMetaData_,n2ck) << std::endl;
+	  GRID_FIELD_NORM_CHECK(FieldNormMetaData_,n2ck);
+	}
 	assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
+
+	// find out if next field is a GridFieldNorm
 	return;
      }
    }
  }
+  void readScidacChecksum(scidacChecksum     &scidacChecksum_,
+			  FieldNormMetaData  &FieldNormMetaData_)
+  {
+    FieldNormMetaData_.norm2 =0.0;
+    std::string scidac_str(SCIDAC_CHECKSUM);
+    std::string field_norm_str(GRID_FIELD_NORM);
+    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
+      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
+      std::vector<char> xmlc(nbytes+1,'\0');
+      limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
+      std::string xmlstring = std::string(&xmlc[0]);
+      XmlReader RD(xmlstring, true, "");
+      if ( !strncmp(limeReaderType(LimeR), field_norm_str.c_str(),strlen(field_norm_str.c_str()) )  ) {
+	//	std::cout << "FieldNormMetaData "<<xmlstring<<std::endl;
+	read(RD,field_norm_str,FieldNormMetaData_);
+      }
+      if ( !strncmp(limeReaderType(LimeR), scidac_str.c_str(),strlen(scidac_str.c_str()) )  ) {
+	//	std::cout << SCIDAC_CHECKSUM << " " <<xmlstring<<std::endl;
+	read(RD,std::string("scidacChecksum"),scidacChecksum_);
+	return;
+      }      
+    }
+    assert(0);
+  }
  ////////////////////////////////////////////
  // Read a generic serialisable object
  ////////////////////////////////////////////
@ -265,7 +304,7 @@ class GridLimeReader : public BinaryIO {
 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
 	//	std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;

-   xmlstring = std::string(&xmlc[0]);
+	xmlstring = std::string(&xmlc[0]);
 	return;
      }

@ -279,8 +318,8 @@ class GridLimeReader : public BinaryIO {
    std::string xmlstring;

    readLimeObject(xmlstring, record_name);
-	  XmlReader RD(xmlstring, true, "");
-	  read(RD,object_name,object);
+    XmlReader RD(xmlstring, true, "");
+    read(RD,object_name,object);
  }
 };

@ -389,6 +428,8 @@ class GridLimeWriter : public BinaryIO
    GridBase *grid = field._grid;
    assert(boss_node == field._grid->IsBoss() );

+    FieldNormMetaData FNMD; FNMD.norm2 = norm2(field);
+
    ////////////////////////////////////////////
    // Create record header
    ////////////////////////////////////////////
@ -447,6 +488,7 @@ class GridLimeWriter : public BinaryIO
    checksum.suma= streama.str();
    checksum.sumb= streamb.str();
    if ( boss_node ) { 
+      writeLimeObject(0,0,FNMD,std::string(GRID_FIELD_NORM),std::string(GRID_FIELD_NORM));
      writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
    }
  }
@ -624,6 +666,12 @@ class IldgWriter : public ScidacWriter {
    assert(header.nd==4);
    assert(header.nd==header.dimension.size());

+    //////////////////////////////////////////////////////////////////////////////
+    // Field norm tests
+    //////////////////////////////////////////////////////////////////////////////
+    FieldNormMetaData FieldNormMetaData_;
+    FieldNormMetaData_.norm2 = norm2(Umu);
+
    //////////////////////////////////////////////////////////////////////////////
    // Fill the USQCD info field
    //////////////////////////////////////////////////////////////////////////////
@ -632,11 +680,12 @@ class IldgWriter : public ScidacWriter {
    info.plaq   = header.plaquette;
    info.linktr = header.link_trace;

-    std::cout << GridLogMessage << " Writing config; IldgIO "<<std::endl;
+    //    std::cout << GridLogMessage << " Writing config; IldgIO n2 "<< FieldNormMetaData_.norm2<<std::endl;
    //////////////////////////////////////////////
    // Fill the Lime file record by record
    //////////////////////////////////////////////
    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
+    writeLimeObject(0,0,FieldNormMetaData_,FieldNormMetaData_.SerialisableClassName(),std::string(GRID_FIELD_NORM));
    writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
    writeLimeObject(0,1,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
    writeLimeObject(1,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
@ -679,6 +728,7 @@ class IldgReader : public GridLimeReader {
    std::string    ildgLFN_       ;
    scidacChecksum scidacChecksum_; 
    usqcdInfo      usqcdInfo_     ;
+    FieldNormMetaData FieldNormMetaData_;

    // track what we read from file
    int found_ildgFormat    =0;
@ -687,7 +737,7 @@ class IldgReader : public GridLimeReader {
    int found_usqcdInfo     =0;
    int found_ildgBinary =0;
    int found_FieldMetaData =0;
-
+    int found_FieldNormMetaData =0;
    uint32_t nersc_csum;
    uint32_t scidac_csuma;
    uint32_t scidac_csumb;
@ -721,7 +771,7 @@ class IldgReader : public GridLimeReader {
 	//////////////////////////////////
 	// ILDG format record

-  std::string xmlstring(&xmlc[0]);
+	std::string xmlstring(&xmlc[0]);
 	if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) { 

 	  XmlReader RD(xmlstring, true, "");
@ -774,11 +824,17 @@ class IldgReader : public GridLimeReader {
 	  found_scidacChecksum = 1;
 	}

+	if ( !strncmp(limeReaderType(LimeR), GRID_FIELD_NORM,strlen(GRID_FIELD_NORM)) ) { 
+	  XmlReader RD(xmlstring, true, "");
+	  read(RD,GRID_FIELD_NORM,FieldNormMetaData_);
+	  found_FieldNormMetaData = 1;
+	}
+
      } else {  
 	/////////////////////////////////
 	// Binary data
 	/////////////////////////////////
-	std::cout << GridLogMessage << "ILDG Binary record found : "  ILDG_BINARY_DATA << std::endl;
+	//	std::cout << GridLogMessage << "ILDG Binary record found : "  ILDG_BINARY_DATA << std::endl;
 	uint64_t offset= ftello(File);
 	if ( format == std::string("IEEE64BIG") ) {
 	  GaugeSimpleMunger<dobj, sobj> munge;
@ -845,6 +901,13 @@ class IldgReader : public GridLimeReader {
    ////////////////////////////////////////////////////////////
    // Really really want to mandate a scidac checksum
    ////////////////////////////////////////////////////////////
+    if ( found_FieldNormMetaData ) { 
+      RealD nn = norm2(Umu);
+      GRID_FIELD_NORM_CHECK(FieldNormMetaData_,nn);
+      std::cout << GridLogMessage<<"FieldNormMetaData matches " << std::endl;
+    }  else { 
+      std::cout << GridLogWarning<<"FieldNormMetaData not found. " << std::endl;
+    }
    if ( found_scidacChecksum ) {
      FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
      FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@ -56,6 +56,10 @@ namespace Grid {
  ////////////////////////////////////////////////////////////////////////////////
  // header specification/interpretation
  ////////////////////////////////////////////////////////////////////////////////
+    class FieldNormMetaData : Serializable {
+    public:
+      GRID_SERIALIZABLE_CLASS_MEMBERS(FieldNormMetaData, double, norm2);
+    };
    class FieldMetaData : Serializable {
    public:

--- a/Grid/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@ -49,21 +49,39 @@ inline double usecond(void) {

 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
-typedef  std::chrono::milliseconds          GridMillisecs;
-typedef  std::chrono::microseconds          GridTime;
-typedef  std::chrono::microseconds          GridUsecs;

-inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
+typedef  std::chrono::seconds               GridSecs;
+typedef  std::chrono::milliseconds          GridMillisecs;
+typedef  std::chrono::microseconds          GridUsecs;
+typedef  std::chrono::microseconds          GridTime;
+
+inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
 {
-  stream << time.count()<<" ms";
+  stream << time.count()<<" s";
  return stream;
 }
-inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
+inline std::ostream& operator<< (std::ostream & stream, const GridMillisecs & now)
 {
-  stream << time.count()<<" usec";
+  GridSecs second(1);
+  auto     secs       = now/second ; 
+  auto     subseconds = now%second ;
+  auto     fill       = stream.fill();
+  stream << secs<<"."<<std::setw(3)<<std::setfill('0')<<subseconds.count()<<" s";
+  stream.fill(fill);
  return stream;
 }
- 
+inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now)
+{
+  GridSecs second(1);
+  auto     seconds    = now/second ; 
+  auto     subseconds = now%second ;
+  auto     fill       = stream.fill();
+  stream << seconds<<"."<<std::setw(6)<<std::setfill('0')<<subseconds.count()<<" s";
+  stream.fill(fill);
+  return stream;
+}
+
+
 class GridStopWatch {
 private:
  bool running;
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@ -90,17 +90,20 @@ namespace QCD {
    // That probably makes for GridRedBlack4dCartesian grid.

    // s,sp,c,spc,lc
-    template<typename vtype> using iSinglet                   = iScalar<iScalar<iScalar<vtype> > >;
-    template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iScalar<vtype>, Ns> >;
-    template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
-    template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
-    template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
-    template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
-    template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
-    template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
-    template<typename vtype> using iSpinColourVector          = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
-    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
-    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
+
+    template<typename vtype> using iSinglet                     = iScalar<iScalar<iScalar<vtype> > >;
+    template<typename vtype> using iSpinMatrix                  = iScalar<iMatrix<iScalar<vtype>, Ns> >;
+    template<typename vtype> using iColourMatrix                = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
+    template<typename vtype> using iSpinColourMatrix            = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
+    template<typename vtype> using iLorentzColourMatrix         = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
+    template<typename vtype> using iDoubleStoredColourMatrix    = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
+    template<typename vtype> using iSpinVector                  = iScalar<iVector<iScalar<vtype>, Ns> >;
+    template<typename vtype> using iColourVector                = iScalar<iScalar<iVector<vtype, Nc> > >;
+    template<typename vtype> using iSpinColourVector            = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
+    template<typename vtype> using iHalfSpinVector              = iScalar<iVector<iScalar<vtype>, Nhs> >;
+    template<typename vtype> using iHalfSpinColourVector        = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
+    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
+

    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
@ -127,10 +130,28 @@ namespace QCD {
    typedef iSpinColourMatrix<Complex  >    SpinColourMatrix;
    typedef iSpinColourMatrix<ComplexF >    SpinColourMatrixF;
    typedef iSpinColourMatrix<ComplexD >    SpinColourMatrixD;
-
+    
    typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
    typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
    typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;
+    
+    // SpinColourSpinColour matrix
+    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
+
+    typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
+
+    // SpinColourSpinColour matrix
+    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
+
+    typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;

    // LorentzColour
    typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
@ -229,6 +250,9 @@ namespace QCD {
    typedef Lattice<vSpinColourMatrixF>     LatticeSpinColourMatrixF;
    typedef Lattice<vSpinColourMatrixD>     LatticeSpinColourMatrixD;

+    typedef Lattice<vSpinColourSpinColourMatrix>      LatticeSpinColourSpinColourMatrix;
+    typedef Lattice<vSpinColourSpinColourMatrixF>     LatticeSpinColourSpinColourMatrixF;
+    typedef Lattice<vSpinColourSpinColourMatrixD>     LatticeSpinColourSpinColourMatrixD;

    typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
    typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@ -44,12 +44,15 @@ namespace QCD {
  
  struct WilsonImplParams {
    bool overlapCommsCompute;
+    std::vector<Real> twist_n_2pi_L;
    std::vector<Complex> boundary_phases;
    WilsonImplParams() : overlapCommsCompute(false) {
      boundary_phases.resize(Nd, 1.0);
+      twist_n_2pi_L.resize(Nd, 0.0);
    };
-    WilsonImplParams(const std::vector<Complex> phi)
-      : boundary_phases(phi), overlapCommsCompute(false) {}
+    WilsonImplParams(const std::vector<Complex> phi) : boundary_phases(phi), overlapCommsCompute(false) {
+      twist_n_2pi_L.resize(Nd, 0.0);
+    }
  };

  struct StaggeredImplParams {
@ -63,7 +66,8 @@ namespace QCD {
 				    int,   MaxIter, 
 				    RealD, tolerance, 
 				    int,   degree, 
-				    int,   precision);
+				    int,   precision,
+				    int,   BoundsCheckFreq);
    
    // MaxIter and tolerance, vectors??
    
@ -73,13 +77,15 @@ namespace QCD {
 				int _maxit     = 1000,
 				RealD tol      = 1.0e-8, 
                           	int _degree    = 10,
-				int _precision = 64)
+				int _precision = 64,
+				int _BoundsCheckFreq=20)
      : lo(_lo),
 	hi(_hi),
 	MaxIter(_maxit),
 	tolerance(tol),
 	degree(_degree),
-	precision(_precision){};
+        precision(_precision),
+        BoundsCheckFreq(_BoundsCheckFreq){};
  };
  
  
--- a/Grid/qcd/action/fermion/CayleyFermion5D.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.cc
@ -68,6 +68,26 @@ void CayleyFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &so
  ExtractSlice(exported4d, tmp, 0, 0);
 }
 template<class Impl>  
+void CayleyFermion5D<Impl>::P(const FermionField &psi, FermionField &chi)
+{
+  int Ls= this->Ls;
+  chi=zero;
+  for(int s=0;s<Ls;s++){
+    axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
+    axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s+1)%Ls);
+  }
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::Pdag(const FermionField &psi, FermionField &chi)
+{
+  int Ls= this->Ls;
+  chi=zero;
+  for(int s=0;s<Ls;s++){
+    axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
+    axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s-1+Ls)%Ls);
+  }
+}
+template<class Impl>  
 void CayleyFermion5D<Impl>::ExportPhysicalFermionSource(const FermionField &solution5d,FermionField &exported4d)
 {
  int Ls = this->Ls;
@ -465,9 +485,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
    
  double bpc = b+c;
  double bmc = b-c;
+  _b = b;
+  _c = c;
+  _gamma  = gamma; // Save the parameters so we can change mass later.
+  _zolo_hi= zolo_hi;
  for(int i=0; i < Ls; i++){
    as[i] = 1.0;
-    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
+    omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code
    assert(omega[i]!=Coeff_t(0.0));
    bs[i] = 0.5*(bpc/omega[i] + bmc);
    cs[i] = 0.5*(bpc/omega[i] - bmc);
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@ -93,6 +93,17 @@ namespace Grid {
      virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);
      virtual void ImportUnphysicalFermion(const FermionField &solution5d, FermionField &exported4d);

+      ///////////////////////////////////////////////////////////////
+      // Support for MADWF tricks
+      ///////////////////////////////////////////////////////////////
+      RealD Mass(void) { return mass; };
+      void  SetMass(RealD _mass) { 
+	mass=_mass; 
+	SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
+      } ;
+      void  P(const FermionField &psi, FermionField &chi);
+      void  Pdag(const FermionField &psi, FermionField &chi);
+
      /////////////////////////////////////////////////////
      // Instantiate different versions depending on Impl
      /////////////////////////////////////////////////////
@ -139,6 +150,12 @@ namespace Grid {
      //    protected:
      RealD mass;

+      // Save arguments to SetCoefficientsInternal
+      std::vector<Coeff_t> _gamma;
+      RealD                _zolo_hi;
+      RealD                _b;
+      RealD                _c;
+
      // Cayley form Moebius (tanh and zolotarev)
      std::vector<Coeff_t> omega;
      std::vector<Coeff_t> bs;    // S dependent coeffs
--- a/Grid/qcd/action/fermion/DomainWallFermion.h
+++ b/Grid/qcd/action/fermion/DomainWallFermion.h
@ -43,7 +43,7 @@ namespace Grid {
     INHERIT_IMPL_TYPES(Impl);
    public:

-      void FreePropagator(const FermionField &in,FermionField &out,RealD mass, std::vector<double> twist, bool fiveD) {
+      void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist, bool fiveD) {
 	FermionField in_k(in._grid);
 	FermionField prop_k(in._grid);

@ -53,17 +53,22 @@ namespace Grid {
 	ComplexField coor(in._grid);
 	ComplexField ph(in._grid);  ph = zero;
 	FermionField in_buf(in._grid); in_buf = zero;
-	Complex ci(0.0,1.0);
+	Scalar ci(0.0,1.0);
 	assert(twist.size() == Nd);//check that twist is Nd
+	assert(boundary.size() == Nd);//check that boundary conditions is Nd
 	int shift = 0;
 	if(fiveD) shift = 1;
 	for(unsigned int nu = 0; nu < Nd; nu++)
 	{
 	  // Shift coordinate lattice index by 1 to account for 5th dimension.
          LatticeCoordinate(coor, nu + shift);
-	  ph = ph + twist[nu]*coor*((1./(in._grid->_fdimensions[nu+shift])));
+	  double boundary_phase = ::acos(real(boundary[nu]));
+	  ph = ph + boundary_phase*coor*((1./(in._grid->_fdimensions[nu+shift])));
+	  //momenta for propagator shifted by twist+boundary
+	  twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
 	}
-	in_buf = exp((Real)(2.0*M_PI)*ci*ph*(-1.0))*in;
+	in_buf = exp(ci*ph*(-1.0))*in;
+

 	if(fiveD){//FFT only on temporal and spatial dimensions
          std::vector<int> mask(Nd+1,1); mask[0] = 0;
@ -76,25 +81,28 @@ namespace Grid {
          this->MomentumSpacePropagatorHt(prop_k,in_k,mass,twist);
 	  theFFT.FFT_all_dim(out,prop_k,FFT::backward);
        }
-
 	//phase for boundary condition
-	out = out * exp((Real)(2.0*M_PI)*ci*ph);
+	out = out * exp(ci*ph);
      };

-      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<double> twist) {
+      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) {
        bool fiveD = true; //5d propagator by default
-        FreePropagator(in,out,mass,twist,fiveD);
+	FreePropagator(in,out,mass,boundary,twist,fiveD);
      };

      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass, bool fiveD) {
 	std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
-        FreePropagator(in,out,mass,twist,fiveD);
+	std::vector<Complex> boundary;
+	for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
+	FreePropagator(in,out,mass,boundary,twist,fiveD);
      };

      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
        bool fiveD = true; //5d propagator by default
-	std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
-        FreePropagator(in,out,mass,twist,fiveD);
+	std::vector<double> twist(Nd,0.0); //default: twist angle 0
+	std::vector<Complex> boundary;
+	for(int i=0;i<Nd;i++) boundary.push_back(1); //default: periodic boundary conditions
+	FreePropagator(in,out,mass,boundary,twist,fiveD);
      };

      virtual void   Instantiatable(void) {};
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@ -80,12 +80,24 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ///////////////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/g5HermitianLinop.h>

+///////////////////////////////////////////////////////////////////////////////
+// Fourier accelerated Pauli Villars inverse support
+///////////////////////////////////////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/WilsonTMFermion5D.h>   
+
+////////////////////////////////////////////////////////////////////////////////
+// Move this group to a DWF specific tools/algorithms subdir? 
+////////////////////////////////////////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/FourierAcceleratedPV.h>
+#include <Grid/qcd/action/fermion/PauliVillarsInverters.h>
+#include <Grid/qcd/action/fermion/Reconstruct5Dprop.h>
+#include <Grid/qcd/action/fermion/MADWF.h>
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // More maintainable to maintain the following typedef list centrally, as more "impl" targets
 // are added, (e.g. extension for gparity, half precision project in comms etc..)
 ////////////////////////////////////////////////////////////////////////////////////////////////////

-
 // Cayley 5d
 namespace Grid {
  namespace QCD {
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@ -64,11 +64,6 @@ namespace Grid {
      virtual RealD  M    (const FermionField &in, FermionField &out)=0;
      virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;

-      // Query the even even properties to make algorithmic decisions
-      virtual int    ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
-      virtual int    isTrivialEE(void) { return 0; };
-      virtual RealD  Mass(void) {return 0.0;};
-
      // half checkerboard operaions
      virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
      virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0;
@ -101,7 +96,7 @@ namespace Grid {

      virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};

-      virtual void  FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<double> twist) {
+      virtual void  FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) {
 	FFT theFFT((GridCartesian *) in._grid);

 	FermionField in_k(in._grid);
@ -111,26 +106,33 @@ namespace Grid {
 	ComplexField coor(in._grid);
 	ComplexField ph(in._grid);  ph = zero;
 	FermionField in_buf(in._grid); in_buf = zero;
-	Complex ci(0.0,1.0);
+	Scalar ci(0.0,1.0);
 	assert(twist.size() == Nd);//check that twist is Nd
+	assert(boundary.size() == Nd);//check that boundary conditions is Nd
 	for(unsigned int nu = 0; nu < Nd; nu++)
 	{
          LatticeCoordinate(coor, nu);
-	  ph = ph + twist[nu]*coor*((1./(in._grid->_fdimensions[nu])));
+	  double boundary_phase = ::acos(real(boundary[nu]));
+	  ph = ph + boundary_phase*coor*((1./(in._grid->_fdimensions[nu])));
+	  //momenta for propagator shifted by twist+boundary
+	  twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
 	}
-	in_buf = exp((Real)(2.0*M_PI)*ci*ph*(-1.0))*in;
+	in_buf = exp(ci*ph*(-1.0))*in;

 	theFFT.FFT_all_dim(in_k,in_buf,FFT::forward);
        this->MomentumSpacePropagator(prop_k,in_k,mass,twist);
 	theFFT.FFT_all_dim(out,prop_k,FFT::backward);

 	//phase for boundary condition
-	out = out * exp((Real)(2.0*M_PI)*ci*ph);
+	out = out * exp(ci*ph);

      };
+
      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+		std::vector<Complex> boundary;
+		for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
 		std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
-	        FreePropagator(in,out,mass,twist);
+	        FreePropagator(in,out,mass,boundary,twist);
      };

      ///////////////////////////////////////////////
@ -141,6 +143,7 @@ namespace Grid {
      //////////////////////////////////////////////////////////////////////
      // Conserved currents, either contract at sink or insert sequentially.
      //////////////////////////////////////////////////////////////////////
+
      virtual void ContractConservedCurrent(PropagatorField &q_in_1,
                                            PropagatorField &q_in_2,
                                            PropagatorField &q_out,
@ -153,6 +156,12 @@ namespace Grid {
                                       unsigned int tmin, 
                                       unsigned int tmax,
                                       ComplexField &lattice_cmplx)=0;
+
+      // Only reimplemented in Wilson5D 
+      // Default to just a zero correlation function
+      virtual void ContractJ5q(FermionField &q_in   ,ComplexField &J5q) { J5q=zero; };
+      virtual void ContractJ5q(PropagatorField &q_in,ComplexField &J5q) { J5q=zero; };
+
      ///////////////////////////////////////////////
      // Physical field import/export
      ///////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/FermionOperatorImpl.h
+++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h
@ -141,6 +141,7 @@ namespace QCD {
  ////////////////////////////////////////////////////////////////////////
  
 #define INHERIT_FIMPL_TYPES(Impl)\
+  typedef Impl Impl_t;							\
  typedef typename Impl::FermionField           FermionField;		\
  typedef typename Impl::PropagatorField     PropagatorField;		\
  typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
@ -239,16 +240,30 @@ namespace QCD {
      GaugeLinkField tmp(GaugeGrid);

      Lattice<iScalar<vInteger> > coor(GaugeGrid);
+      ////////////////////////////////////////////////////
+      // apply any boundary phase or twists
+      ////////////////////////////////////////////////////
      for (int mu = 0; mu < Nd; mu++) {

-	      auto pha = Params.boundary_phases[mu];
-	      scalar_type phase( real(pha),imag(pha) );
+	////////// boundary phase /////////////
+	auto pha = Params.boundary_phases[mu];
+	scalar_type phase( real(pha),imag(pha) );

-        int Lmu = GaugeGrid->GlobalDimensions()[mu] - 1;
+	int L   = GaugeGrid->GlobalDimensions()[mu];
+        int Lmu = L - 1;

        LatticeCoordinate(coor, mu);

        U = PeekIndex<LorentzIndex>(Umu, mu);
+
+	// apply any twists
+	RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
+	if ( theta != 0.0) { 
+	  scalar_type twphase(::cos(theta),::sin(theta));
+	  U = twphase*U;
+	  std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
+	}
+
        tmp = where(coor == Lmu, phase * U, U);
        PokeIndex<LorentzIndex>(Uds, tmp, mu);

--- a/Grid/qcd/action/fermion/FourierAcceleratedPV.h
+++ b/Grid/qcd/action/fermion/FourierAcceleratedPV.h
@ -0,0 +1,237 @@
+
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/FourierAcceleratedPV.h
+
+    Copyright (C) 2015
+
+Author: Christoph Lehner (lifted with permission by Peter Boyle, brought back to Grid)
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+namespace Grid {
+namespace QCD {
+
+  template<typename M>
+    void get_real_const_bc(M& m, RealD& _b, RealD& _c) {
+    ComplexD b,c;
+    b=m.bs[0];
+    c=m.cs[0];
+    std::cout << GridLogMessage << "b=" << b << ", c=" << c << std::endl;
+    for (size_t i=1;i<m.bs.size();i++) {
+      assert(m.bs[i] == b);
+      assert(m.cs[i] == c);
+    }
+    assert(b.imag() == 0.0);
+    assert(c.imag() == 0.0);
+    _b = b.real();
+    _c = c.real();
+  }
+
+
+template<typename Vi, typename M, typename G>
+class FourierAcceleratedPV {
+ public:
+
+  ConjugateGradient<Vi> &cg;
+  M& dwfPV;
+  G& Umu;
+  GridCartesian* grid5D;
+  GridRedBlackCartesian* gridRB5D;
+  int group_in_s;
+
+  FourierAcceleratedPV(M& _dwfPV, G& _Umu, ConjugateGradient<Vi> &_cg, int _group_in_s = 2) 
+   : dwfPV(_dwfPV), Umu(_Umu), cg(_cg), group_in_s(_group_in_s) 
+  {
+    assert( dwfPV.FermionGrid()->_fdimensions[0] % (2*group_in_s) == 0);
+    grid5D = QCD::SpaceTimeGrid::makeFiveDimGrid(2*group_in_s, (GridCartesian*)Umu._grid);
+    gridRB5D = QCD::SpaceTimeGrid::makeFiveDimRedBlackGrid(2*group_in_s, (GridCartesian*)Umu._grid);
+  }
+
+  void rotatePV(const Vi& _src, Vi& dst, bool forward) const {
+
+    GridStopWatch gsw1, gsw2;
+
+    typedef typename Vi::scalar_type Coeff_t;
+    int Ls = dst._grid->_fdimensions[0];
+
+    Vi _tmp(dst._grid);
+    double phase = M_PI / (double)Ls;
+    Coeff_t bzero(0.0,0.0);
+
+    FFT theFFT((GridCartesian*)dst._grid);
+
+    if (!forward) {
+      gsw1.Start();
+      for (int s=0;s<Ls;s++) {
+	Coeff_t a(::cos(phase*s),-::sin(phase*s));
+	axpby_ssp(_tmp,a,_src,bzero,_src,s,s);
+      }
+      gsw1.Stop();
+
+      gsw2.Start();
+      theFFT.FFT_dim(dst,_tmp,0,FFT::forward);
+      gsw2.Stop();
+
+    } else {
+
+      gsw2.Start();
+      theFFT.FFT_dim(_tmp,_src,0,FFT::backward);
+      gsw2.Stop();
+
+      gsw1.Start();
+      for (int s=0;s<Ls;s++) {
+	Coeff_t a(::cos(phase*s),::sin(phase*s));
+	axpby_ssp(dst,a,_tmp,bzero,_tmp,s,s);
+      }
+      gsw1.Stop();
+    }
+
+    std::cout << GridLogMessage << "Timing rotatePV: " << gsw1.Elapsed() << ", " << gsw2.Elapsed() << std::endl;
+
+  }
+
+  void pvInv(const Vi& _src, Vi& _dst) const {
+
+    std::cout << GridLogMessage << "Fourier-Accelerated Outer Pauli Villars"<<std::endl;
+
+    typedef typename Vi::scalar_type Coeff_t;
+    int Ls = _dst._grid->_fdimensions[0];
+
+    GridStopWatch gswT;
+    gswT.Start();
+
+    RealD b,c;
+    get_real_const_bc(dwfPV,b,c);
+    RealD M5 = dwfPV.M5;
+    
+    // U(true) Rightinv TMinv U(false) = Minv
+
+    Vi _src_diag(_dst._grid);
+    Vi _src_diag_slice(dwfPV.GaugeGrid());
+    Vi _dst_diag_slice(dwfPV.GaugeGrid());
+    Vi _src_diag_slices(grid5D);
+    Vi _dst_diag_slices(grid5D);
+    Vi _dst_diag(_dst._grid);
+
+    rotatePV(_src,_src_diag,false);
+
+    // now do TM solves
+    Gamma G5(Gamma::Algebra::Gamma5);
+
+    GridStopWatch gswA, gswB;
+
+    gswA.Start();
+
+    typedef typename M::Impl_t Impl;
+    //WilsonTMFermion<Impl> tm(x.Umu,*x.UGridF,*x.UrbGridF,0.0,0.0,solver_outer.parent.par.wparams_f);
+    std::vector<RealD> vmass(grid5D->_fdimensions[0],0.0);
+    std::vector<RealD> vmu(grid5D->_fdimensions[0],0.0);
+
+    WilsonTMFermion5D<Impl> tm(Umu,*grid5D,*gridRB5D,
+			   *(GridCartesian*)dwfPV.GaugeGrid(),
+			   *(GridRedBlackCartesian*)dwfPV.GaugeRedBlackGrid(),
+			   vmass,vmu);
+    
+    //SchurRedBlackDiagTwoSolve<Vi> sol(cg);
+    SchurRedBlackDiagMooeeSolve<Vi> sol(cg); // same performance as DiagTwo
+    gswA.Stop();
+
+    gswB.Start();
+
+    for (int sgroup=0;sgroup<Ls/2/group_in_s;sgroup++) {
+
+      for (int sidx=0;sidx<group_in_s;sidx++) {
+
+	int s = sgroup*group_in_s + sidx;
+	int sprime = Ls-s-1;
+
+	RealD phase = M_PI / (RealD)Ls * (2.0 * s + 1.0);
+	RealD cosp = ::cos(phase);
+	RealD sinp = ::sin(phase);
+	RealD denom = b*b + c*c + 2.0*b*c*cosp;
+	RealD mass = -(b*b*M5 + c*(1.0 - cosp + c*M5) + b*(-1.0 + cosp + 2.0*c*cosp*M5))/denom;
+	RealD mu = (b+c)*sinp/denom;
+
+	vmass[2*sidx + 0] = mass;
+	vmass[2*sidx + 1] = mass;
+	vmu[2*sidx + 0] = mu;
+	vmu[2*sidx + 1] = -mu;
+
+      }
+
+      tm.update(vmass,vmu);
+
+      for (int sidx=0;sidx<group_in_s;sidx++) {
+
+	int s = sgroup*group_in_s + sidx;
+	int sprime = Ls-s-1;
+
+	ExtractSlice(_src_diag_slice,_src_diag,s,0);
+	InsertSlice(_src_diag_slice,_src_diag_slices,2*sidx + 0,0);
+
+	ExtractSlice(_src_diag_slice,_src_diag,sprime,0);
+	InsertSlice(_src_diag_slice,_src_diag_slices,2*sidx + 1,0);
+
+      }
+
+      GridStopWatch gsw;
+      gsw.Start();
+      _dst_diag_slices = zero; // zero guess
+      sol(tm,_src_diag_slices,_dst_diag_slices);
+      gsw.Stop();
+      std::cout << GridLogMessage << "Solve[sgroup=" << sgroup << "] completed in " << gsw.Elapsed() << ", " << gswA.Elapsed() << std::endl;
+
+      for (int sidx=0;sidx<group_in_s;sidx++) {
+
+	int s = sgroup*group_in_s + sidx;
+	int sprime = Ls-s-1;
+
+	RealD phase = M_PI / (RealD)Ls * (2.0 * s + 1.0);
+	RealD cosp = ::cos(phase);
+	RealD sinp = ::sin(phase);
+
+	// now rotate with inverse of
+	Coeff_t pA = b + c*cosp;
+	Coeff_t pB = - Coeff_t(0.0,1.0)*c*sinp;
+	Coeff_t pABden = pA*pA - pB*pB;
+	// (pA + pB * G5) * (pA - pB*G5) = (pA^2 - pB^2)
+      
+	ExtractSlice(_dst_diag_slice,_dst_diag_slices,2*sidx + 0,0);
+	_dst_diag_slice = (pA/pABden) * _dst_diag_slice - (pB/pABden) * (G5 * _dst_diag_slice);
+	InsertSlice(_dst_diag_slice,_dst_diag,s,0);
+	
+	ExtractSlice(_dst_diag_slice,_dst_diag_slices,2*sidx + 1,0);
+	_dst_diag_slice = (pA/pABden) * _dst_diag_slice + (pB/pABden) * (G5 * _dst_diag_slice);
+	InsertSlice(_dst_diag_slice,_dst_diag,sprime,0);
+      }
+    }
+    gswB.Stop();
+
+    rotatePV(_dst_diag,_dst,true);
+
+    gswT.Stop();
+    std::cout << GridLogMessage << "PV completed in " << gswT.Elapsed() << " (Setup: " << gswA.Elapsed() << ", s-loop: " << gswB.Elapsed() << ")" << std::endl;
+  }
+
+};
+}}
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
@ -26,7 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {
 namespace QCD {
--- a/Grid/qcd/action/fermion/MADWF.h
+++ b/Grid/qcd/action/fermion/MADWF.h
@ -0,0 +1,193 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/MADWF.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+
+namespace Grid {
+namespace QCD {
+
+template <class Fieldi, class Fieldo,IfNotSame<Fieldi,Fieldo> X=0>
+inline void convert(const Fieldi &from,Fieldo &to) 
+{
+  precisionChange(to,from);
+}
+template <class Fieldi, class Fieldo,IfSame<Fieldi,Fieldo> X=0>
+inline void convert(const Fieldi &from,Fieldo &to) 
+{
+  to=from;
+}
+
+template<class Matrixo,class Matrixi,class PVinverter,class SchurSolver, class Guesser> 
+class MADWF 
+{
+ private:
+  typedef typename Matrixo::FermionField FermionFieldo;
+  typedef typename Matrixi::FermionField FermionFieldi;
+
+  PVinverter  & PauliVillarsSolvero;// For the outer field
+  SchurSolver & SchurSolveri;       // For the inner approx field
+  Guesser     & Guesseri;           // To deflate the inner approx solves
+
+  Matrixo & Mato;                   // Action object for outer
+  Matrixi & Mati;                   // Action object for inner
+
+  RealD target_resid;
+  int   maxiter;
+ public:
+
+  MADWF(Matrixo &_Mato,
+	Matrixi &_Mati, 
+	PVinverter &_PauliVillarsSolvero, 
+	SchurSolver &_SchurSolveri,
+	Guesser & _Guesseri,
+	RealD resid,
+	int _maxiter) :
+
+  Mato(_Mato),Mati(_Mati),
+    SchurSolveri(_SchurSolveri),
+    PauliVillarsSolvero(_PauliVillarsSolvero),Guesseri(_Guesseri)
+  {   
+    target_resid=resid;
+    maxiter     =_maxiter; 
+  };
+
+  void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
+  {
+    std::cout << GridLogMessage<< " ************************************************" << std::endl;
+    std::cout << GridLogMessage<< "  MADWF-like algorithm                           " << std::endl;
+    std::cout << GridLogMessage<< " ************************************************" << std::endl;
+
+    FermionFieldi    c0i(Mati.GaugeGrid()); // 4d 
+    FermionFieldi    y0i(Mati.GaugeGrid()); // 4d
+    FermionFieldo    c0 (Mato.GaugeGrid()); // 4d 
+    FermionFieldo    y0 (Mato.GaugeGrid()); // 4d
+
+    FermionFieldo    A(Mato.FermionGrid()); // Temporary outer
+    FermionFieldo    B(Mato.FermionGrid()); // Temporary outer
+    FermionFieldo    b(Mato.FermionGrid()); // 5d source
+
+    FermionFieldo    c(Mato.FermionGrid()); // PVinv source; reused so store
+    FermionFieldo    defect(Mato.FermionGrid()); // 5d source
+
+    FermionFieldi   ci(Mati.FermionGrid()); 
+    FermionFieldi   yi(Mati.FermionGrid()); 
+    FermionFieldi   xi(Mati.FermionGrid()); 
+    FermionFieldi srci(Mati.FermionGrid()); 
+    FermionFieldi   Ai(Mati.FermionGrid()); 
+
+    RealD m=Mati.Mass();
+
+    ///////////////////////////////////////
+    //Import source, include Dminus factors
+    ///////////////////////////////////////
+    Mato.ImportPhysicalFermionSource(src4,b); 
+    std::cout << GridLogMessage << " src4 " <<norm2(src4)<<std::endl;
+    std::cout << GridLogMessage << " b    " <<norm2(b)<<std::endl;
+
+    defect = b;
+    sol5=zero;
+    for (int i=0;i<maxiter;i++) {
+
+      ///////////////////////////////////////
+      // Set up c0 from current defect
+      ///////////////////////////////////////
+      PauliVillarsSolvero(Mato,defect,A);
+      Mato.Pdag(A,c);
+      ExtractSlice(c0, c, 0 , 0);
+
+      ////////////////////////////////////////////////
+      // Solve the inner system with surface term c0
+      ////////////////////////////////////////////////
+      ci = zero;  
+      convert(c0,c0i); // Possible precison change
+      InsertSlice(c0i,ci,0, 0);
+
+      // Dwm P y = Dwm x = D(1) P (c0,0,0,0)^T
+      Mati.P(ci,Ai);
+      Mati.SetMass(1.0);      Mati.M(Ai,srci);      Mati.SetMass(m);
+      SchurSolveri(Mati,srci,xi,Guesseri); 
+      Mati.Pdag(xi,yi);
+      ExtractSlice(y0i, yi, 0 , 0);
+      convert(y0i,y0); // Possible precision change
+
+      //////////////////////////////////////
+      // Propagate solution back to outer system
+      // Build Pdag PV^-1 Dm P [-sol4,c2,c3... cL]
+      //////////////////////////////////////
+      c0 = - y0;
+      InsertSlice(c0, c, 0   , 0);
+
+      /////////////////////////////
+      // Reconstruct the bulk solution Pdag PV^-1 Dm P 
+      /////////////////////////////
+      Mato.P(c,B);
+      Mato.M(B,A);
+      PauliVillarsSolvero(Mato,A,B);
+      Mato.Pdag(B,A);
+
+      //////////////////////////////
+      // Reinsert surface prop
+      //////////////////////////////
+      InsertSlice(y0,A,0,0);
+
+      //////////////////////////////
+      // Convert from y back to x 
+      //////////////////////////////
+      Mato.P(A,B);
+
+      //         sol5' = sol5 + M^-1 defect
+      //               = sol5 + M^-1 src - M^-1 M sol5  ...
+      sol5 = sol5 + B;
+      std::cout << GridLogMessage << "***************************************" <<std::endl;
+      std::cout << GridLogMessage << " Sol5 update "<<std::endl;
+      std::cout << GridLogMessage << "***************************************" <<std::endl;
+      std::cout << GridLogMessage << " Sol5 now "<<norm2(sol5)<<std::endl;
+      std::cout << GridLogMessage << " delta    "<<norm2(B)<<std::endl;
+
+       // New defect  = b - M sol5
+       Mato.M(sol5,A);
+       defect = b - A;
+
+       std::cout << GridLogMessage << " defect   "<<norm2(defect)<<std::endl;
+
+       double resid = ::sqrt(norm2(defect) / norm2(b));
+       std::cout << GridLogMessage << "Residual " << i << ": " << resid  << std::endl;
+       std::cout << GridLogMessage << "***************************************" <<std::endl;
+
+       if (resid < target_resid) {
+	 return;
+       }
+    }
+
+    std::cout << GridLogMessage << "MADWF : Exceeded maxiter "<<std::endl;
+    assert(0);
+
+  }
+
+};
+
+}}
--- a/Grid/qcd/action/fermion/PauliVillarsInverters.h
+++ b/Grid/qcd/action/fermion/PauliVillarsInverters.h
@ -0,0 +1,95 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+
+namespace Grid {
+namespace QCD {
+
+template<class Field>
+class PauliVillarsSolverUnprec
+{
+ public:
+  ConjugateGradient<Field> & CG;
+  PauliVillarsSolverUnprec(  ConjugateGradient<Field> &_CG) : CG(_CG){};
+
+  template<class Matrix>
+  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
+  {
+    RealD m = _Matrix.Mass();
+    Field A  (_Matrix.FermionGrid());
+
+    MdagMLinearOperator<Matrix,Field> HermOp(_Matrix);
+
+    _Matrix.SetMass(1.0);
+    _Matrix.Mdag(src,A);
+    CG(HermOp,A,sol);
+    _Matrix.SetMass(m);
+  };
+};
+
+template<class Field,class SchurSolverType>
+class PauliVillarsSolverRBprec
+{
+ public:
+  SchurSolverType & SchurSolver;
+  PauliVillarsSolverRBprec( SchurSolverType &_SchurSolver) : SchurSolver(_SchurSolver){};
+
+  template<class Matrix>
+  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
+  {
+    RealD m = _Matrix.Mass();
+    Field A  (_Matrix.FermionGrid());
+
+    _Matrix.SetMass(1.0);
+    SchurSolver(_Matrix,src,sol);
+    _Matrix.SetMass(m);
+  };
+};
+
+template<class Field,class GaugeField>
+class PauliVillarsSolverFourierAccel
+{
+ public:
+  GaugeField      & Umu;
+  ConjugateGradient<Field> & CG;
+
+  PauliVillarsSolverFourierAccel(GaugeField &_Umu,ConjugateGradient<Field> &_CG) :  Umu(_Umu), CG(_CG)
+  {
+  };
+
+  template<class Matrix>
+  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
+  {
+    FourierAcceleratedPV<Field, Matrix, typename Matrix::GaugeField > faPV(_Matrix,Umu,CG) ;
+    faPV.pvInv(src,sol);
+  };
+};
+
+
+}
+}
--- a/Grid/qcd/action/fermion/Reconstruct5Dprop.h
+++ b/Grid/qcd/action/fermion/Reconstruct5Dprop.h
@ -0,0 +1,135 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+
+namespace Grid {
+namespace QCD {
+
+template<class Field,class PVinverter> class Reconstruct5DfromPhysical {
+ private:
+  PVinverter & PauliVillarsSolver;
+ public:
+
+ /////////////////////////////////////////////////////
+ // First cut works, 10 Oct 2018.
+ //
+ // Must form a plan to get this into production for Zmobius acceleration
+ // of the Mobius exact AMA corrections.
+ //
+ // TODO : understand absence of contact term in eqns in Hantao's thesis
+ //        sol4 is contact term subtracted, but thesis & Brower's paper suggests not.
+ //
+ // Step 1: Localise PV inverse in a routine. [DONE]
+ // Step 2: Schur based PV inverse            [DONE]
+ // Step 3: Fourier accelerated PV inverse    [DONE]
+ //
+ /////////////////////////////////////////////////////
+ 
+  Reconstruct5DfromPhysical(PVinverter &_PauliVillarsSolver) 
+    : PauliVillarsSolver(_PauliVillarsSolver) 
+  { 
+  };
+
+
+   template<class Matrix>
+   void PV(Matrix &_Matrix,const Field &src,Field &sol)
+   {
+     RealD m = _Matrix.Mass();
+     _Matrix.SetMass(1.0);
+     _Matrix.M(src,sol);
+     _Matrix.SetMass(m);
+   }
+   template<class Matrix>
+   void PVdag(Matrix &_Matrix,const Field &src,Field &sol)
+   {
+     RealD m = _Matrix.Mass();
+     _Matrix.SetMass(1.0);
+     _Matrix.Mdag(src,sol);
+     _Matrix.SetMass(m);
+   }
+  template<class Matrix>
+  void operator() (Matrix & _Matrix,const Field &sol4,const Field &src4, Field &sol5){
+
+    int Ls =  _Matrix.Ls;
+
+    Field psi4(_Matrix.GaugeGrid());
+    Field psi(_Matrix.FermionGrid());
+    Field A  (_Matrix.FermionGrid());
+    Field B  (_Matrix.FermionGrid());
+    Field c  (_Matrix.FermionGrid());
+
+    typedef typename Matrix::Coeff_t Coeff_t;
+
+    std::cout << GridLogMessage<< " ************************************************" << std::endl;
+    std::cout << GridLogMessage<< " Reconstruct5Dprop: c.f. MADWF algorithm         " << std::endl;
+    std::cout << GridLogMessage<< " ************************************************" << std::endl;
+
+    ///////////////////////////////////////
+    //Import source, include Dminus factors
+    ///////////////////////////////////////
+    _Matrix.ImportPhysicalFermionSource(src4,B); 
+
+    ///////////////////////////////////////
+    // Set up c from src4
+    ///////////////////////////////////////
+    PauliVillarsSolver(_Matrix,B,A);
+    _Matrix.Pdag(A,c);
+
+    //////////////////////////////////////
+    // Build Pdag PV^-1 Dm P [-sol4,c2,c3... cL]
+    //////////////////////////////////////
+    psi4 = - sol4;
+    InsertSlice(psi4, psi, 0   , 0);
+    for (int s=1;s<Ls;s++) {
+      ExtractSlice(psi4,c,s,0);
+       InsertSlice(psi4,psi,s,0);
+    }
+
+    /////////////////////////////
+    // Pdag PV^-1 Dm P 
+    /////////////////////////////
+    _Matrix.P(psi,B);
+    _Matrix.M(B,A);
+    PauliVillarsSolver(_Matrix,A,B);
+    _Matrix.Pdag(B,A);
+
+    //////////////////////////////
+    // Reinsert surface prop
+    //////////////////////////////
+    InsertSlice(sol4,A,0,0);
+
+    //////////////////////////////
+    // Convert from y back to x 
+    //////////////////////////////
+    _Matrix.P(A,sol5);
+    
+  }
+};
+
+}
+}
--- a/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
@ -26,11 +26,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 #ifdef AVX512
-#include <simd/Intel512common.h>
-#include <simd/Intel512avx.h>
+#include <Grid/simd/Intel512common.h>
+#include <Grid/simd/Intel512avx.h>
 #endif

 // Interleave operations from two directions
@ -679,7 +679,7 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
  gauge3 =(uint64_t)&UU._odata[sU]( T ); 
  
  // This is the single precision 5th direction vectorised kernel
-#include <simd/Intel512single.h>
+#include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 								    DoubledGaugeField &U, DoubledGaugeField &UUU,
 								    SiteSpinor *buf, int LLs, int sU, 
@ -732,7 +732,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
   
 }

-#include <simd/Intel512double.h>
+#include <Grid/simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 								    DoubledGaugeField &U, DoubledGaugeField &UUU,
 								    SiteSpinor *buf, int LLs, int sU, 
@ -816,7 +816,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl

  // This is the single precision 5th direction vectorised kernel

-#include <simd/Intel512single.h>
+#include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 							       DoubledGaugeField &U, DoubledGaugeField &UUU,
 							       SiteSpinor *buf, int LLs, int sU, 
@ -884,7 +884,7 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
 #endif
 }

-#include <simd/Intel512double.h>
+#include <Grid/simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 							       DoubledGaugeField &U, DoubledGaugeField &UUU,
 							       SiteSpinor *buf, int LLs, int sU, 
--- a/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>


 #define LOAD_CHI(b)		\
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@ -67,6 +67,7 @@ public:
 public:
  typedef WilsonFermion<Impl> WilsonBase;

+  virtual int    ConstEE(void)     { return 0; };
  virtual void Instantiatable(void){};
  // Constructors
  WilsonCloverFermion(GaugeField &_Umu, GridCartesian &Fgrid,
--- a/Grid/qcd/action/fermion/WilsonFermion5D.cc
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.cc
@ -939,6 +939,75 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
    merge(qSiteRev, qSiteVec); \
 }

+//          psi = chiralProjectPlus(Result_s[Ls/2-1]);
+//          psi+= chiralProjectMinus(Result_s[Ls/2]);
+//         PJ5q+=localInnerProduct(psi,psi);
+
+template<class vobj> 
+Lattice<vobj> spProj5p(const Lattice<vobj> & in)
+{
+  GridBase *grid=in._grid;
+  Gamma G5(Gamma::Algebra::Gamma5);
+  Lattice<vobj> ret(grid);
+  parallel_for(int ss=0;ss<grid->oSites();ss++){
+    ret._odata[ss] = in._odata[ss] + G5*in._odata[ss];
+  }
+  return ret;
+}
+template<class vobj> 
+Lattice<vobj> spProj5m(const Lattice<vobj> & in)
+{
+  Gamma G5(Gamma::Algebra::Gamma5);
+  GridBase *grid=in._grid;
+  Lattice<vobj> ret(grid);
+  parallel_for(int ss=0;ss<grid->oSites();ss++){
+    ret._odata[ss] = in._odata[ss] - G5*in._odata[ss];
+  }
+  return ret;
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::ContractJ5q(FermionField &q_in,ComplexField &J5q)
+{
+  conformable(GaugeGrid(), J5q._grid);
+  conformable(q_in._grid, FermionGrid());
+
+  // 4d field
+  int Ls = this->Ls;
+  FermionField psi(GaugeGrid());
+  FermionField p_plus (GaugeGrid());
+  FermionField p_minus(GaugeGrid());
+  FermionField p(GaugeGrid());
+
+  ExtractSlice(p_plus , q_in, Ls/2   , 0);
+  ExtractSlice(p_minus, q_in, Ls/2-1 , 0);
+  p_plus = spProj5p(p_plus );
+  p_minus= spProj5m(p_minus);
+  p=p_plus+p_minus;
+  J5q = localInnerProduct(p,p);
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::ContractJ5q(PropagatorField &q_in,ComplexField &J5q)
+{
+  conformable(GaugeGrid(), J5q._grid);
+  conformable(q_in._grid, FermionGrid());
+
+  // 4d field
+  int Ls = this->Ls;
+  PropagatorField psi(GaugeGrid());
+  PropagatorField p_plus (GaugeGrid());
+  PropagatorField p_minus(GaugeGrid());
+  PropagatorField p(GaugeGrid());
+
+  ExtractSlice(p_plus , q_in, Ls/2   , 0);
+  ExtractSlice(p_minus, q_in, Ls/2-1 , 0);
+  p_plus = spProj5p(p_plus );
+  p_minus= spProj5m(p_minus);
+  p=p_plus+p_minus;
+  J5q = localInnerProduct(p,p);
+}
+
 template <class Impl>
 void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                     PropagatorField &q_in_2,
@ -949,6 +1018,7 @@ void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
    conformable(q_in_1._grid, FermionGrid());
    conformable(q_in_1._grid, q_in_2._grid);
    conformable(_FourDimGrid, q_out._grid);
+
    PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
    unsigned int LLs = q_in_1._grid->_rdimensions[0];
    q_out = zero;
@ -995,7 +1065,6 @@ void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 }


-
 template <class Impl>
 void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
                                                PropagatorField &q_out,
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@ -230,6 +230,10 @@ namespace QCD {
                             unsigned int tmin, 
                             unsigned int tmax,
 			     ComplexField &lattice_cmplx);
+
+    void ContractJ5q(PropagatorField &q_in,ComplexField &J5q);
+    void ContractJ5q(FermionField &q_in,ComplexField &J5q);
+
  };

 }}
--- a/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
@ -81,8 +81,8 @@ WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,Doubl
  assert(0);
 }

-#include <qcd/action/fermion/WilsonKernelsAsmAvx512.h>
-#include <qcd/action/fermion/WilsonKernelsAsmQPX.h>
+#include <Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h>
+#include <Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h>

 #define INSTANTIATE_ASM(A)\
 template void WilsonKernels<A>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
--- a/Grid/qcd/action/fermion/WilsonTMFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
@ -0,0 +1,155 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonTMFermion5D.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk> ; NB Christoph did similar in GPT
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once 
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/WilsonFermion.h>
+
+
+namespace Grid {
+
+  namespace QCD {
+    
+    template<class Impl>
+      class WilsonTMFermion5D : public WilsonFermion5D<Impl>
+      {
+      public:
+	INHERIT_IMPL_TYPES(Impl);
+      public:
+
+	virtual void   Instantiatable(void) {};
+
+	// Constructors
+        WilsonTMFermion5D(GaugeField &_Umu,
+			  GridCartesian         &Fgrid,
+			  GridRedBlackCartesian &Frbgrid, 
+			  GridCartesian         &Ugrid,
+			  GridRedBlackCartesian &Urbgrid, 
+			  const std::vector<RealD> _mass,
+			  const std::vector<RealD> _mu,
+			  const ImplParams &p= ImplParams()
+			  ) :
+	WilsonFermion5D<Impl>(_Umu,
+			      Fgrid,
+			      Frbgrid,
+			      Ugrid,
+			      Urbgrid,
+			      4.0,p)
+	
+	  {
+	    update(_mass,_mu);
+	  }
+
+	virtual void Meooe(const FermionField &in, FermionField &out) {
+	  if (in.checkerboard == Odd) {
+	    this->DhopEO(in, out, DaggerNo);
+	  } else {
+	    this->DhopOE(in, out, DaggerNo);
+	  }
+	}
+
+	virtual void MeooeDag(const FermionField &in, FermionField &out) {
+	  if (in.checkerboard == Odd) {
+	    this->DhopEO(in, out, DaggerYes);
+	  } else {
+	    this->DhopOE(in, out, DaggerYes);
+	  }
+	}	
+	
+	// allow override for twisted mass and clover
+	virtual void Mooee(const FermionField &in, FermionField &out) {
+	  out.checkerboard = in.checkerboard;
+	  //axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
+	  for (int s=0;s<(int)this->mass.size();s++) {
+	    ComplexD a = 4.0+this->mass[s];
+	    ComplexD b(0.0,this->mu[s]);
+	    axpbg5y_ssp(out,a,in,b,in,s,s);
+	  }
+	}
+
+	virtual void MooeeDag(const FermionField &in, FermionField &out) {
+	  out.checkerboard = in.checkerboard;
+	  for (int s=0;s<(int)this->mass.size();s++) {
+	    ComplexD a = 4.0+this->mass[s];
+	    ComplexD b(0.0,-this->mu[s]);
+	    axpbg5y_ssp(out,a,in,b,in,s,s);
+	  }
+	}
+	virtual void MooeeInv(const FermionField &in, FermionField &out) {
+	  for (int s=0;s<(int)this->mass.size();s++) {
+	    RealD m    = this->mass[s];
+	    RealD tm   = this->mu[s];
+	    RealD mtil = 4.0+this->mass[s];
+	    RealD sq   = mtil*mtil+tm*tm;
+	    ComplexD a    = mtil/sq;
+	    ComplexD b(0.0, -tm /sq);
+	    axpbg5y_ssp(out,a,in,b,in,s,s);
+	  }
+	}
+	virtual void MooeeInvDag(const FermionField &in, FermionField &out) {
+	  for (int s=0;s<(int)this->mass.size();s++) {
+	    RealD m    = this->mass[s];
+	    RealD tm   = this->mu[s];
+	    RealD mtil = 4.0+this->mass[s];
+	    RealD sq   = mtil*mtil+tm*tm;
+	    ComplexD a    = mtil/sq;
+	    ComplexD b(0.0,tm /sq);
+	    axpbg5y_ssp(out,a,in,b,in,s,s);
+	  }
+	}
+
+	virtual RealD M(const FermionField &in, FermionField &out) {
+	  out.checkerboard = in.checkerboard;
+	  this->Dhop(in, out, DaggerNo);
+	  FermionField tmp(out._grid);
+	  for (int s=0;s<(int)this->mass.size();s++) {
+	    ComplexD a = 4.0+this->mass[s];
+	    ComplexD b(0.0,this->mu[s]);
+	    axpbg5y_ssp(tmp,a,in,b,in,s,s);
+	  }
+	  return axpy_norm(out, 1.0, tmp, out);
+	}
+	
+	// needed for fast PV
+	void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) {
+	  assert(_mass.size() == _mu.size());
+	  assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
+	  this->mass = _mass;
+	  this->mu = _mu;
+	}
+	
+      private:
+	std::vector<RealD> mu;
+	std::vector<RealD> mass;
+	
+      };
+   
+    typedef WilsonTMFermion5D<WilsonImplF> WilsonTMFermion5DF; 
+    typedef WilsonTMFermion5D<WilsonImplD> WilsonTMFermion5DD; 
+
+}}
--- a/Grid/qcd/action/gauge/GaugeImplTypes.h
+++ b/Grid/qcd/action/gauge/GaugeImplTypes.h
@ -29,6 +29,14 @@ directory
 #ifndef GRID_GAUGE_IMPL_TYPES_H
 #define GRID_GAUGE_IMPL_TYPES_H

+#define CPS_MD_TIME
+
+#ifdef CPS_MD_TIME
+#define HMC_MOMENTUM_DENOMINATOR (2.0)
+#else
+#define HMC_MOMENTUM_DENOMINATOR (1.0)
+#endif
+
 namespace Grid {
 namespace QCD {

@ -38,6 +46,7 @@ namespace QCD {

 #define INHERIT_GIMPL_TYPES(GImpl)                  \
  typedef typename GImpl::Simd Simd;                \
+  typedef typename GImpl::Scalar Scalar;	    \
  typedef typename GImpl::LinkField GaugeLinkField; \
  typedef typename GImpl::Field GaugeField;         \
  typedef typename GImpl::ComplexField ComplexField;\
@ -55,7 +64,8 @@ namespace QCD {
 template <class S, int Nrepresentation = Nc, int Nexp = 12 > class GaugeImplTypes {
 public:
  typedef S Simd;
-
+  typedef typename Simd::scalar_type scalar_type;
+  typedef scalar_type Scalar;
  template <typename vtype> using iImplScalar     = iScalar<iScalar<iScalar<vtype> > >;
  template <typename vtype> using iImplGaugeLink  = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
  template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
@ -87,12 +97,32 @@ public:
  ///////////////////////////////////////////////////////////
  // Move these to another class
  // HMC auxiliary functions
-  static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) {
-    // specific for SU gauge fields
+  static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) 
+  {
+    // Zbigniew Srocinsky thesis:
+    //
+    // P(p) =  N \Prod_{x\mu}e^-{1/2 Tr (p^2_mux)}
+    // 
+    // p_x,mu = c_x,mu,a T_a
+    //
+    // Tr p^2 =  sum_a,x,mu 1/2 (c_x,mu,a)^2
+    //
+    // Which implies P(p) =  N \Prod_{x,\mu,a} e^-{1/4 c_xmua^2  }
+    //
+    //                    =  N \Prod_{x,\mu,a} e^-{1/2 (c_xmua/sqrt{2})^2  }
+    // 
+    // Expect c' = cxmua/sqrt(2) to be a unit variance gaussian.
+    //
+    // Expect cxmua variance sqrt(2).
+    //
+    // Must scale the momentum by sqrt(2) to invoke CPS and UKQCD conventions
+    //
    LinkField Pmu(P._grid);
-    Pmu = zero;
+    Pmu = Zero();
    for (int mu = 0; mu < Nd; mu++) {
      SU<Nrepresentation>::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
+      RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR) ;
+      Pmu = Pmu*scale;
      PokeIndex<LorentzIndex>(P, Pmu, mu);
    }
  }
--- a/Grid/qcd/action/gauge/Photon.h
+++ b/Grid/qcd/action/gauge/Photon.h
@ -4,9 +4,11 @@
 
 Source file: ./lib/qcd/action/gauge/Photon.h
 
- Copyright (C) 2015
+Copyright (C) 2015-2018
 
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+ Author: Antonin Portelli <antonin.portelli@me.com>
+ Author: James Harrison <J.Harrison@soton.ac.uk>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -30,11 +32,13 @@

 namespace Grid{
 namespace QCD{
+
  template <class S>
-  class QedGimpl
+  class QedGImpl
  {
  public:
    typedef S Simd;
+    typedef typename Simd::scalar_type Scalar;
    
    template <typename vtype>
    using iImplGaugeLink  = iScalar<iScalar<iScalar<vtype>>>;
@ -43,27 +47,27 @@ namespace QCD{
    
    typedef iImplGaugeLink<Simd>  SiteLink;
    typedef iImplGaugeField<Simd> SiteField;
-    typedef SiteField             SiteComplex;
+    typedef SiteLink              SiteComplex;
    
    typedef Lattice<SiteLink>  LinkField;
    typedef Lattice<SiteField> Field;
    typedef Field              ComplexField;
  };
  
-  typedef QedGimpl<vComplex> QedGimplR;
+  typedef QedGImpl<vComplex> QedGImplR;
  
-  template<class Gimpl>
+  template <class GImpl>
  class Photon
  {
  public:
-    INHERIT_GIMPL_TYPES(Gimpl);
+    INHERIT_GIMPL_TYPES(GImpl);
+    typedef typename SiteGaugeLink::scalar_object ScalarSite;
+    typedef typename ScalarSite::scalar_type      ScalarComplex;
    GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3);
-    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2, qedInf, 3);
+    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2);
  public:
-    Photon(Gauge gauge, ZmScheme zmScheme);
-    Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements);
-    Photon(Gauge gauge, ZmScheme zmScheme, Real G0);
-    Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements, Real G0);
+    Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvement);
+    Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme);
    virtual ~Photon(void) = default;
    void FreePropagator(const GaugeField &in, GaugeField &out);
    void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
@ -73,345 +77,255 @@ namespace QCD{
                         const GaugeLinkField &weight);
    void UnitField(GaugeField &out);
  private:
-    void infVolPropagator(GaugeLinkField &out);
-    void invKHatSquared(GaugeLinkField &out);
+    void makeSpatialNorm(LatticeInteger &spNrm);
+    void makeKHat(std::vector<GaugeLinkField> &khat);
+    void makeInvKHatSquared(GaugeLinkField &out);
    void zmSub(GaugeLinkField &out);
+    void transverseProjectSpatial(GaugeField &out);
+    void gaugeTransform(GaugeField &out);
  private:
-    Gauge    gauge_;
-    ZmScheme zmScheme_;
-    std::vector<Real>  improvement_;
-    Real     G0_;
+    GridBase          *grid_;
+    Gauge             gauge_;
+    ZmScheme          zmScheme_;
+    std::vector<Real> improvement_;
  };

-  typedef Photon<QedGimplR>  PhotonR;
+  typedef Photon<QedGImplR>  PhotonR;
  
-  template<class Gimpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
-  : gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()),
-    G0_(0.15493339023106021408483720810737508876916113364521)
-  {}
-
-  template<class Gimpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
+  template<class GImpl>
+  Photon<GImpl>::Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme,
                        std::vector<Real> improvements)
-  : gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements),
-    G0_(0.15493339023106021408483720810737508876916113364521)
+  : grid_(grid), gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements)
  {}

-  template<class Gimpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme, Real G0)
-  : gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()), G0_(G0)
+  template<class GImpl>
+  Photon<GImpl>::Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme)
+  : Photon(grid, gauge, zmScheme, std::vector<Real>())
  {}

-  template<class Gimpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
-                        std::vector<Real> improvements, Real G0)
-  : gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements), G0_(G0)
-  {}
-
-  template<class Gimpl>
-  void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out)
+  template<class GImpl>
+  void Photon<GImpl>::FreePropagator(const GaugeField &in, GaugeField &out)
  {
-    FFT theFFT(in._grid);
+    FFT        theFFT(dynamic_cast<GridCartesian *>(grid_));
+    GaugeField in_k(grid_);
+    GaugeField prop_k(grid_);
    
-    GaugeField in_k(in._grid);
-    GaugeField prop_k(in._grid);
-    
-    theFFT.FFT_all_dim(in_k,in,FFT::forward);
-    MomentumSpacePropagator(prop_k,in_k);
-    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+    theFFT.FFT_all_dim(in_k, in, FFT::forward);
+    MomentumSpacePropagator(prop_k, in_k);
+    theFFT.FFT_all_dim(out, prop_k, FFT::backward);
  }

-  template<class Gimpl>
-  void Photon<Gimpl>::infVolPropagator(GaugeLinkField &out)
+  template<class GImpl>
+  void Photon<GImpl>::makeSpatialNorm(LatticeInteger &spNrm)
  {
-    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
-    LatticeReal        xmu(grid);
-    GaugeLinkField     one(grid);
-    const unsigned int nd    = grid->_ndimension;
-    std::vector<int>   &l    = grid->_fdimensions;
-    std::vector<int>   x0(nd,0);
-    TComplex           Tone  = Complex(1.0,0.0);
-    TComplex           Tzero = Complex(G0_,0.0);
-    FFT                fft(grid);
+    LatticeInteger   coor(grid_);
+    std::vector<int> l = grid_->FullDimensions();
+
+    spNrm = zero;
+    for(int mu = 0; mu < grid_->Nd() - 1; mu++)
+    {
+      LatticeCoordinate(coor, mu);
+      coor  = where(coor < Integer(l[mu]/2), coor, coor - Integer(l[mu]));
+      spNrm = spNrm + coor*coor;
+    }
+  }
+
+  template<class GImpl>
+  void Photon<GImpl>::makeKHat(std::vector<GaugeLinkField> &khat)
+  {
+    const unsigned int nd = grid_->Nd();
+    std::vector<int>   l  = grid_->FullDimensions();
+    Complex            ci(0., 1.);
+
+    khat.resize(nd, grid_);
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    {
+      Real piL = M_PI/l[mu];
+
+      LatticeCoordinate(khat[mu], mu);
+      khat[mu] = exp(piL*ci*khat[mu])*2.*sin(piL*khat[mu]);
+    }
+  }
+
+  template<class GImpl>
+  void Photon<GImpl>::makeInvKHatSquared(GaugeLinkField &out)
+  {
+    std::vector<GaugeLinkField> khat;
+    GaugeLinkField              lone(grid_);
+    const unsigned int          nd = grid_->Nd();
+    std::vector<int>            zm(nd, 0);
+    ScalarSite                  one = ScalarComplex(1., 0.), z = ScalarComplex(0., 0.);
    
-    one = Complex(1.0,0.0);
    out = zero;
+    makeKHat(khat);
    for(int mu = 0; mu < nd; mu++)
    {
-      LatticeCoordinate(xmu,mu);
-      Real lo2 = l[mu]/2.0;
-      xmu = where(xmu < lo2, xmu, xmu-double(l[mu]));
-      out = out + toComplex(4*M_PI*M_PI*xmu*xmu);
+      out = out + khat[mu]*conjugate(khat[mu]);
    }
-    pokeSite(Tone, out, x0);
-    out = one/out;
-    pokeSite(Tzero, out, x0);
-    fft.FFT_all_dim(out, out, FFT::forward);
+    lone = ScalarComplex(1., 0.);
+    pokeSite(one, out, zm);
+    out = lone/out;
+    pokeSite(z, out, zm);
  }
  
-  template<class Gimpl>
-  void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out)
+  template<class GImpl>
+  void Photon<GImpl>::zmSub(GaugeLinkField &out)
  {
-    GridBase           *grid = out._grid;
-    GaugeLinkField     kmu(grid), one(grid);
-    const unsigned int nd    = grid->_ndimension;
-    std::vector<int>   &l    = grid->_fdimensions;
-    std::vector<int>   zm(nd,0);
-    TComplex           Tone = Complex(1.0,0.0);
-    TComplex           Tzero= Complex(0.0,0.0);
-    
-    one = Complex(1.0,0.0);
-    out = zero;
-    for(int mu = 0; mu < nd; mu++)
-    {
-      Real twoPiL = M_PI*2./l[mu];
-      
-      LatticeCoordinate(kmu,mu);
-      kmu = 2.*sin(.5*twoPiL*kmu);
-      out = out + kmu*kmu;
-    }
-    pokeSite(Tone, out, zm);
-    out = one/out;
-    pokeSite(Tzero, out, zm);
-  }
-  
-  template<class Gimpl>
-  void Photon<Gimpl>::zmSub(GaugeLinkField &out)
-  {
-    GridBase           *grid = out._grid;
-    const unsigned int nd    = grid->_ndimension;
-    std::vector<int>   &l    = grid->_fdimensions;
-    
    switch (zmScheme_)
    {
      case ZmScheme::qedTL:
      {
-        std::vector<int> zm(nd,0);
-        TComplex         Tzero = Complex(0.0,0.0);
-        
-        pokeSite(Tzero, out, zm);
+        std::vector<int> zm(grid_->Nd(), 0);
+        ScalarSite       z = ScalarComplex(0., 0.);
        
+        pokeSite(z, out, zm);
        break;
      }
      case ZmScheme::qedL:
      {
-        LatticeInteger spNrm(grid), coor(grid);
-        GaugeLinkField z(grid);
-        
-        spNrm = zero;
-        for(int d = 0; d < grid->_ndimension - 1; d++)
-        {
-          LatticeCoordinate(coor,d);
-          coor = where(coor < Integer(l[d]/2), coor, coor-Integer(l[d]));
-          spNrm = spNrm + coor*coor;
-        }
-        out = where(spNrm == Integer(0), 0.*out, out);
+        LatticeInteger spNrm(grid_);

-        // IR improvement
+        makeSpatialNorm(spNrm);
+        out = where(spNrm == Integer(0), 0.*out, out);
        for(int i = 0; i < improvement_.size(); i++)
        {
-          Real f = sqrt(improvement_[i]+1);
-          out = where(spNrm == Integer(i+1), f*out, out);
+          Real f = sqrt(improvement_[i] + 1);
+          out = where(spNrm == Integer(i + 1), f*out, out);
        }
+        break;
      }
      default:
+        assert(0);
        break;
    }
  }

-  template<class Gimpl>
-  void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in,
-                                               GaugeField &out)
+  template<class GImpl>
+  void Photon<GImpl>::transverseProjectSpatial(GaugeField &out)
  {
-  GridBase           *grid = out._grid;
-    LatticeComplex     momProp(grid);
-    
-    switch (zmScheme_)
+    const unsigned int          nd = grid_->Nd();
+    GaugeLinkField              invKHat(grid_), cst(grid_), spdiv(grid_);
+    LatticeInteger              spNrm(grid_);
+    std::vector<GaugeLinkField> khat, a(nd, grid_), aProj(nd, grid_);
+
+    invKHat = zero;
+    makeSpatialNorm(spNrm);
+    makeKHat(khat);
+    for (unsigned int mu = 0; mu < nd; ++mu)
    {
-      case ZmScheme::qedTL:
-      case ZmScheme::qedL:
+      a[mu] = peekLorentz(out, mu);
+      if (mu < nd - 1)
      {
-        invKHatSquared(momProp);
-        zmSub(momProp);
-        break;
+        invKHat += khat[mu]*conjugate(khat[mu]);
      }
-      case ZmScheme::qedInf:
-      {
-        infVolPropagator(momProp);
+    }
+    cst     = ScalarComplex(1., 0.);
+    invKHat = where(spNrm == Integer(0), cst, invKHat);
+    invKHat = cst/invKHat;
+    cst     = zero;
+    invKHat = where(spNrm == Integer(0), cst, invKHat);
+    spdiv   = zero;
+    for (unsigned int nu = 0; nu < nd - 1; ++nu)
+    {
+      spdiv += conjugate(khat[nu])*a[nu];
+    }
+    spdiv *= invKHat;
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    {
+      aProj[mu] = a[mu] - khat[mu]*spdiv;
+      pokeLorentz(out, aProj[mu], mu);
+    }
+  }
+
+  template<class GImpl>
+  void Photon<GImpl>::gaugeTransform(GaugeField &out)
+  {
+    switch (gauge_)
+    {
+      case Gauge::feynman:
+        break;
+      case Gauge::coulomb:
+        transverseProjectSpatial(out);
+        break;
+      case Gauge::landau:
+        assert(0);
        break;
-      }
      default:
+        assert(0);
        break;
    }
+  }
+
+  template<class GImpl>
+  void Photon<GImpl>::MomentumSpacePropagator(const GaugeField &in,
+                                              GaugeField &out)
+  {
+    LatticeComplex momProp(grid_);
+    
+    makeInvKHatSquared(momProp);
+    zmSub(momProp);
    
    out = in*momProp;
  }
  
-  template<class Gimpl>
-  void Photon<Gimpl>::StochasticWeight(GaugeLinkField &weight)
+  template<class GImpl>
+  void Photon<GImpl>::StochasticWeight(GaugeLinkField &weight)
  {
-    auto               *grid     = dynamic_cast<GridCartesian *>(weight._grid);
-    const unsigned int nd        = grid->_ndimension;
-    std::vector<int>   latt_size = grid->_fdimensions;
-    
-    switch (zmScheme_)
+    const unsigned int nd  = grid_->Nd();
+    std::vector<int>   l   = grid_->FullDimensions();
+    Integer            vol = 1;
+
+    for(unsigned int mu = 0; mu < nd; mu++)
    {
-      case ZmScheme::qedTL:
-      case ZmScheme::qedL:
-      {
-        Integer vol = 1;
-        for(int d = 0; d < nd; d++)
-        {
-          vol = vol * latt_size[d];
-        }
-        invKHatSquared(weight);
-        weight = sqrt(vol)*sqrt(weight);
-        zmSub(weight);
-        break;
-      }
-      case ZmScheme::qedInf:
-      {
-        infVolPropagator(weight);
-        weight = sqrt(real(weight));
-        break;
-      }
-      default:
-        break;
+      vol = vol*l[mu];
    }
+    makeInvKHatSquared(weight);
+    weight = sqrt(vol)*sqrt(weight);
+    zmSub(weight);
  }
  
-  template<class Gimpl>
-  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
+  template<class GImpl>
+  void Photon<GImpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
  {
-    auto           *grid = dynamic_cast<GridCartesian *>(out._grid);
-    GaugeLinkField weight(grid);
+    GaugeLinkField weight(grid_);
    
    StochasticWeight(weight);
    StochasticField(out, rng, weight);
  }
  
-  template<class Gimpl>
-  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
+  template<class GImpl>
+  void Photon<GImpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
                                      const GaugeLinkField &weight)
  {
-    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
-    const unsigned int nd = grid->_ndimension;
-    GaugeLinkField     r(grid);
-    GaugeField         aTilde(grid);
-    FFT                fft(grid);
+    const unsigned int nd = grid_->Nd();
+    GaugeLinkField     r(grid_);
+    GaugeField         aTilde(grid_);
+    FFT                fft(dynamic_cast<GridCartesian *>(grid_));
    
-    switch (zmScheme_)
+    for(unsigned int mu = 0; mu < nd; mu++)
    {
-      case ZmScheme::qedTL:
-      case ZmScheme::qedL:
-      {
-        for(int mu = 0; mu < nd; mu++)
-        {
-          gaussian(rng, r);
-          r = weight*r;
-          pokeLorentz(aTilde, r, mu);
-        }
-        break;
-      }
-      case ZmScheme::qedInf:
-      {
-        Complex                    shift(1., 1.); // This needs to be a GaugeLink element?
-        for(int mu = 0; mu < nd; mu++)
-        {
-          bernoulli(rng, r);
-          r = weight*(2.*r - shift);
-          pokeLorentz(aTilde, r, mu);
-        }
-        break;
-      }
-      default:
-        break;
+      gaussian(rng, r);
+      r = weight*r;
+      pokeLorentz(aTilde, r, mu);
    }
-
+    gaugeTransform(aTilde);
    fft.FFT_all_dim(out, aTilde, FFT::backward);
-    
    out = real(out);
  }

-  template<class Gimpl>
-  void Photon<Gimpl>::UnitField(GaugeField &out)
+  template<class GImpl>
+  void Photon<GImpl>::UnitField(GaugeField &out)
  {
-    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
-    const unsigned int nd = grid->_ndimension;
-    GaugeLinkField     r(grid);
+    const unsigned int nd = grid_->Nd();
+    GaugeLinkField     r(grid_);
    
-    r = Complex(1.0,0.0);
-
-    for(int mu = 0; mu < nd; mu++)
+    r = ScalarComplex(1., 0.);
+    for(unsigned int mu = 0; mu < nd; mu++)
    {
      pokeLorentz(out, r, mu);
    }
-    
    out = real(out);
  }
-//  template<class Gimpl>
-//  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
-//                                                            const GaugeField &in)
-//  {
-//    
-//    FeynmanGaugeMomentumSpacePropagator_TL(out,in);
-//    
-//    GridBase *grid = out._grid;
-//    LatticeInteger     coor(grid);
-//    GaugeField zz(grid); zz=zero;
-//    
-//    // xyzt
-//    for(int d = 0; d < grid->_ndimension-1;d++){
-//      LatticeCoordinate(coor,d);
-//      out = where(coor==Integer(0),zz,out);
-//    }
-//  }
-//  
-//  template<class Gimpl>
-//  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out,
-//                                                             const GaugeField &in)
-//  {
-//    
-//    // what type LatticeComplex
-//    GridBase *grid = out._grid;
-//    int nd = grid->_ndimension;
-//    
-//    typedef typename GaugeField::vector_type vector_type;
-//    typedef typename GaugeField::scalar_type ScalComplex;
-//    typedef Lattice<iSinglet<vector_type> > LatComplex;
-//    
-//    std::vector<int> latt_size   = grid->_fdimensions;
-//    
-//    LatComplex denom(grid); denom= zero;
-//    LatComplex   one(grid); one = ScalComplex(1.0,0.0);
-//    LatComplex   kmu(grid);
-//    
-//    ScalComplex ci(0.0,1.0);
-//    // momphase = n * 2pi / L
-//    for(int mu=0;mu<Nd;mu++) {
-//      
-//      LatticeCoordinate(kmu,mu);
-//      
-//      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-//      
-//      kmu = TwoPiL * kmu ;
-//      
-//      denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
-//    }
-//    std::vector<int> zero_mode(nd,0);
-//    TComplexD Tone = ComplexD(1.0,0.0);
-//    TComplexD Tzero= ComplexD(0.0,0.0);
-//    
-//    pokeSite(Tone,denom,zero_mode);
-//    
-//    denom= one/denom;
-//    
-//    pokeSite(Tzero,denom,zero_mode);
-//    
-//    out = zero;
-//    out = in*denom;
-//  };
  
 }}
 #endif
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@ -75,7 +75,7 @@ namespace Grid{
      virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
 	//extend Ta to include Lorentz indexes
 	RealD factor_p = c_plaq/RealD(Nc)*0.5;
-	RealD factor_r =   c_rect/RealD(Nc)*0.5;
+	RealD factor_r = c_rect/RealD(Nc)*0.5;

 	GridBase *grid = Umu._grid;

--- a/Grid/qcd/action/pseudofermion/Bounds.h
+++ b/Grid/qcd/action/pseudofermion/Bounds.h
@ -0,0 +1,53 @@
+#pragma once
+
+namespace Grid{
+  namespace QCD{
+
+    template<class Field>
+    void HighBoundCheck(LinearOperatorBase<Field> &HermOp, 
+			Field &Phi,
+			RealD hi)
+    {
+      // Eigenvalue bound check at high end
+      PowerMethod<Field> power_method;
+      auto lambda_max = power_method(HermOp,Phi);
+      std::cout << GridLogMessage << "Pseudofermion action lamda_max "<<lambda_max<<"( bound "<<hi<<")"<<std::endl;
+      assert( (lambda_max < hi) && " High Bounds Check on operator failed" );
+    }
+      
+    template<class Field> void InverseSqrtBoundsCheck(int MaxIter,double tol,
+						       LinearOperatorBase<Field> &HermOp,
+						       Field &GaussNoise,
+						       MultiShiftFunction &PowerNegHalf) 
+    {
+      GridBase *FermionGrid = GaussNoise._grid;
+
+      Field X(FermionGrid);
+      Field Y(FermionGrid);
+      Field Z(FermionGrid);
+
+      X=GaussNoise;
+      RealD Nx = norm2(X);
+
+      ConjugateGradientMultiShift<Field> msCG(MaxIter,PowerNegHalf);
+      msCG(HermOp,X,Y);
+      msCG(HermOp,Y,Z);
+
+      RealD Nz = norm2(Z);
+
+      HermOp.HermOp(Z,Y);
+      RealD Ny = norm2(Y);
+
+      X=X-Y;
+      RealD Nd = norm2(X);
+      std::cout << "************************* "<<std::endl;
+      std::cout << " noise                         = "<<Nx<<std::endl;
+      std::cout << " (MdagM^-1/2)^2  noise         = "<<Nz<<std::endl;
+      std::cout << " MdagM (MdagM^-1/2)^2  noise   = "<<Ny<<std::endl;
+      std::cout << " noise - MdagM (MdagM^-1/2)^2  noise   = "<<Nd<<std::endl;
+      std::cout << "************************* "<<std::endl;
+      assert( (std::sqrt(Nd/Nx)<tol) && " InverseSqrtBoundsCheck ");
+    }
+
+  }
+}
--- a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
+++ b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
@ -58,13 +58,30 @@ namespace QCD{
      bool use_heatbath_forecasting;
      AbstractEOFAFermion<Impl>& Lop; // the basic LH operator
      AbstractEOFAFermion<Impl>& Rop; // the basic RH operator
-      SchurRedBlackDiagMooeeSolve<FermionField> Solver;
+      SchurRedBlackDiagMooeeSolve<FermionField> SolverHB;
+      SchurRedBlackDiagMooeeSolve<FermionField> SolverL;
+      SchurRedBlackDiagMooeeSolve<FermionField> SolverR;
+      SchurRedBlackDiagMooeeSolve<FermionField> DerivativeSolverL;
+      SchurRedBlackDiagMooeeSolve<FermionField> DerivativeSolverR;
      FermionField Phi; // the pseudofermion field for this trajectory

    public:
-      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, AbstractEOFAFermion<Impl>& _Rop,
-        OperatorFunction<FermionField>& S, Params& p, bool use_fc=false) : Lop(_Lop), Rop(_Rop), Solver(S),
-        Phi(_Lop.FermionGrid()), param(p), use_heatbath_forecasting(use_fc)
+
+      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
+					      AbstractEOFAFermion<Impl>& _Rop,
+					      OperatorFunction<FermionField>& HeatbathCG, 
+					      OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
+					      OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
+					      Params& p, 
+					      bool use_fc=false) : 
+        Lop(_Lop), 
+	Rop(_Rop), 
+	SolverHB(HeatbathCG,false,true),
+	SolverL(ActionCGL, false, true), SolverR(ActionCGR, false, true), 
+	DerivativeSolverL(DerivCGL, false, true), DerivativeSolverR(DerivCGR, false, true), 
+	Phi(_Lop.FermionGrid()), 
+	param(p), 
+        use_heatbath_forecasting(use_fc)
      {
        AlgRemez remez(param.lo, param.hi, param.precision);

@ -98,6 +115,9 @@ namespace QCD{
      // We generate a Gaussian noise vector \eta, and then compute
      //  \Phi = M_{\rm EOFA}^{-1/2} * \eta
      // using a rational approximation to the inverse square root
+      //
+      // As a check of rational require \Phi^dag M_{EOFA} \Phi == eta^dag M^-1/2^dag M M^-1/2 eta = eta^dag eta
+      //
      virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG)
      {
        Lop.ImportGauge(U);
@ -118,7 +138,6 @@ namespace QCD{
        RealD scale = std::sqrt(0.5);
        gaussian(pRNG,eta);
        eta = eta * scale;
-        printf("Heatbath source vector: <\\eta|\\eta> = %1.15e\n", norm2(eta));

        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
        RealD N(PowerNegHalf.norm);
@ -139,11 +158,11 @@ namespace QCD{
          if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles
            Lop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Lop, Forecast_src, prev_solns);
-            Solver(Lop, CG_src, CG_soln);
+            SolverHB(Lop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = zero; // Just use zero as the initial guess
-            Solver(Lop, CG_src, CG_soln);
+            SolverHB(Lop, CG_src, CG_soln);
          }
          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] + ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Lop.k ) * tmp[0];
@ -166,11 +185,11 @@ namespace QCD{
          if(use_heatbath_forecasting){
            Rop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Rop, Forecast_src, prev_solns);
-            Solver(Rop, CG_src, CG_soln);
+            SolverHB(Rop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = zero;
-            Solver(Rop, CG_src, CG_soln);
+            SolverHB(Rop, CG_src, CG_soln);
          }
          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] - ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Rop.k ) * tmp[0];
@ -182,8 +201,47 @@ namespace QCD{
        // Reset shift coefficients for energy and force evals
        Lop.RefreshShiftCoefficients(0.0);
        Rop.RefreshShiftCoefficients(-1.0);
+
+	// Bounds check
+	RealD EtaDagEta = norm2(eta);
+	//	RealD PhiDagMPhi= norm2(eta);
+
      };

+      void Meofa(const GaugeField& U,const FermionField &phi, FermionField & Mphi) 
+      {
+#if 0
+        Lop.ImportGauge(U);
+        Rop.ImportGauge(U);
+
+        FermionField spProj_Phi(Lop.FermionGrid());
+	FermionField mPhi(Lop.FermionGrid());
+        std::vector<FermionField> tmp(2, Lop.FermionGrid());
+	mPhi = phi;
+	
+        // LH term: S = S - k <\Phi| P_{-} \Omega_{-}^{\dagger} H(mf)^{-1} \Omega_{-} P_{-} |\Phi>
+        spProj(Phi, spProj_Phi, -1, Lop.Ls);
+        Lop.Omega(spProj_Phi, tmp[0], -1, 0);
+        G5R5(tmp[1], tmp[0]);
+        tmp[0] = zero;
+        SolverL(Lop, tmp[1], tmp[0]);
+        Lop.Dtilde(tmp[0], tmp[1]); // We actually solved Cayley preconditioned system: transform back
+        Lop.Omega(tmp[1], tmp[0], -1, 1);
+	mPhi = mPhi -  Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
+
+        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
+        spProj(Phi, spProj_Phi, 1, Rop.Ls);
+        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
+        G5R5(tmp[1], tmp[0]);
+        tmp[0] = zero;
+        SolverR(Rop, tmp[1], tmp[0]);
+        Rop.Dtilde(tmp[0], tmp[1]);
+        Rop.Omega(tmp[1], tmp[0], 1, 1);
+        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
+#endif
+      }
+
      // EOFA action: see Eqn. (10) of arXiv:1706.05843
      virtual RealD S(const GaugeField& U)
      {
@ -201,7 +259,7 @@ namespace QCD{
        Lop.Omega(spProj_Phi, tmp[0], -1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = zero;
-        Solver(Lop, tmp[1], tmp[0]);
+        SolverL(Lop, tmp[1], tmp[0]);
        Lop.Dtilde(tmp[0], tmp[1]); // We actually solved Cayley preconditioned system: transform back
        Lop.Omega(tmp[1], tmp[0], -1, 1);
        action -= Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
@ -212,7 +270,7 @@ namespace QCD{
        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = zero;
-        Solver(Rop, tmp[1], tmp[0]);
+        SolverR(Rop, tmp[1], tmp[0]);
        Rop.Dtilde(tmp[0], tmp[1]);
        Rop.Omega(tmp[1], tmp[0], 1, 1);
        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
@ -234,17 +292,22 @@ namespace QCD{

        GaugeField force(Lop.GaugeGrid());

+	/////////////////////////////////////////////
+	// PAB: 
+	//   Optional single precision derivative ?
+	/////////////////////////////////////////////
+
        // LH: dSdU = k \chi_{L}^{\dagger} \gamma_{5} R_{5} ( \partial_{x,\mu} D_{w} ) \chi_{L}
        //     \chi_{L} = H(mf)^{-1} \Omega_{-} P_{-} \Phi
        spProj(Phi, spProj_Phi, -1, Lop.Ls);
        Lop.Omega(spProj_Phi, Omega_spProj_Phi, -1, 0);
        G5R5(CG_src, Omega_spProj_Phi);
        spProj_Phi = zero;
-        Solver(Lop, CG_src, spProj_Phi);
+        DerivativeSolverL(Lop, CG_src, spProj_Phi);
        Lop.Dtilde(spProj_Phi, Chi);
        G5R5(g5_R5_Chi, Chi);
        Lop.MDeriv(force, g5_R5_Chi, Chi, DaggerNo);
-        dSdU = Lop.k * force;
+        dSdU = -Lop.k * force;

        // RH: dSdU = dSdU - k \chi_{R}^{\dagger} \gamma_{5} R_{5} ( \partial_{x,\mu} D_{w} ) \chi_{}
        //     \chi_{R} = ( H(mb) - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \Phi
@ -252,11 +315,11 @@ namespace QCD{
        Rop.Omega(spProj_Phi, Omega_spProj_Phi, 1, 0);
        G5R5(CG_src, Omega_spProj_Phi);
        spProj_Phi = zero;
-        Solver(Rop, CG_src, spProj_Phi);
+        DerivativeSolverR(Rop, CG_src, spProj_Phi);
        Rop.Dtilde(spProj_Phi, Chi);
        G5R5(g5_R5_Chi, Chi);
        Lop.MDeriv(force, g5_R5_Chi, Chi, DaggerNo);
-        dSdU = dSdU - Rop.k * force;
+        dSdU = dSdU + Rop.k * force;
      };
  };
 }}
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
@ -157,6 +157,13 @@ class OneFlavourEvenOddRationalPseudoFermionAction

    msCG(Mpc, PhiOdd, Y);

+    if ( (rand()%param.BoundsCheckFreq)==0 ) { 
+      FermionField gauss(FermOp.FermionRedBlackGrid());
+      gauss = PhiOdd;
+      HighBoundCheck(Mpc,gauss,param.hi);
+      InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,Mpc,gauss,PowerNegHalf);
+    }
+
    RealD action = norm2(Y);
    std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 "
                                   "solve or -1/2 solve faster??? "
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@ -170,6 +170,14 @@ namespace Grid{
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
 	msCG_M(MdagM,X,Y);

+	// Randomly apply rational bounds checks.
+	if ( (rand()%param.BoundsCheckFreq)==0 ) { 
+	  FermionField gauss(NumOp.FermionRedBlackGrid());
+	  gauss = PhiOdd;
+	  HighBoundCheck(MdagM,gauss,param.hi);
+	  InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagM,gauss,PowerNegHalf);
+	}
+
 	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
 	RealD action = norm2(Y);

--- a/Grid/qcd/action/pseudofermion/OneFlavourRational.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourRational.h
@ -143,6 +143,14 @@ namespace Grid{

 	msCG(MdagMOp,Phi,Y);

+	if ( (rand()%param.BoundsCheckFreq)==0 ) { 
+	  FermionField gauss(FermOp.FermionGrid());
+	  gauss = Phi;
+	  HighBoundCheck(MdagMOp,gauss,param.hi);
+	  InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagMOp,gauss,PowerNegHalf);
+	}
+
+
 	RealD action = norm2(Y);
 	std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 solve or -1/2 solve faster??? "<<action<<std::endl;
 	return action;
--- a/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h
@ -156,6 +156,14 @@ namespace Grid{
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
 	msCG_M(MdagM,X,Y);

+	// Randomly apply rational bounds checks.
+	if ( (rand()%param.BoundsCheckFreq)==0 ) { 
+	  FermionField gauss(NumOp.FermionGrid());
+	  gauss = Phi;
+	  HighBoundCheck(MdagM,gauss,param.hi);
+	  InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagM,gauss,PowerNegHalf);
+	}
+
 	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
 	RealD action = norm2(Y);

--- a/Grid/qcd/action/pseudofermion/PseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/PseudoFermion.h
@ -29,6 +29,9 @@ directory
 #ifndef QCD_PSEUDOFERMION_AGGREGATE_H
 #define QCD_PSEUDOFERMION_AGGREGATE_H

+// Rational functions
+#include <Grid/qcd/action/pseudofermion/Bounds.h>
+
 #include <Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
 #include <Grid/qcd/action/pseudofermion/TwoFlavour.h>
 #include <Grid/qcd/action/pseudofermion/TwoFlavourRatio.h>
--- a/Grid/qcd/action/pseudofermion/TwoFlavour.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavour.h
@ -85,21 +85,20 @@ class TwoFlavourPseudoFermionAction : public Action<typename Impl::GaugeField> {
    // and must multiply by 0.707....
    //
    // Chroma has this scale factor: two_flavor_monomial_w.h
+    // CPS uses this factor
    // IroIro: does not use this scale. It is absorbed by a change of vars
    //         in the Phi integral, and thus is only an irrelevant prefactor for
    //         the partition function.
    //

-    RealD scale = std::sqrt(0.5);
+    const RealD scale = std::sqrt(0.5);

    FermionField eta(FermOp.FermionGrid());

-    gaussian(pRNG, eta);
+    gaussian(pRNG, eta); eta = scale *eta;

    FermOp.ImportGauge(U);
    FermOp.Mdag(eta, Phi);
-
-    Phi = Phi * scale;
  };

  //////////////////////////////////////////////////////
--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@ -46,6 +46,7 @@ namespace Grid{

      OperatorFunction<FermionField> &DerivativeSolver;
      OperatorFunction<FermionField> &ActionSolver;
+      OperatorFunction<FermionField> &HeatbathSolver;

      FermionField PhiOdd;   // the pseudo fermion field for this trajectory
      FermionField PhiEven;  // the pseudo fermion field for this trajectory
@ -54,11 +55,18 @@ namespace Grid{
      TwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
                                                FermionOperator<Impl>  &_DenOp, 
                                                OperatorFunction<FermionField> & DS,
-                                                OperatorFunction<FermionField> & AS) :
+                                                OperatorFunction<FermionField> & AS ) : 
+      TwoFlavourEvenOddRatioPseudoFermionAction(_NumOp,_DenOp, DS,AS,AS) {};
+
+      TwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+                                                FermionOperator<Impl>  &_DenOp, 
+                                                OperatorFunction<FermionField> & DS,
+                                                OperatorFunction<FermionField> & AS, OperatorFunction<FermionField> & HS) :
      NumOp(_NumOp), 
      DenOp(_DenOp), 
      DerivativeSolver(DS), 
      ActionSolver(AS),
+      HeatbathSolver(HS),
      PhiEven(_NumOp.FermionRedBlackGrid()),
      PhiOdd(_NumOp.FermionRedBlackGrid()) 
        {
@ -111,7 +119,7 @@ namespace Grid{
        // Odd det factors
        Mpc.MpcDag(etaOdd,PhiOdd);
        tmp=zero;
-        ActionSolver(Vpc,PhiOdd,tmp);
+        HeatbathSolver(Vpc,PhiOdd,tmp);
        Vpc.Mpc(tmp,PhiOdd);            

        // Even det factors
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@ -54,8 +54,8 @@ public:

  template <class ReaderClass, typename std::enable_if<isReader<ReaderClass>::value, int >::type = 0 >
  IntegratorParameters(ReaderClass & Reader){
-    std::cout << "Reading integrator\n";
-        read(Reader, "Integrator", *this);
+    std::cout << GridLogMessage << "Reading integrator\n";
+    read(Reader, "Integrator", *this);
  }

  void print_parameters() const {
@ -88,8 +88,7 @@ class Integrator {
    t_P[level] += ep;
    update_P(P, U, level, ep);

-    std::cout << GridLogIntegrator << "[" << level << "] P "
-              << " dt " << ep << " : t_P " << t_P[level] << std::endl;
+    std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl;
  }

  // to be used by the actionlevel class to iterate
@ -105,7 +104,7 @@ class Integrator {
        GF force = Rep.RtoFundamentalProject(forceR);  // Ta for the fundamental rep
        Real force_abs = std::sqrt(norm2(force)/(U._grid->gSites()));
        std::cout << GridLogIntegrator << "Hirep Force average: " << force_abs << std::endl;
-        Mom -= force * ep ;
+	Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
      }
    }
  } update_P_hireps{};
@ -129,11 +128,11 @@ class Integrator {
      double end_force = usecond();
      Real force_abs = std::sqrt(norm2(force)/U._grid->gSites());
      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
-      Mom -= force * ep; 
+      Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
      double end_full = usecond();
      double time_full  = (end_full - start_full) / 1e3;
      double time_force = (end_force - start_force) / 1e3;
-      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl;
+      std::cout << GridLogMessage << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl;
    }

    // Force from the other representations
@ -238,8 +237,7 @@ class Integrator {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
-        Field& Us =
-            Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
+        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        as[level].actions.at(actionID)->refresh(Us, pRNG);
      }

@ -252,13 +250,11 @@ class Integrator {
  // over the representations
  struct _S {
    template <class FieldType, class Repr>
-    void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep,
-                    int level, RealD& H) {
+    void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) {
      
      for (int a = 0; a < repr_set.size(); ++a) {
        RealD Hterm = repr_set.at(a)->S(Rep.U);
-        std::cout << GridLogMessage << "S Level " << level << " term " << a
-                  << " H Hirep = " << Hterm << std::endl;
+        std::cout << GridLogMessage << "S Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl;
        H += Hterm;

      }
@ -268,20 +264,21 @@ class Integrator {
  // Calculate action
  RealD S(Field& U) {  // here also U not used

-    RealD H = - FieldImplementation::FieldSquareNorm(P); // - trace (P*P)
+    std::cout << GridLogIntegrator << "Integrator action\n";
+
+    RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
+
    RealD Hterm;
-    std::cout << GridLogMessage << "Momentum action H_p = " << H << "\n";

    // Actions
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
-        Field& Us =
-            Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
+        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
+        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
        Hterm = as[level].actions.at(actionID)->S(Us);
-        std::cout << GridLogMessage << "S Level " << level << " term "
-                  << actionID << " H = " << Hterm << std::endl;
+        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
        H += Hterm;
      }
      as[level].apply(S_hireps, Representations, level, H);
@ -306,8 +303,7 @@ class Integrator {
    // Check the clocks all match on all levels
    for (int level = 0; level < as.size(); ++level) {
      assert(fabs(t_U - t_P[level]) < 1.0e-6);  // must be the same
-      std::cout << GridLogIntegrator << " times[" << level
-                << "]= " << t_P[level] << " " << t_U << std::endl;
+      std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl;
    }

    // and that we indeed got to the end of the trajectory
--- a/Grid/qcd/hmc/integrators/Integrator_algorithm.h
+++ b/Grid/qcd/hmc/integrators/Integrator_algorithm.h
@ -231,8 +231,7 @@ class ForceGradient : public Integrator<FieldImplementation, SmearingPolicy,
    Field Pfg(U._grid);
    Ufg = U;
    Pfg = zero;
-    std::cout << GridLogIntegrator << "FG update " << fg_dt << " " << ep
-              << std::endl;
+    std::cout << GridLogIntegrator << "FG update " << fg_dt << " " << ep << std::endl;
    // prepare_fg; no prediction/result cache for now
    // could relax CG stopping conditions for the
    // derivatives in the small step since the force gets multiplied by
@ -271,8 +270,7 @@ class ForceGradient : public Integrator<FieldImplementation, SmearingPolicy,
        this->step(U, level + 1, first_step, 0);
      }

-      this->FG_update_P(U, level, 2 * Chi / ((1.0 - 2.0 * lambda) * eps),
-                        (1.0 - 2.0 * lambda) * eps);
+      this->FG_update_P(U, level, 2 * Chi / ((1.0 - 2.0 * lambda) * eps), (1.0 - 2.0 * lambda) * eps);

      if (level == fl) {  // lowest level
        this->update_U(U, 0.5 * eps);
--- a/Grid/qcd/spin/Gamma.cc
+++ b/Grid/qcd/spin/Gamma.cc
@ -11,6 +11,24 @@ const std::array<const Gamma, 4> Gamma::gmu = {{
  Gamma(Gamma::Algebra::GammaZ),
  Gamma(Gamma::Algebra::GammaT)}};

+const std::array<const Gamma, 16> Gamma::gall = {{
+  Gamma(Gamma::Algebra::Identity),
+  Gamma(Gamma::Algebra::Gamma5),
+  Gamma(Gamma::Algebra::GammaX),
+  Gamma(Gamma::Algebra::GammaY),
+  Gamma(Gamma::Algebra::GammaZ),
+  Gamma(Gamma::Algebra::GammaT),
+  Gamma(Gamma::Algebra::GammaXGamma5),
+  Gamma(Gamma::Algebra::GammaYGamma5),
+  Gamma(Gamma::Algebra::GammaZGamma5),
+  Gamma(Gamma::Algebra::GammaTGamma5),
+  Gamma(Gamma::Algebra::SigmaXT),      
+  Gamma(Gamma::Algebra::SigmaXY),      
+  Gamma(Gamma::Algebra::SigmaXZ),      
+  Gamma(Gamma::Algebra::SigmaYT),
+  Gamma(Gamma::Algebra::SigmaYZ),
+  Gamma(Gamma::Algebra::SigmaZT)}};
+
 const std::array<const char *, Gamma::nGamma> Gamma::name = {{
  "-Gamma5      ",
  "Gamma5       ",
--- a/Grid/qcd/spin/Gamma.h
+++ b/Grid/qcd/spin/Gamma.h
@ -48,6 +48,7 @@ class Gamma {
    static const std::array<std::array<Algebra, nGamma>, nGamma> mul;
    static const std::array<Algebra, nGamma>                     adj;
    static const std::array<const Gamma, 4>                      gmu;
+    static const std::array<const Gamma, 16>                     gall;
    Algebra                                                      g;
  public:
    Gamma(Algebra initg): g(initg) {}  
--- a/Grid/qcd/spin/gamma-gen/gamma-gen.nb
+++ b/Grid/qcd/spin/gamma-gen/gamma-gen.nb
@ -10,10 +10,10 @@
 NotebookFileLineBreakTest
 NotebookFileLineBreakTest
 NotebookDataPosition[       158,          7]
-NotebookDataLength[     75090,       1956]
-NotebookOptionsPosition[     69536,       1867]
-NotebookOutlinePosition[     69898,       1883]
-CellTagsIndexPosition[     69855,       1880]
+NotebookDataLength[     67118,       1714]
+NotebookOptionsPosition[     63485,       1652]
+NotebookOutlinePosition[     63842,       1668]
+CellTagsIndexPosition[     63799,       1665]
 WindowFrame->Normal*)

 (* Beginning of Notebook Content *)
@ -76,234 +76,6 @@ Cell[BoxData["\<\"/Users/antonin/Development/Grid/lib/qcd/spin/gamma-gen\"\>"]\

 Cell[CellGroupData[{

-Cell[BoxData[
- RowBox[{"FactorInteger", "[", "3152", "]"}]], "Input",
- CellChangeTimes->{{3.7432347536316767`*^9, 3.7432347764739027`*^9}, {
-  3.743234833567358*^9, 
-  3.743234862146022*^9}},ExpressionUUID->"d1a0fd03-85e1-43af-ba80-\
-3ca4235675d8"],
-
-Cell[BoxData[
- RowBox[{"{", 
-  RowBox[{
-   RowBox[{"{", 
-    RowBox[{"2", ",", "4"}], "}"}], ",", 
-   RowBox[{"{", 
-    RowBox[{"197", ",", "1"}], "}"}]}], "}"}]], "Output",
- CellChangeTimes->{{3.743234836792224*^9, 
-  3.743234862493619*^9}},ExpressionUUID->"16d3f953-4b24-4ed2-ae62-\
-306dcab66ca7"]
-}, Open  ]],
-
-Cell[CellGroupData[{
-
-Cell[BoxData[
- RowBox[{"sol", "=", 
-  RowBox[{"Solve", "[", 
-   RowBox[{
-    RowBox[{
-     RowBox[{
-      SuperscriptBox["x", "2"], "+", 
-      SuperscriptBox["y", "2"], "+", 
-      SuperscriptBox["z", "2"]}], "\[Equal]", "2"}], ",", 
-    RowBox[{"{", 
-     RowBox[{"x", ",", "y", ",", "z"}], "}"}], ",", "Integers"}], 
-   "]"}]}]], "Input",
- CellChangeTimes->{{3.743235304127721*^9, 
-  3.7432353087929983`*^9}},ExpressionUUID->"f0fa2a5c-3d81-4d75-a447-\
-50c7ca3459ff"],
-
-Cell[BoxData[
- RowBox[{"{", 
-  RowBox[{
-   RowBox[{"{", 
-    RowBox[{
-     RowBox[{"x", "\[Rule]", 
-      RowBox[{"-", "1"}]}], ",", 
-     RowBox[{"y", "\[Rule]", 
-      RowBox[{"-", "1"}]}], ",", 
-     RowBox[{"z", "\[Rule]", "0"}]}], "}"}], ",", 
-   RowBox[{"{", 
-    RowBox[{
-     RowBox[{"x", "\[Rule]", 
-      RowBox[{"-", "1"}]}], ",", 
-     RowBox[{"y", "\[Rule]", "0"}], ",", 
-     RowBox[{"z", "\[Rule]", 
-      RowBox[{"-", "1"}]}]}], "}"}], ",", 
-   RowBox[{"{", 
-    RowBox[{
-     RowBox[{"x", "\[Rule]", 
-      RowBox[{"-", "1"}]}], ",", 
-     RowBox[{"y", "\[Rule]", "0"}], ",", 
-     RowBox[{"z", "\[Rule]", "1"}]}], "}"}], ",", 
-   RowBox[{"{", 
-    RowBox[{
-     RowBox[{"x", "\[Rule]", 
-      RowBox[{"-", "1"}]}], ",", 
-     RowBox[{"y", "\[Rule]", "1"}], ",", 
-     RowBox[{"z", "\[Rule]", "0"}]}], "}"}], ",", 
-   RowBox[{"{", 
-    RowBox[{
-     RowBox[{"x", "\[Rule]", "0"}], ",", 
-     RowBox[{"y", "\[Rule]", 
-      RowBox[{"-", "1"}]}], ",", 
-     RowBox[{"z", "\[Rule]", 
-      RowBox[{"-", "1"}]}]}], "}"}], ",", 
-   RowBox[{"{", 
-    RowBox[{
-     RowBox[{"x", "\[Rule]", "0"}], ",", 
-     RowBox[{"y", "\[Rule]", 
-      RowBox[{"-", "1"}]}], ",", 
-     RowBox[{"z", "\[Rule]", "1"}]}], "}"}], ",", 
-   RowBox[{"{", 
-    RowBox[{
-     RowBox[{"x", "\[Rule]", "0"}], ",", 
-     RowBox[{"y", "\[Rule]", "1"}], ",", 
-     RowBox[{"z", "\[Rule]", 
-      RowBox[{"-", "1"}]}]}], "}"}], ",", 
-   RowBox[{"{", 
-    RowBox[{
-     RowBox[{"x", "\[Rule]", "0"}], ",", 
-     RowBox[{"y", "\[Rule]", "1"}], ",", 
-     RowBox[{"z", "\[Rule]", "1"}]}], "}"}], ",", 
-   RowBox[{"{", 
-    RowBox[{
-     RowBox[{"x", "\[Rule]", "1"}], ",", 
-     RowBox[{"y", "\[Rule]", 
-      RowBox[{"-", "1"}]}], ",", 
-     RowBox[{"z", "\[Rule]", "0"}]}], "}"}], ",", 
-   RowBox[{"{", 
-    RowBox[{
-     RowBox[{"x", "\[Rule]", "1"}], ",", 
-     RowBox[{"y", "\[Rule]", "0"}], ",", 
-     RowBox[{"z", "\[Rule]", 
-      RowBox[{"-", "1"}]}]}], "}"}], ",", 
-   RowBox[{"{", 
-    RowBox[{
-     RowBox[{"x", "\[Rule]", "1"}], ",", 
-     RowBox[{"y", "\[Rule]", "0"}], ",", 
-     RowBox[{"z", "\[Rule]", "1"}]}], "}"}], ",", 
-   RowBox[{"{", 
-    RowBox[{
-     RowBox[{"x", "\[Rule]", "1"}], ",", 
-     RowBox[{"y", "\[Rule]", "1"}], ",", 
-     RowBox[{"z", "\[Rule]", "0"}]}], "}"}]}], "}"}]], "Output",
- CellChangeTimes->{{3.743235305220907*^9, 
-  3.743235309139554*^9}},ExpressionUUID->"d9825c95-24bb-442a-8734-\
-4c0f47e99dfc"]
-}, Open  ]],
-
-Cell[BoxData[
- RowBox[{
-  RowBox[{"xmlElem", "[", "x_", "]"}], ":=", 
-  RowBox[{"Print", "[", 
-   RowBox[{"\"\<<elem>\>\"", "<>", 
-    RowBox[{"ToString", "[", 
-     RowBox[{"x", "[", 
-      RowBox[{"[", "1", "]"}], "]"}], "]"}], "<>", "\"\< \>\"", "<>", 
-    RowBox[{"ToString", "[", 
-     RowBox[{"x", "[", 
-      RowBox[{"[", "2", "]"}], "]"}], "]"}], "<>", "\"\< \>\"", "<>", 
-    RowBox[{"ToString", "[", 
-     RowBox[{"x", "[", 
-      RowBox[{"[", "3", "]"}], "]"}], "]"}], "<>", "\"\<</elem>\>\""}], 
-   "]"}]}]], "Input",
- CellChangeTimes->{{3.74323534002862*^9, 3.743235351000985*^9}, {
-  3.743235403233039*^9, 3.743235413488028*^9}, {3.743235473169856*^9, 
-  3.7432354747126904`*^9}},ExpressionUUID->"aea76313-c89e-45e8-b429-\
-3f454091666d"],
-
-Cell[CellGroupData[{
-
-Cell[BoxData[
- RowBox[{
-  RowBox[{
-   RowBox[{"xmlElem", "[", 
-    RowBox[{
-     RowBox[{"{", 
-      RowBox[{"x", ",", "y", ",", "z"}], "}"}], "/.", "#"}], "]"}], "&"}], "/@",
-   "sol"}]], "Input",
- CellChangeTimes->{{3.743235415820318*^9, 
-  3.743235467025091*^9}},ExpressionUUID->"07da3998-8eab-40ba-8c0b-\
-ac6b130cb4fb"],
-
-Cell[CellGroupData[{
-
-Cell[BoxData["\<\"<elem>-1 -1 0</elem>\"\>"], "Print",
- CellChangeTimes->{
-  3.743235476581676*^9},ExpressionUUID->"c577ba06-b67a-405a-9ff5-\
-2bf7dc898d03"],
-
-Cell[BoxData["\<\"<elem>-1 0 -1</elem>\"\>"], "Print",
- CellChangeTimes->{
-  3.743235476588011*^9},ExpressionUUID->"d041aa36-0cea-457c-9d4b-\
-1fe9be66e2ab"],
-
-Cell[BoxData["\<\"<elem>-1 0 1</elem>\"\>"], "Print",
- CellChangeTimes->{
-  3.743235476596887*^9},ExpressionUUID->"bf141b55-86b2-4430-a994-\
-5c03d5a19441"],
-
-Cell[BoxData["\<\"<elem>-1 1 0</elem>\"\>"], "Print",
- CellChangeTimes->{
-  3.743235476605785*^9},ExpressionUUID->"4968a660-4ecf-4b66-9071-\
-8bd798c18d21"],
-
-Cell[BoxData["\<\"<elem>0 -1 -1</elem>\"\>"], "Print",
- CellChangeTimes->{
-  3.743235476613523*^9},ExpressionUUID->"4e22d943-2680-416b-a1d7-\
-a16ca20b781f"],
-
-Cell[BoxData["\<\"<elem>0 -1 1</elem>\"\>"], "Print",
- CellChangeTimes->{
-  3.7432354766218576`*^9},ExpressionUUID->"6dd38385-08b3-4dd9-932f-\
-98a00c6db1b2"],
-
-Cell[BoxData["\<\"<elem>0 1 -1</elem>\"\>"], "Print",
- CellChangeTimes->{
-  3.743235476629427*^9},ExpressionUUID->"ef3baad3-91d1-4735-9a22-\
-53495a624c15"],
-
-Cell[BoxData["\<\"<elem>0 1 1</elem>\"\>"], "Print",
- CellChangeTimes->{
-  3.743235476638257*^9},ExpressionUUID->"413fbb68-5017-4272-a62a-\
-fa234e6daaea"],
-
-Cell[BoxData["\<\"<elem>1 -1 0</elem>\"\>"], "Print",
- CellChangeTimes->{
-  3.743235476646203*^9},ExpressionUUID->"3a832a60-ae00-414b-a9ac-\
-f5e86e67e917"],
-
-Cell[BoxData["\<\"<elem>1 0 -1</elem>\"\>"], "Print",
- CellChangeTimes->{
-  3.743235476653907*^9},ExpressionUUID->"bfc79ef6-f6c7-4f1e-88e8-\
-005ac314be9c"],
-
-Cell[BoxData["\<\"<elem>1 0 1</elem>\"\>"], "Print",
- CellChangeTimes->{
-  3.743235476662575*^9},ExpressionUUID->"0f892891-f885-489c-9925-\
-ddef4d698410"],
-
-Cell[BoxData["\<\"<elem>1 1 0</elem>\"\>"], "Print",
- CellChangeTimes->{
-  3.7432354766702337`*^9},ExpressionUUID->"2906f190-e673-4f33-9c34-\
-e8e56efe7a27"]
-}, Open  ]],
-
-Cell[BoxData[
- RowBox[{"{", 
-  RowBox[{
-  "Null", ",", "Null", ",", "Null", ",", "Null", ",", "Null", ",", "Null", 
-   ",", "Null", ",", "Null", ",", "Null", ",", "Null", ",", "Null", ",", 
-   "Null"}], "}"}]], "Output",
- CellChangeTimes->{
-  3.7432354246225967`*^9, {3.7432354674878073`*^9, 
-   3.743235476678007*^9}},ExpressionUUID->"500ca3c1-88d8-46e5-a1a1-\
-86a7878e5638"]
-}, Open  ]],
-
-Cell[CellGroupData[{
-
 Cell["Clifford algebra generation", "Section",
 CellChangeTimes->{{3.6942089434583883`*^9, 
  3.694208978559093*^9}},ExpressionUUID->"a5b064b3-3011-4922-8559-\
@ -1048,9 +820,10 @@ generated by the Mathematica notebook gamma-gen/gamma-gen.nb\n\n#include \
       "\"\<    static const std::array<const char *, nGamma>                \
 name;\n    static const std::array<std::array<Algebra, nGamma>, nGamma> mul;\n\
    static const std::array<Algebra, nGamma>                     adj;\n    \
-static const std::array<const Gamma, 4>                      gmu;\n    \
-Algebra                                                      g;\n  public:\n  \
-  Gamma(Algebra initg): g(initg) {}  \n};\n\n\>\""}]}], ";", 
+static const std::array<const Gamma, 4>                      gmu;\n    static \
+const std::array<const Gamma, 16>                     gall;\n    Algebra      \
+                                                g;\n  public:\n    \
+Gamma(Algebra initg): g(initg) {}  \n};\n\n\>\""}]}], ";", 
     "\[IndentingNewLine]", 
     RowBox[{"out", " ", "=", 
      RowBox[{"out", "<>", "funcCode"}]}], ";", "\[IndentingNewLine]", 
@ -1076,7 +849,8 @@ Algebra                                                      g;\n  public:\n  \
   3.694963343265525*^9}, {3.694964367519239*^9, 3.69496439461199*^9}, {
   3.694964462130747*^9, 3.6949644669959793`*^9}, 3.694964509762739*^9, {
   3.694964705045744*^9, 3.694964723148797*^9}, {3.694964992988984*^9, 
-   3.6949649968504257`*^9}},ExpressionUUID->"c7103bd6-b539-4495-b98c-\
+   3.6949649968504257`*^9}, {3.758291687176977*^9, 
+   3.758291694181189*^9}},ExpressionUUID->"c7103bd6-b539-4495-b98c-\
 d4d12ac6cad8"],

 Cell["Gamma enum generation:", "Text",
@ -1745,8 +1519,17 @@ namespace QCD {\>\""}]}], ";", "\[IndentingNewLine]",
       "\"\<\n\nconst std::array<const Gamma, 4> Gamma::gmu = {{\n  \
 Gamma(Gamma::Algebra::GammaX),\n  Gamma(Gamma::Algebra::GammaY),\n  \
 Gamma(Gamma::Algebra::GammaZ),\n  Gamma(Gamma::Algebra::GammaT)}};\n\nconst \
-std::array<const char *, Gamma::nGamma> Gamma::name = {{\n\>\""}]}], ";", 
-     "\[IndentingNewLine]", 
+std::array<const Gamma, 16> Gamma::gall = {{\n  \
+Gamma(Gamma::Algebra::Identity),\n  Gamma(Gamma::Algebra::Gamma5),\n  \
+Gamma(Gamma::Algebra::GammaX),\n  Gamma(Gamma::Algebra::GammaY),\n  \
+Gamma(Gamma::Algebra::GammaZ),\n  Gamma(Gamma::Algebra::GammaT),\n  \
+Gamma(Gamma::Algebra::GammaXGamma5),\n  Gamma(Gamma::Algebra::GammaYGamma5),\n\
+  Gamma(Gamma::Algebra::GammaZGamma5),\n  \
+Gamma(Gamma::Algebra::GammaTGamma5),\n  Gamma(Gamma::Algebra::SigmaXT),      \
+\n  Gamma(Gamma::Algebra::SigmaXY),      \n  Gamma(Gamma::Algebra::SigmaXZ),  \
+    \n  Gamma(Gamma::Algebra::SigmaYT),\n  Gamma(Gamma::Algebra::SigmaYZ),\n  \
+Gamma(Gamma::Algebra::SigmaZT)}};\n\nconst std::array<const char *, \
+Gamma::nGamma> Gamma::name = {{\n\>\""}]}], ";", "\[IndentingNewLine]", 
     RowBox[{"Do", "[", "\[IndentingNewLine]", 
      RowBox[{
       RowBox[{"out", " ", "=", " ", 
@ -1847,7 +1630,9 @@ Gamma::nGamma> Gamma::mul = {{\\n\>\""}]}], ";", "\[IndentingNewLine]",
   3.694963031525289*^9}, {3.694963065828494*^9, 3.694963098327538*^9}, {
   3.6949632020836153`*^9, 3.6949632715940027`*^9}, {3.694963440035037*^9, 
   3.6949634418966017`*^9}, {3.6949651447067547`*^9, 3.694965161228381*^9}, {
-   3.694967957845581*^9, 3.694967958364184*^9}}],
+   3.694967957845581*^9, 3.694967958364184*^9}, {3.758291673792514*^9, 
+   3.758291676983432*^9}},ExpressionUUID->"b1b309f8-a3a7-4081-a781-\
+c3845e3cd372"],

 Cell[BoxData[
 RowBox[{
@ -1867,8 +1652,8 @@ Cell[BoxData[""], "Input",
 },
 WindowSize->{1246, 1005},
 WindowMargins->{{282, Automatic}, {Automatic, 14}},
-FrontEndVersion->"11.2 for Mac OS X x86 (32-bit, 64-bit Kernel) (September \
-10, 2017)",
+FrontEndVersion->"11.3 for Mac OS X x86 (32-bit, 64-bit Kernel) (March 5, \
+2018)",
 StyleDefinitions->"Default.nb"
 ]
 (* End of Notebook Content *)
@ -1888,75 +1673,48 @@ Cell[1948, 43, 570, 11, 73, "Input",ExpressionUUID->"5c937a3e-adfd-4d7e-8fde-afb
 Cell[2521, 56, 1172, 17, 34, "Output",ExpressionUUID->"72817ba6-2f6a-4a4d-8212-6f0970f49e7c"]
 }, Open  ]],
 Cell[CellGroupData[{
-Cell[3730, 78, 248, 5, 30, "Input",ExpressionUUID->"d1a0fd03-85e1-43af-ba80-3ca4235675d8"],
-Cell[3981, 85, 299, 9, 34, "Output",ExpressionUUID->"16d3f953-4b24-4ed2-ae62-306dcab66ca7"]
+Cell[3730, 78, 174, 3, 67, "Section",ExpressionUUID->"a5b064b3-3011-4922-8559-ead857cad102"],
+Cell[3907, 83, 535, 16, 52, "Input",ExpressionUUID->"aa28f02b-31e1-4df2-9b5d-482177464b59"],
+Cell[4445, 101, 250, 4, 35, "Text",ExpressionUUID->"c8896b88-f1db-4ce4-b7a6-0c9838bdb8f1"],
+Cell[4698, 107, 5511, 169, 425, "Input",ExpressionUUID->"52a96ff6-047e-4043-86d0-e303866e5f8e"],
+Cell[CellGroupData[{
+Cell[10234, 280, 2183, 58, 135, "Input",ExpressionUUID->"8b0f4955-2c3f-418c-9226-9be8f87621e8"],
+Cell[12420, 340, 1027, 27, 56, "Output",ExpressionUUID->"edd0619f-6f12-4070-a1d2-6b547877fadc"]
 }, Open  ]],
 Cell[CellGroupData[{
-Cell[4317, 99, 469, 14, 33, "Input",ExpressionUUID->"f0fa2a5c-3d81-4d75-a447-50c7ca3459ff"],
-Cell[4789, 115, 2423, 77, 56, "Output",ExpressionUUID->"d9825c95-24bb-442a-8734-4c0f47e99dfc"]
+Cell[13484, 372, 1543, 46, 114, "Input",ExpressionUUID->"fb45123c-c610-4075-99b0-7cd71c728ae7"],
+Cell[15030, 420, 1311, 32, 87, "Output",ExpressionUUID->"2ae14565-b412-4dc0-9dce-bd6c1ba5ef27"]
 }, Open  ]],
-Cell[7227, 195, 751, 18, 30, "Input",ExpressionUUID->"aea76313-c89e-45e8-b429-3f454091666d"],
+Cell[16356, 455, 179, 3, 35, "Text",ExpressionUUID->"af247231-a58d-417b-987a-26908dafffdb"],
+Cell[16538, 460, 2175, 65, 94, "Input",ExpressionUUID->"7c44cadd-e488-4f51-87d8-c64eef11f40c"],
+Cell[18716, 527, 193, 3, 35, "Text",ExpressionUUID->"856f1746-1107-4509-a5ce-ac9c7f56cdb1"],
 Cell[CellGroupData[{
-Cell[8003, 217, 323, 10, 30, "Input",ExpressionUUID->"07da3998-8eab-40ba-8c0b-ac6b130cb4fb"],
-Cell[CellGroupData[{
-Cell[8351, 231, 156, 3, 24, "Print",ExpressionUUID->"c577ba06-b67a-405a-9ff5-2bf7dc898d03"],
-Cell[8510, 236, 156, 3, 24, "Print",ExpressionUUID->"d041aa36-0cea-457c-9d4b-1fe9be66e2ab"],
-Cell[8669, 241, 155, 3, 24, "Print",ExpressionUUID->"bf141b55-86b2-4430-a994-5c03d5a19441"],
-Cell[8827, 246, 155, 3, 24, "Print",ExpressionUUID->"4968a660-4ecf-4b66-9071-8bd798c18d21"],
-Cell[8985, 251, 156, 3, 24, "Print",ExpressionUUID->"4e22d943-2680-416b-a1d7-a16ca20b781f"],
-Cell[9144, 256, 157, 3, 24, "Print",ExpressionUUID->"6dd38385-08b3-4dd9-932f-98a00c6db1b2"],
-Cell[9304, 261, 155, 3, 24, "Print",ExpressionUUID->"ef3baad3-91d1-4735-9a22-53495a624c15"],
-Cell[9462, 266, 154, 3, 24, "Print",ExpressionUUID->"413fbb68-5017-4272-a62a-fa234e6daaea"],
-Cell[9619, 271, 155, 3, 24, "Print",ExpressionUUID->"3a832a60-ae00-414b-a9ac-f5e86e67e917"],
-Cell[9777, 276, 155, 3, 24, "Print",ExpressionUUID->"bfc79ef6-f6c7-4f1e-88e8-005ac314be9c"],
-Cell[9935, 281, 154, 3, 24, "Print",ExpressionUUID->"0f892891-f885-489c-9925-ddef4d698410"],
-Cell[10092, 286, 156, 3, 24, "Print",ExpressionUUID->"2906f190-e673-4f33-9c34-e8e56efe7a27"]
-}, Open  ]],
-Cell[10263, 292, 376, 9, 34, "Output",ExpressionUUID->"500ca3c1-88d8-46e5-a1a1-86a7878e5638"]
+Cell[18934, 534, 536, 16, 30, "Input",ExpressionUUID->"8674484a-8543-434f-b177-3b27f9353212"],
+Cell[19473, 552, 1705, 35, 87, "Output",ExpressionUUID->"c3b3f84d-91f6-41af-af6b-a394ca020511"]
 }, Open  ]],
+Cell[21193, 590, 170, 3, 35, "Text",ExpressionUUID->"518a3040-54b1-4d43-8947-5c7d12efa94d"],
 Cell[CellGroupData[{
-Cell[10676, 306, 174, 3, 67, "Section",ExpressionUUID->"a5b064b3-3011-4922-8559-ead857cad102"],
-Cell[10853, 311, 535, 16, 52, "Input",ExpressionUUID->"aa28f02b-31e1-4df2-9b5d-482177464b59"],
-Cell[11391, 329, 250, 4, 35, "Text",ExpressionUUID->"c8896b88-f1db-4ce4-b7a6-0c9838bdb8f1"],
-Cell[11644, 335, 5511, 169, 425, "Input",ExpressionUUID->"52a96ff6-047e-4043-86d0-e303866e5f8e"],
-Cell[CellGroupData[{
-Cell[17180, 508, 2183, 58, 135, "Input",ExpressionUUID->"8b0f4955-2c3f-418c-9226-9be8f87621e8"],
-Cell[19366, 568, 1027, 27, 67, "Output",ExpressionUUID->"edd0619f-6f12-4070-a1d2-6b547877fadc"]
-}, Open  ]],
-Cell[CellGroupData[{
-Cell[20430, 600, 1543, 46, 114, "Input",ExpressionUUID->"fb45123c-c610-4075-99b0-7cd71c728ae7"],
-Cell[21976, 648, 1311, 32, 98, "Output",ExpressionUUID->"2ae14565-b412-4dc0-9dce-bd6c1ba5ef27"]
-}, Open  ]],
-Cell[23302, 683, 179, 3, 35, "Text",ExpressionUUID->"af247231-a58d-417b-987a-26908dafffdb"],
-Cell[23484, 688, 2175, 65, 94, "Input",ExpressionUUID->"7c44cadd-e488-4f51-87d8-c64eef11f40c"],
-Cell[25662, 755, 193, 3, 35, "Text",ExpressionUUID->"856f1746-1107-4509-a5ce-ac9c7f56cdb1"],
-Cell[CellGroupData[{
-Cell[25880, 762, 536, 16, 30, "Input",ExpressionUUID->"8674484a-8543-434f-b177-3b27f9353212"],
-Cell[26419, 780, 1705, 35, 87, "Output",ExpressionUUID->"c3b3f84d-91f6-41af-af6b-a394ca020511"]
-}, Open  ]],
-Cell[28139, 818, 170, 3, 35, "Text",ExpressionUUID->"518a3040-54b1-4d43-8947-5c7d12efa94d"],
-Cell[CellGroupData[{
-Cell[28334, 825, 536, 14, 30, "Input",ExpressionUUID->"61a2e974-2b39-4a07-8043-2dfd39a70569"],
-Cell[28873, 841, 6754, 167, 303, "Output",ExpressionUUID->"73480ac0-3043-4077-80cc-b952a94c822a"]
+Cell[21388, 597, 536, 14, 30, "Input",ExpressionUUID->"61a2e974-2b39-4a07-8043-2dfd39a70569"],
+Cell[21927, 613, 6754, 167, 303, "Output",ExpressionUUID->"73480ac0-3043-4077-80cc-b952a94c822a"]
 }, Open  ]]
 }, Open  ]],
 Cell[CellGroupData[{
-Cell[35676, 1014, 226, 4, 67, "Section",ExpressionUUID->"4e833cd6-9f0e-4aa3-a873-3d579e874720"],
-Cell[35905, 1020, 188, 4, 44, "Text",ExpressionUUID->"6d27fc04-3a60-4e03-8df7-3dd3aeee35b4"],
-Cell[36096, 1026, 2980, 53, 703, "Input",ExpressionUUID->"c7103bd6-b539-4495-b98c-d4d12ac6cad8"],
-Cell[39079, 1081, 221, 4, 44, "Text",ExpressionUUID->"0625593d-290f-4a39-9d80-8e2c6fdbc94e"],
-Cell[39303, 1087, 4936, 150, 682, "Input",ExpressionUUID->"1ad4904c-352f-4b1d-a7c7-91e1b0549409"],
-Cell[44242, 1239, 2645, 56, 199, "Input",ExpressionUUID->"0221674f-9b63-4662-91bc-ccc8c6ae9589"],
-Cell[46890, 1297, 209, 4, 44, "Text",ExpressionUUID->"d2d2257a-487b-416f-bc40-abd4482225f7"],
-Cell[47102, 1303, 15306, 397, 2131, "Input",ExpressionUUID->"daea68a9-c9e8-46ab-9bc8-5186e2cf477c"],
-Cell[62411, 1702, 137, 2, 44, "Text",ExpressionUUID->"76ba9d5a-7ee3-4888-be7e-6377003275e8"],
-Cell[62551, 1706, 521, 12, 30, "Input",ExpressionUUID->"4ec61f4c-3fd3-49ea-b5ef-6f7f04a16b34"]
+Cell[28730, 786, 226, 4, 67, "Section",ExpressionUUID->"4e833cd6-9f0e-4aa3-a873-3d579e874720"],
+Cell[28959, 792, 188, 4, 44, "Text",ExpressionUUID->"6d27fc04-3a60-4e03-8df7-3dd3aeee35b4"],
+Cell[29150, 798, 3104, 55, 724, "Input",ExpressionUUID->"c7103bd6-b539-4495-b98c-d4d12ac6cad8"],
+Cell[32257, 855, 221, 4, 44, "Text",ExpressionUUID->"0625593d-290f-4a39-9d80-8e2c6fdbc94e"],
+Cell[32481, 861, 4936, 150, 682, "Input",ExpressionUUID->"1ad4904c-352f-4b1d-a7c7-91e1b0549409"],
+Cell[37420, 1013, 2645, 56, 199, "Input",ExpressionUUID->"0221674f-9b63-4662-91bc-ccc8c6ae9589"],
+Cell[40068, 1071, 209, 4, 44, "Text",ExpressionUUID->"d2d2257a-487b-416f-bc40-abd4482225f7"],
+Cell[40280, 1077, 15306, 397, 2131, "Input",ExpressionUUID->"daea68a9-c9e8-46ab-9bc8-5186e2cf477c"],
+Cell[55589, 1476, 137, 2, 44, "Text",ExpressionUUID->"76ba9d5a-7ee3-4888-be7e-6377003275e8"],
+Cell[55729, 1480, 521, 12, 30, "Input",ExpressionUUID->"4ec61f4c-3fd3-49ea-b5ef-6f7f04a16b34"]
 }, Open  ]],
 Cell[CellGroupData[{
-Cell[63109, 1723, 167, 2, 67, "Section",ExpressionUUID->"a4458b3a-09b5-4e36-a1fc-781d6702b2dc"],
-Cell[63279, 1727, 5693, 122, 829, "Input",ExpressionUUID->"b1b309f8-a3a7-4081-a781-c3845e3cd372"],
-Cell[68975, 1851, 448, 10, 30, "Input",ExpressionUUID->"cba42949-b0f2-42ce-aebd-ffadfd83ef88"],
-Cell[69426, 1863, 94, 1, 30, "Input",ExpressionUUID->"6175b72c-af9f-43c2-b4ca-bd84c48a456d"]
+Cell[56287, 1497, 167, 2, 67, "Section",ExpressionUUID->"a4458b3a-09b5-4e36-a1fc-781d6702b2dc"],
+Cell[56457, 1501, 6464, 133, 1207, "Input",ExpressionUUID->"b1b309f8-a3a7-4081-a781-c3845e3cd372"],
+Cell[62924, 1636, 448, 10, 30, "Input",ExpressionUUID->"cba42949-b0f2-42ce-aebd-ffadfd83ef88"],
+Cell[63375, 1648, 94, 1, 30, "Input",ExpressionUUID->"6175b72c-af9f-43c2-b4ca-bd84c48a456d"]
 }, Open  ]]
 }
 ]
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@ -27,12 +27,13 @@ public:

  typedef iSpinColourMatrix<vector_type> SpinColourMatrix_v;

-  static void MesonField(Eigen::Tensor<ComplexD,5> &mat, 
+  template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
+  static void MesonField(TensorType &mat, 
 			 const FermionField *lhs_wi,
 			 const FermionField *rhs_vj,
 			 std::vector<Gamma::Algebra> gammas,
 			 const std::vector<ComplexField > &mom,
-			 int orthogdim);
+			 int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);

  static void PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat, 
 			     const FermionField *wi,
@ -59,6 +60,14 @@ public:
 			  const FermionField *vj,
 			  int orthogdim);

+  template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
+  static void AslashField(TensorType &mat, 
+        const FermionField *lhs_wi,
+        const FermionField *rhs_vj,
+        const std::vector<ComplexField> &emB0,
+        const std::vector<ComplexField> &emB1,
+        int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
+
  static void ContractWWVV(std::vector<PropagatorField> &WWVV,
 			   const Eigen::Tensor<ComplexD,3> &WW_sd,
 			   const FermionField *vs,
@ -92,13 +101,14 @@ public:
 #endif
 };

-template<class FImpl>
-void A2Autils<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat, 
+template <class FImpl>
+template <typename TensorType>
+void A2Autils<FImpl>::MesonField(TensorType &mat, 
 				 const FermionField *lhs_wi,
 				 const FermionField *rhs_vj,
 				 std::vector<Gamma::Algebra> gammas,
 				 const std::vector<ComplexField > &mom,
-				 int orthogdim) 
+				 int orthogdim, double *t_kernel, double *t_gsum) 
 {
  typedef typename FImpl::SiteSpinor vobj;

@ -146,6 +156,7 @@ void A2Autils<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
  int stride=grid->_slice_stride[orthogdim];

  // potentially wasting cores here if local time extent too small
+  if (t_kernel) *t_kernel = -usecond();
  parallel_for(int r=0;r<rd;r++){

    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
@ -212,7 +223,7 @@ void A2Autils<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
      }
    }}}
  }
-
+  if (t_kernel) *t_kernel += usecond();
  assert(mat.dimension(0) == Nmom);
  assert(mat.dimension(1) == Ngamma);
  assert(mat.dimension(2) == Nt);
@ -256,9 +267,9 @@ void A2Autils<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
  // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
  // Healthy size that should suffice
  ////////////////////////////////////////////////////////////////////
-
+  if (t_gsum) *t_gsum = -usecond();
  grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
-
+  if (t_gsum) *t_gsum += usecond();
 }


@ -614,6 +625,189 @@ void A2Autils<FImpl>::PionFieldVV(Eigen::Tensor<ComplexD,3> &mat,
  PionFieldXX(mat,vi,vj,orthogdim,nog5);
 }

+// "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x)
+//
+// With:
+//
+// B_0 = A_0 + i A_1
+// B_1 = A_2 + i A_3
+// 
+// then in spin space
+// 
+//                 ( 0          0          -conj(B_1) -B_0 )
+// i * A_mu g_mu = ( 0          0          -conj(B_0)  B_1 )
+//                 ( B_1        B_0        0          0    )
+//                 ( conj(B_0)  -conj(B_1) 0          0    )
+template <class FImpl>
+template <typename TensorType>
+void A2Autils<FImpl>::AslashField(TensorType &mat, 
+          const FermionField *lhs_wi,
+          const FermionField *rhs_vj,
+          const std::vector<ComplexField> &emB0,
+          const std::vector<ComplexField> &emB1,
+          int orthogdim, double *t_kernel, double *t_gsum) 
+{
+    typedef typename FermionField::vector_object vobj;
+    typedef typename vobj::scalar_object         sobj;
+    typedef typename vobj::scalar_type           scalar_type;
+    typedef typename vobj::vector_type           vector_type;
+
+    typedef iSpinMatrix<vector_type> SpinMatrix_v;
+    typedef iSpinMatrix<scalar_type> SpinMatrix_s;
+    typedef iSinglet<vector_type>    Singlet_v;
+    typedef iSinglet<scalar_type>    Singlet_s;
+    
+    int Lblock = mat.dimension(3); 
+    int Rblock = mat.dimension(4);
+
+    GridBase *grid = lhs_wi[0]._grid;
+    
+    const int    Nd = grid->_ndimension;
+    const int Nsimd = grid->Nsimd();
+
+    int Nt  = grid->GlobalDimensions()[orthogdim];
+    int Nem = emB0.size();
+    assert(emB1.size() == Nem);
+
+    int fd=grid->_fdimensions[orthogdim];
+    int ld=grid->_ldimensions[orthogdim];
+    int rd=grid->_rdimensions[orthogdim];
+
+    // will locally sum vectors first
+    // sum across these down to scalars
+    // splitting the SIMD
+    int MFrvol = rd*Lblock*Rblock*Nem;
+    int MFlvol = ld*Lblock*Rblock*Nem;
+
+    Vector<vector_type> lvSum(MFrvol);
+    parallel_for (int r = 0; r < MFrvol; r++)
+    {
+        lvSum[r] = zero;
+    }
+
+    Vector<scalar_type> lsSum(MFlvol);             
+    parallel_for (int r = 0; r < MFlvol; r++)
+    {
+        lsSum[r] = scalar_type(0.0);
+    }
+
+    int e1=    grid->_slice_nblock[orthogdim];
+    int e2=    grid->_slice_block [orthogdim];
+    int stride=grid->_slice_stride[orthogdim];
+
+    // Nested parallelism would be ok
+    // Wasting cores here. Test case r
+    if (t_kernel) *t_kernel = -usecond();
+    parallel_for(int r=0;r<rd;r++)
+    {
+        int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+        for(int n=0;n<e1;n++)
+        for(int b=0;b<e2;b++)
+        {
+            int ss= so+n*stride+b;
+
+            for(int i=0;i<Lblock;i++)
+            {
+                auto left = conjugate(lhs_wi[i]._odata[ss]);
+
+                for(int j=0;j<Rblock;j++)
+                {
+                    SpinMatrix_v vv;
+                    auto right = rhs_vj[j]._odata[ss];
+
+                    for(int s1=0;s1<Ns;s1++)
+                    for(int s2=0;s2<Ns;s2++)
+                    {
+                        vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
+                                        + left()(s2)(1) * right()(s1)(1)
+                                        + left()(s2)(2) * right()(s1)(2);
+                    }
+                    
+                    // After getting the sitewise product do the mom phase loop
+                    int base = Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*r;
+
+                    for ( int m=0;m<Nem;m++)
+                    {
+                        int idx  = m+base;
+                        auto b0  = emB0[m]._odata[ss];
+                        auto b1  = emB1[m]._odata[ss];
+                        auto cb0 = conjugate(b0);
+                        auto cb1 = conjugate(b1);
+
+                        lvSum[idx] += - vv()(3,0)()*b0()()()  - vv()(2,0)()*cb1()()()
+                                      + vv()(3,1)()*b1()()()  - vv()(2,1)()*cb0()()()
+                                      + vv()(0,2)()*b1()()()  + vv()(1,2)()*b0()()()
+                                      + vv()(0,3)()*cb0()()() - vv()(1,3)()*cb1()()();
+                    }
+                }
+            }
+        }
+    }
+
+    // Sum across simd lanes in the plane, breaking out orthog dir.
+    parallel_for(int rt=0;rt<rd;rt++)
+    {
+        std::vector<int> icoor(Nd);
+        std::vector<scalar_type> extracted(Nsimd);               
+
+        for(int i=0;i<Lblock;i++)
+        for(int j=0;j<Rblock;j++)
+        for(int m=0;m<Nem;m++)
+        {
+
+            int ij_rdx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*rt;
+
+            extract<vector_type,scalar_type>(lvSum[ij_rdx],extracted);
+            for(int idx=0;idx<Nsimd;idx++)
+            {
+                grid->iCoorFromIindex(icoor,idx);
+
+                int ldx    = rt+icoor[orthogdim]*rd;
+                int ij_ldx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*ldx;
+
+                lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
+            }
+        }
+    }
+    if (t_kernel) *t_kernel += usecond();
+
+    // ld loop and local only??
+    int pd = grid->_processors[orthogdim];
+    int pc = grid->_processor_coor[orthogdim];
+    parallel_for_nest2(int lt=0;lt<ld;lt++)
+    {
+        for(int pt=0;pt<pd;pt++)
+        {
+            int t = lt + pt*ld;
+            if (pt == pc)
+            {
+                for(int i=0;i<Lblock;i++)
+                for(int j=0;j<Rblock;j++)
+                for(int m=0;m<Nem;m++)
+                {
+                    int ij_dx = m+Nem*i + Nem*Lblock * j + Nem*Lblock * Rblock * lt;
+
+                    mat(m,0,t,i,j) = lsSum[ij_dx];
+                }
+            } 
+            else 
+            { 
+                const scalar_type zz(0.0);
+
+                for(int i=0;i<Lblock;i++)
+                for(int j=0;j<Rblock;j++)
+                for(int m=0;m<Nem;m++)
+                {
+                    mat(m,0,t,i,j) = zz;
+                }
+            }
+        }
+    }
+    if (t_gsum) *t_gsum = -usecond();
+    grid->GlobalSumVector(&mat(0,0,0,0,0),Nem*Nt*Lblock*Rblock);
+    if (t_gsum) *t_gsum += usecond();
+}

 ////////////////////////////////////////////
 // Schematic thoughts about more generalised four quark insertion
@ -792,17 +986,18 @@ void A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
      for(int t=0;t<N_t;t++){
      for(int s=0;s<N_s;s++){
 	auto tmp1 = vs[s]._odata[ss];
-	vobj tmp2 = zero;
+  vobj tmp2 = zero;
+  vobj tmp3 = zero;

 	for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
 	  Scalar_v coeff = WW_sd(t,s,d);
-	  mac(&tmp2 ,& coeff, & vd[d]._odata[ss]);
-	}
+	  tmp3 = conjugate(vd[d]._odata[ss]);
+	  mac(&tmp2, &coeff, &tmp3);
+  }

 	//////////////////////////
 	// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
 	//////////////////////////
-	tmp2 = conjugate(tmp2);
 	for(int s1=0;s1<Ns;s1++){
 	for(int s2=0;s2<Ns;s2++){
 	  WWVV[t]._odata[ss]()(s1,s2)(0,0) += tmp1()(s1)(0)*tmp2()(s2)(0);
--- a/Grid/qcd/utils/CovariantSmearing.h
+++ b/Grid/qcd/utils/CovariantSmearing.h
@ -0,0 +1,87 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/scalar/CovariantLaplacian.h
+
+Copyright (C) 2016
+
+Author: Azusa Yamaguchi
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+#pragma once
+
+namespace Grid {
+namespace QCD {
+
+template <class Gimpl> class CovariantSmearing : public Gimpl 
+{
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  typedef typename Gimpl::GaugeLinkField GaugeMat;
+  typedef typename Gimpl::GaugeField GaugeLorentz;
+
+  template<typename T>
+  static void GaussianSmear(const std::vector<LatticeColourMatrix>& U, 
+			    T& chi, 
+			    const Real& width, int Iterations, int orthog)
+  {
+    GridBase *grid = chi._grid;
+    T psi(grid);
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Follow Chroma conventions for width to keep compatibility with previous data
+    // Free field iterates 
+    //   chi = (1 - w^2/4N p^2)^N chi
+    //
+    //       ~ (e^(-w^2/4N p^2)^N chi
+    //       ~ (e^(-w^2/4 p^2) chi
+    //       ~ (e^(-w'^2/2 p^2) chi          [ w' = w/sqrt(2) ]
+    //
+    // Which in coordinate space is proportional to
+    //
+    //   e^(-x^2/w^2) = e^(-x^2/2w'^2) 
+    //
+    // The 4 is a bit unconventional from Gaussian width perspective, but... it's Chroma convention.
+    // 2nd derivative approx d^2/dx^2  =  x+mu + x-mu - 2x
+    //
+    // d^2/dx^2 = - p^2
+    //
+    // chi = ( 1 + w^2/4N d^2/dx^2 )^N chi
+    //
+    ////////////////////////////////////////////////////////////////////////////////////
+    Real coeff = (width*width) / Real(4*Iterations);
+  
+    int dims = Nd;
+    if( orthog < Nd ) dims=Nd-1;
+
+    for(int n = 0; n < Iterations; ++n) {
+      psi = (-2.0*dims)*chi;
+      for(int mu=0;mu<Nd;mu++) {
+	if ( mu != orthog ) { 
+	  psi = psi + Gimpl::CovShiftForward(U[mu],mu,chi);    
+	  psi = psi + Gimpl::CovShiftBackward(U[mu],mu,chi);    
+	}
+      }
+      chi = chi + coeff*psi;
+    }
+  }
+};
+}}
--- a/Grid/qcd/utils/GaugeFix.h
+++ b/Grid/qcd/utils/GaugeFix.h
@ -31,6 +31,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 namespace QCD {

+
 template <class Gimpl> 
 class FourierAcceleratedGaugeFixer  : public Gimpl {
 public:
@ -45,30 +46,58 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
      A[mu] = Ta(U[mu]) * cmi;
    }
  }
-  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu) {
+  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu,int orthog) {
    dmuAmu=zero;
    for(int mu=0;mu<Nd;mu++){
-      dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
+      if ( mu != orthog ) {
+	dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
+      }
    }
  }  
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false) {
+
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
+    GridBase *grid = Umu._grid;
+    GaugeMat xform(grid);
+    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog);
+  }
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
+
    GridBase *grid = Umu._grid;

    Real org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
    Real org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
    Real old_trace = org_link_trace;
    Real trG;
+    
+    xform=1.0;

    std::vector<GaugeMat> U(Nd,grid);
-                 GaugeMat dmuAmu(grid);

-    for(int i=0;i<maxiter;i++){
-      for(int mu=0;mu<Nd;mu++) U[mu]= PeekIndex<LorentzIndex>(Umu,mu);
-      if ( Fourier==false ) { 
-	trG = SteepestDescentStep(U,alpha,dmuAmu);
+    GaugeMat dmuAmu(grid);
+
+    {
+      Real plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
+      Real link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
+      if( (orthog>=0) && (orthog<Nd) ){
+	std::cout << GridLogMessage << " Gauge fixing to Coulomb gauge time="<<orthog<< " plaq= "<<plaq<<" link trace = "<<link_trace<<  std::endl;
      } else { 
-	trG = FourierAccelSteepestDescentStep(U,alpha,dmuAmu);
+	std::cout << GridLogMessage << " Gauge fixing to Landau gauge plaq= "<<plaq<<" link trace = "<<link_trace<<  std::endl;
      }
+    }
+    for(int i=0;i<maxiter;i++){
+
+      for(int mu=0;mu<Nd;mu++) U[mu]= PeekIndex<LorentzIndex>(Umu,mu);
+
+      if ( Fourier==false ) { 
+	trG = SteepestDescentStep(U,xform,alpha,dmuAmu,orthog);
+      } else { 
+	trG = FourierAccelSteepestDescentStep(U,xform,alpha,dmuAmu,orthog);
+      }
+
+      //      std::cout << GridLogMessage << "trG   "<< trG<< std::endl;
+      //      std::cout << GridLogMessage << "xform "<< norm2(xform)<< std::endl;
+      //      std::cout << GridLogMessage << "dmuAmu "<< norm2(dmuAmu)<< std::endl;
+
      for(int mu=0;mu<Nd;mu++) PokeIndex<LorentzIndex>(Umu,U[mu],mu);
      // Monitor progress and convergence test 
      // infrequently to minimise cost overhead
@ -84,7 +113,6 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
 	Real Phi  = 1.0 - old_trace / link_trace ;
 	Real Omega= 1.0 - trG;

-
 	std::cout << GridLogMessage << " Iteration "<<i<< " Phi= "<<Phi<< " Omega= " << Omega<< " trG " << trG <<std::endl;
 	if ( (Omega < Omega_tol) && ( ::fabs(Phi) < Phi_tol) ) {
 	  std::cout << GridLogMessage << "Converged ! "<<std::endl;
@ -96,25 +124,26 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
      }
    }
  };
-  static Real SteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
+  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
    GridBase *grid = U[0]._grid;

    std::vector<GaugeMat> A(Nd,grid);
    GaugeMat g(grid);

    GaugeLinkToLieAlgebraField(U,A);
-    ExpiAlphaDmuAmu(A,g,alpha,dmuAmu);
+    ExpiAlphaDmuAmu(A,g,alpha,dmuAmu,orthog);


    Real vol = grid->gSites();
    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;

+    xform = g*xform ;
    SU<Nc>::GaugeTransform(U,g);

    return trG;
  }

-  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
+  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {

    GridBase *grid = U[0]._grid;

@ -133,38 +162,41 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {

    GaugeLinkToLieAlgebraField(U,A);

-    DmuAmu(A,dmuAmu);
+    DmuAmu(A,dmuAmu,orthog);

-    theFFT.FFT_all_dim(dmuAmu_p,dmuAmu,FFT::forward);
+    std::vector<int> mask(Nd,1);
+    for(int mu=0;mu<Nd;mu++) if (mu==orthog) mask[mu]=0;
+    theFFT.FFT_dim_mask(dmuAmu_p,dmuAmu,mask,FFT::forward);

    //////////////////////////////////
    // Work out Fp = psq_max/ psq...
+    // Avoid singularities in Fp
    //////////////////////////////////
    std::vector<int> latt_size = grid->GlobalDimensions();
    std::vector<int> coor(grid->_ndimension,0);
    for(int mu=0;mu<Nd;mu++) {
-
-      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
-      LatticeCoordinate(pmu,mu);
-      pmu = TwoPiL * pmu ;
-      psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5); 
+      if ( mu != orthog ) { 
+	Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
+	LatticeCoordinate(pmu,mu);
+	pmu = TwoPiL * pmu ;
+	psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5); 
+      }
    }

    Complex psqMax(16.0);
    Fp =  psqMax*one/psq;

-    /*
-    static int once;
-    if ( once == 0 ) { 
-      std::cout << " Fp " << Fp <<std::endl;
-      once ++;
-      }*/
-
-    pokeSite(TComplex(1.0),Fp,coor);
-
+    pokeSite(TComplex(16.0),Fp,coor);
+    if( (orthog>=0) && (orthog<Nd) ){
+      for(int t=0;t<grid->GlobalDimensions()[orthog];t++){
+	coor[orthog]=t;
+	pokeSite(TComplex(16.0),Fp,coor);
+      }
+    }
+    
    dmuAmu_p  = dmuAmu_p * Fp; 

-    theFFT.FFT_all_dim(dmuAmu,dmuAmu_p,FFT::backward);
+    theFFT.FFT_dim_mask(dmuAmu,dmuAmu_p,mask,FFT::backward);

    GaugeMat ciadmam(grid);
    Complex cialpha(0.0,-alpha);
@ -173,16 +205,17 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {

    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;

+    xform = g*xform ;
    SU<Nc>::GaugeTransform(U,g);

    return trG;
  }

-  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) {
+  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu,int orthog) {
    GridBase *grid = g._grid;
    Complex cialpha(0.0,-alpha);
    GaugeMat ciadmam(grid);
-    DmuAmu(A,dmuAmu);
+    DmuAmu(A,dmuAmu,orthog);
    ciadmam = dmuAmu*cialpha;
    SU<Nc>::taExp(ciadmam,g);
  }  
--- a/Grid/qcd/utils/LinalgUtils.h
+++ b/Grid/qcd/utils/LinalgUtils.h
@ -173,6 +173,39 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
    }
  }
 }
+}

-}}
+// I explicitly need these outside the QCD namespace
+template<typename vobj>
+void G5C(Lattice<vobj> &z, const Lattice<vobj> &x)
+{
+  GridBase *grid = x._grid;
+  z.checkerboard = x.checkerboard;
+  conformable(x, z);
+
+  QCD::Gamma G5(QCD::Gamma::Algebra::Gamma5);
+  z = G5 * x;
+}
+
+template<class CComplex, int nbasis>
+void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, nbasis>> &x)
+{
+  GridBase *grid = x._grid;
+  z.checkerboard = x.checkerboard;
+  conformable(x, z);
+
+  static_assert(nbasis % 2 == 0, "");
+  int nb = nbasis / 2;
+
+  parallel_for(int ss = 0; ss < grid->oSites(); ss++) {
+    for(int n = 0; n < nb; ++n) {
+      z._odata[ss](n) = x._odata[ss](n);
+    }
+    for(int n = nb; n < nbasis; ++n) {
+      z._odata[ss](n) = -x._odata[ss](n);
+    }
+  }
+}
+
+}
 #endif 
--- a/Grid/qcd/utils/SUn.h
+++ b/Grid/qcd/utils/SUn.h
@ -676,10 +676,18 @@ class SU {
    }
  }
 /*
- add GaugeTrans
-*/
-
-template<typename GaugeField,typename GaugeMat>
+ * Fundamental rep gauge xform
+ */
+  template<typename Fundamental,typename GaugeMat>
+  static void GaugeTransformFundamental( Fundamental &ferm, GaugeMat &g){
+    GridBase *grid = ferm._grid;
+    conformable(grid,g._grid);
+    ferm = g*ferm;
+  }
+/*
+ * Adjoint rep gauge xform
+ */
+  template<typename GaugeField,typename GaugeMat>
  static void GaugeTransform( GaugeField &Umu, GaugeMat &g){
    GridBase *grid = Umu._grid;
    conformable(grid,g._grid);
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@ -6,10 +6,12 @@

    Copyright (C) 2015

-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: neo <cossu@post.kek.jp>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: James Harrison <J.Harrison@soton.ac.uk>
+    Author: Antonin Portelli <antonin.portelli@me.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -645,6 +647,184 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      }
    }
  }
+
+  //////////////////////////////////////////////////
+  // Wilson loop of size (R1, R2), oriented in mu,nu plane
+  //////////////////////////////////////////////////
+  static void wilsonLoop(GaugeMat &wl, const std::vector<GaugeMat> &U,
+                           const int Rmu, const int Rnu,
+                           const int mu, const int nu) {
+    wl = U[nu];
+
+    for(int i = 0; i < Rnu-1; i++){
+      wl = Gimpl::CovShiftForward(U[nu], nu, wl);
+    }
+
+    for(int i = 0; i < Rmu; i++){
+      wl = Gimpl::CovShiftForward(U[mu], mu, wl);
+    }
+
+    for(int i = 0; i < Rnu; i++){
+      wl = Gimpl::CovShiftBackward(U[nu], nu, wl);
+    }
+
+    for(int i = 0; i < Rmu; i++){
+      wl = Gimpl::CovShiftBackward(U[mu], mu, wl);
+    }
+  }
+  //////////////////////////////////////////////////
+  // trace of Wilson Loop oriented in mu,nu plane
+  //////////////////////////////////////////////////
+  static void traceWilsonLoop(LatticeComplex &wl,
+                                const std::vector<GaugeMat> &U,
+                                const int Rmu, const int Rnu,
+                                const int mu, const int nu) {
+    GaugeMat sp(U[0]._grid);
+    wilsonLoop(sp, U, Rmu, Rnu, mu, nu);
+    wl = trace(sp);
+  }
+  //////////////////////////////////////////////////
+  // sum over all planes of Wilson loop
+  //////////////////////////////////////////////////
+  static void siteWilsonLoop(LatticeComplex &Wl,
+                            const std::vector<GaugeMat> &U,
+                            const int R1, const int R2) {
+    LatticeComplex siteWl(U[0]._grid);
+    Wl = zero;
+    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
+      for (int nu = 0; nu < mu; nu++) {
+        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
+        Wl = Wl + siteWl;
+        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
+        Wl = Wl + siteWl;
+      }
+    }
+  }
+  //////////////////////////////////////////////////
+  // sum over planes of Wilson loop with length R1
+  // in the time direction
+  //////////////////////////////////////////////////
+  static void siteTimelikeWilsonLoop(LatticeComplex &Wl,
+                            const std::vector<GaugeMat> &U,
+                            const int R1, const int R2) {
+    LatticeComplex siteWl(U[0]._grid);
+
+    int ndim = U[0]._grid->_ndimension;
+
+    Wl = zero;
+    for (int nu = 0; nu < ndim - 1; nu++) {
+      traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu);
+      Wl = Wl + siteWl;
+    }
+  }
+  //////////////////////////////////////////////////
+  // sum Wilson loop over all planes orthogonal to the time direction
+  //////////////////////////////////////////////////
+  static void siteSpatialWilsonLoop(LatticeComplex &Wl,
+                            const std::vector<GaugeMat> &U,
+                            const int R1, const int R2) {
+    LatticeComplex siteWl(U[0]._grid);
+
+    Wl = zero;
+    for (int mu = 1; mu < U[0]._grid->_ndimension - 1; mu++) {
+      for (int nu = 0; nu < mu; nu++) {
+        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
+        Wl = Wl + siteWl;
+        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
+        Wl = Wl + siteWl;
+      }
+    }
+  }
+  //////////////////////////////////////////////////
+  // sum over all x,y,z,t and over all planes of Wilson loop
+  //////////////////////////////////////////////////
+  static Real sumWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    std::vector<GaugeMat> U(4, Umu._grid);
+
+    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+    }
+
+    LatticeComplex Wl(Umu._grid);
+
+    siteWilsonLoop(Wl, U, R1, R2);
+
+    TComplex Tp = sum(Wl);
+    Complex p = TensorRemove(Tp);
+    return p.real();
+  }
+  //////////////////////////////////////////////////
+  // sum over all x,y,z,t and over all planes of timelike Wilson loop
+  //////////////////////////////////////////////////
+  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    std::vector<GaugeMat> U(4, Umu._grid);
+
+    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+    }
+
+    LatticeComplex Wl(Umu._grid);
+
+    siteTimelikeWilsonLoop(Wl, U, R1, R2);
+
+    TComplex Tp = sum(Wl);
+    Complex p = TensorRemove(Tp);
+    return p.real();
+  }
+  //////////////////////////////////////////////////
+  // sum over all x,y,z,t and over all planes of spatial Wilson loop
+  //////////////////////////////////////////////////
+  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    std::vector<GaugeMat> U(4, Umu._grid);
+
+    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+    }
+
+    LatticeComplex Wl(Umu._grid);
+
+    siteSpatialWilsonLoop(Wl, U, R1, R2);
+
+    TComplex Tp = sum(Wl);
+    Complex p = TensorRemove(Tp);
+    return p.real();
+  }
+  //////////////////////////////////////////////////
+  // average over all x,y,z,t and over all planes of Wilson loop
+  //////////////////////////////////////////////////
+  static Real avgWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    int ndim = Umu._grid->_ndimension;
+    Real sumWl = sumWilsonLoop(Umu, R1, R2);
+    Real vol = Umu._grid->gSites();
+    Real faces = 1.0 * ndim * (ndim - 1);
+    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
+  }
+  //////////////////////////////////////////////////
+  // average over all x,y,z,t and over all planes of timelike Wilson loop
+  //////////////////////////////////////////////////
+  static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    int ndim = Umu._grid->_ndimension;
+    Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2);
+    Real vol = Umu._grid->gSites();
+    Real faces = 1.0 * (ndim - 1);
+    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
+  }
+  //////////////////////////////////////////////////
+  // average over all x,y,z,t and over all planes of spatial Wilson loop
+  //////////////////////////////////////////////////
+  static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    int ndim = Umu._grid->_ndimension;
+    Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2);
+    Real vol = Umu._grid->gSites();
+    Real faces = 1.0 * (ndim - 1) * (ndim - 2);
+    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
+  }
 };

 typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
--- a/Grid/serialisation/BaseIO.h
+++ b/Grid/serialisation/BaseIO.h
@ -33,12 +33,76 @@ Author: Guido Cossu <guido.cossu@ed.ac.uk>
 #include <type_traits>
 #include <Grid/tensors/Tensors.h>
 #include <Grid/serialisation/VectorUtils.h>
+#include <Grid/Eigen/unsupported/CXX11/Tensor>

 namespace Grid {
+  namespace EigenIO {
+    // EigenIO works for scalars that are not just Grid supported scalars
+    template<typename T, typename V = void> struct is_complex : public std::false_type {};
+    // Support all complex types (not just Grid complex types) - even if the definitions overlap (!)
+    template<typename T> struct is_complex<             T , typename
+        std::enable_if< ::Grid::is_complex<             T >::value>::type> : public std::true_type {};
+    template<typename T> struct is_complex<std::complex<T>, typename
+        std::enable_if<!::Grid::is_complex<std::complex<T>>::value>::type> : public std::true_type {};
+
+    // Helpers to support I/O for Eigen tensors of arithmetic scalars, complex types, or Grid tensors
+    template<typename T, typename V = void> struct is_scalar : public std::false_type {};
+    template<typename T> struct is_scalar<T, typename std::enable_if<std::is_arithmetic<T>::value || is_complex<T>::value>::type> : public std::true_type {};
+
+    // Is this an Eigen tensor
+    template<typename T> struct is_tensor : std::integral_constant<bool,
+      std::is_base_of<Eigen::TensorBase<T, Eigen::ReadOnlyAccessors>, T>::value> {};
+
+    // Is this an Eigen tensor of a supported scalar
+    template<typename T, typename V = void> struct is_tensor_of_scalar : public std::false_type {};
+    template<typename T> struct is_tensor_of_scalar<T, typename std::enable_if<is_tensor<T>::value && is_scalar<typename T::Scalar>::value>::type> : public std::true_type {};
+
+    // Is this an Eigen tensor of a supported container
+    template<typename T, typename V = void> struct is_tensor_of_container : public std::false_type {};
+    template<typename T> struct is_tensor_of_container<T, typename std::enable_if<is_tensor<T>::value && isGridTensor<typename T::Scalar>::value>::type> : public std::true_type {};
+
+    // These traits describe the scalars inside Eigen tensors
+    // I wish I could define these in reference to the scalar type (so there would be fewer traits defined)
+    // but I'm unable to find a syntax to make this work
+    template<typename T, typename V = void> struct Traits {};
+    // Traits are the default for scalars, or come from GridTypeMapper for GridTensors
+    template<typename T> struct Traits<T, typename std::enable_if<is_tensor_of_scalar<T>::value>::type>
+      : public GridTypeMapper_Base {
+      using scalar_type   = typename T::Scalar; // ultimate base scalar
+      static constexpr bool is_complex = ::Grid::EigenIO::is_complex<scalar_type>::value;
+    };
+    // Traits are the default for scalars, or come from GridTypeMapper for GridTensors
+    template<typename T> struct Traits<T, typename std::enable_if<is_tensor_of_container<T>::value>::type> {
+      using BaseTraits  = GridTypeMapper<typename T::Scalar>;
+      using scalar_type = typename BaseTraits::scalar_type; // ultimate base scalar
+      static constexpr bool   is_complex = ::Grid::EigenIO::is_complex<scalar_type>::value;
+      static constexpr int   TensorLevel = BaseTraits::TensorLevel;
+      static constexpr int          Rank = BaseTraits::Rank;
+      static constexpr std::size_t count = BaseTraits::count;
+      static constexpr int Dimension(int dim) { return BaseTraits::Dimension(dim); }
+    };
+
+    // Is this a fixed-size Eigen tensor
+    template<typename T> struct is_tensor_fixed : public std::false_type {};
+    template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
+    struct is_tensor_fixed<Eigen::TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType>>
+        : public std::true_type {};
+    template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType,
+              int MapOptions_, template <class> class MapPointer_>
+    struct is_tensor_fixed<Eigen::TensorMap<Eigen::TensorFixedSize<Scalar_, Dimensions_,
+                                            Options_, IndexType>, MapOptions_, MapPointer_>>
+        : public std::true_type {};
+
+    // Is this a variable-size Eigen tensor
+    template<typename T, typename V = void> struct is_tensor_variable : public std::false_type {};
+    template<typename T> struct is_tensor_variable<T, typename std::enable_if<is_tensor<T>::value
+        && !is_tensor_fixed<T>::value>::type> : public std::true_type {};
+  }
+
  // Abstract writer/reader classes ////////////////////////////////////////////
  // static polymorphism implemented using CRTP idiom
  class Serializable;
-  
+
  // Static abstract writer
  template <typename T>
  class Writer
@ -49,10 +113,10 @@ namespace Grid {
    void push(const std::string &s);
    void pop(void);
    template <typename U>
-    typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
+    typename std::enable_if<std::is_base_of<Serializable, U>::value>::type
    write(const std::string& s, const U &output);
    template <typename U>
-    typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
+    typename std::enable_if<!std::is_base_of<Serializable, U>::value && !EigenIO::is_tensor<U>::value>::type
    write(const std::string& s, const U &output);
    template <typename U>
    void write(const std::string &s, const iScalar<U> &output);
@ -60,6 +124,42 @@ namespace Grid {
    void write(const std::string &s, const iVector<U, N> &output);
    template <typename U, int N>
    void write(const std::string &s, const iMatrix<U, N> &output);
+    template <typename ETensor>
+    typename std::enable_if<EigenIO::is_tensor<ETensor>::value>::type
+    write(const std::string &s, const ETensor &output);
+
+    // Helper functions for Scalar vs Container specialisations
+    template <typename ETensor>
+    inline typename std::enable_if<EigenIO::is_tensor_of_scalar<ETensor>::value,
+    const typename ETensor::Scalar *>::type
+    getFirstScalar(const ETensor &output)
+    {
+      return output.data();
+    }
+    
+    template <typename ETensor>
+    inline typename std::enable_if<EigenIO::is_tensor_of_container<ETensor>::value,
+    const typename EigenIO::Traits<ETensor>::scalar_type *>::type
+    getFirstScalar(const ETensor &output)
+    {
+      return output.data()->begin();
+    }
+    
+    template <typename S>
+    inline typename std::enable_if<EigenIO::is_scalar<S>::value, void>::type
+    copyScalars(S * &pCopy, const S &Source)
+    {
+      * pCopy ++ = Source;
+    }
+    
+    template <typename S>
+    inline typename std::enable_if<isGridTensor<S>::value, void>::type
+    copyScalars(typename GridTypeMapper<S>::scalar_type * &pCopy, const S &Source)
+    {
+      for( const typename GridTypeMapper<S>::scalar_type &item : Source )
+        * pCopy ++ = item;
+    }
+
    void         scientificFormat(const bool set);
    bool         isScientific(void);
    void         setPrecision(const unsigned int prec);
@ -83,7 +183,8 @@ namespace Grid {
    typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
    read(const std::string& s, U &output);
    template <typename U>
-    typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
+    typename std::enable_if<!std::is_base_of<Serializable, U>::value
+                         && !EigenIO::is_tensor<U>::value, void>::type
    read(const std::string& s, U &output);
    template <typename U>
    void read(const std::string &s, iScalar<U> &output);
@ -91,6 +192,32 @@ namespace Grid {
    void read(const std::string &s, iVector<U, N> &output);
    template <typename U, int N>
    void read(const std::string &s, iMatrix<U, N> &output);
+    template <typename ETensor>
+    typename std::enable_if<EigenIO::is_tensor<ETensor>::value, void>::type
+    read(const std::string &s, ETensor &output);
+    template <typename ETensor>
+    typename std::enable_if<EigenIO::is_tensor_fixed<ETensor>::value, void>::type
+    Reshape(ETensor &t, const std::array<typename ETensor::Index, ETensor::NumDimensions> &dims );
+    template <typename ETensor>
+    typename std::enable_if<EigenIO::is_tensor_variable<ETensor>::value, void>::type
+    Reshape(ETensor &t, const std::array<typename ETensor::Index, ETensor::NumDimensions> &dims );
+  
+    // Helper functions for Scalar vs Container specialisations
+    template <typename S>
+    inline typename std::enable_if<EigenIO::is_scalar<S>::value, void>::type
+    copyScalars(S &Dest, const S * &pSource)
+    {
+      Dest = * pSource ++;
+    }
+    
+    template <typename S>
+    inline typename std::enable_if<isGridTensor<S>::value, void>::type
+    copyScalars(S &Dest, const typename GridTypeMapper<S>::scalar_type * &pSource)
+    {
+      for( typename GridTypeMapper<S>::scalar_type &item : Dest )
+        item = * pSource ++;
+    }
+    
  protected:
    template <typename U>
    void fromString(U &output, const std::string &s);
@ -135,12 +262,14 @@ namespace Grid {
  
  template <typename T>
  template <typename U>
-  typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
+  typename std::enable_if<!std::is_base_of<Serializable, U>::value
+                       && !EigenIO::is_tensor<U>::value, void>::type
  Writer<T>::write(const std::string &s, const U &output)
  {
    upcast->writeDefault(s, output);
  }

+
  template <typename T>
  template <typename U>
  void Writer<T>::write(const std::string &s, const iScalar<U> &output)
@ -161,6 +290,57 @@ namespace Grid {
  {
    upcast->writeDefault(s, tensorToVec(output));
  }
+  
+  // Eigen::Tensors of Grid tensors (iScalar, iVector, iMatrix)
+  template <typename T>
+  template <typename ETensor>
+  typename std::enable_if<EigenIO::is_tensor<ETensor>::value, void>::type
+  Writer<T>::write(const std::string &s, const ETensor &output)
+  {
+    using Index = typename ETensor::Index;
+    using Container = typename ETensor::Scalar; // NB: could be same as scalar
+    using Traits = EigenIO::Traits<ETensor>;
+    using Scalar = typename Traits::scalar_type; // type of the underlying scalar
+    constexpr unsigned int TensorRank{ETensor::NumIndices};
+    constexpr unsigned int ContainerRank{Traits::Rank}; // Only non-zero for containers
+    constexpr unsigned int TotalRank{TensorRank + ContainerRank};
+    const Index NumElements{output.size()};
+    assert( NumElements > 0 );
+
+    // Get the dimensionality of the tensor
+    std::vector<std::size_t>  TotalDims(TotalRank);
+    for(auto i = 0; i < TensorRank; i++ ) {
+      auto dim = output.dimension(i);
+      TotalDims[i] = static_cast<size_t>(dim);
+      assert( TotalDims[i] == dim ); // check we didn't lose anything in the conversion
+    }
+    for(auto i = 0; i < ContainerRank; i++ )
+      TotalDims[TensorRank + i] = Traits::Dimension(i);
+
+    // If the Tensor isn't in Row-Major order, then we'll need to copy it's data
+    const bool CopyData{NumElements > 1 && ETensor::Layout != Eigen::StorageOptions::RowMajor};
+    const Scalar * pWriteBuffer;
+    std::vector<Scalar> CopyBuffer;
+    const Index TotalNumElements = NumElements * Traits::count;
+    if( !CopyData ) {
+      pWriteBuffer = getFirstScalar( output );
+    } else {
+      // Regardless of the Eigen::Tensor storage order, the copy will be Row Major
+      CopyBuffer.resize( TotalNumElements );
+      Scalar * pCopy = &CopyBuffer[0];
+      pWriteBuffer = pCopy;
+      std::array<Index, TensorRank> MyIndex;
+      for( auto &idx : MyIndex ) idx = 0;
+      for( auto n = 0; n < NumElements; n++ ) {
+        const Container & c = output( MyIndex );
+        copyScalars( pCopy, c );
+        // Now increment the index
+        for( int i = output.NumDimensions - 1; i >= 0 && ++MyIndex[i] == output.dimension(i); i-- )
+          MyIndex[i] = 0;
+      }
+    }
+    upcast->template writeMultiDim<Scalar>(s, TotalDims, pWriteBuffer, TotalNumElements);
+  }

  template <typename T>
  void Writer<T>::scientificFormat(const bool set)
@ -215,7 +395,8 @@ namespace Grid {
  
  template <typename T>
  template <typename U>
-  typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
+  typename std::enable_if<!std::is_base_of<Serializable, U>::value
+                       && !EigenIO::is_tensor<U>::value, void>::type
  Reader<T>::read(const std::string &s, U &output)
  {
    upcast->readDefault(s, output);
@ -251,6 +432,79 @@ namespace Grid {
    vecToTensor(output, v);
  }

+  template <typename T>
+  template <typename ETensor>
+  typename std::enable_if<EigenIO::is_tensor<ETensor>::value, void>::type
+  Reader<T>::read(const std::string &s, ETensor &output)
+  {
+    using Index = typename ETensor::Index;
+    using Container = typename ETensor::Scalar; // NB: could be same as scalar
+    using Traits = EigenIO::Traits<ETensor>;
+    using Scalar = typename Traits::scalar_type; // type of the underlying scalar
+    constexpr unsigned int TensorRank{ETensor::NumIndices};
+    constexpr unsigned int ContainerRank{Traits::Rank}; // Only non-zero for containers
+    constexpr unsigned int TotalRank{TensorRank + ContainerRank};
+    using ETDims = std::array<Index, TensorRank>; // Dimensions of the tensor
+
+    // read the (flat) data and dimensionality
+    std::vector<std::size_t> dimData;
+    std::vector<Scalar> buf;
+    upcast->readMultiDim( s, buf, dimData );
+    assert(dimData.size() == TotalRank && "EigenIO: Tensor rank mismatch" );
+    // Make sure that the number of elements read matches dimensions read
+    std::size_t NumContainers = 1;
+    for( auto i = 0 ; i < TensorRank ; i++ )
+      NumContainers *= dimData[i];
+    // If our scalar object is a Container, make sure it's dimensions match what we read back
+    std::size_t ElementsPerContainer = 1;
+    for( auto i = 0 ; i < ContainerRank ; i++ ) {
+      assert( dimData[TensorRank+i] == Traits::Dimension(i) && "Tensor Container dimensions don't match data" );
+      ElementsPerContainer *= dimData[TensorRank+i];
+    }
+    assert( NumContainers * ElementsPerContainer == buf.size() && "EigenIO: Number of elements != product of dimensions" );
+    // Now see whether the tensor is the right shape, or can be made to be
+    const auto & dims = output.dimensions();
+    bool bShapeOK = (output.data() != nullptr);
+    for( auto i = 0; bShapeOK && i < TensorRank ; i++ )
+      if( dims[i] != dimData[i] )
+        bShapeOK = false;
+    // Make the tensor the same size as the data read
+    ETDims MyIndex;
+    if( !bShapeOK ) {
+      for( auto i = 0 ; i < TensorRank ; i++ )
+        MyIndex[i] = dimData[i];
+      Reshape(output, MyIndex);
+    }
+    // Copy the data into the tensor
+    for( auto &d : MyIndex ) d = 0;
+    const Scalar * pSource = &buf[0];
+    for( std::size_t n = 0 ; n < NumContainers ; n++ ) {
+      Container & c = output( MyIndex );
+      copyScalars( c, pSource );
+      // Now increment the index
+      for( int i = TensorRank - 1; i != -1 && ++MyIndex[i] == dims[i]; i-- )
+        MyIndex[i] = 0;
+    }
+    assert( pSource == &buf[NumContainers * ElementsPerContainer] );
+  }
+
+  template <typename T>
+  template <typename ETensor>
+  typename std::enable_if<EigenIO::is_tensor_fixed<ETensor>::value, void>::type
+  Reader<T>::Reshape(ETensor &t, const std::array<typename ETensor::Index, ETensor::NumDimensions> &dims )
+  {
+    assert( 0 && "EigenIO: Fixed tensor dimensions can't be changed" );
+  }
+
+  template <typename T>
+  template <typename ETensor>
+  typename std::enable_if<EigenIO::is_tensor_variable<ETensor>::value, void>::type
+  Reader<T>::Reshape(ETensor &t, const std::array<typename ETensor::Index, ETensor::NumDimensions> &dims )
+  {
+    //t.reshape( dims );
+    t.resize( dims );
+  }
+
  template <typename T>
  template <typename U>
  void Reader<T>::fromString(U &output, const std::string &s)
@ -289,8 +543,70 @@ namespace Grid {
    {
      return os;
    }
+
+    template <typename T1, typename T2>
+    static inline typename std::enable_if<!EigenIO::is_tensor<T1>::value || !EigenIO::is_tensor<T2>::value, bool>::type
+    CompareMember(const T1 &lhs, const T2 &rhs) {
+      return lhs == rhs;
+    }
+
+    template <typename T1, typename T2>
+    static inline typename std::enable_if<EigenIO::is_tensor<T1>::value && EigenIO::is_tensor<T2>::value, bool>::type
+    CompareMember(const T1 &lhs, const T2 &rhs) {
+      // First check whether dimensions match (Eigen tensor library will assert if they don't match)
+      bool bReturnValue = (T1::NumIndices == T2::NumIndices);
+      for( auto i = 0 ; bReturnValue && i < T1::NumIndices ; i++ )
+          bReturnValue = ( lhs.dimension(i) == rhs.dimension(i) );
+      if( bReturnValue ) {
+        Eigen::Tensor<bool, 0, T1::Options> bResult = (lhs == rhs).all();
+        bReturnValue = bResult(0);
+      }
+      return bReturnValue;
+    }
+
+    template <typename T>
+    static inline typename std::enable_if<EigenIO::is_tensor<T>::value, bool>::type
+    CompareMember(const std::vector<T> &lhs, const std::vector<T> &rhs) {
+      const auto NumElements = lhs.size();
+      bool bResult = ( NumElements == rhs.size() );
+      for( auto i = 0 ; i < NumElements && bResult ; i++ )
+        bResult = CompareMember(lhs[i], rhs[i]);
+      return bResult;
+    }
+
+    template <typename T>
+    static inline typename std::enable_if<!EigenIO::is_tensor<T>::value, void>::type
+    WriteMember(std::ostream &os, const T &object) {
+      os << object;
+    }
+    
+    template <typename T>
+    static inline typename std::enable_if<EigenIO::is_tensor<T>::value, void>::type
+    WriteMember(std::ostream &os, const T &object) {
+      using Index = typename T::Index;
+      const Index NumElements{object.size()};
+      assert( NumElements > 0 );
+      Index count = 1;
+      os << "T<";
+      for( int i = 0; i < T::NumIndices; i++ ) {
+        Index dim = object.dimension(i);
+        count *= dim;
+        if( i )
+          os << ",";
+        os << dim;
+      }
+      assert( count == NumElements && "Number of elements doesn't match tensor dimensions" );
+      os << ">{";
+      const typename T::Scalar * p = object.data();
+      for( Index i = 0; i < count; i++ ) {
+        if( i )
+          os << ",";
+        os << *p++;
+      }
+      os << "}";
+    }
  };
-  
+
  // Generic writer interface //////////////////////////////////////////////////
  template <typename T>
  inline void push(Writer<T> &w, const std::string &s) {
--- a/Grid/serialisation/BinaryIO.h
+++ b/Grid/serialisation/BinaryIO.h
@ -51,6 +51,8 @@ namespace Grid {
    template <typename U>
    void writeDefault(const std::string &s, const std::vector<U> &x);
    void writeDefault(const std::string &s, const char *x);
+    template <typename U>
+    void writeMultiDim(const std::string &s, const std::vector<size_t> & Dimensions, const U * pDataRowMajor, size_t NumElements);
  private:
    std::ofstream file_;
  };
@ -66,6 +68,8 @@ namespace Grid {
    void readDefault(const std::string &s, U &output);
    template <typename U>
    void readDefault(const std::string &s, std::vector<U> &output);
+    template <typename U>
+    void readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim);
  private:
    std::ifstream file_;
  };
@ -92,6 +96,27 @@ namespace Grid {
    }
  }
  
+  template <typename U>
+  void BinaryWriter::writeMultiDim(const std::string &s, const std::vector<size_t> & Dimensions, const U * pDataRowMajor, size_t NumElements)
+  {
+    uint64_t rank = static_cast<uint64_t>( Dimensions.size() );
+    uint64_t tmp = 1;
+    for( auto i = 0 ; i < rank ; i++ )
+      tmp *= Dimensions[i];
+    assert( tmp == NumElements && "Dimensions don't match size of data being written" );
+    // Total number of elements
+    write("", tmp);
+    // Number of dimensions
+    write("", rank);
+    // Followed by each dimension
+    for( auto i = 0 ; i < rank ; i++ ) {
+      tmp = Dimensions[i];
+      write("", tmp);
+    }
+    for( auto i = 0; i < NumElements; ++i)
+      write("", pDataRowMajor[i]);
+  }
+
  // Reader template implementation ////////////////////////////////////////////
  template <typename U>
  void BinaryReader::readDefault(const std::string &s, U &output)
@ -114,6 +139,30 @@ namespace Grid {
      read("", output[i]);
    }
  }
+
+  template <typename U>
+  void BinaryReader::readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim)
+  {
+    // Number of elements
+    uint64_t NumElements;
+    read("", NumElements);
+    // Number of dimensions
+    uint64_t rank;
+    read("", rank);
+    // Followed by each dimension
+    uint64_t count = 1;
+    dim.resize(rank);
+    uint64_t tmp;
+    for( auto i = 0 ; i < rank ; i++ ) {
+      read("", tmp);
+      dim[i] = tmp;
+      count *= tmp;
+    }
+    assert( count == NumElements && "Dimensions don't match size of data being read" );
+    buf.resize(count);
+    for( auto i = 0; i < count; ++i)
+      read("", buf[i]);
+  }
 }

 #endif
--- a/Grid/serialisation/Hdf5IO.cc
+++ b/Grid/serialisation/Hdf5IO.cc
@ -61,9 +61,9 @@ Group & Hdf5Writer::getGroup(void)
 }

 // Reader implementation ///////////////////////////////////////////////////////
-Hdf5Reader::Hdf5Reader(const std::string &fileName)
+Hdf5Reader::Hdf5Reader(const std::string &fileName, const bool readOnly)
 : fileName_(fileName)
-, file_(fileName.c_str(), H5F_ACC_RDWR)
+, file_(fileName.c_str(), readOnly ? H5F_ACC_RDONLY : H5F_ACC_RDWR)
 {
  group_ = file_.openGroup("/");
  readSingleAttribute(dataSetThres_, HDF5_GRID_GUARD "dataset_threshold",
--- a/Grid/serialisation/Hdf5IO.h
+++ b/Grid/serialisation/Hdf5IO.h
@ -3,6 +3,7 @@

 #include <stack>
 #include <string>
+#include <list>
 #include <vector>
 #include <H5Cpp.h>
 #include <Grid/tensors/Tensors.h>
@ -38,6 +39,8 @@ namespace Grid
    template <typename U>
    typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
    writeDefault(const std::string &s, const std::vector<U> &x);
+    template <typename U>
+    void writeMultiDim(const std::string &s, const std::vector<size_t> & Dimensions, const U * pDataRowMajor, size_t NumElements);
    H5NS::Group & getGroup(void);
  private:
    template <typename U>
@ -48,13 +51,13 @@ namespace Grid
    std::vector<std::string> path_;
    H5NS::H5File             file_;
    H5NS::Group              group_;
-    unsigned int             dataSetThres_{HDF5_DEF_DATASET_THRES};
+    const unsigned int       dataSetThres_{HDF5_DEF_DATASET_THRES};
  };
  
  class Hdf5Reader: public Reader<Hdf5Reader>
  {
  public:
-    Hdf5Reader(const std::string &fileName);
+    Hdf5Reader(const std::string &fileName, const bool readOnly = true);
    virtual ~Hdf5Reader(void) = default;
    bool push(const std::string &s);
    void pop(void);
@ -66,6 +69,8 @@ namespace Grid
    template <typename U>
    typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
    readDefault(const std::string &s, std::vector<U> &x);
+    template <typename U>
+    void readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim);
    H5NS::Group & getGroup(void);
  private:
    template <typename U>
@ -101,6 +106,75 @@ namespace Grid
  template <>
  void Hdf5Writer::writeDefault(const std::string &s, const std::string &x);
  
+  template <typename U>
+  void Hdf5Writer::writeMultiDim(const std::string &s, const std::vector<size_t> & Dimensions, const U * pDataRowMajor, size_t NumElements)
+  {
+    // Hdf5 needs the dimensions as hsize_t
+    const int rank = static_cast<int>(Dimensions.size());
+    std::vector<hsize_t> dim(rank);
+    for(int i = 0; i < rank; i++)
+      dim[i] = Dimensions[i];
+    // write the entire dataset to file
+    H5NS::DataSpace dataSpace(rank, dim.data());
+
+    if (NumElements > dataSetThres_)
+    {
+      // Make sure 1) each dimension; and 2) chunk size is < 4GB
+      const hsize_t MaxElements = ( sizeof( U ) == 1 ) ? 0xffffffff : 0x100000000 / sizeof( U );
+      hsize_t ElementsPerChunk = 1;
+      bool bTooBig = false;
+      for( int i = rank - 1 ; i != -1 ; i-- ) {
+        auto &d = dim[i];
+        if( bTooBig )
+          d = 1; // Chunk size is already as big as can be - remaining dimensions = 1
+        else {
+          // If individual dimension too big, reduce by prime factors if possible
+          while( d > MaxElements && ( d & 1 ) == 0 )
+            d >>= 1;
+          const char ErrorMsg[] = " dimension > 4GB and not divisible by 2^n. "
+                                  "Hdf5IO chunk size will be inefficient. NB Serialisation is not intended for large datasets - please consider alternatives.";
+          if( d > MaxElements ) {
+            std::cout << GridLogWarning << "Individual" << ErrorMsg << std::endl;
+            hsize_t quotient = d / MaxElements;
+            if( d % MaxElements )
+              quotient++;
+            d /= quotient;
+          }
+          // Now make sure overall size is not too big
+          hsize_t OverflowCheck = ElementsPerChunk;
+          ElementsPerChunk *= d;
+          assert( OverflowCheck == ElementsPerChunk / d && "Product of dimensions overflowed hsize_t" );
+          // If product of dimensions too big, reduce by prime factors
+          while( ElementsPerChunk > MaxElements && ( ElementsPerChunk & 1 ) == 0 ) {
+            bTooBig = true;
+            d >>= 1;
+            ElementsPerChunk >>= 1;
+          }
+          if( ElementsPerChunk > MaxElements ) {
+            std::cout << GridLogWarning << "Product of" << ErrorMsg << std::endl;
+            hsize_t quotient = ElementsPerChunk / MaxElements;
+            if( ElementsPerChunk % MaxElements )
+              quotient++;
+            d /= quotient;
+            ElementsPerChunk /= quotient;
+          }
+        }
+      }
+      H5NS::DataSet           dataSet;
+      H5NS::DSetCreatPropList plist;
+      plist.setChunk(rank, dim.data());
+      plist.setFletcher32();
+      dataSet = group_.createDataSet(s, Hdf5Type<U>::type(), dataSpace, plist);
+      dataSet.write(pDataRowMajor, Hdf5Type<U>::type());
+    }
+    else
+    {
+      H5NS::Attribute attribute;
+      attribute = group_.createAttribute(s, Hdf5Type<U>::type(), dataSpace);
+      attribute.write(Hdf5Type<U>::type(), pDataRowMajor);
+    }
+  }
+
  template <typename U>
  typename std::enable_if<element<std::vector<U>>::is_number, void>::type
  Hdf5Writer::writeDefault(const std::string &s, const std::vector<U> &x)
@ -110,31 +184,11 @@ namespace Grid
    
    // flatten the vector and getting dimensions
    Flatten<std::vector<U>> flat(x);
-    std::vector<hsize_t> dim;
+    std::vector<size_t> dim;
    const auto           &flatx = flat.getFlatVector();
-    
    for (auto &d: flat.getDim())
-    {
      dim.push_back(d);
-    }
-    
-    // write to file
-    H5NS::DataSpace dataSpace(dim.size(), dim.data());
-    
-    if (flatx.size() > dataSetThres_)
-    {
-      H5NS::DataSet dataSet;
-      
-      dataSet = group_.createDataSet(s, Hdf5Type<Element>::type(), dataSpace);
-      dataSet.write(flatx.data(), Hdf5Type<Element>::type());
-    }
-    else
-    {
-      H5NS::Attribute attribute;
-      
-      attribute = group_.createAttribute(s, Hdf5Type<Element>::type(), dataSpace);
-      attribute.write(Hdf5Type<Element>::type(), flatx.data());
-    }
+    writeMultiDim<Element>(s, dim, &flatx[0], flatx.size());
  }
  
  template <typename U>
@ -170,10 +224,9 @@ namespace Grid
  
  template <>
  void Hdf5Reader::readDefault(const std::string &s, std::string &x);
-  
+
  template <typename U>
-  typename std::enable_if<element<std::vector<U>>::is_number, void>::type
-  Hdf5Reader::readDefault(const std::string &s, std::vector<U> &x)
+  void Hdf5Reader::readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim)
  {
    // alias to element type
    typedef typename element<std::vector<U>>::type Element;
@ -181,7 +234,6 @@ namespace Grid
    // read the dimensions
    H5NS::DataSpace       dataSpace;
    std::vector<hsize_t>  hdim;
-    std::vector<size_t>   dim;
    hsize_t               size = 1;
    
    if (group_.attrExists(s))
@ -201,8 +253,8 @@ namespace Grid
    }
    
    // read the flat vector
-    std::vector<Element> buf(size);
-
+    buf.resize(size);
+    
    if (size > dataSetThres_)
    {
      H5NS::DataSet dataSet;
@ -217,7 +269,19 @@ namespace Grid
      attribute = group_.openAttribute(s);
      attribute.read(Hdf5Type<Element>::type(), buf.data());
    }
-    
+  }
+
+  template <typename U>
+  typename std::enable_if<element<std::vector<U>>::is_number, void>::type
+  Hdf5Reader::readDefault(const std::string &s, std::vector<U> &x)
+  {
+    // alias to element type
+    typedef typename element<std::vector<U>>::type Element;
+
+    std::vector<size_t>   dim;
+    std::vector<Element>  buf;
+    readMultiDim( s, buf, dim );
+
    // reconstruct the multidimensional vector
    Reconstruct<std::vector<U>> r(buf, dim);
    
--- a/Grid/serialisation/MacroMagic.h
+++ b/Grid/serialisation/MacroMagic.h
@ -109,8 +109,8 @@ THE SOFTWARE.
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 #define GRID_MACRO_MEMBER(A,B)        A B;
-#define GRID_MACRO_COMP_MEMBER(A,B) result = (result and (lhs. B == rhs. B));
-#define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" " #B << " = " << obj. B << " ; " <<std::endl;
+#define GRID_MACRO_COMP_MEMBER(A,B) result = (result and CompareMember(lhs. B, rhs. B));
+#define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" " #B << " = "; WriteMember( os, obj. B ); os << " ; " <<std::endl;
 #define GRID_MACRO_READ_MEMBER(A,B) Grid::read(RD,#B,obj. B);
 #define GRID_MACRO_WRITE_MEMBER(A,B) Grid::write(WR,#B,obj. B);

--- a/Grid/serialisation/TextIO.h
+++ b/Grid/serialisation/TextIO.h
@ -51,6 +51,8 @@ namespace Grid
    void writeDefault(const std::string &s, const U &x);
    template <typename U>
    void writeDefault(const std::string &s, const std::vector<U> &x);
+    template <typename U>
+    void writeMultiDim(const std::string &s, const std::vector<size_t> & Dimensions, const U * pDataRowMajor, size_t NumElements);
  private:
    void indent(void);
  private:
@ -69,6 +71,8 @@ namespace Grid
    void readDefault(const std::string &s, U &output);
    template <typename U>
    void readDefault(const std::string &s, std::vector<U> &output);
+    template <typename U>
+    void readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim);
  private:
    void checkIndent(void);
  private:
@ -95,7 +99,18 @@ namespace Grid
      write(s, x[i]);
    }
  }
-  
+
+  template <typename U>
+  void TextWriter::writeMultiDim(const std::string &s, const std::vector<size_t> & Dimensions, const U * pDataRowMajor, size_t NumElements)
+  {
+    uint64_t Rank = Dimensions.size();
+    write(s, Rank);
+    for( uint64_t d : Dimensions )
+      write(s, d);
+    while( NumElements-- )
+      write(s, *pDataRowMajor++);
+  }
+
  // Reader template implementation ////////////////////////////////////////////
  template <typename U>
  void TextReader::readDefault(const std::string &s, U &output)
@ -121,6 +136,23 @@ namespace Grid
      read("", output[i]);
    }
  }
+
+  template <typename U>
+  void TextReader::readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim)
+  {
+    const char sz[] = "";
+    uint64_t Rank;
+    read(sz, Rank);
+    dim.resize( Rank );
+    size_t NumElements = 1;
+    for( auto &d : dim ) {
+      read(sz, d);
+      NumElements *= d;
+    }
+    buf.resize( NumElements );
+    for( auto &x : buf )
+      read(s, x);
+  }
 }

 #endif
--- a/Grid/serialisation/VectorUtils.h
+++ b/Grid/serialisation/VectorUtils.h
@ -1,3 +1,32 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: ./Grid/serialisation/VectorUtils.h
+ 
+ Copyright (C) 2015
+ 
+ Author: Antonin Portelli <antonin.portelli@me.com>
+ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+ Author: paboyle <paboyle@ph.ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_SERIALISATION_VECTORUTILS_H
 #define GRID_SERIALISATION_VECTORUTILS_H

@ -53,6 +82,17 @@ namespace Grid {
    return os;
  }
  
+  // std::vector<std:vector<...>> nested to specified Rank //////////////////////////////////
+  template<typename T, unsigned int Rank>
+  struct NestedStdVector {
+    typedef typename std::vector<typename NestedStdVector<T, Rank - 1>::type> type;
+  };
+  
+  template<typename T>
+  struct NestedStdVector<T,0> {
+    typedef T type;
+  };
+  
  // Grid scalar tensors to nested std::vectors //////////////////////////////////
  template <typename T>
  struct TensorToVec
@ -436,4 +476,4 @@ std::string vecToStr(const std::vector<T> &v)
  return sstr.str();
 }

-#endif
+#endif
--- a/Grid/serialisation/XmlIO.h
+++ b/Grid/serialisation/XmlIO.h
@ -57,6 +57,8 @@ namespace Grid
    void writeDefault(const std::string &s, const U &x);
    template <typename U>
    void writeDefault(const std::string &s, const std::vector<U> &x);
+    template <typename U>
+    void writeMultiDim(const std::string &s, const std::vector<size_t> & Dimensions, const U * pDataRowMajor, size_t NumElements);
    std::string docString(void);
    std::string string(void);
  private:
@ -79,6 +81,8 @@ namespace Grid
    void readDefault(const std::string &s, U &output);
    template <typename U>
    void readDefault(const std::string &s, std::vector<U> &output);
+    template <typename U>
+    void readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim);
    void readCurrentSubtree(std::string &s);
  private:
    void checkParse(const pugi::xml_parse_result &result, const std::string name);
@ -122,13 +126,45 @@ namespace Grid
  void XmlWriter::writeDefault(const std::string &s, const std::vector<U> &x)
  {
    push(s);
-    for (auto &x_i: x)
+    for( auto &u : x )
    {
-      write("elem", x_i);
+      write("elem", u);
    }
    pop();
  }
-  
+
+  template <typename U>
+  void XmlWriter::writeMultiDim(const std::string &s, const std::vector<size_t> & Dimensions, const U * pDataRowMajor, size_t NumElements)
+  {
+    push(s);
+    size_t count = 1;
+    const int Rank = static_cast<int>( Dimensions.size() );
+    write("rank", Rank );
+    std::vector<size_t> MyIndex( Rank );
+    for( auto d : Dimensions ) {
+      write("dim", d);
+      count *= d;
+    }
+    assert( count == NumElements && "XmlIO : element count doesn't match dimensions" );
+    static const char sName[] = "tensor";
+    for( int i = 0 ; i < Rank ; i++ ) {
+      MyIndex[i] = 0;
+      push(sName);
+    }
+    while (NumElements--) {
+      write("elem", *pDataRowMajor++);
+      int i;
+      for( i = Rank - 1 ; i != -1 && ++MyIndex[i] == Dimensions[i] ; i-- )
+        MyIndex[i] = 0;
+      int Rollover = Rank - 1 - i;
+      for( i = 0 ; i < Rollover ; i++ )
+        pop();
+      for( i = 0 ; NumElements && i < Rollover ; i++ )
+        push(sName);
+    }
+    pop();
+  }
+
  // Reader template implementation ////////////////////////////////////////////
  template <typename U>
  void XmlReader::readDefault(const std::string &s, U &output)
@ -145,25 +181,66 @@ namespace Grid
  template <typename U>
  void XmlReader::readDefault(const std::string &s, std::vector<U> &output)
  {
-    std::string    buf;
-    unsigned int   i = 0;
-    
    if (!push(s))
    {
      std::cout << GridLogWarning << "XML: cannot open node '" << s << "'";
      std::cout << std::endl;
-
-      return; 
+    } else {
+      for(unsigned int i = 0; node_.child("elem"); )
+      {
+        output.resize(i + 1);
+        read("elem", output[i++]);
+        node_.child("elem").set_name("elem-done");
+      }
+      pop();
+    }
+  }
+
+  template <typename U>
+  void XmlReader::readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim)
+  {
+    if (!push(s))
+    {
+      std::cout << GridLogWarning << "XML: cannot open node '" << s << "'";
+      std::cout << std::endl;
+    } else {
+      static const char sName[] = "tensor";
+      static const char sNameDone[] = "tensor-done";
+      int Rank;
+      read("rank", Rank);
+      dim.resize( Rank );
+      size_t NumElements = 1;
+      for( auto &d : dim )
+      {
+        read("dim", d);
+        node_.child("dim").set_name("dim-done");
+        NumElements *= d;
+      }
+      buf.resize( NumElements );
+      std::vector<size_t> MyIndex( Rank );
+      for( int i = 0 ; i < Rank ; i++ ) {
+        MyIndex[i] = 0;
+        push(sName);
+      }
+
+      for( auto &x : buf )
+      {
+        NumElements--;
+        read("elem", x);
+        node_.child("elem").set_name("elem-done");
+        int i;
+        for( i = Rank - 1 ; i != -1 && ++MyIndex[i] == dim[i] ; i-- )
+          MyIndex[i] = 0;
+        int Rollover = Rank - 1 - i;
+        for( i = 0 ; i < Rollover ; i++ ) {
+          node_.set_name(sNameDone);
+          pop();
+        }
+        for( i = 0 ; NumElements && i < Rollover ; i++ )
+          push(sName);
+      }
+      pop();
    }
-    while (node_.child("elem"))
-    {
-      output.resize(i + 1);
-      read("elem", output[i]);
-      node_.child("elem").set_name("elem-done");
-      i++;
-    }
-    pop();
  }
-  
 }
 #endif
--- a/Grid/simd/Grid_avx512.h
+++ b/Grid/simd/Grid_avx512.h
@ -485,83 +485,6 @@ namespace Optimization {
  // Some Template specialization

  // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
-#ifndef __INTEL_COMPILER
-#warning "Slow reduction due to incomplete reduce intrinsics"
-  //Complex float Reduce
-  template<>
-    inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
-    __m512 v1,v2;
-    v1=Optimization::Permute::Permute0(in); // avx 512; quad complex single
-    v1= _mm512_add_ps(v1,in);
-    v2=Optimization::Permute::Permute1(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    v2=Optimization::Permute::Permute2(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    u512f conv; conv.v = v1;
-    return Grid::ComplexF(conv.f[0],conv.f[1]);
-  }
-  
-  //Real float Reduce
-  template<>
-    inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
-    __m512 v1,v2;
-    v1 = Optimization::Permute::Permute0(in); // avx 512; octo-double
-    v1 = _mm512_add_ps(v1,in);
-    v2 = Optimization::Permute::Permute1(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    v2 = Optimization::Permute::Permute2(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    v2 = Optimization::Permute::Permute3(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    u512f conv; conv.v=v1;
-    return conv.f[0];
-  }
-  
-  
-  //Complex double Reduce
-  template<>
-    inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
-    __m512d v1;
-    v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
-    v1 = _mm512_add_pd(v1,in);
-    v1 = Optimization::Permute::Permute1(in); // sse 128; paired complex single
-    v1 = _mm512_add_pd(v1,in);
-    u512d conv; conv.v = v1;
-    return Grid::ComplexD(conv.f[0],conv.f[1]);
-  }
-  
-  //Real double Reduce
-  template<>
-    inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
-    __m512d v1,v2;
-    v1 = Optimization::Permute::Permute0(in); // avx 512; quad double
-    v1 = _mm512_add_pd(v1,in);
-      v2 = Optimization::Permute::Permute1(v1); 
-      v1 = _mm512_add_pd(v1,v2);
-      v2 = Optimization::Permute::Permute2(v1); 
-      v1 = _mm512_add_pd(v1,v2);
-     u512d conv; conv.v = v1;
-     return conv.f[0];
-  }
-  
-  //Integer Reduce
-  template<>
-  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
-    // No full vector reduce, use AVX to add upper and lower halves of register
-    // and perform AVX reduction.
-    __m256i v1, v2, v3;
-    __m128i u1, u2, ret;
-    v1  = _mm512_castsi512_si256(in);       // upper half
-    v2  = _mm512_extracti32x8_epi32(in, 1); // lower half
-    v3  = _mm256_add_epi32(v1, v2);
-    v1  = _mm256_hadd_epi32(v3, v3);
-    v2  = _mm256_hadd_epi32(v1, v1);
-    u1  = _mm256_castsi256_si128(v2);        // upper half
-    u2  = _mm256_extracti128_si256(v2, 1);  // lower half
-    ret = _mm_add_epi32(u1, u2);
-    return _mm_cvtsi128_si32(ret);
-  }
-#else
  //Complex float Reduce
  template<>
  inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
@ -590,8 +513,6 @@ namespace Optimization {
  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
    return _mm512_reduce_add_epi32(in);
  }
-#endif
-  
  
 }

--- a/Grid/simd/Grid_vector_types.h
+++ b/Grid/simd/Grid_vector_types.h
@ -10,6 +10,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Guido Cossu <cossu@iroiro-pc.kek.jp>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
+Author: Michael Marshall <michael.marshall@ed.ac.au>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -89,17 +90,25 @@ template <typename Condition, typename ReturnType> using NotEnableIf = Invoke<st
 ////////////////////////////////////////////////////////
 // Check for complexity with type traits
 template <typename T> struct is_complex : public std::false_type {};
-template <> struct is_complex<std::complex<double> > : public std::true_type {};
-template <> struct is_complex<std::complex<float> > : public std::true_type {};
+template <> struct is_complex<ComplexD> : public std::true_type {};
+template <> struct is_complex<ComplexF> : public std::true_type {};

-template <typename T>              using IfReal    = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
+template<typename T, typename V=void> struct is_real : public std::false_type {};
+template<typename T> struct is_real<T, typename std::enable_if<std::is_floating_point<T>::value,
+  void>::type> : public std::true_type {};
+
+template<typename T, typename V=void> struct is_integer : public std::false_type {};
+template<typename T> struct is_integer<T, typename std::enable_if<std::is_integral<T>::value,
+  void>::type> : public std::true_type {};
+  
+template <typename T>              using IfReal    = Invoke<std::enable_if<is_real<T>::value, int> >;
 template <typename T>              using IfComplex = Invoke<std::enable_if<is_complex<T>::value, int> >;
-template <typename T>              using IfInteger = Invoke<std::enable_if<std::is_integral<T>::value, int> >;
+template <typename T>              using IfInteger = Invoke<std::enable_if<is_integer<T>::value, int> >;
 template <typename T1,typename T2> using IfSame    = Invoke<std::enable_if<std::is_same<T1,T2>::value, int> >;

-template <typename T>              using IfNotReal    = Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
+template <typename T>              using IfNotReal    = Invoke<std::enable_if<!is_real<T>::value, int> >;
 template <typename T>              using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
-template <typename T>              using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;
+template <typename T>              using IfNotInteger = Invoke<std::enable_if<!is_integer<T>::value, int> >;
 template <typename T1,typename T2> using IfNotSame    = Invoke<std::enable_if<!std::is_same<T1,T2>::value, int> >;

 ////////////////////////////////////////////////////////
@ -857,8 +866,10 @@ template <typename T>
 struct is_simd : public std::false_type {};
 template <> struct is_simd<vRealF>     : public std::true_type {};
 template <> struct is_simd<vRealD>     : public std::true_type {};
+template <> struct is_simd<vRealH>     : public std::true_type {};
 template <> struct is_simd<vComplexF>  : public std::true_type {};
 template <> struct is_simd<vComplexD>  : public std::true_type {};
+template <> struct is_simd<vComplexH>  : public std::true_type {};
 template <> struct is_simd<vInteger>   : public std::true_type {};

 template <typename T> using IfSimd    = Invoke<std::enable_if<is_simd<T>::value, int> >;
--- a/Grid/tensors/Tensor_class.h
+++ b/Grid/tensors/Tensor_class.h
@ -5,6 +5,7 @@ Copyright (C) 2015

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Michael Marshall <michael.marshall@ed.ac.au>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -42,27 +43,26 @@ namespace Grid {
 //
 class GridTensorBase {};

+// Too late to remove these traits from Grid Tensors, so inherit from GridTypeMapper
+#define GridVector_CopyTraits \
+  using element = vtype; \
+  using scalar_type     = typename Traits::scalar_type; \
+  using vector_type     = typename Traits::vector_type; \
+  using vector_typeD    = typename Traits::vector_typeD; \
+  using tensor_reduced  = typename Traits::tensor_reduced; \
+  using scalar_object   = typename Traits::scalar_object; \
+  using Complexified    = typename Traits::Complexified; \
+  using Realified       = typename Traits::Realified; \
+  using DoublePrecision = typename Traits::DoublePrecision; \
+  static constexpr int TensorLevel = Traits::TensorLevel
+
 template <class vtype>
 class iScalar {
 public:
  vtype _internal;

-  typedef vtype element;
-  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
-  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
-  typedef typename GridTypeMapper<vtype>::vector_typeD vector_typeD;
-  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
-  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
-  typedef iScalar<tensor_reduced_v> tensor_reduced;
-  typedef iScalar<recurse_scalar_object> scalar_object;
-  // substitutes a real or complex version with same tensor structure
-  typedef iScalar<typename GridTypeMapper<vtype>::Complexified> Complexified;
-  typedef iScalar<typename GridTypeMapper<vtype>::Realified> Realified;
-
-  // get double precision version
-  typedef iScalar<typename GridTypeMapper<vtype>::DoublePrecision> DoublePrecision;
-  
-  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };
+  using Traits = GridTypeMapper<iScalar<vtype> >;
+  GridVector_CopyTraits;

  // Scalar no action
  //  template<int Level> using tensor_reduce_level = typename
@ -173,7 +173,10 @@ class iScalar {
    return stream;
  };

-
+  strong_inline const scalar_type * begin() const { return reinterpret_cast<const scalar_type *>(&_internal); }
+  strong_inline       scalar_type * begin()       { return reinterpret_cast<      scalar_type *>(&_internal); }
+  strong_inline const scalar_type * end()   const { return begin() + Traits::count; }
+  strong_inline       scalar_type * end()         { return begin() + Traits::count; }
 };
 ///////////////////////////////////////////////////////////
 // Allows to turn scalar<scalar<scalar<double>>>> back to double.
@ -194,22 +197,9 @@ class iVector {
 public:
  vtype _internal[N];

-  typedef vtype element;
-  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
-  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
-  typedef typename GridTypeMapper<vtype>::vector_typeD vector_typeD;
-  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
-  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
-  typedef iScalar<tensor_reduced_v> tensor_reduced;
-  typedef iVector<recurse_scalar_object, N> scalar_object;
+  using Traits = GridTypeMapper<iVector<vtype, N> >;
+  GridVector_CopyTraits;

-  // substitutes a real or complex version with same tensor structure
-  typedef iVector<typename GridTypeMapper<vtype>::Complexified, N> Complexified;
-  typedef iVector<typename GridTypeMapper<vtype>::Realified, N> Realified;
-
-  // get double precision version
-  typedef iVector<typename GridTypeMapper<vtype>::DoublePrecision, N> DoublePrecision;
-  
  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type
                         * = nullptr>
  strong_inline auto operator=(T arg) -> iVector<vtype, N> {
@ -218,7 +208,6 @@ class iVector {
    return *this;
  }

-  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };
  iVector(const Zero &z) { *this = zero; };
  iVector() = default;
  /*
@ -303,6 +292,11 @@ class iVector {
  //    strong_inline vtype && operator ()(int i) {
  //      return _internal[i];
  //    }
+
+  strong_inline const scalar_type * begin() const { return reinterpret_cast<const scalar_type *>(_internal); }
+  strong_inline       scalar_type * begin()       { return reinterpret_cast<      scalar_type *>(_internal); }
+  strong_inline const scalar_type * end()   const { return begin() + Traits::count; }
+  strong_inline       scalar_type * end()         { return begin() + Traits::count; }
 };

 template <class vtype, int N>
@ -310,25 +304,8 @@ class iMatrix {
 public:
  vtype _internal[N][N];

-  typedef vtype element;
-  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
-  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
-  typedef typename GridTypeMapper<vtype>::vector_typeD vector_typeD;
-  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
-  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
-
-  // substitutes a real or complex version with same tensor structure
-  typedef iMatrix<typename GridTypeMapper<vtype>::Complexified, N> Complexified;
-  typedef iMatrix<typename GridTypeMapper<vtype>::Realified, N> Realified;
-
-  // get double precision version
-  typedef iMatrix<typename GridTypeMapper<vtype>::DoublePrecision, N> DoublePrecision;
-  
-  // Tensor removal
-  typedef iScalar<tensor_reduced_v> tensor_reduced;
-  typedef iMatrix<recurse_scalar_object, N> scalar_object;
-
-  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };
+  using Traits = GridTypeMapper<iMatrix<vtype, N> >;
+  GridVector_CopyTraits;

  iMatrix(const Zero &z) { *this = zero; };
  iMatrix() = default;
@ -458,6 +435,11 @@ class iMatrix {
  //  strong_inline vtype && operator ()(int i,int j) {
  //    return _internal[i][j];
  //  }
+
+  strong_inline const scalar_type * begin() const { return reinterpret_cast<const scalar_type *>(_internal[0]); }
+  strong_inline       scalar_type * begin()       { return reinterpret_cast<      scalar_type *>(_internal[0]); }
+  strong_inline const scalar_type * end()   const { return begin() + Traits::count; }
+  strong_inline       scalar_type * end()         { return begin() + Traits::count; }
 };

 template <class v>
@ -480,6 +462,3 @@ void vprefetch(const iMatrix<v, N> &vv) {
 }
 }
 #endif
-
-
-
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@ -5,6 +5,7 @@
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@phys.columbia.edu>
+Author: Michael Marshall <michael.marshall@ed.ac.au>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
@ -26,6 +27,17 @@ Author: Christopher Kelly <ckelly@phys.columbia.edu>

 namespace Grid {

+  // Forward declarations
+  template<class T>        class iScalar;
+  template<class T, int N> class iVector;
+  template<class T, int N> class iMatrix;
+
+  // These are the Grid tensors
+  template<typename T>     struct isGridTensor                : public std::false_type { static constexpr bool notvalue = true; };
+  template<class T>        struct isGridTensor<iScalar<T>>    : public std::true_type  { static constexpr bool notvalue = false; };
+  template<class T, int N> struct isGridTensor<iVector<T, N>> : public std::true_type  { static constexpr bool notvalue = false; };
+  template<class T, int N> struct isGridTensor<iMatrix<T, N>> : public std::true_type  { static constexpr bool notvalue = false; };
+
 //////////////////////////////////////////////////////////////////////////////////
 // Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
 // Use of a helper class like this allows us to template specialise and "dress"
@ -40,25 +52,26 @@ namespace Grid {
 // to study C++11's type_traits.h file. (std::enable_if<isGridTensorType<vtype> >)
 //
 //////////////////////////////////////////////////////////////////////////////////
-  
-  template <class T> class GridTypeMapper {
-  public:
-    typedef typename T::scalar_type scalar_type;
-    typedef typename T::vector_type vector_type;
-    typedef typename T::vector_typeD vector_typeD;
-    typedef typename T::tensor_reduced tensor_reduced;
-    typedef typename T::scalar_object scalar_object;
-    typedef typename T::Complexified Complexified;
-    typedef typename T::Realified Realified;
-    typedef typename T::DoublePrecision DoublePrecision;
-    enum { TensorLevel = T::TensorLevel };
+
+  // This saves repeating common properties for supported Grid Scalar types
+  // TensorLevel    How many nested grid tensors
+  // Rank           Rank of the grid tensor
+  // count          Total number of elements, i.e. product of dimensions
+  // Dimension(dim) Size of dimension dim
+  struct GridTypeMapper_Base {
+    static constexpr int TensorLevel = 0;
+    static constexpr int Rank = 0;
+    static constexpr std::size_t count = 1;
+    static constexpr int Dimension(int dim) { return 0; }
  };

 //////////////////////////////////////////////////////////////////////////////////
 // Recursion stops with these template specialisations
 //////////////////////////////////////////////////////////////////////////////////
-  template<> class GridTypeMapper<RealF> {
-  public:
+
+  template<typename T> struct GridTypeMapper {};
+
+  template<> struct GridTypeMapper<RealF> : public GridTypeMapper_Base {
    typedef RealF scalar_type;
    typedef RealF vector_type;
    typedef RealD vector_typeD;
@ -67,10 +80,8 @@ namespace Grid {
    typedef ComplexF Complexified;
    typedef RealF Realified;
    typedef RealD DoublePrecision;
-    enum { TensorLevel = 0 };
  };
-  template<> class GridTypeMapper<RealD> {
-  public:
+  template<> struct GridTypeMapper<RealD> : public GridTypeMapper_Base {
    typedef RealD scalar_type;
    typedef RealD vector_type;
    typedef RealD vector_typeD;
@ -79,10 +90,8 @@ namespace Grid {
    typedef ComplexD Complexified;
    typedef RealD Realified;
    typedef RealD DoublePrecision;
-    enum { TensorLevel = 0 };
  };
-  template<> class GridTypeMapper<ComplexF> {
-  public:
+  template<> struct GridTypeMapper<ComplexF> : public GridTypeMapper_Base {
    typedef ComplexF scalar_type;
    typedef ComplexF vector_type;
    typedef ComplexD vector_typeD;
@ -91,10 +100,8 @@ namespace Grid {
    typedef ComplexF Complexified;
    typedef RealF Realified;
    typedef ComplexD DoublePrecision;
-    enum { TensorLevel = 0 };
  };
-  template<> class GridTypeMapper<ComplexD> {
-  public:
+  template<> struct GridTypeMapper<ComplexD> : public GridTypeMapper_Base {
    typedef ComplexD scalar_type;
    typedef ComplexD vector_type;
    typedef ComplexD vector_typeD;
@ -103,10 +110,8 @@ namespace Grid {
    typedef ComplexD Complexified;
    typedef RealD Realified;
    typedef ComplexD DoublePrecision;
-    enum { TensorLevel = 0 };
  };
-  template<> class GridTypeMapper<Integer> {
-  public:
+  template<> struct GridTypeMapper<Integer> : public GridTypeMapper_Base {
    typedef Integer scalar_type;
    typedef Integer vector_type;
    typedef Integer vector_typeD;
@ -115,11 +120,9 @@ namespace Grid {
    typedef void Complexified;
    typedef void Realified;
    typedef void DoublePrecision;
-    enum { TensorLevel = 0 };
  };

-  template<> class GridTypeMapper<vRealF> {
-  public:
+  template<> struct GridTypeMapper<vRealF> : public GridTypeMapper_Base {
    typedef RealF  scalar_type;
    typedef vRealF vector_type;
    typedef vRealD vector_typeD;
@ -128,10 +131,8 @@ namespace Grid {
    typedef vComplexF Complexified;
    typedef vRealF Realified;
    typedef vRealD DoublePrecision;
-    enum { TensorLevel = 0 };
  };
-  template<> class GridTypeMapper<vRealD> {
-  public:
+  template<> struct GridTypeMapper<vRealD> : public GridTypeMapper_Base {
    typedef RealD  scalar_type;
    typedef vRealD vector_type;
    typedef vRealD vector_typeD;
@ -140,10 +141,20 @@ namespace Grid {
    typedef vComplexD Complexified;
    typedef vRealD Realified;
    typedef vRealD DoublePrecision;
-    enum { TensorLevel = 0 };
  };
-  template<> class GridTypeMapper<vComplexH> {
-  public:
+  template<> struct GridTypeMapper<vRealH> : public GridTypeMapper_Base {
+    // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types
+    typedef RealF  scalar_type;
+    typedef vRealH vector_type;
+    typedef vRealD vector_typeD;
+    typedef vRealH tensor_reduced;
+    typedef RealF  scalar_object;
+    typedef vComplexH Complexified;
+    typedef vRealH Realified;
+    typedef vRealD DoublePrecision;
+  };
+  template<> struct GridTypeMapper<vComplexH> : public GridTypeMapper_Base {
+    // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types
    typedef ComplexF  scalar_type;
    typedef vComplexH vector_type;
    typedef vComplexD vector_typeD;
@ -152,10 +163,8 @@ namespace Grid {
    typedef vComplexH Complexified;
    typedef vRealH Realified;
    typedef vComplexD DoublePrecision;
-    enum { TensorLevel = 0 };
  };
-  template<> class GridTypeMapper<vComplexF> {
-  public:
+  template<> struct GridTypeMapper<vComplexF> : public GridTypeMapper_Base {
    typedef ComplexF  scalar_type;
    typedef vComplexF vector_type;
    typedef vComplexD vector_typeD;
@ -164,10 +173,8 @@ namespace Grid {
    typedef vComplexF Complexified;
    typedef vRealF Realified;
    typedef vComplexD DoublePrecision;
-    enum { TensorLevel = 0 };
  };
-  template<> class GridTypeMapper<vComplexD> {
-  public:
+  template<> struct GridTypeMapper<vComplexD> : public GridTypeMapper_Base {
    typedef ComplexD  scalar_type;
    typedef vComplexD vector_type;
    typedef vComplexD vector_typeD;
@ -176,10 +183,8 @@ namespace Grid {
    typedef vComplexD Complexified;
    typedef vRealD Realified;
    typedef vComplexD DoublePrecision;
-    enum { TensorLevel = 0 };
  };
-  template<> class GridTypeMapper<vInteger> {
-  public:
+  template<> struct GridTypeMapper<vInteger> : public GridTypeMapper_Base {
    typedef  Integer scalar_type;
    typedef vInteger vector_type;
    typedef vInteger vector_typeD;
@ -188,57 +193,52 @@ namespace Grid {
    typedef void Complexified;
    typedef void Realified;
    typedef void DoublePrecision;
-    enum { TensorLevel = 0 };
  };

-  // First some of my own traits
-  template<typename T> struct isGridTensor {
-    static const bool value = true;
-    static const bool notvalue = false;
+#define GridTypeMapper_RepeatedTypes \
+  using BaseTraits   = GridTypeMapper<T>; \
+  using scalar_type  = typename BaseTraits::scalar_type; \
+  using vector_type  = typename BaseTraits::vector_type; \
+  using vector_typeD = typename BaseTraits::vector_typeD; \
+  static constexpr int TensorLevel = BaseTraits::TensorLevel + 1
+
+  template<typename T> struct GridTypeMapper<iScalar<T>> {
+    GridTypeMapper_RepeatedTypes;
+    using tensor_reduced  = iScalar<typename BaseTraits::tensor_reduced>;
+    using scalar_object   = iScalar<typename BaseTraits::scalar_object>;
+    using Complexified    = iScalar<typename BaseTraits::Complexified>;
+    using Realified       = iScalar<typename BaseTraits::Realified>;
+    using DoublePrecision = iScalar<typename BaseTraits::DoublePrecision>;
+    static constexpr int Rank = BaseTraits::Rank + 1;
+    static constexpr std::size_t count = BaseTraits::count;
+    static constexpr int Dimension(int dim) {
+      return ( dim == 0 ) ? 1 : BaseTraits::Dimension(dim - 1); }
  };
-  template<> struct isGridTensor<int > {
-    static const bool value = false;
-    static const bool notvalue = true;
+
+  template<typename T, int N> struct GridTypeMapper<iVector<T, N>> {
+    GridTypeMapper_RepeatedTypes;
+    using tensor_reduced  = iScalar<typename BaseTraits::tensor_reduced>;
+    using scalar_object   = iVector<typename BaseTraits::scalar_object,   N>;
+    using Complexified    = iVector<typename BaseTraits::Complexified,    N>;
+    using Realified       = iVector<typename BaseTraits::Realified,       N>;
+    using DoublePrecision = iVector<typename BaseTraits::DoublePrecision, N>;
+    static constexpr int Rank = BaseTraits::Rank + 1;
+    static constexpr std::size_t count = BaseTraits::count * N;
+    static constexpr int Dimension(int dim) {
+      return ( dim == 0 ) ? N : BaseTraits::Dimension(dim - 1); }
  };
-  template<> struct isGridTensor<RealD > {
-    static const bool value = false;
-    static const bool notvalue = true;
-  };
-  template<> struct isGridTensor<RealF > {
-    static const bool value = false;
-    static const bool notvalue = true;
-  };
-  template<> struct isGridTensor<ComplexD > {
-    static const bool value = false;
-    static const bool notvalue = true;
-  };
-  template<> struct isGridTensor<ComplexF > {
-    static const bool value = false;
-    static const bool notvalue = true;
-  };
-  template<> struct isGridTensor<Integer > {
-    static const bool value = false;
-    static const bool notvalue = true;
-  };
-  template<> struct isGridTensor<vRealD > {
-    static const bool value = false;
-    static const bool notvalue = true;
-  };
-  template<> struct isGridTensor<vRealF > {
-    static const bool value = false;
-    static const bool notvalue = true;
-  };
-  template<> struct isGridTensor<vComplexD > {
-    static const bool value = false;
-    static const bool notvalue = true;
-  };
-  template<> struct isGridTensor<vComplexF > {
-    static const bool value = false;
-    static const bool notvalue = true;
-  };
-  template<> struct isGridTensor<vInteger > {
-    static const bool value = false;
-    static const bool notvalue = true;
+
+  template<typename T, int N> struct GridTypeMapper<iMatrix<T, N>> {
+    GridTypeMapper_RepeatedTypes;
+    using tensor_reduced  = iScalar<typename BaseTraits::tensor_reduced>;
+    using scalar_object   = iMatrix<typename BaseTraits::scalar_object,   N>;
+    using Complexified    = iMatrix<typename BaseTraits::Complexified,    N>;
+    using Realified       = iMatrix<typename BaseTraits::Realified,       N>;
+    using DoublePrecision = iMatrix<typename BaseTraits::DoublePrecision, N>;
+    static constexpr int Rank = BaseTraits::Rank + 2;
+    static constexpr std::size_t count = BaseTraits::count * N * N;
+    static constexpr int Dimension(int dim) {
+      return ( dim == 0 || dim == 1 ) ? N : BaseTraits::Dimension(dim - 2); }
  };

  // Match the index
@ -263,20 +263,13 @@ namespace Grid {
    typedef T type;
  };
  
-  //Query if a tensor or Lattice<Tensor> is SIMD vector or scalar
-  template<typename T>
-  class isSIMDvectorized{
-    template<typename U>
-    static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,   
-      typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *);
+  //Query whether a tensor or Lattice<Tensor> is SIMD vector or scalar
+  template<typename T, typename V=void> struct isSIMDvectorized : public std::false_type {};
+  template<typename U> struct isSIMDvectorized<U, typename std::enable_if< !std::is_same<
+    typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,
+    typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, void>::type>
+  : public std::true_type {};

-    template<typename U>
-    static double test(...);
-  
-  public:
-    enum {value = sizeof(test<T>(0)) == sizeof(char) };
-  };
-  
  //Get the precision of a Lattice, tensor or scalar type in units of sizeof(float)
  template<typename T>
  class getPrecision{
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@ -47,6 +47,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #define PARALLEL_FOR_LOOP
 #define PARALLEL_FOR_LOOP_INTERN
+#define PARALLEL_FOR_LOOP_REDUCE(op, var)
 #define PARALLEL_NESTED_LOOP2
 #define PARALLEL_NESTED_LOOP5
 #define PARALLEL_REGION
@ -58,6 +59,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
 #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 #define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for
+#define parallel_critical PARALLEL_CRITICAL

 namespace Grid {

--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -289,6 +289,11 @@ void Grid_init(int *argc,char ***argv)
    std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
    std::cout << "GNU General Public License for more details."<<std::endl;
    printHash();
+  #ifdef GRID_BUILD_REF
+  #define _GRID_BUILD_STR(x) #x
+  #define GRID_BUILD_STR(x) _GRID_BUILD_STR(x)
+    std::cout << "Build " << GRID_BUILD_STR(GRID_BUILD_REF) << std::endl;
+  #endif
    std::cout << std::endl;
  }

--- a/Grid/util/Sha.h
+++ b/Grid/util/Sha.h
@ -28,17 +28,46 @@
 extern "C" {
 #include <openssl/sha.h>
 }
+#ifdef USE_IPP
+#include "ipp.h"
+#endif

 #pragma once

 class GridChecksum
 {
 public:
-  static inline uint32_t crc32(void *data,size_t bytes)
+  static inline uint32_t crc32(const void *data, size_t bytes)
  {
    return ::crc32(0L,(unsigned char *)data,bytes);
  }
-  static inline std::vector<unsigned char> sha256(void *data,size_t bytes)
+
+#ifdef USE_IPP
+  static inline uint32_t crc32c(const void* data, size_t bytes)
+  {
+      uint32_t crc32c = ~(uint32_t)0;
+      ippsCRC32C_8u(reinterpret_cast<const unsigned char *>(data), bytes, &crc32c);
+      ippsSwapBytes_32u_I(&crc32c, 1);
+  
+      return ~crc32c;
+  }
+#endif
+
+  template <typename T>
+  static inline std::string sha256_string(const std::vector<T> &hash)
+  {
+    std::stringstream sha;
+    std::string       s;
+
+    for(unsigned int i = 0; i < hash.size(); i++) 
+    { 
+        sha << std::hex << static_cast<unsigned int>(hash[i]);
+    }
+    s = sha.str();
+
+    return s;
+  }
+  static inline std::vector<unsigned char> sha256(const void *data,size_t bytes)
  {
    std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
    SHA256_CTX sha256;
--- a/HMC/Makefile.am
+++ b/HMC/Makefile.am
@ -0,0 +1,6 @@
+SUBDIRS = . 
+
+include Make.inc
+
+
+
--- a/HMC/Mobius2p1f.cc
+++ b/HMC/Mobius2p1f.cc
@ -0,0 +1,198 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_EODWFRatio.cc
+
+Copyright (C) 2015-2016
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+   // Typedefs to simplify notation
+  typedef WilsonImplR FermionImplPolicy;
+  typedef MobiusFermionR FermionAction;
+  typedef typename FermionAction::FermionField FermionField;
+
+  typedef Grid::XmlReader       Serialiser;
+  
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
+  //  MD.name    = std::string("Leap Frog");
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
+  //  MD.name    = std::string("Force Gradient");
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = 20;
+  MD.trajL   = 1.0;
+  
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = 0;
+  HMCparams.Trajectories     = 200;
+  HMCparams.NoMetropolisUntil=  20;
+  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+  HMCparams.StartingType     =std::string("ColdStart");
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+  
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_EODWF_lat";
+  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
+  CPparams.saveInterval  = 10;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection 
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 16;
+  Real beta         = 2.13;
+  Real light_mass   = 0.01;
+  Real strange_mass = 0.04;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+  RealD b   = 1.0; // Scale factor two
+  RealD c   = 0.0;
+
+  OneFlavourRationalParams OFRp;
+  OFRp.lo       = 1.0e-2;
+  OFRp.hi       = 64;
+  OFRp.MaxIter  = 10000;
+  OFRp.tolerance= 1.0e-10;
+  OFRp.degree   = 14;
+  OFRp.precision= 40;
+
+  std::vector<Real> hasenbusch({ 0.1 });
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  IwasakiGaugeActionR GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+
+  // These lines are unecessary if BC are all periodic
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+  
+  double StoppingCondition = 1e-10;
+  double MaxCGIterations = 30000;
+  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(4);
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+
+  //  FermionAction StrangeOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_mass,M5,b,c, Params);
+  //  DomainWallEOFAFermionR Strange_Op_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5);
+  //  DomainWallEOFAFermionR Strange_Op_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5);
+  //  ExactOneFlavourRatioPseudoFermionAction EOFA(Strange_Op_L,Strange_Op_R,CG,ofp, false);
+
+  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
+  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
+
+  //  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermion(StrangePauliVillarsOp,StrangeOp,OFRp);
+  OneFlavourRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermion(StrangePauliVillarsOp,StrangeOp,OFRp);
+  //  TwoFlavourRationalTesterPseudoFermionAction<FermionImplPolicy> StrangePseudoFermion1F(StrangeOp,OFRp);
+  //  TwoFlavourPseudoFermionAction<FermionImplPolicy> StrangePseudoFermion2F(StrangeOp,CG,CG);
+  //  Level1.push_back(&StrangePseudoFermion2F);
+  //  Level1.push_back(&StrangePseudoFermion);
+
+  ////////////////////////////////////
+  // up down action
+  ////////////////////////////////////
+  std::vector<Real> light_den;
+  std::vector<Real> light_num;
+
+  int n_hasenbusch = hasenbusch.size();
+  light_den.push_back(light_mass);
+  for(int h=0;h<n_hasenbusch;h++){
+    light_den.push_back(hasenbusch[h]);
+    light_num.push_back(hasenbusch[h]);
+  }
+  light_num.push_back(pv_mass);
+
+  std::vector<FermionAction *> Numerators;
+  std::vector<FermionAction *> Denominators;
+  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
+
+  for(int h=0;h<n_hasenbusch+1;h++){
+    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
+    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
+    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
+    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
+  }
+
+  for(int h=0;h<n_hasenbusch+1;h++){
+    Level1.push_back(Quotients[h]);
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level2.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+
+  /////////////////////////////////////////////////////////////
+  // HMC parameters are serialisable
+
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.Run();  // no smearing
+
+  Grid_finalize();
+} // main
+
+
+
--- a/HMC/Mobius2p1fEOFA.cc
+++ b/HMC/Mobius2p1fEOFA.cc
@ -0,0 +1,452 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: 
+
+Copyright (C) 2015-2016
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Guido Cossu
+Author: David Murphy
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+#define MIXED_PRECISION
+#endif
+
+namespace Grid{ 
+  namespace QCD{
+
+  /*
+   * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
+   *    -- Store the single prec action operator.
+   *    -- Clone the gauge field from the operator function argument.
+   *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
+   */
+
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
+						    Integer maxinnerit, 
+						    Integer maxouterit, 
+						    GridBase* _sp_grid4, 
+						    GridBase* _sp_grid5, 
+						    FermionOperatorF &_FermOpF,
+						    FermionOperatorD &_FermOpD,
+						    SchurOperatorF   &_LinOpF,
+						    SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      InnerTolerance(tol), 
+      MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5),
+      OuterLoopNormMult(100.) 
+    { 
+      /* Debugging instances of objects; references are stored
+      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
+      */
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      
+      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
+      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
+      // Assumption made in code to extract gauge field
+      // We could avoid storing LinopD reference alltogether ?
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Must snarf a single precision copy of the gauge field in Linop_d argument
+      ////////////////////////////////////////////////////////////////////////////////////
+      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
+      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
+      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
+      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
+
+      GridBase * GridPtrF = SinglePrecGrid4;
+      GridBase * GridPtrD = FermOpD.Umu._grid;
+      GaugeFieldF     U_f  (GridPtrF);
+      GaugeLinkFieldF Umu_f(GridPtrF);
+      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
+      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Moving this to a Clone method of fermion operator would allow to duplicate the 
+      // physics parameters and decrease gauge field copies
+      ////////////////////////////////////////////////////////////////////////////////////
+      GaugeLinkFieldD Umu_d(GridPtrD);
+      for(int mu=0;mu<Nd*2;mu++){ 
+	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
+	precisionChange(Umu_f,Umu_d);
+	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
+      }
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Could test to make sure that LinOpF and LinOpD agree to single prec?
+      ////////////////////////////////////////////////////////////////////////////////////
+      /*
+      GridBase *Fgrid = psi._grid;
+      FieldD tmp2(Fgrid);
+      FieldD tmp1(Fgrid);
+      LinOpU.Op(src,tmp1);
+      LinOpD.Op(src,tmp2);
+      std::cout << " Double gauge field "<< norm2(FermOpD.Umu)<<std::endl;
+      std::cout << " Single gauge field "<< norm2(FermOpF.Umu)<<std::endl;
+      std::cout << " Test of operators "<<norm2(tmp1)<<std::endl;
+      std::cout << " Test of operators "<<norm2(tmp2)<<std::endl;
+      tmp1=tmp1-tmp2;
+      std::cout << " Test of operators diff "<<norm2(tmp1)<<std::endl;
+      */
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+}};
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+   // Typedefs to simplify notation
+  typedef WilsonImplR FermionImplPolicy;
+  typedef MobiusFermionR FermionAction;
+  typedef MobiusFermionF FermionActionF;
+  typedef MobiusEOFAFermionR FermionEOFAAction;
+  typedef MobiusEOFAFermionF FermionEOFAActionF;
+  typedef typename FermionAction::FermionField FermionField;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef Grid::XmlReader       Serialiser;
+  
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
+  //  MD.name    = std::string("Leap Frog");
+  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
+  MD.name    = std::string("Force Gradient");
+  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
+  //  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = 6;
+  MD.trajL   = 1.0;
+  
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = 590;
+  HMCparams.Trajectories     = 1000;
+  HMCparams.NoMetropolisUntil=  0;
+  //  "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+  //  HMCparams.StartingType     =std::string("ColdStart");
+  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+  
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_EODWF_lat";
+  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
+  CPparams.saveInterval  = 10;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection 
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 16;
+  Real beta         = 2.13;
+  Real light_mass   = 0.01;
+  Real strange_mass = 0.04;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+  RealD b   = 1.0; 
+  RealD c   = 0.0;
+
+  std::vector<Real> hasenbusch({ 0.1, 0.3, 0.6 });
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  std::vector<int> latt  = GridDefaultLatt();
+  std::vector<int> mpi   = GridDefaultMpi();
+  std::vector<int> simdF = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  std::vector<int> simdD = GridDefaultSimd(Nd,vComplexD::Nsimd());
+  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi);
+  auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
+
+  IwasakiGaugeActionR GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+  LatticeGaugeFieldF UF(GridPtrF);
+
+  // These lines are unecessary if BC are all periodic
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+  FermionActionF::ImplParams ParamsF(boundary);
+  
+  double ActionStoppingCondition     = 1e-10;
+  double DerivativeStoppingCondition = 1e-6;
+  double MaxCGIterations = 30000;
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(8);
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
+  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
+  typedef SchurDiagMooeeOperator<FermionEOFAActionF,FermionFieldF> LinearOperatorEOFAF;
+  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
+
+  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
+  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusEOFAFermionD,MobiusEOFAFermionF,LinearOperatorEOFAD,LinearOperatorEOFAF> MxPCG_EOFA;
+
+  // DJM: setup for EOFA ratio (Mobius)
+  OneFlavourRationalParams OFRp;
+  OFRp.lo       = 0.1;
+  OFRp.hi       = 25.0;
+  OFRp.MaxIter  = 10000;
+  OFRp.tolerance= 1.0e-9;
+  OFRp.degree   = 14;
+  OFRp.precision= 50;
+
+  
+  MobiusEOFAFermionR Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
+  MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
+  MobiusEOFAFermionR Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
+  MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
+
+  ConjugateGradient<FermionField>      ActionCG(ActionStoppingCondition,MaxCGIterations);
+  ConjugateGradient<FermionField>  DerivativeCG(DerivativeStoppingCondition,MaxCGIterations);
+#ifdef MIXED_PRECISION
+  const int MX_inner = 1000;
+  // Mixed precision EOFA
+  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
+  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
+  LinearOperatorEOFAF Strange_LinOp_LF(Strange_Op_LF);
+  LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF);
+
+  MxPCG_EOFA ActionCGL(ActionStoppingCondition,
+		       MX_inner,
+		       MaxCGIterations,
+		       GridPtrF,
+		       FrbGridF,
+		       Strange_Op_LF,Strange_Op_L,
+		       Strange_LinOp_LF,Strange_LinOp_L);
+
+  MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition,
+			   MX_inner,
+			   MaxCGIterations,
+			   GridPtrF,
+			   FrbGridF,
+			   Strange_Op_LF,Strange_Op_L,
+			   Strange_LinOp_LF,Strange_LinOp_L);
+  
+  MxPCG_EOFA ActionCGR(ActionStoppingCondition,
+		       MX_inner,
+		       MaxCGIterations,
+		       GridPtrF,
+		       FrbGridF,
+		       Strange_Op_RF,Strange_Op_R,
+		       Strange_LinOp_RF,Strange_LinOp_R);
+  
+  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,
+			   MX_inner,
+			   MaxCGIterations,
+			   GridPtrF,
+			   FrbGridF,
+			   Strange_Op_RF,Strange_Op_R,
+			   Strange_LinOp_RF,Strange_LinOp_R);
+
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
+    EOFA(Strange_Op_L, Strange_Op_R, 
+	 ActionCG, 
+	 ActionCGL, ActionCGR,
+	 DerivativeCGL, DerivativeCGR,
+	 OFRp, true);
+#else
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
+    EOFA(Strange_Op_L, Strange_Op_R, 
+	 ActionCG,
+	 ActionCG, ActionCG,
+	 DerivativeCG, DerivativeCG,
+	 OFRp, true);
+#endif
+  Level1.push_back(&EOFA);
+
+  ////////////////////////////////////
+  // up down action
+  ////////////////////////////////////
+  std::vector<Real> light_den;
+  std::vector<Real> light_num;
+
+  int n_hasenbusch = hasenbusch.size();
+  light_den.push_back(light_mass);
+  for(int h=0;h<n_hasenbusch;h++){
+    light_den.push_back(hasenbusch[h]);
+    light_num.push_back(hasenbusch[h]);
+  }
+  light_num.push_back(pv_mass);
+
+  //////////////////////////////////////////////////////////////
+  // Forced to replicate the MxPCG and DenominatorsF etc.. because
+  // there is no convenient way to "Clone" physics params from double op
+  // into single op for any operator pair.
+  // Same issue prevents using MxPCG in the Heatbath step
+  //////////////////////////////////////////////////////////////
+  std::vector<FermionAction *> Numerators;
+  std::vector<FermionAction *> Denominators;
+  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
+  std::vector<MxPCG *> ActionMPCG;
+  std::vector<MxPCG *> MPCG;
+  std::vector<FermionActionF *> DenominatorsF;
+  std::vector<LinearOperatorD *> LinOpD;
+  std::vector<LinearOperatorF *> LinOpF; 
+
+  for(int h=0;h<n_hasenbusch+1;h++){
+
+    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
+
+    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
+    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
+
+#ifdef MIXED_PRECISION
+    ////////////////////////////////////////////////////////////////////////////
+    // Mixed precision CG for 2f force
+    ////////////////////////////////////////////////////////////////////////////
+
+    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsF));
+    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
+    LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
+
+    MPCG.push_back(new MxPCG(DerivativeStoppingCondition,
+			     MX_inner,
+			     MaxCGIterations,
+			     GridPtrF,
+			     FrbGridF,
+			     *DenominatorsF[h],*Denominators[h],
+			     *LinOpF[h], *LinOpD[h]) );
+
+    ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,
+				   MX_inner,
+				   MaxCGIterations,
+				   GridPtrF,
+				   FrbGridF,
+				   *DenominatorsF[h],*Denominators[h],
+				   *LinOpF[h], *LinOpD[h]) );
+
+    // Heatbath not mixed yet. As inverts numerators not so important as raised mass.
+    Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],ActionCG));
+#else
+    ////////////////////////////////////////////////////////////////////////////
+    // Standard CG for 2f force
+    ////////////////////////////////////////////////////////////////////////////
+    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],DerivativeCG,ActionCG));
+#endif
+
+  }
+
+  for(int h=0;h<n_hasenbusch+1;h++){
+    Level1.push_back(Quotients[h]);
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level2.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+
+  /////////////////////////////////////////////////////////////
+  // HMC parameters are serialisable
+
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.Run();  // no smearing
+
+  Grid_finalize();
+} // main
+
+
+
--- a/HMC/Mobius2p1fRHMC.cc
+++ b/HMC/Mobius2p1fRHMC.cc
@ -0,0 +1,198 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_EODWFRatio.cc
+
+Copyright (C) 2015-2016
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+   // Typedefs to simplify notation
+  typedef WilsonImplR FermionImplPolicy;
+  typedef MobiusFermionR FermionAction;
+  typedef typename FermionAction::FermionField FermionField;
+
+  typedef Grid::XmlReader       Serialiser;
+  
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
+  //  MD.name    = std::string("Leap Frog");
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
+  //  MD.name    = std::string("Force Gradient");
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = 20;
+  MD.trajL   = 1.0;
+  
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = 30;
+  HMCparams.Trajectories     = 200;
+  HMCparams.NoMetropolisUntil=  0;
+  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+  //  HMCparams.StartingType     =std::string("ColdStart");
+  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+  
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_EODWF_lat";
+  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
+  CPparams.saveInterval  = 10;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection 
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 16;
+  Real beta         = 2.13;
+  Real light_mass   = 0.01;
+  Real strange_mass = 0.04;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+  RealD b   = 1.0; 
+  RealD c   = 0.0;
+  
+  // FIXME:
+  // Same in MC and MD 
+  // Need to mix precision too
+  OneFlavourRationalParams OFRp;
+  OFRp.lo       = 4.0e-3;
+  OFRp.hi       = 30.0;
+  OFRp.MaxIter  = 10000;
+  OFRp.tolerance= 1.0e-10;
+  OFRp.degree   = 16;
+  OFRp.precision= 50;
+
+  std::vector<Real> hasenbusch({ 0.1 });
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  IwasakiGaugeActionR GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+
+  // These lines are unecessary if BC are all periodic
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+  
+  double StoppingCondition = 1e-10;
+  double MaxCGIterations = 30000;
+  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(4);
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+
+  //  FermionAction StrangeOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_mass,M5,b,c, Params);
+  //  DomainWallEOFAFermionR Strange_Op_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5);
+  //  DomainWallEOFAFermionR Strange_Op_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5);
+  //  ExactOneFlavourRatioPseudoFermionAction EOFA(Strange_Op_L,Strange_Op_R,CG,ofp, false);
+
+  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
+  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
+
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermion(StrangePauliVillarsOp,StrangeOp,OFRp);
+  Level1.push_back(&StrangePseudoFermion);
+
+  ////////////////////////////////////
+  // up down action
+  ////////////////////////////////////
+  std::vector<Real> light_den;
+  std::vector<Real> light_num;
+
+  int n_hasenbusch = hasenbusch.size();
+  light_den.push_back(light_mass);
+  for(int h=0;h<n_hasenbusch;h++){
+    light_den.push_back(hasenbusch[h]);
+    light_num.push_back(hasenbusch[h]);
+  }
+  light_num.push_back(pv_mass);
+
+  std::vector<FermionAction *> Numerators;
+  std::vector<FermionAction *> Denominators;
+  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
+
+  for(int h=0;h<n_hasenbusch+1;h++){
+    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
+    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
+    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
+    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
+  }
+
+  for(int h=0;h<n_hasenbusch+1;h++){
+    Level1.push_back(Quotients[h]);
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level2.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+
+  /////////////////////////////////////////////////////////////
+  // HMC parameters are serialisable
+
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.Run();  // no smearing
+
+  Grid_finalize();
+} // main
+
+
+
--- a/HMC/README
+++ b/HMC/README
@ -0,0 +1,109 @@
+********************************************************************
+TODO: 
+********************************************************************
+
+i) Got mixed precision in 2f and EOFA force and action solves.
+   But need mixed precision in the heatbath solve. Best for Fermop to have a "clone" method, to
+   reduce the number of solver and action objects. Needed ideally for the EOFA heatbath.
+   15% perhaps
+   Combine with 2x trajectory length?
+
+ii) Rational on EOFA HB  -- relax order
+                         -- Test the approx as per David email
+
+Resume / roll.sh 
+
+----------------------------------------------------------------
+
+- 16^3 Currently 10 traj per hour
+
+- EOFA use a different derivative solver from action solver
+- EOFA fix Davids hack to the SchurRedBlack guessing
+
+*** Reduce precision/tolerance  in EOFA with second CG param.                          (10% speed up)
+*** Force gradient - reduced precision solve for the gradient                          (4/3x speedup)
+
+
+*** Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
+    -- Store the single prec action operator.
+    -- Clone the gauge field from the operator function argument.
+    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
+
+*** Mixed precision CG into EOFA portion         
+*** Further reduce precision in forces to 10^-6 ?
+
+*** Overall: a 3x or so is still possible => 500s -> 160s and 20 traj per hour on 16^3.
+
+- Use mixed precision CG in HMC                           
+- SchurRedBlack.h: stop use of operator function; use LinearOperator or similar instead.
+- Or make an OperatorFunction for mixed precision as a wrapper
+
+********************************************************************
+* Signed off 2+1f HMC with Hasenbush and strange RHMC 16^3 x 32 DWF Ls=16 Plaquette 0.5883 ish
+* Signed off 2+1f HMC with Hasenbush and strange EOFA 16^3 x 32 DWF Ls=16 Plaquette 0.5883 ish
+* Wilson plaquette cross checked against CPS and literature GwilsonFnone
+********************************************************************
+
+********************************************************************
+* RHMC: Timesteps & eigenranges matched from previous CPS 16^3 x 32 runs:
+********************************************************************
+
+****
+Strange (m=0.04)  has eigenspan 
+**** 
+16^3 done as 1+1+1 with separate PV's. 
+/dirac1/archive/QCDOC/host/QCDDWF/DWF/2+1f/16nt32/IWASAKI/b2.13/ls16/M1_8/ms0.04/mu0.01/rhmc_multitimescale/evol5/work
+****
+2+1f 16^3  - [ 4e^-4, 2.42 ]    for strange
+
+****
+24^3 done as 1+1+1 at strange, and single quotient https://arxiv.org/pdf/0804.0473.pdf Eq 83,
+****
+double lambda_low =   4.0000000000000002e-04 <- strange
+double lambda_low =   1.0000000000000000e-02 <- pauli villars
+And high = 2.5
+
+Array bsn_mass[3] = { 
+double bsn_mass[0] =   1.0000000000000000e+00
+double bsn_mass[1] =   1.0000000000000000e+00
+double bsn_mass[2] =   1.0000000000000000e+00
+}
+Array frm_mass[3] = { 
+double frm_mass[0] =   4.0000000000000001e-02
+double frm_mass[1] =   4.0000000000000001e-02
+double frm_mass[2] =   4.0000000000000001e-02
+}
+
+***
+32^3 
+/dirac1/archive/QCDOC/host/QCDDWF/DWF/2+1f/32nt64/IWASAKI/b2.25/ls16/M1_8/ms0.03/mu0.004/evol6/work
+***
+Similar det scheme
+double lambda_low =   4.0000000000000002e-04
+double lambda_low =   1.0000000000000000e-02
+
+Array bsn_mass[3] = { 
+double bsn_mass[0] =   1.0000000000000000e+00
+double bsn_mass[1] =   1.0000000000000000e+00
+double bsn_mass[2] =   1.0000000000000000e+00
+}
+Array frm_mass[3] = { 
+double frm_mass[0] =   3.0000000000000002e-02
+double frm_mass[1] =   3.0000000000000002e-02
+double frm_mass[2] =   3.0000000000000002e-02
+}
+
+********************************************************************
+* Grid: Power method bounds check
+********************************************************************
+- Finding largest eigenvalue approx 25 not 2.5
+- Conventions:
+
+Grid MpcDagMpc based on:
+
+   (Moo-Moe Mee^-1 Meo)^dag(Moo-Moe Mee^-1 Meo)
+
+- with  Moo = 5-M5 = 3.2
+- CPS use(d) Moo = 1
+- Eigenrange in Grid is 3.2^2 rescaled so factor of 10 accounted for
+
--- a/Hadrons/A2AMatrix.hpp
+++ b/Hadrons/A2AMatrix.hpp
@ -4,9 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid

 Source file: Hadrons/A2AMatrix.hpp

-Copyright (C) 2015-2018
+Copyright (C) 2015-2019

 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -29,38 +30,394 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define A2A_Matrix_hpp_

 #include <Hadrons/Global.hpp>
+#include <Hadrons/TimerArray.hpp>
+#include <Grid/Eigen/unsupported/CXX11/Tensor>
+#ifdef USE_MKL
+#include "mkl.h"
+#include "mkl_cblas.h"
+#endif
+
+#ifndef HADRONS_A2AM_NAME 
+#define HADRONS_A2AM_NAME "a2aMatrix"
+#endif
+
+#ifndef HADRONS_A2AM_IO_TYPE
+#define HADRONS_A2AM_IO_TYPE ComplexF
+#endif
+
+#define HADRONS_A2AM_PARALLEL_IO

 BEGIN_HADRONS_NAMESPACE

-template <typename T, typename MetadataType>
+// general A2A matrix set based on Eigen tensors and Grid-allocated memory
+// Dimensions:
+//   0 - ext - external field (momentum, EM field, ...)
+//   1 - str - spin-color structure
+//   2 - t   - timeslice
+//   3 - i   - left  A2A mode index
+//   4 - j   - right A2A mode index
+template <typename T>
+using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;
+
+template <typename T>
+using A2AMatrix = Eigen::Matrix<T, -1, -1, Eigen::RowMajor>;
+
+template <typename T>
+using A2AMatrixTr = Eigen::Matrix<T, -1, -1, Eigen::ColMajor>;
+
+/******************************************************************************
+ *                      Abstract class for A2A kernels                        *
+ ******************************************************************************/
+template <typename T, typename Field>
+class A2AKernel
+{
+public:
+    A2AKernel(void) = default;
+    virtual ~A2AKernel(void) = default;
+    virtual void operator()(A2AMatrixSet<T> &m, const Field *left, const Field *right,
+                          const unsigned int orthogDim, double &time) = 0;
+    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
+    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
+};
+
+/******************************************************************************
+ *                  Class to handle A2A matrix block HDF5 I/O                 *
+ ******************************************************************************/
+template <typename T>
 class A2AMatrixIo
 {
 public:
+    // constructors
    A2AMatrixIo(void) = default;
    A2AMatrixIo(std::string filename, std::string dataname, 
-                const unsigned int nt, const unsigned int ni,
-                const unsigned int nj);
+                const unsigned int nt, const unsigned int ni = 0,
+                const unsigned int nj = 0);
+    // destructor
    ~A2AMatrixIo(void) = default;
+    // access
+    unsigned int getNi(void) const;
+    unsigned int getNj(void) const;
+    unsigned int getNt(void) const;
+    size_t       getSize(void) const;
+    // file allocation
+    template <typename MetadataType>
    void initFile(const MetadataType &d, const unsigned int chunkSize);
+    // block I/O
    void saveBlock(const T *data, const unsigned int i, const unsigned int j,
                   const unsigned int blockSizei, const unsigned int blockSizej);
+    void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
+                   const unsigned int i, const unsigned int j);
+    template <template <class> class Vec, typename VecT>
+    void load(Vec<VecT> &v, double *tRead = nullptr);
 private:
-    std::string  filename_, dataname_;
-    unsigned int nt_, ni_, nj_;
+    std::string  filename_{""}, dataname_{""};
+    unsigned int nt_{0}, ni_{0}, nj_{0};
 };

-template <typename T, typename MetadataType>
-A2AMatrixIo<T, MetadataType>::A2AMatrixIo(std::string filename, 
-                                          std::string dataname, 
-                                          const unsigned int nt, 
-                                          const unsigned int ni,
-                                          const unsigned int nj)
+/******************************************************************************
+ *                  Wrapper for A2A matrix block computation                  *
+ ******************************************************************************/
+template <typename T, typename Field, typename MetadataType, typename TIo = T>
+class A2AMatrixBlockComputation
+{
+private:
+    struct IoHelper
+    {
+        A2AMatrixIo<TIo> io;
+        MetadataType     md;
+        unsigned int     e, s, i, j;
+    };
+    typedef std::function<std::string(const unsigned int, const unsigned int)>  FilenameFn;
+    typedef std::function<MetadataType(const unsigned int, const unsigned int)> MetadataFn;
+public:
+    // constructor
+    A2AMatrixBlockComputation(GridBase *grid,
+                              const unsigned int orthogDim,
+                              const unsigned int next,
+                              const unsigned int nstr,
+                              const unsigned int blockSize,
+                              const unsigned int cacheBlockSize,
+                              TimerArray *tArray = nullptr);
+    // execution
+    void execute(const std::vector<Field> &left, 
+                 const std::vector<Field> &right,
+                 A2AKernel<T, Field> &kernel,
+                 const FilenameFn &ionameFn,
+                 const FilenameFn &filenameFn,
+                 const MetadataFn &metadataFn);
+private:
+    // I/O handler
+    void saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h);
+private:
+    TimerArray            *tArray_;
+    GridBase              *grid_;
+    unsigned int          orthogDim_, nt_, next_, nstr_, blockSize_, cacheBlockSize_;
+    Vector<T>             mCache_;
+    Vector<TIo>           mBuf_;
+    std::vector<IoHelper> nodeIo_;
+};
+
+/******************************************************************************
+ *                       A2A matrix contraction kernels                       *
+ ******************************************************************************/
+class A2AContraction
+{
+public:
+    // accTrMul(acc, a, b): acc += tr(a*b)
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
+    {
+        if ((MatLeft::Options == Eigen::RowMajor) and
+            (MatRight::Options == Eigen::ColMajor))
+        {
+            parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+            {
+                C tmp;
+#ifdef USE_MKL
+                dotuRow(tmp, r, a, b);
+#else
+                tmp = a.row(r).conjugate().dot(b.col(r));
+#endif
+                parallel_critical
+                {
+                    acc += tmp;
+                }
+            }
+        }
+        else
+        {
+            parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+            {
+                C tmp;
+#ifdef USE_MKL 
+                dotuCol(tmp, c, a, b);
+#else
+                tmp = a.col(c).conjugate().dot(b.row(c));
+#endif
+                parallel_critical
+                {
+                    acc += tmp;
+                }
+            }
+        }
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline double accTrMulFlops(const MatLeft &a, const MatRight &b)
+    {
+        double n = a.rows()*a.cols();
+
+        return 8.*n;
+    }
+
+    // mul(res, a, b): res = a*b
+#ifdef USE_MKL
+    template <template <class, int...> class Mat, int... Opts>
+    static inline void mul(Mat<ComplexD, Opts...> &res, 
+                           const Mat<ComplexD, Opts...> &a, 
+                           const Mat<ComplexD, Opts...> &b)
+    {
+        static const ComplexD one(1., 0.), zero(0., 0.);
+
+        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
+        {
+            res.resize(a.rows(), b.cols());
+        }
+        if (Mat<ComplexD, Opts...>::Options == Eigen::RowMajor)
+        {
+            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat<ComplexD, Opts...>::Options == Eigen::ColMajor)
+        {
+            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    }
+
+    template <template <class, int...> class Mat, int... Opts>
+    static inline void mul(Mat<ComplexF, Opts...> &res, 
+                           const Mat<ComplexF, Opts...> &a, 
+                           const Mat<ComplexF, Opts...> &b)
+    {
+        static const ComplexF one(1., 0.), zero(0., 0.);
+
+        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
+        {
+            res.resize(a.rows(), b.cols());
+        }
+        if (Mat<ComplexF, Opts...>::Options == Eigen::RowMajor)
+        {
+            cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat<ComplexF, Opts...>::Options == Eigen::ColMajor)
+        {
+            cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    }
+#else
+    template <typename Mat>
+    static inline void mul(Mat &res, const Mat &a, const Mat &b)
+    {
+        res = a*b;
+    }
+#endif
+    template <typename Mat>
+    static inline double mulFlops(const Mat &a, const Mat &b)
+    {
+        double nr = a.rows(), nc = a.cols();
+
+        return nr*nr*(6.*nc + 2.*(nc - 1.));
+    }
+private:
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void makeDotRowPt(C * &aPt, unsigned int &aInc, C * &bPt, 
+                                    unsigned int &bInc, const unsigned int aRow, 
+                                    const MatLeft &a, const MatRight &b)
+    {
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            aPt  = a.data() + aRow*a.cols();
+            aInc = 1;
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            aPt  = a.data() + aRow;
+            aInc = a.rows();
+        }
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            bPt  = b.data() + aRow;
+            bInc = b.cols();
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            bPt  = b.data() + aRow*b.rows();
+            bInc = 1;
+        }
+    }
+
+#ifdef USE_MKL
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void makeDotColPt(C * &aPt, unsigned int &aInc, C * &bPt, 
+                                    unsigned int &bInc, const unsigned int aCol, 
+                                    const MatLeft &a, const MatRight &b)
+    {
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            aPt  = a.data() + aCol;
+            aInc = a.cols();
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            aPt  = a.data() + aCol*a.rows();
+            aInc = 1;
+        }
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            bPt  = b.data() + aCol*b.cols();
+            bInc = 1;
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            bPt  = b.data() + aCol;
+            bInc = b.rows();
+        }
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuRow(ComplexF &res, const unsigned int aRow,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexF *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
+        cblas_cdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuCol(ComplexF &res, const unsigned int aCol,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexF *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
+        cblas_cdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuRow(ComplexD &res, const unsigned int aRow,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexD *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
+        cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuCol(ComplexD &res, const unsigned int aCol,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexD *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
+        cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+    }
+#endif
+};
+
+/******************************************************************************
+ *                     A2AMatrixIo template implementation                    *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename T>
+A2AMatrixIo<T>::A2AMatrixIo(std::string filename, std::string dataname, 
+                            const unsigned int nt, const unsigned int ni,
+                            const unsigned int nj)
 : filename_(filename), dataname_(dataname)
 , nt_(nt), ni_(ni), nj_(nj)
 {}

-template <typename T, typename MetadataType>
-void A2AMatrixIo<T, MetadataType>::initFile(const MetadataType &d, const unsigned int chunkSize)
+// access //////////////////////////////////////////////////////////////////////
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNt(void) const
+{
+    return nt_;
+}
+
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNi(void) const
+{
+    return ni_;
+}
+
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNj(void) const
+{
+    return nj_;
+}
+
+template <typename T>
+size_t A2AMatrixIo<T>::getSize(void) const
+{
+    return nt_*ni_*nj_*sizeof(T);
+}
+
+// file allocation /////////////////////////////////////////////////////////////
+template <typename T>
+template <typename MetadataType>
+void A2AMatrixIo<T>::initFile(const MetadataType &d, const unsigned int chunkSize)
 {
 #ifdef HAVE_HDF5
    std::vector<hsize_t>    dim = {static_cast<hsize_t>(nt_), 
@ -80,26 +437,28 @@ void A2AMatrixIo<T, MetadataType>::initFile(const MetadataType &d, const unsigne
    }

    // create the dataset
-    Hdf5Reader reader(filename_);
+    Hdf5Reader reader(filename_, false);

    push(reader, dataname_);
    auto &group = reader.getGroup();
    plist.setChunk(chunk.size(), chunk.data());
-    dataset = group.createDataSet("data", Hdf5Type<T>::type(), dataspace, plist);
+    plist.setFletcher32();
+    dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type<T>::type(), dataspace, plist);
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
 #endif
 }

-template <typename T, typename MetadataType>
-void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data, 
-                                             const unsigned int i, 
-                                             const unsigned int j,
-                                             const unsigned int blockSizei,
-                                             const unsigned int blockSizej)
+// block I/O ///////////////////////////////////////////////////////////////////
+template <typename T>
+void A2AMatrixIo<T>::saveBlock(const T *data, 
+                               const unsigned int i, 
+                               const unsigned int j,
+                               const unsigned int blockSizei,
+                               const unsigned int blockSizej)
 {
 #ifdef HAVE_HDF5
-    Hdf5Reader           reader(filename_);
+    Hdf5Reader           reader(filename_, false);
    std::vector<hsize_t> count = {nt_, blockSizei, blockSizej},
                         offset = {0, static_cast<hsize_t>(i),
                                   static_cast<hsize_t>(j)},
@ -111,7 +470,7 @@ void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data,

    push(reader, dataname_);
    auto &group = reader.getGroup();
-    dataset     = group.openDataSet("data");
+    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
    dataspace   = dataset.getSpace();
    dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
                              stride.data(), block.data());
@ -121,6 +480,267 @@ void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data,
 #endif
 }

+template <typename T>
+void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,
+                               const unsigned int ext, const unsigned int str,
+                               const unsigned int i, const unsigned int j)
+{
+    unsigned int blockSizei = m.dimension(3);
+    unsigned int blockSizej = m.dimension(4);
+    unsigned int nstr       = m.dimension(1);
+    size_t       offset     = (ext*nstr + str)*nt_*blockSizei*blockSizej;
+
+    saveBlock(m.data() + offset, i, j, blockSizei, blockSizej);
+}
+
+template <typename T>
+template <template <class> class Vec, typename VecT>
+void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead)
+{
+#ifdef HAVE_HDF5
+    Hdf5Reader           reader(filename_);
+    std::vector<hsize_t> hdim;
+    H5NS::DataSet        dataset;
+    H5NS::DataSpace      dataspace;
+    H5NS::CompType       datatype;
+    
+    push(reader, dataname_);
+    auto &group = reader.getGroup();
+    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
+    datatype    = dataset.getCompType();
+    dataspace   = dataset.getSpace();
+    hdim.resize(dataspace.getSimpleExtentNdims());
+    dataspace.getSimpleExtentDims(hdim.data());
+    if ((nt_*ni_*nj_ != 0) and
+        ((hdim[0] != nt_) or (hdim[1] != ni_) or (hdim[2] != nj_)))
+    {
+        HADRONS_ERROR(Size, "all-to-all matrix size mismatch (got "
+            + std::to_string(hdim[0]) + "x" + std::to_string(hdim[1]) + "x"
+            + std::to_string(hdim[2]) + ", expected "
+            + std::to_string(nt_) + "x" + std::to_string(ni_) + "x"
+            + std::to_string(nj_));
+    }
+    else if (ni_*nj_ == 0)
+    {
+        if (hdim[0] != nt_)
+        {
+            HADRONS_ERROR(Size, "all-to-all time size mismatch (got "
+                + std::to_string(hdim[0]) + ", expected "
+                + std::to_string(nt_) + ")");
+        }
+        ni_ = hdim[1];
+        nj_ = hdim[2];
+    }
+
+    A2AMatrix<T>         buf(ni_, nj_);
+    std::vector<hsize_t> count    = {1, static_cast<hsize_t>(ni_),
+                                     static_cast<hsize_t>(nj_)},
+                         stride   = {1, 1, 1},
+                         block    = {1, 1, 1},
+                         memCount = {static_cast<hsize_t>(ni_),
+                                     static_cast<hsize_t>(nj_)};
+    H5NS::DataSpace      memspace(memCount.size(), memCount.data());
+
+    std::cout << "Loading timeslice";
+    std::cout.flush();
+    *tRead = 0.;
+    for (unsigned int tp1 = nt_; tp1 > 0; --tp1)
+    {
+        unsigned int         t      = tp1 - 1;
+        std::vector<hsize_t> offset = {static_cast<hsize_t>(t), 0, 0};
+        
+        if (t % 10 == 0)
+        {
+            std::cout << " " << t;
+            std::cout.flush();
+        }
+        dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
+                                  stride.data(), block.data());
+        if (tRead) *tRead -= usecond();    
+        dataset.read(buf.data(), datatype, memspace, dataspace);
+        if (tRead) *tRead += usecond();
+        v[t] = buf.template cast<VecT>();
+    }
+    std::cout << std::endl;
+#else
+    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
+#endif
+}
+
+/******************************************************************************
+ *               A2AMatrixBlockComputation template implementation            *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename T, typename Field, typename MetadataType, typename TIo>
+A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
+::A2AMatrixBlockComputation(GridBase *grid,
+                            const unsigned int orthogDim,
+                            const unsigned int next, 
+                            const unsigned int nstr,
+                            const unsigned int blockSize, 
+                            const unsigned int cacheBlockSize,
+                            TimerArray *tArray)
+: grid_(grid), nt_(grid->GlobalDimensions()[orthogDim]), orthogDim_(orthogDim)
+, next_(next), nstr_(nstr), blockSize_(blockSize), cacheBlockSize_(cacheBlockSize)
+, tArray_(tArray)
+{
+    mCache_.resize(nt_*next_*nstr_*cacheBlockSize_*cacheBlockSize_);
+    mBuf_.resize(nt_*next_*nstr_*blockSize_*blockSize_);
+}
+
+#define START_TIMER(name) if (tArray_) tArray_->startTimer(name)
+#define STOP_TIMER(name)  if (tArray_) tArray_->stopTimer(name)
+#define GET_TIMER(name)   ((tArray_ != nullptr) ? tArray_->getDTimer(name) : 0.)
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename T, typename Field, typename MetadataType, typename TIo>
+void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
+::execute(const std::vector<Field> &left, const std::vector<Field> &right,
+          A2AKernel<T, Field> &kernel, const FilenameFn &ionameFn,
+          const FilenameFn &filenameFn, const MetadataFn &metadataFn)
+{
+    //////////////////////////////////////////////////////////////////////////
+    // i,j   is first  loop over blockSize_ factors
+    // ii,jj is second loop over cacheBlockSize_ factors for high perf contractions
+    // iii,jjj are loops within cacheBlock
+    // Total index is sum of these  i+ii+iii etc...
+    //////////////////////////////////////////////////////////////////////////
+    int    N_i = left.size();
+    int    N_j = right.size();
+    double flops, bytes, t_kernel;
+    double nodes = grid_->NodeCount();
+    
+    int NBlock_i = N_i/blockSize_ + (((N_i % blockSize_) != 0) ? 1 : 0);
+    int NBlock_j = N_j/blockSize_ + (((N_j % blockSize_) != 0) ? 1 : 0);
+
+    for(int i=0;i<N_i;i+=blockSize_)
+    for(int j=0;j<N_j;j+=blockSize_)
+    {
+        // Get the W and V vectors for this block^2 set of terms
+        int N_ii = MIN(N_i-i,blockSize_);
+        int N_jj = MIN(N_j-j,blockSize_);
+        A2AMatrixSet<TIo> mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj);
+
+        LOG(Message) << "All-to-all matrix block " 
+                     << j/blockSize_ + NBlock_j*i/blockSize_ + 1 
+                     << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
+                     << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
+                     << std::endl;
+        // Series of cache blocked chunks of the contractions within this block
+        flops    = 0.0;
+        bytes    = 0.0;
+        t_kernel = 0.0;
+        for(int ii=0;ii<N_ii;ii+=cacheBlockSize_)
+        for(int jj=0;jj<N_jj;jj+=cacheBlockSize_)
+        {
+            double t;
+            int N_iii = MIN(N_ii-ii,cacheBlockSize_);
+            int N_jjj = MIN(N_jj-jj,cacheBlockSize_);
+            A2AMatrixSet<T> mCacheBlock(mCache_.data(), next_, nstr_, nt_, N_iii, N_jjj);
+
+            START_TIMER("kernel");
+            kernel(mCacheBlock, &left[i+ii], &right[j+jj], orthogDim_, t);
+            STOP_TIMER("kernel");
+            t_kernel += t;
+            flops    += kernel.flops(N_iii, N_jjj);
+            bytes    += kernel.bytes(N_iii, N_jjj);
+
+            START_TIMER("cache copy");
+            parallel_for_nest5(int e =0;e<next_;e++)
+            for(int s =0;s< nstr_;s++)
+            for(int t =0;t< nt_;t++)
+            for(int iii=0;iii< N_iii;iii++)
+            for(int jjj=0;jjj< N_jjj;jjj++)
+            {
+                mBlock(e,s,t,ii+iii,jj+jjj) = mCacheBlock(e,s,t,iii,jjj);
+            }
+            STOP_TIMER("cache copy");
+        }
+
+        // perf
+        LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes 
+                     << " Gflop/s/node " << std::endl;
+        LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes 
+                     << " GB/s/node "  << std::endl;
+
+        // IO
+        double       blockSize, ioTime;
+        unsigned int myRank = grid_->ThisRank(), nRank  = grid_->RankCount();
+    
+        LOG(Message) << "Writing block to disk" << std::endl;
+        ioTime = -GET_TIMER("IO: write block");
+        START_TIMER("IO: total");
+        makeFileDir(filenameFn(0, 0), grid_);
+#ifdef HADRONS_A2AM_PARALLEL_IO
+        grid_->Barrier();
+        // make task list for current node
+        nodeIo_.clear();
+        for(int f = myRank; f < next_*nstr_; f += nRank)
+        {
+            IoHelper h;
+
+            h.i  = i;
+            h.j  = j;
+            h.e  = f/nstr_;
+            h.s  = f % nstr_;
+            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
+                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
+            h.md = metadataFn(h.e, h.s);
+            nodeIo_.push_back(h);
+        }
+        // parallel IO
+        for (auto &h: nodeIo_)
+        {
+            saveBlock(mBlock, h);
+        }
+        grid_->Barrier();
+#else
+        // serial IO, for testing purposes only
+        for(int e = 0; e < next_; e++)
+        for(int s = 0; s < nstr_; s++)
+        {
+            IoHelper h;
+
+            h.i  = i;
+            h.j  = j;
+            h.e  = e;
+            h.s  = s;
+            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
+                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
+            h.md = metadataFn(h.e, h.s);
+            saveBlock(mfBlock, h);
+        }
+#endif
+        STOP_TIMER("IO: total");
+        blockSize  = static_cast<double>(next_*nstr_*nt_*N_ii*N_jj*sizeof(TIo));
+        ioTime    += GET_TIMER("IO: write block");
+        LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
+                     << ioTime  << " us (" 
+                     << blockSize/ioTime*1.0e6/1024/1024
+                     << " MB/s)" << std::endl;
+    }
+}
+
+// I/O handler /////////////////////////////////////////////////////////////////
+template <typename T, typename Field, typename MetadataType, typename TIo>
+void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
+::saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h)
+{
+    if ((h.i == 0) and (h.j == 0))
+    {
+        START_TIMER("IO: file creation");
+        h.io.initFile(h.md, blockSize_);
+        STOP_TIMER("IO: file creation");
+    }
+    START_TIMER("IO: write block");
+    h.io.saveBlock(m, h.e, h.s, h.i, h.j);
+    STOP_TIMER("IO: write block");
+}
+
+#undef START_TIMER
+#undef STOP_TIMER
+#undef GET_TIMER
+
 END_HADRONS_NAMESPACE

 #endif // A2A_Matrix_hpp_
--- a/Hadrons/A2AVectors.hpp
+++ b/Hadrons/A2AVectors.hpp
@ -4,7 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid

 Source file: Hadrons/A2AVectors.hpp

-Copyright (C) 2015-2018
+Copyright (C) 2015-2019

 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: fionnoh <fionnoh@gmail.com>
@ -36,7 +36,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 BEGIN_HADRONS_NAMESPACE

 /******************************************************************************
- *               Classes to generate V & W all-to-all vectors                 *
+ *                 Class to generate V & W all-to-all vectors                 *
 ******************************************************************************/
 template <typename FImpl>
 class A2AVectorsSchurDiagTwo
@ -70,6 +70,42 @@ private:
    SchurDiagTwoOperator<FMat, FermionField> op_;
 };

+/******************************************************************************
+ *                  Methods for V & W all-to-all vectors I/O                  *
+ ******************************************************************************/
+class A2AVectorsIo
+{
+public:
+    struct Record: Serializable
+    {
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Record,
+                                        unsigned int, index);
+        Record(void): index(0) {}
+    };
+public:
+    template <typename Field>
+    static void write(const std::string fileStem, std::vector<Field> &vec, 
+                      const bool multiFile, const int trajectory = -1);
+    template <typename Field>
+    static void read(std::vector<Field> &vec, const std::string fileStem,
+                     const bool multiFile, const int trajectory = -1);
+private:
+    static inline std::string vecFilename(const std::string stem, const int traj, 
+                                          const bool multiFile)
+    {
+        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
+
+        if (multiFile)
+        {
+            return stem + t;
+        }
+        else
+        {
+            return stem + t + ".bin";
+        }
+    }
+};
+
 /******************************************************************************
 *               A2AVectorsSchurDiagTwo template implementation               *
 ******************************************************************************/
@ -217,6 +253,90 @@ void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeW5D(FermionField &wout_4d,
    }
 }

+/******************************************************************************
+ *               all-to-all vectors I/O template implementation               *
+ ******************************************************************************/
+template <typename Field>
+void A2AVectorsIo::write(const std::string fileStem, std::vector<Field> &vec, 
+                         const bool multiFile, const int trajectory)
+{
+    Record       record;
+    GridBase     *grid = vec[0]._grid;
+    ScidacWriter binWriter(grid->IsBoss());
+    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
+
+    if (multiFile)
+    {
+        std::string fullFilename;
+
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
+
+            LOG(Message) << "Writing vector " << i << std::endl;
+            makeFileDir(fullFilename, grid);
+            binWriter.open(fullFilename);
+            record.index = i;
+            binWriter.writeScidacFieldRecord(vec[i], record);
+            binWriter.close();
+        }
+    }
+    else
+    {
+        makeFileDir(filename, grid);
+        binWriter.open(filename);
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            LOG(Message) << "Writing vector " << i << std::endl;
+            record.index = i;
+            binWriter.writeScidacFieldRecord(vec[i], record);
+        }
+        binWriter.close();
+    }
+}
+
+template <typename Field>
+void A2AVectorsIo::read(std::vector<Field> &vec, const std::string fileStem, 
+                        const bool multiFile, const int trajectory)
+{
+    Record       record;
+    ScidacReader binReader;
+    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
+
+    if (multiFile)
+    {
+        std::string fullFilename;
+
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
+
+            LOG(Message) << "Reading vector " << i << std::endl;
+            binReader.open(fullFilename);
+            binReader.readScidacFieldRecord(vec[i], record);
+            binReader.close();
+            if (record.index != i)
+            {
+                HADRONS_ERROR(Io, "vector index mismatch");
+            }
+        }
+    }
+    else
+    {
+        binReader.open(filename);
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            LOG(Message) << "Reading vector " << i << std::endl;
+            binReader.readScidacFieldRecord(vec[i], record);
+            if (record.index != i)
+            {
+                HADRONS_ERROR(Io, "vector index mismatch");
+            }
+        }
+        binReader.close();
+    }
+}
+
 END_HADRONS_NAMESPACE

 #endif // A2A_Vectors_hpp_
--- a/Hadrons/Application.cc
+++ b/Hadrons/Application.cc
@ -4,7 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid

 Source file: Hadrons/Application.cc

-Copyright (C) 2015-2018
+Copyright (C) 2015-2019

 Author: Antonin Portelli <antonin.portelli@me.com>

@ -48,28 +48,32 @@ Application::Application(void)
 {
    initLogger();
    auto dim = GridDefaultLatt(), mpi = GridDefaultMpi(), loc(dim);
-    locVol_ = 1;
-    for (unsigned int d = 0; d < dim.size(); ++d)
+
+    if (dim.size())
    {
-        loc[d]  /= mpi[d];
-        locVol_ *= loc[d];
+        locVol_ = 1;
+        for (unsigned int d = 0; d < dim.size(); ++d)
+        {
+            loc[d]  /= mpi[d];
+            locVol_ *= loc[d];
+        }
+        LOG(Message) << "====== HADRONS APPLICATION INITIALISATION ======" << std::endl;
+        LOG(Message) << "** Dimensions" << std::endl;
+        LOG(Message) << "Global lattice: " << dim << std::endl;
+        LOG(Message) << "MPI partition : " << mpi << std::endl;
+        LOG(Message) << "Local lattice : " << loc << std::endl;
+        LOG(Message) << std::endl;
+        LOG(Message) << "** Default parameters (and associated C macros)" << std::endl;
+        LOG(Message) << "ASCII output precision  : " << MACOUT(DEFAULT_ASCII_PREC) << std::endl;
+        LOG(Message) << "Fermion implementation  : " << MACOUTS(FIMPLBASE) << std::endl;
+        LOG(Message) << "z-Fermion implementation: " << MACOUTS(ZFIMPLBASE) << std::endl;
+        LOG(Message) << "Scalar implementation   : " << MACOUTS(SIMPLBASE) << std::endl;
+        LOG(Message) << "Gauge implementation    : " << MACOUTS(GIMPLBASE) << std::endl;
+        LOG(Message) << "Eigenvector base size   : " 
+                    << MACOUT(HADRONS_DEFAULT_LANCZOS_NBASIS) << std::endl;
+        LOG(Message) << "Schur decomposition     : " << MACOUTS(HADRONS_DEFAULT_SCHUR) << std::endl;
+        LOG(Message) << std::endl;
    }
-    LOG(Message) << "====== HADRONS APPLICATION INITIALISATION ======" << std::endl;
-    LOG(Message) << "** Dimensions" << std::endl;
-    LOG(Message) << "Global lattice: " << dim << std::endl;
-    LOG(Message) << "MPI partition : " << mpi << std::endl;
-    LOG(Message) << "Local lattice : " << loc << std::endl;
-    LOG(Message) << std::endl;
-    LOG(Message) << "** Default parameters (and associated C macros)" << std::endl;
-    LOG(Message) << "ASCII output precision  : " << MACOUT(DEFAULT_ASCII_PREC) << std::endl;
-    LOG(Message) << "Fermion implementation  : " << MACOUTS(FIMPLBASE) << std::endl;
-    LOG(Message) << "z-Fermion implementation: " << MACOUTS(ZFIMPLBASE) << std::endl;
-    LOG(Message) << "Scalar implementation   : " << MACOUTS(SIMPLBASE) << std::endl;
-    LOG(Message) << "Gauge implementation    : " << MACOUTS(GIMPLBASE) << std::endl;
-    LOG(Message) << "Eigenvector base size   : " 
-                 << MACOUT(HADRONS_DEFAULT_LANCZOS_NBASIS) << std::endl;
-    LOG(Message) << "Schur decomposition     : " << MACOUTS(HADRONS_DEFAULT_SCHUR) << std::endl;
-    LOG(Message) << std::endl;
 }

 Application::Application(const Application::GlobalPar &par)
@ -108,10 +112,28 @@ void Application::run(void)
        HADRONS_ERROR(Definition, "run id is empty");
    }
    LOG(Message) << "RUN ID '" << getPar().runId << "'" << std::endl;
+    BinaryIO::latticeWriteMaxRetry = getPar().parallelWriteMaxRetry;
+    LOG(Message) << "Attempt(s) for resilient parallel I/O: " 
+                 << BinaryIO::latticeWriteMaxRetry << std::endl;
    vm().setRunId(getPar().runId);
    vm().printContent();
    env().printContent();
-    schedule();
+    if (getPar().saveSchedule or getPar().scheduleFile.empty())
+    {
+        schedule();
+        if (getPar().saveSchedule)
+        {
+            std::string filename;
+
+            filename = (getPar().scheduleFile.empty()) ? 
+                         "hadrons.sched" : getPar().scheduleFile;
+            saveSchedule(filename);
+        }
+    }
+    else
+    {
+        loadSchedule(getPar().scheduleFile);
+    }
    printSchedule();
    if (!getPar().graphFile.empty())
    {
@ -158,12 +180,13 @@ void Application::parseParameterFile(const std::string parameterFileName)
    pop(reader);
 }

-void Application::saveParameterFile(const std::string parameterFileName)
+void Application::saveParameterFile(const std::string parameterFileName, unsigned int prec)
 {
    LOG(Message) << "Saving application to '" << parameterFileName << "'..." << std::endl;
    if (env().getGrid()->IsBoss())
    {
        XmlWriter          writer(parameterFileName);
+        writer.setPrecision(prec);
        ObjectId           id;
        const unsigned int nMod = vm().getNModule();

--- a/Show More
+++ b/Show More