Hadrons: faster A2A matrix load

Hadrons: contractor fixes
Hadrons: first stab at MPI contractor
2025-08-17 11:41:53 +01:00 · 2019-01-11 16:12:49 +00:00 · 2019-01-11 16:12:16 +00:00 · 2019-01-10 16:29:57 +00:00 · 2019-01-02 14:40:31 +00:00 · 2019-01-02 14:39:59 +00:00
157 changed files with 12184 additions and 2232 deletions
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -48,11 +48,16 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
+#include <Grid/algorithms/iterative/MinimalResidual.h>
+#include <Grid/algorithms/iterative/GeneralisedMinimalResidual.h>
+#include <Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h>
+#include <Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h>
+#include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
+#include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/CoarsenedMatrix.h>
 #include <Grid/algorithms/FFT.h>

-
 // EigCg
 // Pcg
 // Hdcg
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -211,6 +211,7 @@ namespace Grid {

      for(int b=0;b<nn;b++){
 	
+	subspace[b] = zero;
 	gaussian(RNG,noise);
 	scale = std::pow(norm2(noise),-0.5); 
 	noise=noise*scale;
@@ -295,13 +296,58 @@ namespace Grid {
      return norm2(out);
    };

-    RealD Mdag (const CoarseVector &in, CoarseVector &out){ 
-      return M(in,out);
+    RealD Mdag (const CoarseVector &in, CoarseVector &out){
+      // // corresponds to Petrov-Galerkin coarsening
+      // return M(in,out);
+
+      // corresponds to Galerkin coarsening
+      CoarseVector tmp(Grid());
+      G5C(tmp, in);
+      M(tmp, out);
+      G5C(out, out);
+      return norm2(out);
    };

-    // Defer support for further coarsening for now
-    void Mdiag    (const CoarseVector &in,  CoarseVector &out){};
-    void Mdir     (const CoarseVector &in,  CoarseVector &out,int dir, int disp){};
+    void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
+
+      conformable(_grid,in._grid);
+      conformable(in._grid,out._grid);
+
+      SimpleCompressor<siteVector> compressor;
+      Stencil.HaloExchange(in,compressor);
+
+      auto point = [dir, disp](){
+        if(dir == 0 and disp == 0)
+          return 8;
+        else
+          return (4 * dir + 1 - disp) / 2;
+      }();
+
+      parallel_for(int ss=0;ss<Grid()->oSites();ss++){
+        siteVector res = zero;
+        siteVector nbr;
+        int ptype;
+        StencilEntry *SE;
+
+        SE=Stencil.GetEntry(ptype,point,ss);
+
+        if(SE->_is_local&&SE->_permute) {
+          permute(nbr,in._odata[SE->_offset],ptype);
+        } else if(SE->_is_local) {
+          nbr = in._odata[SE->_offset];
+        } else {
+          nbr = Stencil.CommBuf()[SE->_offset];
+        }
+
+        res = res + A[point]._odata[ss]*nbr;
+
+        vstream(out._odata[ss],res);
+      }
+    };
+
+    void Mdiag(const CoarseVector &in, CoarseVector &out){
+      Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil
+    };

    CoarsenedMatrix(GridCartesian &CoarseGrid) 	: 

@@ -417,7 +463,7 @@ namespace Grid {
      std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
      //      ForceHermitian();
-      AssertHermitian();
+      // AssertHermitian();
      // ForceDiagonal();
    }
    void ForceDiagonal(void) {
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -380,6 +380,12 @@ namespace Grid {
    template<class Field> class OperatorFunction {
    public:
      virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
+      virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
+	assert(in.size()==out.size());
+	for(int k=0;k<in.size();k++){
+	  (*this)(Linop,in[k],out[k]);
+	}
+      };
    };

    template<class Field> class LinearFunction {
@@ -421,7 +427,7 @@ namespace Grid {
  // Hermitian operator Linear function and operator function
  ////////////////////////////////////////////////////////////////////////////////////////////
    template<class Field>
-      class HermOpOperatorFunction : public OperatorFunction<Field> {
+    class HermOpOperatorFunction : public OperatorFunction<Field> {
      void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
 	Linop.HermOp(in,out);
      };
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -55,6 +55,14 @@ namespace Grid {
    template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
    public:
      virtual GridBase *RedBlackGrid(void)=0;
+
+      //////////////////////////////////////////////////////////////////////
+      // Query the even even properties to make algorithmic decisions
+      //////////////////////////////////////////////////////////////////////
+      virtual RealD  Mass(void)        { return 0.0; };
+      virtual int    ConstEE(void)     { return 0; }; // Disable assumptions unless overridden
+      virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
+
      // half checkerboard operaions
      virtual  void Meooe    (const Field &in, Field &out)=0;
      virtual  void Mooee    (const Field &in, Field &out)=0;
--- a/Grid/algorithms/iterative/BlockConjugateGradient.h
+++ b/Grid/algorithms/iterative/BlockConjugateGradient.h
@@ -33,7 +33,7 @@ directory

 namespace Grid {

-enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
+enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };

 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient. Dimension zero should be the block direction
@@ -42,7 +42,6 @@ template <class Field>
 class BlockConjugateGradient : public OperatorFunction<Field> {
 public:

-
  typedef typename Field::scalar_type scomplex;

  int blockDim ;
@@ -54,21 +53,15 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  Integer PrintInterval; //GridLogMessages or Iterative
  
  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
+    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100)
  {};

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Thin QR factorisation (google it)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-void ThinQRfact (Eigen::MatrixXcd &m_rr,
-		 Eigen::MatrixXcd &C,
-		 Eigen::MatrixXcd &Cinv,
-		 Field & Q,
-		 const Field & R)
-{
-  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  //Dimensions
  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
@@ -85,22 +78,20 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  // Cdag C = Rdag R ; passes.
  // QdagQ  = 1      ; passes
  ////////////////////////////////////////////////////////////////////////////////////////////////////
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 Field & Q,
+		 const Field & R)
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  sliceInnerProductMatrix(m_rr,R,R,Orthog);

  // Force manifest hermitian to avoid rounding related
  m_rr = 0.5*(m_rr+m_rr.adjoint());

-#if 0
-  std::cout << " Calling Cholesky  ldlt on m_rr "  << m_rr <<std::endl;
-  Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL(); 
-  std::cout << " Called Cholesky  ldlt on m_rr "  << L_ldlt <<std::endl;
-  auto  D_ldlt = m_rr.ldlt().vectorD(); 
-  std::cout << " Called Cholesky  ldlt on m_rr "  << D_ldlt <<std::endl;
-#endif
-
-  //  std::cout << " Calling Cholesky  llt on m_rr "  <<std::endl;
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
-  //  std::cout << " Called Cholesky  llt on m_rr "  << L <<std::endl;
+
  C    = L.adjoint();
  Cinv = C.inverse();
  ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -112,6 +103,25 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  sliceMulMatrix(Q,Cinv,R,Orthog);
 }
+// see comments above
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 std::vector<Field> & Q,
+		 const std::vector<Field> & R)
+{
+  InnerProductMatrix(m_rr,R,R);
+
+  m_rr = 0.5*(m_rr+m_rr.adjoint());
+
+  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+
+  C    = L.adjoint();
+  Cinv = C.inverse();
+
+  MulMatrix(Q,Cinv,R);
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Call one of several implementations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -119,14 +129,20 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 {
  if ( CGtype == BlockCGrQ ) {
    BlockCGrQsolve(Linop,Src,Psi);
-  } else if (CGtype == BlockCG ) {
-    BlockCGsolve(Linop,Src,Psi);
  } else if (CGtype == CGmultiRHS ) {
    CGmultiRHSsolve(Linop,Src,Psi);
  } else {
    assert(0);
  }
 }
+virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi) 
+{
+  if ( CGtype == BlockCGrQVec ) {
+    BlockCGrQsolveVec(Linop,Src,Psi);
+  } else {
+    assert(0);
+  }
+}

 ////////////////////////////////////////////////////////////////////////////
 // BlockCGrQ implementation:
@@ -139,7 +155,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = B._grid->_fdimensions[Orthog];
-
+/* FAKE */
+  Nblock=8;
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;

  X.checkerboard = B.checkerboard;
@@ -202,15 +219,10 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;

  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
-
  Linop.HermOp(X, AD);
  tmp = B - AD;  
-  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
+
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
-  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
-  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
-  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
-  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
  D=Q;

  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
@@ -232,14 +244,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    MatrixTimer.Start();
    Linop.HermOp(D, Z);      
    MatrixTimer.Stop();
-    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;

    //4. M  = [D^dag Z]^{-1}
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
    sliceInnerTimer.Stop();
    m_M       = m_DZ.inverse();
-    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
    
    //5. X  = X + D MC
    m_tmp     = m_M * m_C;
@@ -257,6 +267,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    
    //7. D  = Q + D S^dag
    m_tmp = m_S.adjoint();
+
    sliceMaddTimer.Start();
    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
    sliceMaddTimer.Stop();
@@ -317,152 +328,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  IterationsToComplete = k;
 }
 //////////////////////////////////////////////////////////////////////////
-// Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
-//////////////////////////////////////////////////////////////////////////
-void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
-{
-  int Orthog = blockDim; // First dimension is block dim; this is an assumption
-  Nblock = Src._grid->_fdimensions[Orthog];
-
-  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
-
-  Psi.checkerboard = Src.checkerboard;
-  conformable(Psi, Src);
-
-  Field P(Src);
-  Field AP(Src);
-  Field R(Src);
-  
-  Eigen::MatrixXcd m_pAp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
-  Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
-  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-  Eigen::MatrixXcd m_alpha      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  Eigen::MatrixXcd m_beta   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-  // Initial residual computation & set up
-  std::vector<RealD> residuals(Nblock);
-  std::vector<RealD> ssq(Nblock);
-
-  sliceNorm(ssq,Src,Orthog);
-  RealD sssum=0;
-  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
-
-  sliceNorm(residuals,Src,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-  sliceNorm(residuals,Psi,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-  // Initial search dir is guess
-  Linop.HermOp(Psi, AP);
-  
-
-  /************************************************************************
-   * Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
-   ************************************************************************
-   * O'Leary : R = B - A X
-   * O'Leary : P = M R ; preconditioner M = 1
-   * O'Leary : alpha = PAP^{-1} RMR
-   * O'Leary : beta  = RMR^{-1}_old RMR_new
-   * O'Leary : X=X+Palpha
-   * O'Leary : R_new=R_old-AP alpha
-   * O'Leary : P=MR_new+P beta
-   */
-
-  R = Src - AP;  
-  P = R;
-  sliceInnerProductMatrix(m_rr,R,R,Orthog);
-
-  GridStopWatch sliceInnerTimer;
-  GridStopWatch sliceMaddTimer;
-  GridStopWatch MatrixTimer;
-  GridStopWatch SolverTimer;
-  SolverTimer.Start();
-
-  int k;
-  for (k = 1; k <= MaxIterations; k++) {
-
-    RealD rrsum=0;
-    for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
-
-    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
-	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
-
-    MatrixTimer.Start();
-    Linop.HermOp(P, AP);
-    MatrixTimer.Stop();
-
-    // Alpha
-    sliceInnerTimer.Start();
-    sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
-    sliceInnerTimer.Stop();
-    m_pAp_inv = m_pAp.inverse();
-    m_alpha   = m_pAp_inv * m_rr ;
-
-    // Psi, R update
-    sliceMaddTimer.Start();
-    sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog);     // add alpha *  P to psi
-    sliceMaddMatrix(R  ,m_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
-    sliceMaddTimer.Stop();
-
-    // Beta
-    m_rr_inv = m_rr.inverse();
-    sliceInnerTimer.Start();
-    sliceInnerProductMatrix(m_rr,R,R,Orthog);
-    sliceInnerTimer.Stop();
-    m_beta = m_rr_inv *m_rr;
-
-    // Search update
-    sliceMaddTimer.Start();
-    sliceMaddMatrix(AP,m_beta,P,R,Orthog);
-    sliceMaddTimer.Stop();
-    P= AP;
-
-    /*********************
-     * convergence monitor
-     *********************
-     */
-    RealD max_resid=0;
-    RealD rr;
-    for(int b=0;b<Nblock;b++){
-      rr = real(m_rr(b,b))/ssq[b];
-      if ( rr > max_resid ) max_resid = rr;
-    }
-    
-    if ( max_resid < Tolerance*Tolerance ) { 
-
-      SolverTimer.Stop();
-
-      std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
-      for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
-		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
-      }
-      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
-
-      Linop.HermOp(Psi, AP);
-      AP = AP-Src;
-      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
-
-      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
-      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
-      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
-	    
-      IterationsToComplete = k;
-      return;
-    }
-
-  }
-  std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;
-
-  if (ErrorOnNoConverge) assert(0);
-  IterationsToComplete = k;
-}
-//////////////////////////////////////////////////////////////////////////
 // multiRHS conjugate gradient. Dimension zero should be the block direction
 // Use this for spread out across nodes
 //////////////////////////////////////////////////////////////////////////
@@ -600,6 +465,233 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
  IterationsToComplete = k;
 }

+void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
+  for(int b=0;b<Nblock;b++){
+  for(int bp=0;bp<Nblock;bp++) {
+    m(b,bp) = innerProduct(X[b],Y[bp]);  
+  }}
+}
+void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
+  // Should make this cache friendly with site outermost, parallel_for
+  // Deal with case AP aliases with either Y or X
+  std::vector<Field> tmp(Nblock,X[0]);
+  for(int b=0;b<Nblock;b++){
+    tmp[b]   = Y[b];
+    for(int bp=0;bp<Nblock;bp++) {
+      tmp[b] = tmp[b] + (scale*m(bp,b))*X[bp]; 
+    }
+  }
+  for(int b=0;b<Nblock;b++){
+    AP[b] = tmp[b];
+  }
+}
+void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
+  // Should make this cache friendly with site outermost, parallel_for
+  for(int b=0;b<Nblock;b++){
+    AP[b] = zero;
+    for(int bp=0;bp<Nblock;bp++) {
+      AP[b] += (m(bp,b))*X[bp]; 
+    }
+  }
+}
+double normv(const std::vector<Field> &P){
+  double nn = 0.0;
+  for(int b=0;b<Nblock;b++) {
+    nn+=norm2(P[b]);
+  }
+  return nn;
+}
+
+////////////////////////////////////////////////////////////////////////////
+// BlockCGrQvec implementation:
+//--------------------------
+// X is guess/Solution
+// B is RHS
+// Solve A X_i = B_i    ;        i refers to Nblock index
+////////////////////////////////////////////////////////////////////////////
+void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X) 
+{
+  Nblock = B.size();
+  assert(Nblock == X.size());
+
+  std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
+
+  for(int b=0;b<Nblock;b++){ 
+    X[b].checkerboard = B[b].checkerboard;
+    conformable(X[b], B[b]);
+    conformable(X[b], X[0]); 
+  }
+
+  Field Fake(B[0]);
+
+  std::vector<Field> tmp(Nblock,Fake);
+  std::vector<Field>   Q(Nblock,Fake);
+  std::vector<Field>   D(Nblock,Fake);
+  std::vector<Field>   Z(Nblock,Fake);
+  std::vector<Field>  AD(Nblock,Fake);
+
+  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  /************************************************************************
+   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
+   ************************************************************************
+   * Dimensions:
+   *
+   *   X,B==(Nferm x Nblock)
+   *   A==(Nferm x Nferm)
+   *  
+   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
+   * 
+   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+   * for k: 
+   *   Z  = AD
+   *   M  = [D^dag Z]^{-1}
+   *   X  = X + D MC
+   *   QS = Q - ZM
+   *   D  = Q + D S^dag
+   *   C  = S C
+   */
+  ///////////////////////////////////////
+  // Initial block: initial search dir is guess
+  ///////////////////////////////////////
+  std::cout << GridLogMessage<<"BlockCGrQvec algorithm initialisation " <<std::endl;
+
+  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+  for(int b=0;b<Nblock;b++) {
+    Linop.HermOp(X[b], AD[b]);
+    tmp[b] = B[b] - AD[b];  
+  }
+
+  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+
+  for(int b=0;b<Nblock;b++) D[b]=Q[b];
+
+  std::cout << GridLogMessage<<"BlockCGrQ vec computed initial residual and QR fact " <<std::endl;
+
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
+
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    //3. Z  = AD
+    MatrixTimer.Start();
+    for(int b=0;b<Nblock;b++) Linop.HermOp(D[b], Z[b]);      
+    MatrixTimer.Stop();
+
+    //4. M  = [D^dag Z]^{-1}
+    sliceInnerTimer.Start();
+    InnerProductMatrix(m_DZ,D,Z);
+    sliceInnerTimer.Stop();
+    m_M       = m_DZ.inverse();
+    
+    //5. X  = X + D MC
+    m_tmp     = m_M * m_C;
+    sliceMaddTimer.Start();
+    MaddMatrix(X,m_tmp, D,X);     
+    sliceMaddTimer.Stop();
+
+    //6. QS = Q - ZM
+    sliceMaddTimer.Start();
+    MaddMatrix(tmp,m_M,Z,Q,-1.0);
+    sliceMaddTimer.Stop();
+    QRTimer.Start();
+    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
+    QRTimer.Stop();
+    
+    //7. D  = Q + D S^dag
+    m_tmp = m_S.adjoint();
+    sliceMaddTimer.Start();
+    MaddMatrix(D,m_tmp,D,Q);
+    sliceMaddTimer.Stop();
+
+    //8. C  = S C
+    m_C = m_S*m_C;
+    
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    m_rr = m_C.adjoint() * m_C;
+
+    RealD max_resid=0;
+    RealD rrsum=0;
+    RealD rr;
+
+    for(int b=0;b<Nblock;b++) {
+      rrsum+=real(m_rr(b,b));
+      rr = real(m_rr(b,b))/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+
+    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl;
+
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
+
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
+      for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
+	    
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+
+
+
 };

 }
--- a/Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
@@ -0,0 +1,244 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
+
+Copyright (C) 2015
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
+#define GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
+
+namespace Grid {
+
+template<class Field>
+class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
+ public:
+  bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
+                          // defaults to true
+
+  RealD   Tolerance;
+
+  Integer MaxIterations;
+  Integer RestartLength;
+  Integer MaxNumberOfRestarts;
+  Integer IterationCount; // Number of iterations the CAGMRES took to finish,
+                          // filled in upon completion
+
+  GridStopWatch MatrixTimer;
+  GridStopWatch LinalgTimer;
+  GridStopWatch QrTimer;
+  GridStopWatch CompSolutionTimer;
+
+  Eigen::MatrixXcd H;
+
+  std::vector<std::complex<double>> y;
+  std::vector<std::complex<double>> gamma;
+  std::vector<std::complex<double>> c;
+  std::vector<std::complex<double>> s;
+
+  CommunicationAvoidingGeneralisedMinimalResidual(RealD   tol,
+                                                  Integer maxit,
+                                                  Integer restart_length,
+                                                  bool    err_on_no_conv = true)
+      : Tolerance(tol)
+      , MaxIterations(maxit)
+      , RestartLength(restart_length)
+      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
+      , ErrorOnNoConverge(err_on_no_conv)
+      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
+      , y(RestartLength + 1, 0.)
+      , gamma(RestartLength + 1, 0.)
+      , c(RestartLength + 1, 0.)
+      , s(RestartLength + 1, 0.) {};
+
+  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
+
+    std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular GMRES" << std::endl;
+
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);
+
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+
+    RealD cp;
+    RealD ssq = norm2(src);
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    Field r(src._grid);
+
+    std::cout << std::setprecision(4) << std::scientific;
+    std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
+    std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual:   src " << ssq   << std::endl;
+
+    MatrixTimer.Reset();
+    LinalgTimer.Reset();
+    QrTimer.Reset();
+    CompSolutionTimer.Reset();
+
+    GridStopWatch SolverTimer;
+    SolverTimer.Start();
+
+    IterationCount = 0;
+
+    for (int k=0; k<MaxNumberOfRestarts; k++) {
+
+      cp = outerLoopBody(LinOp, src, psi, rsq);
+
+      // Stopping condition
+      if (cp <= rsq) {
+
+        SolverTimer.Stop();
+
+        LinOp.Op(psi,r);
+        axpy(r,-1.0,src,r);
+
+        RealD srcnorm       = sqrt(ssq);
+        RealD resnorm       = sqrt(norm2(r));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage        << "CommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
+                  << " computed residual " << sqrt(cp / ssq)
+                  << " true residual "     << true_residual
+                  << " target "            << Tolerance << std::endl;
+
+        std::cout << GridLogMessage << "CAGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "CAGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "CAGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "CAGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "CAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
+        return;
+      }
+    }
+
+    std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
+
+    if (ErrorOnNoConverge)
+      assert(0);
+  }
+
+  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
+
+    RealD cp = 0;
+
+    Field w(src._grid);
+    Field r(src._grid);
+
+    // this should probably be made a class member so that it is only allocated once, not in every restart
+    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
+
+    MatrixTimer.Start();
+    LinOp.Op(psi, w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    r = src - w;
+
+    gamma[0] = sqrt(norm2(r));
+
+    v[0] = (1. / gamma[0]) * r;
+    LinalgTimer.Stop();
+
+    for (int i=0; i<RestartLength; i++) {
+
+      IterationCount++;
+
+      arnoldiStep(LinOp, v, w, i);
+
+      qrUpdate(i);
+
+      cp = std::norm(gamma[i+1]);
+
+      std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
+                << " residual " << cp << " target " << rsq << std::endl;
+
+      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
+
+        computeSolution(v, psi, i);
+
+        return cp;
+      }
+    }
+
+    assert(0); // Never reached
+    return cp;
+  }
+
+  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
+
+    MatrixTimer.Start();
+    LinOp.Op(v[iter], w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    for (int i = 0; i <= iter; ++i) {
+      H(iter, i) = innerProduct(v[i], w);
+      w = w - H(iter, i) * v[i];
+    }
+
+    H(iter, iter + 1) = sqrt(norm2(w));
+    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
+    LinalgTimer.Stop();
+  }
+
+  void qrUpdate(int iter) {
+
+    QrTimer.Start();
+    for (int i = 0; i < iter ; ++i) {
+      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
+      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
+      H(iter, i + 1) = tmp;
+    }
+
+    // Compute new Givens Rotation
+    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
+    c[iter]     = H(iter, iter) / nu;
+    s[iter]     = H(iter, iter + 1) / nu;
+
+    // Apply new Givens rotation
+    H(iter, iter)     = nu;
+    H(iter, iter + 1) = 0.;
+
+    gamma[iter + 1] = -s[iter] * gamma[iter];
+    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
+    QrTimer.Stop();
+  }
+
+  void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
+
+    CompSolutionTimer.Start();
+    for (int i = iter; i >= 0; i--) {
+      y[i] = gamma[i];
+      for (int k = i + 1; k <= iter; k++)
+        y[i] = y[i] - H(k, i) * y[k];
+      y[i] = y[i] / H(i, i);
+    }
+
+    for (int i = 0; i <= iter; i++)
+      psi = psi + v[i] * y[i];
+    CompSolutionTimer.Stop();
+  }
+};
+}
+#endif
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -133,7 +133,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
      LinalgTimer.Stop();

      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
-                << " residual " << cp << " target " << rsq << std::endl;
+                << " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;

      // Stopping condition
      if (cp <= rsq) {
--- a/Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
@@ -0,0 +1,256 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
+
+Copyright (C) 2015
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
+#define GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
+
+namespace Grid {
+
+template<class Field>
+class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
+ public:
+  bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
+                          // defaults to true
+
+  RealD   Tolerance;
+
+  Integer MaxIterations;
+  Integer RestartLength;
+  Integer MaxNumberOfRestarts;
+  Integer IterationCount; // Number of iterations the FCAGMRES took to finish,
+                          // filled in upon completion
+
+  GridStopWatch MatrixTimer;
+  GridStopWatch PrecTimer;
+  GridStopWatch LinalgTimer;
+  GridStopWatch QrTimer;
+  GridStopWatch CompSolutionTimer;
+
+  Eigen::MatrixXcd H;
+
+  std::vector<std::complex<double>> y;
+  std::vector<std::complex<double>> gamma;
+  std::vector<std::complex<double>> c;
+  std::vector<std::complex<double>> s;
+
+  LinearFunction<Field> &Preconditioner;
+
+  FlexibleCommunicationAvoidingGeneralisedMinimalResidual(RealD   tol,
+                                                          Integer maxit,
+                                                          LinearFunction<Field> &Prec,
+                                                          Integer restart_length,
+                                                          bool    err_on_no_conv = true)
+      : Tolerance(tol)
+      , MaxIterations(maxit)
+      , RestartLength(restart_length)
+      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
+      , ErrorOnNoConverge(err_on_no_conv)
+      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
+      , y(RestartLength + 1, 0.)
+      , gamma(RestartLength + 1, 0.)
+      , c(RestartLength + 1, 0.)
+      , s(RestartLength + 1, 0.)
+      , Preconditioner(Prec) {};
+
+  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
+
+    std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular FGMRES" << std::endl;
+
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);
+
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+
+    RealD cp;
+    RealD ssq = norm2(src);
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    Field r(src._grid);
+
+    std::cout << std::setprecision(4) << std::scientific;
+    std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
+    std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual:   src " << ssq   << std::endl;
+
+    PrecTimer.Reset();
+    MatrixTimer.Reset();
+    LinalgTimer.Reset();
+    QrTimer.Reset();
+    CompSolutionTimer.Reset();
+
+    GridStopWatch SolverTimer;
+    SolverTimer.Start();
+
+    IterationCount = 0;
+
+    for (int k=0; k<MaxNumberOfRestarts; k++) {
+
+      cp = outerLoopBody(LinOp, src, psi, rsq);
+
+      // Stopping condition
+      if (cp <= rsq) {
+
+        SolverTimer.Stop();
+
+        LinOp.Op(psi,r);
+        axpy(r,-1.0,src,r);
+
+        RealD srcnorm       = sqrt(ssq);
+        RealD resnorm       = sqrt(norm2(r));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage        << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
+                  << " computed residual " << sqrt(cp / ssq)
+                  << " true residual "     << true_residual
+                  << " target "            << Tolerance << std::endl;
+
+        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Precon  " <<         PrecTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FCAGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FCAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
+        return;
+      }
+    }
+
+    std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
+
+    if (ErrorOnNoConverge)
+      assert(0);
+  }
+
+  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
+
+    RealD cp = 0;
+
+    Field w(src._grid);
+    Field r(src._grid);
+
+    // these should probably be made class members so that they are only allocated once, not in every restart
+    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
+    std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
+
+    MatrixTimer.Start();
+    LinOp.Op(psi, w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    r = src - w;
+
+    gamma[0] = sqrt(norm2(r));
+
+    v[0] = (1. / gamma[0]) * r;
+    LinalgTimer.Stop();
+
+    for (int i=0; i<RestartLength; i++) {
+
+      IterationCount++;
+
+      arnoldiStep(LinOp, v, z, w, i);
+
+      qrUpdate(i);
+
+      cp = std::norm(gamma[i+1]);
+
+      std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
+                << " residual " << cp << " target " << rsq << std::endl;
+
+      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
+
+        computeSolution(z, psi, i);
+
+        return cp;
+      }
+    }
+
+    assert(0); // Never reached
+    return cp;
+  }
+
+  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
+
+    PrecTimer.Start();
+    Preconditioner(v[iter], z[iter]);
+    PrecTimer.Stop();
+
+    MatrixTimer.Start();
+    LinOp.Op(z[iter], w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    for (int i = 0; i <= iter; ++i) {
+      H(iter, i) = innerProduct(v[i], w);
+      w = w - H(iter, i) * v[i];
+    }
+
+    H(iter, iter + 1) = sqrt(norm2(w));
+    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
+    LinalgTimer.Stop();
+  }
+
+  void qrUpdate(int iter) {
+
+    QrTimer.Start();
+    for (int i = 0; i < iter ; ++i) {
+      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
+      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
+      H(iter, i + 1) = tmp;
+    }
+
+    // Compute new Givens Rotation
+    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
+    c[iter]     = H(iter, iter) / nu;
+    s[iter]     = H(iter, iter + 1) / nu;
+
+    // Apply new Givens rotation
+    H(iter, iter)     = nu;
+    H(iter, iter + 1) = 0.;
+
+    gamma[iter + 1] = -s[iter] * gamma[iter];
+    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
+    QrTimer.Stop();
+  }
+
+  void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
+
+    CompSolutionTimer.Start();
+    for (int i = iter; i >= 0; i--) {
+      y[i] = gamma[i];
+      for (int k = i + 1; k <= iter; k++)
+        y[i] = y[i] - H(k, i) * y[k];
+      y[i] = y[i] / H(i, i);
+    }
+
+    for (int i = 0; i <= iter; i++)
+      psi = psi + z[i] * y[i];
+    CompSolutionTimer.Stop();
+  }
+};
+}
+#endif
--- a/Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
@@ -0,0 +1,254 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
+
+Copyright (C) 2015
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
+#define GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
+
+namespace Grid {
+
+template<class Field>
+class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
+ public:
+  bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
+                          // defaults to true
+
+  RealD   Tolerance;
+
+  Integer MaxIterations;
+  Integer RestartLength;
+  Integer MaxNumberOfRestarts;
+  Integer IterationCount; // Number of iterations the FGMRES took to finish,
+                          // filled in upon completion
+
+  GridStopWatch MatrixTimer;
+  GridStopWatch PrecTimer;
+  GridStopWatch LinalgTimer;
+  GridStopWatch QrTimer;
+  GridStopWatch CompSolutionTimer;
+
+  Eigen::MatrixXcd H;
+
+  std::vector<std::complex<double>> y;
+  std::vector<std::complex<double>> gamma;
+  std::vector<std::complex<double>> c;
+  std::vector<std::complex<double>> s;
+
+  LinearFunction<Field> &Preconditioner;
+
+  FlexibleGeneralisedMinimalResidual(RealD   tol,
+                                     Integer maxit,
+                                     LinearFunction<Field> &Prec,
+                                     Integer restart_length,
+                                     bool    err_on_no_conv = true)
+      : Tolerance(tol)
+      , MaxIterations(maxit)
+      , RestartLength(restart_length)
+      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
+      , ErrorOnNoConverge(err_on_no_conv)
+      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
+      , y(RestartLength + 1, 0.)
+      , gamma(RestartLength + 1, 0.)
+      , c(RestartLength + 1, 0.)
+      , s(RestartLength + 1, 0.)
+      , Preconditioner(Prec) {};
+
+  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
+
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);
+
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+
+    RealD cp;
+    RealD ssq = norm2(src);
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    Field r(src._grid);
+
+    std::cout << std::setprecision(4) << std::scientific;
+    std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: guess " << guess << std::endl;
+    std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual:   src " << ssq   << std::endl;
+
+    PrecTimer.Reset();
+    MatrixTimer.Reset();
+    LinalgTimer.Reset();
+    QrTimer.Reset();
+    CompSolutionTimer.Reset();
+
+    GridStopWatch SolverTimer;
+    SolverTimer.Start();
+
+    IterationCount = 0;
+
+    for (int k=0; k<MaxNumberOfRestarts; k++) {
+
+      cp = outerLoopBody(LinOp, src, psi, rsq);
+
+      // Stopping condition
+      if (cp <= rsq) {
+
+        SolverTimer.Stop();
+
+        LinOp.Op(psi,r);
+        axpy(r,-1.0,src,r);
+
+        RealD srcnorm       = sqrt(ssq);
+        RealD resnorm       = sqrt(norm2(r));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage        << "FlexibleGeneralisedMinimalResidual: Converged on iteration " << IterationCount
+                  << " computed residual " << sqrt(cp / ssq)
+                  << " true residual "     << true_residual
+                  << " target "            << Tolerance << std::endl;
+
+        std::cout << GridLogMessage << "FGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FGMRES Time elapsed: Precon  " <<         PrecTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "FGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
+        return;
+      }
+    }
+
+    std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl;
+
+    if (ErrorOnNoConverge)
+      assert(0);
+  }
+
+  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
+
+    RealD cp = 0;
+
+    Field w(src._grid);
+    Field r(src._grid);
+
+    // these should probably be made class members so that they are only allocated once, not in every restart
+    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
+    std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
+
+    MatrixTimer.Start();
+    LinOp.Op(psi, w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    r = src - w;
+
+    gamma[0] = sqrt(norm2(r));
+
+    v[0] = (1. / gamma[0]) * r;
+    LinalgTimer.Stop();
+
+    for (int i=0; i<RestartLength; i++) {
+
+      IterationCount++;
+
+      arnoldiStep(LinOp, v, z, w, i);
+
+      qrUpdate(i);
+
+      cp = std::norm(gamma[i+1]);
+
+      std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: Iteration " << IterationCount
+                << " residual " << cp << " target " << rsq << std::endl;
+
+      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
+
+        computeSolution(z, psi, i);
+
+        return cp;
+      }
+    }
+
+    assert(0); // Never reached
+    return cp;
+  }
+
+  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
+
+    PrecTimer.Start();
+    Preconditioner(v[iter], z[iter]);
+    PrecTimer.Stop();
+
+    MatrixTimer.Start();
+    LinOp.Op(z[iter], w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    for (int i = 0; i <= iter; ++i) {
+      H(iter, i) = innerProduct(v[i], w);
+      w = w - H(iter, i) * v[i];
+    }
+
+    H(iter, iter + 1) = sqrt(norm2(w));
+    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
+    LinalgTimer.Stop();
+  }
+
+  void qrUpdate(int iter) {
+
+    QrTimer.Start();
+    for (int i = 0; i < iter ; ++i) {
+      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
+      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
+      H(iter, i + 1) = tmp;
+    }
+
+    // Compute new Givens Rotation
+    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
+    c[iter]     = H(iter, iter) / nu;
+    s[iter]     = H(iter, iter + 1) / nu;
+
+    // Apply new Givens rotation
+    H(iter, iter)     = nu;
+    H(iter, iter + 1) = 0.;
+
+    gamma[iter + 1] = -s[iter] * gamma[iter];
+    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
+    QrTimer.Stop();
+  }
+
+  void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
+
+    CompSolutionTimer.Start();
+    for (int i = iter; i >= 0; i--) {
+      y[i] = gamma[i];
+      for (int k = i + 1; k <= iter; k++)
+        y[i] = y[i] - H(k, i) * y[k];
+      y[i] = y[i] / H(i, i);
+    }
+
+    for (int i = 0; i <= iter; i++)
+      psi = psi + z[i] * y[i];
+    CompSolutionTimer.Stop();
+  }
+};
+}
+#endif
--- a/Grid/algorithms/iterative/GeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/GeneralisedMinimalResidual.h
@@ -0,0 +1,242 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/GeneralisedMinimalResidual.h
+
+Copyright (C) 2015
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_GENERALISED_MINIMAL_RESIDUAL_H
+#define GRID_GENERALISED_MINIMAL_RESIDUAL_H
+
+namespace Grid {
+
+template<class Field>
+class GeneralisedMinimalResidual : public OperatorFunction<Field> {
+ public:
+  bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
+                          // defaults to true
+
+  RealD   Tolerance;
+
+  Integer MaxIterations;
+  Integer RestartLength;
+  Integer MaxNumberOfRestarts;
+  Integer IterationCount; // Number of iterations the GMRES took to finish,
+                          // filled in upon completion
+
+  GridStopWatch MatrixTimer;
+  GridStopWatch LinalgTimer;
+  GridStopWatch QrTimer;
+  GridStopWatch CompSolutionTimer;
+
+  Eigen::MatrixXcd H;
+
+  std::vector<std::complex<double>> y;
+  std::vector<std::complex<double>> gamma;
+  std::vector<std::complex<double>> c;
+  std::vector<std::complex<double>> s;
+
+  GeneralisedMinimalResidual(RealD   tol,
+                             Integer maxit,
+                             Integer restart_length,
+                             bool    err_on_no_conv = true)
+      : Tolerance(tol)
+      , MaxIterations(maxit)
+      , RestartLength(restart_length)
+      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
+      , ErrorOnNoConverge(err_on_no_conv)
+      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
+      , y(RestartLength + 1, 0.)
+      , gamma(RestartLength + 1, 0.)
+      , c(RestartLength + 1, 0.)
+      , s(RestartLength + 1, 0.) {};
+
+  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
+
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);
+
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+
+    RealD cp;
+    RealD ssq = norm2(src);
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    Field r(src._grid);
+
+    std::cout << std::setprecision(4) << std::scientific;
+    std::cout << GridLogIterative << "GeneralisedMinimalResidual: guess " << guess << std::endl;
+    std::cout << GridLogIterative << "GeneralisedMinimalResidual:   src " << ssq   << std::endl;
+
+    MatrixTimer.Reset();
+    LinalgTimer.Reset();
+    QrTimer.Reset();
+    CompSolutionTimer.Reset();
+
+    GridStopWatch SolverTimer;
+    SolverTimer.Start();
+
+    IterationCount = 0;
+
+    for (int k=0; k<MaxNumberOfRestarts; k++) {
+
+      cp = outerLoopBody(LinOp, src, psi, rsq);
+
+      // Stopping condition
+      if (cp <= rsq) {
+
+        SolverTimer.Stop();
+
+        LinOp.Op(psi,r);
+        axpy(r,-1.0,src,r);
+
+        RealD srcnorm       = sqrt(ssq);
+        RealD resnorm       = sqrt(norm2(r));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage        << "GeneralisedMinimalResidual: Converged on iteration " << IterationCount
+                  << " computed residual " << sqrt(cp / ssq)
+                  << " true residual "     << true_residual
+                  << " target "            << Tolerance << std::endl;
+
+        std::cout << GridLogMessage << "GMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "GMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "GMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "GMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "GMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
+        return;
+      }
+    }
+
+    std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl;
+
+    if (ErrorOnNoConverge)
+      assert(0);
+  }
+
+  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
+
+    RealD cp = 0;
+
+    Field w(src._grid);
+    Field r(src._grid);
+
+    // this should probably be made a class member so that it is only allocated once, not in every restart
+    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
+
+    MatrixTimer.Start();
+    LinOp.Op(psi, w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    r = src - w;
+
+    gamma[0] = sqrt(norm2(r));
+
+    v[0] = (1. / gamma[0]) * r;
+    LinalgTimer.Stop();
+
+    for (int i=0; i<RestartLength; i++) {
+
+      IterationCount++;
+
+      arnoldiStep(LinOp, v, w, i);
+
+      qrUpdate(i);
+
+      cp = std::norm(gamma[i+1]);
+
+      std::cout << GridLogIterative << "GeneralisedMinimalResidual: Iteration " << IterationCount
+                << " residual " << cp << " target " << rsq << std::endl;
+
+      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
+
+        computeSolution(v, psi, i);
+
+        return cp;
+      }
+    }
+
+    assert(0); // Never reached
+    return cp;
+  }
+
+  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
+
+    MatrixTimer.Start();
+    LinOp.Op(v[iter], w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    for (int i = 0; i <= iter; ++i) {
+      H(iter, i) = innerProduct(v[i], w);
+      w = w - H(iter, i) * v[i];
+    }
+
+    H(iter, iter + 1) = sqrt(norm2(w));
+    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
+    LinalgTimer.Stop();
+  }
+
+  void qrUpdate(int iter) {
+
+    QrTimer.Start();
+    for (int i = 0; i < iter ; ++i) {
+      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
+      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
+      H(iter, i + 1) = tmp;
+    }
+
+    // Compute new Givens Rotation
+    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
+    c[iter]     = H(iter, iter) / nu;
+    s[iter]     = H(iter, iter + 1) / nu;
+
+    // Apply new Givens rotation
+    H(iter, iter)     = nu;
+    H(iter, iter + 1) = 0.;
+
+    gamma[iter + 1] = -s[iter] * gamma[iter];
+    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
+    QrTimer.Stop();
+  }
+
+  void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
+
+    CompSolutionTimer.Start();
+    for (int i = iter; i >= 0; i--) {
+      y[i] = gamma[i];
+      for (int k = i + 1; k <= iter; k++)
+        y[i] = y[i] - H(k, i) * y[k];
+      y[i] = y[i] / H(i, i);
+    }
+
+    for (int i = 0; i <= iter; i++)
+      psi = psi + v[i] * y[i];
+    CompSolutionTimer.Stop();
+  }
+};
+}
+#endif
--- a/Grid/algorithms/iterative/MinimalResidual.h
+++ b/Grid/algorithms/iterative/MinimalResidual.h
@@ -0,0 +1,156 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/MinimalResidual.h
+
+Copyright (C) 2015
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MINIMAL_RESIDUAL_H
+#define GRID_MINIMAL_RESIDUAL_H
+
+namespace Grid {
+
+template<class Field> class MinimalResidual : public OperatorFunction<Field> {
+ public:
+  bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
+                          // Defaults true.
+  RealD   Tolerance;
+  Integer MaxIterations;
+  RealD   overRelaxParam;
+  Integer IterationsToComplete; // Number of iterations the MR took to finish.
+                                // Filled in upon completion
+
+  MinimalResidual(RealD tol, Integer maxit, Real ovrelparam = 1.0, bool err_on_no_conv = true)
+    : Tolerance(tol), MaxIterations(maxit), overRelaxParam(ovrelparam), ErrorOnNoConverge(err_on_no_conv){};
+
+  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
+
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);
+
+    Complex a, c;
+    Real    d;
+
+    Field Mr(src);
+    Field r(src);
+
+    // Initial residual computation & set up
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+
+    RealD ssq = norm2(src);
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    Linop.Op(psi, Mr);
+
+    r = src - Mr;
+
+    RealD cp = norm2(r);
+
+    std::cout << std::setprecision(4) << std::scientific;
+    std::cout << GridLogIterative << "MinimalResidual: guess " << guess << std::endl;
+    std::cout << GridLogIterative << "MinimalResidual:   src " << ssq << std::endl;
+    std::cout << GridLogIterative << "MinimalResidual:    mp " << d << std::endl;
+    std::cout << GridLogIterative << "MinimalResidual:  cp,r " << cp << std::endl;
+
+    if (cp <= rsq) {
+      return;
+    }
+
+    std::cout << GridLogIterative << "MinimalResidual: k=0 residual " << cp << " target " << rsq << std::endl;
+
+    GridStopWatch LinalgTimer;
+    GridStopWatch MatrixTimer;
+    GridStopWatch SolverTimer;
+
+    SolverTimer.Start();
+    int k;
+    for (k = 1; k <= MaxIterations; k++) {
+
+      MatrixTimer.Start();
+      Linop.Op(r, Mr);
+      MatrixTimer.Stop();
+
+      LinalgTimer.Start();
+
+      c = innerProduct(Mr, r);
+
+      d = norm2(Mr);
+
+      a = c / d;
+
+      a = a * overRelaxParam;
+
+      psi = psi + r * a;
+
+      r = r - Mr * a;
+
+      cp = norm2(r);
+
+      LinalgTimer.Stop();
+
+      std::cout << GridLogIterative << "MinimalResidual: Iteration " << k
+                << " residual " << cp << " target " << rsq << std::endl;
+      std::cout << GridLogDebug << "a = " << a << " c = " << c << " d = " << d << std::endl;
+
+      // Stopping condition
+      if (cp <= rsq) {
+        SolverTimer.Stop();
+
+        Linop.Op(psi, Mr);
+        r = src - Mr;
+
+        RealD srcnorm       = sqrt(ssq);
+        RealD resnorm       = sqrt(norm2(r));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage        << "MinimalResidual Converged on iteration " << k
+                  << " computed residual " << sqrt(cp / ssq)
+                  << " true residual "     << true_residual
+                  << " target "            << Tolerance << std::endl;
+
+        std::cout << GridLogMessage << "MR Time elapsed: Total   " << SolverTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MR Time elapsed: Matrix  " << MatrixTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MR Time elapsed: Linalg  " << LinalgTimer.Elapsed() << std::endl;
+
+        if (ErrorOnNoConverge)
+          assert(true_residual / Tolerance < 10000.0);
+
+        IterationsToComplete = k;
+
+        return;
+      }
+    }
+
+    std::cout << GridLogMessage << "MinimalResidual did NOT converge"
+              << std::endl;
+
+    if (ErrorOnNoConverge)
+      assert(0);
+
+    IterationsToComplete = k;
+  }
+};
+} // namespace Grid
+#endif
--- a/Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
@@ -0,0 +1,273 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
+
+Copyright (C) 2015
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
+#define GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
+
+namespace Grid {
+
+template<class FieldD, class FieldF, typename std::enable_if<getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
+class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction<FieldD> {
+ public:
+  bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
+                          // defaults to true
+
+  RealD   Tolerance;
+
+  Integer MaxIterations;
+  Integer RestartLength;
+  Integer MaxNumberOfRestarts;
+  Integer IterationCount; // Number of iterations the MPFGMRES took to finish,
+                          // filled in upon completion
+
+  GridStopWatch MatrixTimer;
+  GridStopWatch PrecTimer;
+  GridStopWatch LinalgTimer;
+  GridStopWatch QrTimer;
+  GridStopWatch CompSolutionTimer;
+  GridStopWatch ChangePrecTimer;
+
+  Eigen::MatrixXcd H;
+
+  std::vector<std::complex<double>> y;
+  std::vector<std::complex<double>> gamma;
+  std::vector<std::complex<double>> c;
+  std::vector<std::complex<double>> s;
+
+  GridBase* SinglePrecGrid;
+
+  LinearFunction<FieldF> &Preconditioner;
+
+  MixedPrecisionFlexibleGeneralisedMinimalResidual(RealD   tol,
+                                                   Integer maxit,
+                                                   GridBase * sp_grid,
+                                                   LinearFunction<FieldF> &Prec,
+                                                   Integer restart_length,
+                                                   bool    err_on_no_conv = true)
+      : Tolerance(tol)
+      , MaxIterations(maxit)
+      , RestartLength(restart_length)
+      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
+      , ErrorOnNoConverge(err_on_no_conv)
+      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
+      , y(RestartLength + 1, 0.)
+      , gamma(RestartLength + 1, 0.)
+      , c(RestartLength + 1, 0.)
+      , s(RestartLength + 1, 0.)
+      , SinglePrecGrid(sp_grid)
+      , Preconditioner(Prec) {};
+
+  void operator()(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi) {
+
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);
+
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+
+    RealD cp;
+    RealD ssq = norm2(src);
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    FieldD r(src._grid);
+
+    std::cout << std::setprecision(4) << std::scientific;
+    std::cout << GridLogIterative << "MPFGMRES: guess " << guess << std::endl;
+    std::cout << GridLogIterative << "MPFGMRES:   src " << ssq   << std::endl;
+
+    PrecTimer.Reset();
+    MatrixTimer.Reset();
+    LinalgTimer.Reset();
+    QrTimer.Reset();
+    CompSolutionTimer.Reset();
+    ChangePrecTimer.Reset();
+
+    GridStopWatch SolverTimer;
+    SolverTimer.Start();
+
+    IterationCount = 0;
+
+    for (int k=0; k<MaxNumberOfRestarts; k++) {
+
+      cp = outerLoopBody(LinOp, src, psi, rsq);
+
+      // Stopping condition
+      if (cp <= rsq) {
+
+        SolverTimer.Stop();
+
+        LinOp.Op(psi,r);
+        axpy(r,-1.0,src,r);
+
+        RealD srcnorm       = sqrt(ssq);
+        RealD resnorm       = sqrt(norm2(r));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage        << "MPFGMRES: Converged on iteration " << IterationCount
+                  << " computed residual " << sqrt(cp / ssq)
+                  << " true residual "     << true_residual
+                  << " target "            << Tolerance << std::endl;
+
+        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Total      " <<       SolverTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Precon     " <<         PrecTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Matrix     " <<       MatrixTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Linalg     " <<       LinalgTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MPFGMRES Time elapsed: QR         " <<           QrTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MPFGMRES Time elapsed: CompSol    " << CompSolutionTimer.Elapsed() << std::endl;
+        std::cout << GridLogMessage << "MPFGMRES Time elapsed: PrecChange " <<   ChangePrecTimer.Elapsed() << std::endl;
+        return;
+      }
+    }
+
+    std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl;
+
+    if (ErrorOnNoConverge)
+      assert(0);
+  }
+
+  RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) {
+
+    RealD cp = 0;
+
+    FieldD w(src._grid);
+    FieldD r(src._grid);
+
+    // these should probably be made class members so that they are only allocated once, not in every restart
+    std::vector<FieldD> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
+    std::vector<FieldD> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
+
+    MatrixTimer.Start();
+    LinOp.Op(psi, w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    r = src - w;
+
+    gamma[0] = sqrt(norm2(r));
+
+    v[0] = (1. / gamma[0]) * r;
+    LinalgTimer.Stop();
+
+    for (int i=0; i<RestartLength; i++) {
+
+      IterationCount++;
+
+      arnoldiStep(LinOp, v, z, w, i);
+
+      qrUpdate(i);
+
+      cp = std::norm(gamma[i+1]);
+
+      std::cout << GridLogIterative << "MPFGMRES: Iteration " << IterationCount
+                << " residual " << cp << " target " << rsq << std::endl;
+
+      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
+
+        computeSolution(z, psi, i);
+
+        return cp;
+      }
+    }
+
+    assert(0); // Never reached
+    return cp;
+  }
+
+  void arnoldiStep(LinearOperatorBase<FieldD> &LinOp, std::vector<FieldD> &v, std::vector<FieldD> &z, FieldD &w, int iter) {
+
+    FieldF v_f(SinglePrecGrid);
+    FieldF z_f(SinglePrecGrid);
+
+    ChangePrecTimer.Start();
+    precisionChange(v_f, v[iter]);
+    precisionChange(z_f, z[iter]);
+    ChangePrecTimer.Stop();
+
+    PrecTimer.Start();
+    Preconditioner(v_f, z_f);
+    PrecTimer.Stop();
+
+    ChangePrecTimer.Start();
+    precisionChange(z[iter], z_f);
+    ChangePrecTimer.Stop();
+
+    MatrixTimer.Start();
+    LinOp.Op(z[iter], w);
+    MatrixTimer.Stop();
+
+    LinalgTimer.Start();
+    for (int i = 0; i <= iter; ++i) {
+      H(iter, i) = innerProduct(v[i], w);
+      w = w - H(iter, i) * v[i];
+    }
+
+    H(iter, iter + 1) = sqrt(norm2(w));
+    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
+    LinalgTimer.Stop();
+  }
+
+  void qrUpdate(int iter) {
+
+    QrTimer.Start();
+    for (int i = 0; i < iter ; ++i) {
+      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
+      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
+      H(iter, i + 1) = tmp;
+    }
+
+    // Compute new Givens Rotation
+    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
+    c[iter]     = H(iter, iter) / nu;
+    s[iter]     = H(iter, iter + 1) / nu;
+
+    // Apply new Givens rotation
+    H(iter, iter)     = nu;
+    H(iter, iter + 1) = 0.;
+
+    gamma[iter + 1] = -s[iter] * gamma[iter];
+    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
+    QrTimer.Stop();
+  }
+
+  void computeSolution(std::vector<FieldD> const &z, FieldD &psi, int iter) {
+
+    CompSolutionTimer.Start();
+    for (int i = iter; i >= 0; i--) {
+      y[i] = gamma[i];
+      for (int k = i + 1; k <= iter; k++)
+        y[i] = y[i] - H(k, i) * y[k];
+      y[i] = y[i] / H(i, i);
+    }
+
+    for (int i = 0; i <= iter; i++)
+      psi = psi + z[i] * y[i];
+    CompSolutionTimer.Stop();
+  }
+};
+}
+#endif
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -139,8 +139,11 @@ namespace Grid {
      MatTimer.Start();
      Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
      MatTimer.Stop();
+
+      LinalgTimer.Start();
      r=src-Az;
-      
+      LinalgTimer.Stop();
+
      /////////////////////
      // p = Prec(r)
      /////////////////////
@@ -152,8 +155,10 @@ namespace Grid {
      Linop.HermOp(z,tmp); 
      MatTimer.Stop();

+      LinalgTimer.Start();
      ttmp=tmp;
      tmp=tmp-r;
+      LinalgTimer.Stop();

      /*
      std::cout<<GridLogMessage<<r<<std::endl;
@@ -166,12 +171,14 @@ namespace Grid {
      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
      MatTimer.Stop();

+      LinalgTimer.Start();
      //p[0],q[0],qq[0] 
      p[0]= z;
      q[0]= Az;
      qq[0]= zAAz;

      cp =norm2(r);
+      LinalgTimer.Stop();

      for(int k=0;k<nstep;k++){

@@ -181,12 +188,14 @@ namespace Grid {
 	int peri_k = k %mmax;
 	int peri_kp= kp%mmax;

+        LinalgTimer.Start();
 	rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
 	a = rq/qq[peri_k];

 	axpy(psi,a,p[peri_k],psi);         

-	cp = axpy_norm(r,-a,q[peri_k],r);  
+	cp = axpy_norm(r,-a,q[peri_k],r);
+        LinalgTimer.Stop();

 	if((k==nstep-1)||(cp<rsq)){
 	  return cp;
@@ -202,6 +211,8 @@ namespace Grid {
 	Linop.HermOpAndNorm(z,Az,zAz,zAAz);
 	Linop.HermOp(z,tmp);
 	MatTimer.Stop();
+
+        LinalgTimer.Start();
        tmp=tmp-r;
 	std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 

@@ -219,9 +230,9 @@ namespace Grid {

 	}
 	qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
-
-
+        LinalgTimer.Stop();
      }
+
      assert(0); // never reached
      return cp;
    }
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -86,229 +86,23 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   */
 namespace Grid {

+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Use base class to share code
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Now make the norm reflect extra factor of Mee
-  template<class Field> class SchurRedBlackStaggeredSolve {
-  private:
+  template<class Field> class SchurRedBlackBase {
+  protected:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
  public:

-    /////////////////////////////////////////////////////
-    // Wrap the usual normal equations Schur trick
-    /////////////////////////////////////////////////////
-  SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
-     _HermitianRBSolver(HermitianRBSolver) 
-    { 
-      CBfactorise=0;
-      subtractGuess(initSubGuess);
-    };
-    void subtractGuess(const bool initSubGuess)
-    {
-      subGuess = initSubGuess;
-    }
-    bool isSubtractGuess(void)
-    {
-      return subGuess;
-    }
-
-    template<class Matrix>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out){
-      ZeroGuesser<Field> guess;
-      (*this)(_Matrix,in,out,guess);
-    }
-    template<class Matrix, class Guesser>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
-
-      // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME use CBfactorise to control schur decomp
-      GridBase *grid = _Matrix.RedBlackGrid();
-      GridBase *fgrid= _Matrix.Grid();
-
-      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
-      Field   tmp(grid);
-      Field  Mtmp(grid);
-      Field resid(fgrid);
-      
-      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve " <<std::endl;
-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
-      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
-    
-      /////////////////////////////////////////////////////
-      // src_o = (source_o - Moe MeeInv source_e)
-      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
-
-      //src_o = tmp;     assert(src_o.checkerboard ==Odd);
-      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
-
-      //////////////////////////////////////////////////////////////
-      // Call the red-black solver
-      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
-      guess(src_o, sol_o);
-      Mtmp = sol_o;
-      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;
-      // Fionn A2A boolean behavioural control
-      if (subGuess)        sol_o = sol_o-Mtmp;
-
-      ///////////////////////////////////////////////////
-      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-     
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver reconstructed other CB" <<std::endl;
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver inserted solution" <<std::endl;
-
-      // Verify the unprec residual
-      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
-        resid = resid-in;
-        RealD ns = norm2(in);
-        RealD nr = norm2(resid);
-        std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
-      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
-      }
-    }     
-  };
-  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
-
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Take a matrix and form a Red Black solver calling a Herm solver
-  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackDiagMooeeSolve {
-  private:
-    OperatorFunction<Field> & _HermitianRBSolver;
-    int CBfactorise;
-    bool subGuess;
-  public:
-
-    /////////////////////////////////////////////////////
-    // Wrap the usual normal equations Schur trick
-    /////////////////////////////////////////////////////
-  SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0, const bool initSubGuess = false)  :  _HermitianRBSolver(HermitianRBSolver) 
-  { 
-    CBfactorise=cb;
-    subtractGuess(initSubGuess);
-  };
-    void subtractGuess(const bool initSubGuess)
-    {
-      subGuess = initSubGuess;
-    }
-    bool isSubtractGuess(void)
-    {
-      return subGuess;
-    }
-    template<class Matrix>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out){
-      ZeroGuesser<Field> guess;
-      (*this)(_Matrix,in,out,guess);
-    }
-    template<class Matrix, class Guesser>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
-
-      // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME use CBfactorise to control schur decomp
-      GridBase *grid = _Matrix.RedBlackGrid();
-      GridBase *fgrid= _Matrix.Grid();
-
-      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
-      Field   tmp(grid);
-      Field  Mtmp(grid);
-      Field resid(fgrid);
-
-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
-    
-      /////////////////////////////////////////////////////
-      // src_o = Mdag * (source_o - Moe MeeInv source_e)
-      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
-
-      // get the right MpcDag
-      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
-
-      //////////////////////////////////////////////////////////////
-      // Call the red-black solver
-      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-      guess(src_o,sol_o);
-      Mtmp = sol_o;
-      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-      // Fionn A2A boolean behavioural control
-      if (subGuess)        sol_o = sol_o-Mtmp;
-
-      ///////////////////////////////////////////////////
-      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-     
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
-
-      // Verify the unprec residual
-      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
-        resid = resid-in;
-        RealD ns = norm2(in);
-        RealD nr = norm2(resid);
-
-        std::cout<<GridLogMessage << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
-      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
-      }
-    }     
-  };
-
-
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Take a matrix and form a Red Black solver calling a Herm solver
-  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackDiagTwoSolve {
-  private:
-    OperatorFunction<Field> & _HermitianRBSolver;
-    int CBfactorise;
-    bool subGuess;
-  public:
-
-    /////////////////////////////////////////////////////
-    // Wrap the usual normal equations Schur trick
-    /////////////////////////////////////////////////////
-  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
-     _HermitianRBSolver(HermitianRBSolver) 
+    SchurRedBlackBase(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
+    _HermitianRBSolver(HermitianRBSolver) 
    { 
      CBfactorise = 0;
      subtractGuess(initSubGuess);
@@ -322,12 +116,86 @@ namespace Grid {
      return subGuess;
    }

-    template<class Matrix>
+    /////////////////////////////////////////////////////////////
+    // Shared code
+    /////////////////////////////////////////////////////////////
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
-    template<class Matrix,class Guesser>
+    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out) 
+    {
+      ZeroGuesser<Field> guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+
+    template<class Guesser>
+    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess) 
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+      int nblock = in.size();
+
+      std::vector<Field> src_o(nblock,grid);
+      std::vector<Field> sol_o(nblock,grid);
+      
+      std::vector<Field> guess_save;
+
+      Field resid(fgrid);
+      Field tmp(grid);
+
+      ////////////////////////////////////////////////
+      // Prepare RedBlack source
+      ////////////////////////////////////////////////
+      for(int b=0;b<nblock;b++){
+	RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
+      }
+      ////////////////////////////////////////////////
+      // Make the guesses
+      ////////////////////////////////////////////////
+      if ( subGuess ) guess_save.resize(nblock,grid);
+
+      for(int b=0;b<nblock;b++){
+	guess(src_o[b],sol_o[b]); 
+
+	if ( subGuess ) { 
+	  guess_save[b] = sol_o[b];
+	}
+      }
+      //////////////////////////////////////////////////////////////
+      // Call the block solver
+      //////////////////////////////////////////////////////////////
+      std::cout<<GridLogMessage << "SchurRedBlackBase calling the solver for "<<nblock<<" RHS" <<std::endl;
+      RedBlackSolve(_Matrix,src_o,sol_o);
+
+      ////////////////////////////////////////////////
+      // A2A boolean behavioural control & reconstruct other checkerboard
+      ////////////////////////////////////////////////
+      for(int b=0;b<nblock;b++) {
+
+	if (subGuess)   sol_o[b] = sol_o[b] - guess_save[b];
+
+	///////// Needs even source //////////////
+	pickCheckerboard(Even,tmp,in[b]);
+	RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
+
+	/////////////////////////////////////////////////
+	// Check unprec residual if possible
+	/////////////////////////////////////////////////
+	if ( ! subGuess ) {
+	  _Matrix.M(out[b],resid); 
+	  resid = resid-in[b];
+	  RealD ns = norm2(in[b]);
+	  RealD nr = norm2(resid);
+	
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
+	} else {
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
+	}
+
+      }
+    }
+    template<class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
@@ -335,52 +203,39 @@ namespace Grid {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();

-      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
-      Field   tmp(grid);
-      Field  Mtmp(grid);
      Field resid(fgrid);
+      Field src_o(grid);
+      Field src_e(grid);
+      Field sol_o(grid);

-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
-    
-      /////////////////////////////////////////////////////
-      // src_o = Mdag * (source_o - Moe MeeInv source_e)
-      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+      ////////////////////////////////////////////////
+      // RedBlack source
+      ////////////////////////////////////////////////
+      RedBlackSource(_Matrix,in,src_e,src_o);

-      // get the right MpcDag
-      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
+      ////////////////////////////////
+      // Construct the guess
+      ////////////////////////////////
+      Field   tmp(grid);
+      guess(src_o,sol_o);
+
+      Field  guess_save(grid);
+      guess_save = sol_o;

      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-//      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-      guess(src_o,tmp);
-      Mtmp = tmp;
-      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
+      RedBlackSolve(_Matrix,src_o,sol_o);
+
+      ////////////////////////////////////////////////
      // Fionn A2A boolean behavioural control
-      if (subGuess)      tmp = tmp-Mtmp;
-      _Matrix.MooeeInv(tmp,sol_o);       assert(  sol_o.checkerboard   ==Odd);
+      ////////////////////////////////////////////////
+      if (subGuess)      sol_o= sol_o-guess_save;

      ///////////////////////////////////////////////////
-      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      // RedBlack solution needs the even source
      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-     
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
+      RedBlackSolution(_Matrix,sol_o,src_e,out);

      // Verify the unprec residual
      if ( ! subGuess ) {
@@ -389,68 +244,182 @@ namespace Grid {
        RealD ns = norm2(in);
        RealD nr = norm2(resid);

-        std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
+        std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl;
      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
+        std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl;
      }
    }     
+    
+    /////////////////////////////////////////////////////////////
+    // Override in derived. Not virtual as template methods
+    /////////////////////////////////////////////////////////////
+    virtual void RedBlackSource  (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)                =0;
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)          =0;
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)                           =0;
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)=0;
+
  };
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Take a matrix and form a Red Black solver calling a Herm solver
-  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackDiagTwoMixed {
-  private:
-    LinearFunction<Field> & _HermitianRBSolver;
-    int CBfactorise;
-    bool subGuess;
+
+  template<class Field> class SchurRedBlackStaggeredSolve : public SchurRedBlackBase<Field> {
  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) 
+      :    SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess) 
+    {
+    }
+
+    //////////////////////////////////////////////////////
+    // Override RedBlack specialisation
+    //////////////////////////////////////////////////////
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+
+      /////////////////////////////////////////////////////
+      // src_o = (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+
+      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
+    }
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e_c,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field   sol_e(grid);
+      Field   src_e(grid);
+
+      src_e = src_e_c; // Const correctness
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
+      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
+     
+      setCheckerboard(sol,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(sol,sol_o); assert(  sol_o.checkerboard ==Odd );
+    }
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal has Mooee on it.
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  
+      : SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess) {};
+
+
+    //////////////////////////////////////////////////////
+    // Override RedBlack specialisation
+    //////////////////////////////////////////////////////
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+
+      /////////////////////////////////////////////////////
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+
+      // get the right MpcDag
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
+
+    }
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  sol_e(grid);
+      Field  src_e_i(grid);
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);          assert(  tmp.checkerboard   ==Even);
+      src_e_i = src_e-tmp;               assert(  src_e_i.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e_i,sol_e);   assert(  sol_e.checkerboard ==Even);
+     
+      setCheckerboard(sol,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(sol,sol_o); assert(  sol_o.checkerboard ==Odd );
+    }
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal is identity, right preconditioned by Mee^inv
+  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
+  //=> psi = MeeInv phi
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;

    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
-  SchurRedBlackDiagTwoMixed(LinearFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
-     _HermitianRBSolver(HermitianRBSolver) 
-    { 
-      CBfactorise=0;
-      subtractGuess(initSubGuess);
-    };
-    void subtractGuess(const bool initSubGuess)
-    {
-      subGuess = initSubGuess;
-    }
-    bool isSubtractGuess(void)
-    {
-      return subGuess;
-    }
+  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  
+    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess) {};

-    template<class Matrix>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out){
-      ZeroGuesser<Field> guess;
-      (*this)(_Matrix,in,out,guess);
-    }
-    template<class Matrix, class Guesser>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
-
-      // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME use CBfactorise to control schur decomp
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();

      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
+      
      Field   tmp(grid);
      Field  Mtmp(grid);
-      Field resid(fgrid);

-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
    
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
@@ -461,43 +430,44 @@ namespace Grid {

      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
+    }

-      //////////////////////////////////////////////////////////////
-      // Call the red-black solver
-      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-//      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-//      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
-      guess(src_o,tmp);
-      Mtmp = tmp;
-      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
-      // Fionn A2A boolean behavioural control
-      if (subGuess)      tmp = tmp-Mtmp;
-      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   sol_o_i(grid);
+      Field   tmp(grid);
+      Field   sol_e(grid);
+
+      ////////////////////////////////////////////////
+      // MooeeInv due to pecond
+      ////////////////////////////////////////////////
+      _Matrix.MooeeInv(sol_o,tmp);
+      sol_o_i = tmp;

      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
+      _Matrix.Meooe(sol_o_i,tmp);    assert(  tmp.checkerboard   ==Even);
+      tmp = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      _Matrix.MooeeInv(tmp,sol_e);   assert(  sol_e.checkerboard ==Even);
     
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
+      setCheckerboard(sol,sol_e);    assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(sol,sol_o_i);  assert(  sol_o_i.checkerboard ==Odd );
+    };

-      // Verify the unprec residual
-      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
-        resid = resid-in;
-        RealD ns = norm2(in);
-        RealD nr = norm2(resid);
-
-        std::cout << GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid " << std::sqrt(nr / ns) << " nr " << nr << " ns " << ns << std::endl;
-      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
-      }
-    }     
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
  };
-
 }
 #endif
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -50,8 +50,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
      assert(0);
  }

-  Grid_quiesce_nodes();
-
  // Never clean up as done once.
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);

@@ -124,10 +122,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  // split the communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  //  int Nparent = parent._processors ; 
-  //  std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
  int Nparent;
  MPI_Comm_size(parent.communicator,&Nparent);
-  //  std::cout << " Parent size  "<<Nparent <<std::endl;

  int childsize=1;
  for(int d=0;d<processors.size();d++) {
@@ -136,8 +132,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  int Nchild = Nparent/childsize;
  assert (childsize * Nchild == Nparent);

-  //  std::cout << " child size  "<<childsize <<std::endl;
-
  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
  std::vector<int> scoor(_ndimension); // coor of split within parent
  std::vector<int> ssize(_ndimension); // coor of split within parent
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -413,7 +413,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@@ -455,7 +455,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@@ -499,7 +499,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
      
-      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
+      //      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
      if ( ptr == (void * )MAP_FAILED ) {       
 	perror("failed mmap");     
 	assert(0);    
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -392,14 +392,10 @@ namespace Grid {

    void SeedUniqueString(const std::string &s){
      std::vector<int> seeds;
-      std::stringstream sha;
      seeds = GridChecksum::sha256_seeds(s);
-      for(int i=0;i<seeds.size();i++) { 
-        sha << std::hex << seeds[i];
-      }
      std::cout << GridLogMessage << "Intialising parallel RNG with unique string '" 
                << s << "'" << std::endl;
-      std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
+      std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
      SeedFixedIntegers(seeds);
    }
    void SeedFixedIntegers(const std::vector<int> &seeds){
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -464,8 +464,10 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
  assert(orthog>=0);

  for(int d=0;d<nh;d++){
-    assert(lg->_processors[d]  == hg->_processors[d]);
-    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    if ( d!=orthog ) {
+      assert(lg->_processors[d]  == hg->_processors[d]);
+      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    }
  }

  // the above should guarantee that the operations are local
@@ -485,7 +487,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int


 template<class vobj>
-void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
  typedef typename vobj::scalar_object sobj;

@@ -499,8 +501,10 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
  assert(orthog>=0);

  for(int d=0;d<nh;d++){
-    assert(lg->_processors[d]  == hg->_processors[d]);
-    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    if ( d!=orthog ) {
+      assert(lg->_processors[d]  == hg->_processors[d]);
+      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    }
  }

  // the above should guarantee that the operations are local
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@@ -59,6 +59,7 @@ void GridLogTimestamp(int on){
 }

 Colours GridLogColours(0);
+GridLogger GridLogMG     (1, "MG"    , GridLogColours, "NORMAL");
 GridLogger GridLogIRL    (1, "IRL"   , GridLogColours, "NORMAL");
 GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
 GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -146,9 +146,11 @@ public:
      if ( log.timestamp ) {
 	log.StopWatch->Stop();
 	GridTime now = log.StopWatch->Elapsed();
+	
 	if ( log.timing_mode==1 ) log.StopWatch->Reset();
 	log.StopWatch->Start();
-	stream << log.evidence()<< std::setw(6)<<now << log.background() << " : " ;
+	stream << log.evidence()
+	       << now	       << log.background() << " : " ;
      }
      stream << log.colour();
      return stream;
@@ -167,6 +169,7 @@ public:

 void GridLogConfigure(std::vector<std::string> &logstreams);

+extern GridLogger GridLogMG;
 extern GridLogger GridLogIRL;
 extern GridLogger GridLogSolver;
 extern GridLogger GridLogError;
--- a/Grid/parallelIO/BinaryIO.cc
+++ b/Grid/parallelIO/BinaryIO.cc
@@ -0,0 +1,3 @@
+#include <Grid/GridCore.h>
+
+int Grid::BinaryIO::latticeWriteMaxRetry = -1;
--- a/Grid/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@@ -81,6 +81,7 @@ inline void removeWhitespace(std::string &key)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 class BinaryIO {
 public:
+  static int latticeWriteMaxRetry;

  /////////////////////////////////////////////////////////////////////////////
  // more byte manipulation helpers
@@ -370,7 +371,7 @@ PARALLEL_CRITICAL
 #endif
      } else {
 	std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
-                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
+                  << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
        std::ifstream fin;
 	fin.open(file, std::ios::binary | std::ios::in);
        if (control & BINARYIO_MASTER_APPEND)
@@ -582,7 +583,9 @@ PARALLEL_CRITICAL
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
    GridBase *grid = Umu._grid;
-    uint64_t lsites = grid->lSites();
+    uint64_t lsites = grid->lSites(), offsetCopy = offset;
+    int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry);
+    bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0);

    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
@@ -597,9 +600,35 @@ PARALLEL_CRITICAL

    grid->Barrier();
    timer.Stop();
+    while (attemptsLeft >= 0)
+    {
+      grid->Barrier();
+      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
+	             nersc_csum,scidac_csuma,scidac_csumb);
+      if (checkWrite)
+      {
+        std::vector<fobj> ckiodata(lsites);
+        uint32_t          cknersc_csum, ckscidac_csuma, ckscidac_csumb;
+        uint64_t          ckoffset = offsetCopy;

-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
-	     nersc_csum,scidac_csuma,scidac_csumb);
+        std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
+        grid->Barrier();
+        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
+	               cknersc_csum,ckscidac_csuma,ckscidac_csumb);
+        if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
+        {
+          std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl;
+          offset = offsetCopy;
+        }
+        else
+        {
+          std::cout << GridLogMessage << "writeLatticeObject: read test checksum correct" << std::endl;
+          break;
+        }
+      }
+      attemptsLeft--;
+    }
+    

    std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed()  <<std::endl;
  }
@@ -725,5 +754,6 @@ PARALLEL_CRITICAL
    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
  }
 };
+
 }
 #endif
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -233,7 +233,8 @@ class GridLimeReader : public BinaryIO {
 	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
 	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
-
+  std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
+  std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
 	/////////////////////////////////////////////
 	// Insist checksum is next record
 	/////////////////////////////////////////////
--- a/Grid/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@@ -49,21 +49,39 @@ inline double usecond(void) {

 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
-typedef  std::chrono::milliseconds          GridMillisecs;
-typedef  std::chrono::microseconds          GridTime;
-typedef  std::chrono::microseconds          GridUsecs;

-inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
+typedef  std::chrono::seconds               GridSecs;
+typedef  std::chrono::milliseconds          GridMillisecs;
+typedef  std::chrono::microseconds          GridUsecs;
+typedef  std::chrono::microseconds          GridTime;
+
+inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
 {
-  stream << time.count()<<" ms";
+  stream << time.count()<<" s";
  return stream;
 }
-inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
+inline std::ostream& operator<< (std::ostream & stream, const GridMillisecs & now)
 {
-  stream << time.count()<<" usec";
+  GridSecs second(1);
+  auto     secs       = now/second ; 
+  auto     subseconds = now%second ;
+  auto     fill       = stream.fill();
+  stream << secs<<"."<<std::setw(3)<<std::setfill('0')<<subseconds.count()<<" s";
+  stream.fill(fill);
  return stream;
 }
- 
+inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now)
+{
+  GridSecs second(1);
+  auto     seconds    = now/second ; 
+  auto     subseconds = now%second ;
+  auto     fill       = stream.fill();
+  stream << seconds<<"."<<std::setw(6)<<std::setfill('0')<<subseconds.count()<<" s";
+  stream.fill(fill);
+  return stream;
+}
+
+
 class GridStopWatch {
 private:
  bool running;
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -90,17 +90,20 @@ namespace QCD {
    // That probably makes for GridRedBlack4dCartesian grid.

    // s,sp,c,spc,lc
-    template<typename vtype> using iSinglet                   = iScalar<iScalar<iScalar<vtype> > >;
-    template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iScalar<vtype>, Ns> >;
-    template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
-    template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
-    template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
-    template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
-    template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
-    template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
-    template<typename vtype> using iSpinColourVector          = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
-    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
-    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
+
+    template<typename vtype> using iSinglet                     = iScalar<iScalar<iScalar<vtype> > >;
+    template<typename vtype> using iSpinMatrix                  = iScalar<iMatrix<iScalar<vtype>, Ns> >;
+    template<typename vtype> using iColourMatrix                = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
+    template<typename vtype> using iSpinColourMatrix            = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
+    template<typename vtype> using iLorentzColourMatrix         = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
+    template<typename vtype> using iDoubleStoredColourMatrix    = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
+    template<typename vtype> using iSpinVector                  = iScalar<iVector<iScalar<vtype>, Ns> >;
+    template<typename vtype> using iColourVector                = iScalar<iScalar<iVector<vtype, Nc> > >;
+    template<typename vtype> using iSpinColourVector            = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
+    template<typename vtype> using iHalfSpinVector              = iScalar<iVector<iScalar<vtype>, Nhs> >;
+    template<typename vtype> using iHalfSpinColourVector        = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
+    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
+

    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
@@ -127,10 +130,28 @@ namespace QCD {
    typedef iSpinColourMatrix<Complex  >    SpinColourMatrix;
    typedef iSpinColourMatrix<ComplexF >    SpinColourMatrixF;
    typedef iSpinColourMatrix<ComplexD >    SpinColourMatrixD;
-
+    
    typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
    typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
    typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;
+    
+    // SpinColourSpinColour matrix
+    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
+
+    typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
+
+    // SpinColourSpinColour matrix
+    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
+
+    typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;

    // LorentzColour
    typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
@@ -229,6 +250,9 @@ namespace QCD {
    typedef Lattice<vSpinColourMatrixF>     LatticeSpinColourMatrixF;
    typedef Lattice<vSpinColourMatrixD>     LatticeSpinColourMatrixD;

+    typedef Lattice<vSpinColourSpinColourMatrix>      LatticeSpinColourSpinColourMatrix;
+    typedef Lattice<vSpinColourSpinColourMatrixF>     LatticeSpinColourSpinColourMatrixF;
+    typedef Lattice<vSpinColourSpinColourMatrixD>     LatticeSpinColourSpinColourMatrixD;

    typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
    typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -44,12 +44,15 @@ namespace QCD {
  
  struct WilsonImplParams {
    bool overlapCommsCompute;
+    std::vector<Real> twist_n_2pi_L;
    std::vector<Complex> boundary_phases;
    WilsonImplParams() : overlapCommsCompute(false) {
      boundary_phases.resize(Nd, 1.0);
+      twist_n_2pi_L.resize(Nd, 0.0);
    };
-    WilsonImplParams(const std::vector<Complex> phi)
-      : boundary_phases(phi), overlapCommsCompute(false) {}
+    WilsonImplParams(const std::vector<Complex> phi) : boundary_phases(phi), overlapCommsCompute(false) {
+      twist_n_2pi_L.resize(Nd, 0.0);
+    }
  };

  struct StaggeredImplParams {
--- a/Grid/qcd/action/fermion/CayleyFermion5D.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.cc
@@ -485,9 +485,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
    
  double bpc = b+c;
  double bmc = b-c;
+  _b = b;
+  _c = c;
+  _gamma  = gamma; // Save the parameters so we can change mass later.
+  _zolo_hi= zolo_hi;
  for(int i=0; i < Ls; i++){
    as[i] = 1.0;
-    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
+    omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code
    assert(omega[i]!=Coeff_t(0.0));
    bs[i] = 0.5*(bpc/omega[i] + bmc);
    cs[i] = 0.5*(bpc/omega[i] - bmc);
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -97,7 +97,10 @@ namespace Grid {
      // Support for MADWF tricks
      ///////////////////////////////////////////////////////////////
      RealD Mass(void) { return mass; };
-      void  SetMass(RealD _mass) { mass=_mass; } ;
+      void  SetMass(RealD _mass) { 
+	mass=_mass; 
+	SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
+      } ;
      void  P(const FermionField &psi, FermionField &chi);
      void  Pdag(const FermionField &psi, FermionField &chi);

@@ -147,6 +150,12 @@ namespace Grid {
      //    protected:
      RealD mass;

+      // Save arguments to SetCoefficientsInternal
+      std::vector<Coeff_t> _gamma;
+      RealD                _zolo_hi;
+      RealD                _b;
+      RealD                _c;
+
      // Cayley form Moebius (tanh and zolotarev)
      std::vector<Coeff_t> omega;
      std::vector<Coeff_t> bs;    // S dependent coeffs
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -80,12 +80,24 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ///////////////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/g5HermitianLinop.h>

+///////////////////////////////////////////////////////////////////////////////
+// Fourier accelerated Pauli Villars inverse support
+///////////////////////////////////////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/WilsonTMFermion5D.h>   
+
+////////////////////////////////////////////////////////////////////////////////
+// Move this group to a DWF specific tools/algorithms subdir? 
+////////////////////////////////////////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/FourierAcceleratedPV.h>
+#include <Grid/qcd/action/fermion/PauliVillarsInverters.h>
+#include <Grid/qcd/action/fermion/Reconstruct5Dprop.h>
+#include <Grid/qcd/action/fermion/MADWF.h>
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // More maintainable to maintain the following typedef list centrally, as more "impl" targets
 // are added, (e.g. extension for gparity, half precision project in comms etc..)
 ////////////////////////////////////////////////////////////////////////////////////////////////////

-
 // Cayley 5d
 namespace Grid {
  namespace QCD {
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -64,12 +64,6 @@ namespace Grid {
      virtual RealD  M    (const FermionField &in, FermionField &out)=0;
      virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;

-      // Query the even even properties to make algorithmic decisions
-      virtual int    ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
-      virtual int    isTrivialEE(void) { return 0; };
-      virtual RealD  Mass(void) {return 0.0;};
-      virtual void SetMass(RealD _mass) { return; };
-
      // half checkerboard operaions
      virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
      virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0;
--- a/Grid/qcd/action/fermion/FermionOperatorImpl.h
+++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h
@@ -141,6 +141,7 @@ namespace QCD {
  ////////////////////////////////////////////////////////////////////////
  
 #define INHERIT_FIMPL_TYPES(Impl)\
+  typedef Impl Impl_t;							\
  typedef typename Impl::FermionField           FermionField;		\
  typedef typename Impl::PropagatorField     PropagatorField;		\
  typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
@@ -239,16 +240,30 @@ namespace QCD {
      GaugeLinkField tmp(GaugeGrid);

      Lattice<iScalar<vInteger> > coor(GaugeGrid);
+      ////////////////////////////////////////////////////
+      // apply any boundary phase or twists
+      ////////////////////////////////////////////////////
      for (int mu = 0; mu < Nd; mu++) {

-	      auto pha = Params.boundary_phases[mu];
-	      scalar_type phase( real(pha),imag(pha) );
+	////////// boundary phase /////////////
+	auto pha = Params.boundary_phases[mu];
+	scalar_type phase( real(pha),imag(pha) );

-        int Lmu = GaugeGrid->GlobalDimensions()[mu] - 1;
+	int L   = GaugeGrid->GlobalDimensions()[mu];
+        int Lmu = L - 1;

        LatticeCoordinate(coor, mu);

        U = PeekIndex<LorentzIndex>(Umu, mu);
+
+	// apply any twists
+	RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
+	if ( theta != 0.0) { 
+	  scalar_type twphase(::cos(theta),::sin(theta));
+	  U = twphase*U;
+	  std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
+	}
+
        tmp = where(coor == Lmu, phase * U, U);
        PokeIndex<LorentzIndex>(Uds, tmp, mu);

--- a/Grid/qcd/action/fermion/FourierAcceleratedPV.h
+++ b/Grid/qcd/action/fermion/FourierAcceleratedPV.h
@@ -0,0 +1,237 @@
+
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/FourierAcceleratedPV.h
+
+    Copyright (C) 2015
+
+Author: Christoph Lehner (lifted with permission by Peter Boyle, brought back to Grid)
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+namespace Grid {
+namespace QCD {
+
+  template<typename M>
+    void get_real_const_bc(M& m, RealD& _b, RealD& _c) {
+    ComplexD b,c;
+    b=m.bs[0];
+    c=m.cs[0];
+    std::cout << GridLogMessage << "b=" << b << ", c=" << c << std::endl;
+    for (size_t i=1;i<m.bs.size();i++) {
+      assert(m.bs[i] == b);
+      assert(m.cs[i] == c);
+    }
+    assert(b.imag() == 0.0);
+    assert(c.imag() == 0.0);
+    _b = b.real();
+    _c = c.real();
+  }
+
+
+template<typename Vi, typename M, typename G>
+class FourierAcceleratedPV {
+ public:
+
+  ConjugateGradient<Vi> &cg;
+  M& dwfPV;
+  G& Umu;
+  GridCartesian* grid5D;
+  GridRedBlackCartesian* gridRB5D;
+  int group_in_s;
+
+  FourierAcceleratedPV(M& _dwfPV, G& _Umu, ConjugateGradient<Vi> &_cg, int _group_in_s = 2) 
+   : dwfPV(_dwfPV), Umu(_Umu), cg(_cg), group_in_s(_group_in_s) 
+  {
+    assert( dwfPV.FermionGrid()->_fdimensions[0] % (2*group_in_s) == 0);
+    grid5D = QCD::SpaceTimeGrid::makeFiveDimGrid(2*group_in_s, (GridCartesian*)Umu._grid);
+    gridRB5D = QCD::SpaceTimeGrid::makeFiveDimRedBlackGrid(2*group_in_s, (GridCartesian*)Umu._grid);
+  }
+
+  void rotatePV(const Vi& _src, Vi& dst, bool forward) const {
+
+    GridStopWatch gsw1, gsw2;
+
+    typedef typename Vi::scalar_type Coeff_t;
+    int Ls = dst._grid->_fdimensions[0];
+
+    Vi _tmp(dst._grid);
+    double phase = M_PI / (double)Ls;
+    Coeff_t bzero(0.0,0.0);
+
+    FFT theFFT((GridCartesian*)dst._grid);
+
+    if (!forward) {
+      gsw1.Start();
+      for (int s=0;s<Ls;s++) {
+	Coeff_t a(::cos(phase*s),-::sin(phase*s));
+	axpby_ssp(_tmp,a,_src,bzero,_src,s,s);
+      }
+      gsw1.Stop();
+
+      gsw2.Start();
+      theFFT.FFT_dim(dst,_tmp,0,FFT::forward);
+      gsw2.Stop();
+
+    } else {
+
+      gsw2.Start();
+      theFFT.FFT_dim(_tmp,_src,0,FFT::backward);
+      gsw2.Stop();
+
+      gsw1.Start();
+      for (int s=0;s<Ls;s++) {
+	Coeff_t a(::cos(phase*s),::sin(phase*s));
+	axpby_ssp(dst,a,_tmp,bzero,_tmp,s,s);
+      }
+      gsw1.Stop();
+    }
+
+    std::cout << GridLogMessage << "Timing rotatePV: " << gsw1.Elapsed() << ", " << gsw2.Elapsed() << std::endl;
+
+  }
+
+  void pvInv(const Vi& _src, Vi& _dst) const {
+
+    std::cout << GridLogMessage << "Fourier-Accelerated Outer Pauli Villars"<<std::endl;
+
+    typedef typename Vi::scalar_type Coeff_t;
+    int Ls = _dst._grid->_fdimensions[0];
+
+    GridStopWatch gswT;
+    gswT.Start();
+
+    RealD b,c;
+    get_real_const_bc(dwfPV,b,c);
+    RealD M5 = dwfPV.M5;
+    
+    // U(true) Rightinv TMinv U(false) = Minv
+
+    Vi _src_diag(_dst._grid);
+    Vi _src_diag_slice(dwfPV.GaugeGrid());
+    Vi _dst_diag_slice(dwfPV.GaugeGrid());
+    Vi _src_diag_slices(grid5D);
+    Vi _dst_diag_slices(grid5D);
+    Vi _dst_diag(_dst._grid);
+
+    rotatePV(_src,_src_diag,false);
+
+    // now do TM solves
+    Gamma G5(Gamma::Algebra::Gamma5);
+
+    GridStopWatch gswA, gswB;
+
+    gswA.Start();
+
+    typedef typename M::Impl_t Impl;
+    //WilsonTMFermion<Impl> tm(x.Umu,*x.UGridF,*x.UrbGridF,0.0,0.0,solver_outer.parent.par.wparams_f);
+    std::vector<RealD> vmass(grid5D->_fdimensions[0],0.0);
+    std::vector<RealD> vmu(grid5D->_fdimensions[0],0.0);
+
+    WilsonTMFermion5D<Impl> tm(Umu,*grid5D,*gridRB5D,
+			   *(GridCartesian*)dwfPV.GaugeGrid(),
+			   *(GridRedBlackCartesian*)dwfPV.GaugeRedBlackGrid(),
+			   vmass,vmu);
+    
+    //SchurRedBlackDiagTwoSolve<Vi> sol(cg);
+    SchurRedBlackDiagMooeeSolve<Vi> sol(cg); // same performance as DiagTwo
+    gswA.Stop();
+
+    gswB.Start();
+
+    for (int sgroup=0;sgroup<Ls/2/group_in_s;sgroup++) {
+
+      for (int sidx=0;sidx<group_in_s;sidx++) {
+
+	int s = sgroup*group_in_s + sidx;
+	int sprime = Ls-s-1;
+
+	RealD phase = M_PI / (RealD)Ls * (2.0 * s + 1.0);
+	RealD cosp = ::cos(phase);
+	RealD sinp = ::sin(phase);
+	RealD denom = b*b + c*c + 2.0*b*c*cosp;
+	RealD mass = -(b*b*M5 + c*(1.0 - cosp + c*M5) + b*(-1.0 + cosp + 2.0*c*cosp*M5))/denom;
+	RealD mu = (b+c)*sinp/denom;
+
+	vmass[2*sidx + 0] = mass;
+	vmass[2*sidx + 1] = mass;
+	vmu[2*sidx + 0] = mu;
+	vmu[2*sidx + 1] = -mu;
+
+      }
+
+      tm.update(vmass,vmu);
+
+      for (int sidx=0;sidx<group_in_s;sidx++) {
+
+	int s = sgroup*group_in_s + sidx;
+	int sprime = Ls-s-1;
+
+	ExtractSlice(_src_diag_slice,_src_diag,s,0);
+	InsertSlice(_src_diag_slice,_src_diag_slices,2*sidx + 0,0);
+
+	ExtractSlice(_src_diag_slice,_src_diag,sprime,0);
+	InsertSlice(_src_diag_slice,_src_diag_slices,2*sidx + 1,0);
+
+      }
+
+      GridStopWatch gsw;
+      gsw.Start();
+      _dst_diag_slices = zero; // zero guess
+      sol(tm,_src_diag_slices,_dst_diag_slices);
+      gsw.Stop();
+      std::cout << GridLogMessage << "Solve[sgroup=" << sgroup << "] completed in " << gsw.Elapsed() << ", " << gswA.Elapsed() << std::endl;
+
+      for (int sidx=0;sidx<group_in_s;sidx++) {
+
+	int s = sgroup*group_in_s + sidx;
+	int sprime = Ls-s-1;
+
+	RealD phase = M_PI / (RealD)Ls * (2.0 * s + 1.0);
+	RealD cosp = ::cos(phase);
+	RealD sinp = ::sin(phase);
+
+	// now rotate with inverse of
+	Coeff_t pA = b + c*cosp;
+	Coeff_t pB = - Coeff_t(0.0,1.0)*c*sinp;
+	Coeff_t pABden = pA*pA - pB*pB;
+	// (pA + pB * G5) * (pA - pB*G5) = (pA^2 - pB^2)
+      
+	ExtractSlice(_dst_diag_slice,_dst_diag_slices,2*sidx + 0,0);
+	_dst_diag_slice = (pA/pABden) * _dst_diag_slice - (pB/pABden) * (G5 * _dst_diag_slice);
+	InsertSlice(_dst_diag_slice,_dst_diag,s,0);
+	
+	ExtractSlice(_dst_diag_slice,_dst_diag_slices,2*sidx + 1,0);
+	_dst_diag_slice = (pA/pABden) * _dst_diag_slice + (pB/pABden) * (G5 * _dst_diag_slice);
+	InsertSlice(_dst_diag_slice,_dst_diag,sprime,0);
+      }
+    }
+    gswB.Stop();
+
+    rotatePV(_dst_diag,_dst,true);
+
+    gswT.Stop();
+    std::cout << GridLogMessage << "PV completed in " << gswT.Elapsed() << " (Setup: " << gswA.Elapsed() << ", s-loop: " << gswB.Elapsed() << ")" << std::endl;
+  }
+
+};
+}}
--- a/Grid/qcd/action/fermion/MADWF.h
+++ b/Grid/qcd/action/fermion/MADWF.h
@@ -0,0 +1,193 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/MADWF.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+
+namespace Grid {
+namespace QCD {
+
+template <class Fieldi, class Fieldo,IfNotSame<Fieldi,Fieldo> X=0>
+inline void convert(const Fieldi &from,Fieldo &to) 
+{
+  precisionChange(to,from);
+}
+template <class Fieldi, class Fieldo,IfSame<Fieldi,Fieldo> X=0>
+inline void convert(const Fieldi &from,Fieldo &to) 
+{
+  to=from;
+}
+
+template<class Matrixo,class Matrixi,class PVinverter,class SchurSolver, class Guesser> 
+class MADWF 
+{
+ private:
+  typedef typename Matrixo::FermionField FermionFieldo;
+  typedef typename Matrixi::FermionField FermionFieldi;
+
+  PVinverter  & PauliVillarsSolvero;// For the outer field
+  SchurSolver & SchurSolveri;       // For the inner approx field
+  Guesser     & Guesseri;           // To deflate the inner approx solves
+
+  Matrixo & Mato;                   // Action object for outer
+  Matrixi & Mati;                   // Action object for inner
+
+  RealD target_resid;
+  int   maxiter;
+ public:
+
+  MADWF(Matrixo &_Mato,
+	Matrixi &_Mati, 
+	PVinverter &_PauliVillarsSolvero, 
+	SchurSolver &_SchurSolveri,
+	Guesser & _Guesseri,
+	RealD resid,
+	int _maxiter) :
+
+  Mato(_Mato),Mati(_Mati),
+    SchurSolveri(_SchurSolveri),
+    PauliVillarsSolvero(_PauliVillarsSolvero),Guesseri(_Guesseri)
+  {   
+    target_resid=resid;
+    maxiter     =_maxiter; 
+  };
+
+  void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
+  {
+    std::cout << GridLogMessage<< " ************************************************" << std::endl;
+    std::cout << GridLogMessage<< "  MADWF-like algorithm                           " << std::endl;
+    std::cout << GridLogMessage<< " ************************************************" << std::endl;
+
+    FermionFieldi    c0i(Mati.GaugeGrid()); // 4d 
+    FermionFieldi    y0i(Mati.GaugeGrid()); // 4d
+    FermionFieldo    c0 (Mato.GaugeGrid()); // 4d 
+    FermionFieldo    y0 (Mato.GaugeGrid()); // 4d
+
+    FermionFieldo    A(Mato.FermionGrid()); // Temporary outer
+    FermionFieldo    B(Mato.FermionGrid()); // Temporary outer
+    FermionFieldo    b(Mato.FermionGrid()); // 5d source
+
+    FermionFieldo    c(Mato.FermionGrid()); // PVinv source; reused so store
+    FermionFieldo    defect(Mato.FermionGrid()); // 5d source
+
+    FermionFieldi   ci(Mati.FermionGrid()); 
+    FermionFieldi   yi(Mati.FermionGrid()); 
+    FermionFieldi   xi(Mati.FermionGrid()); 
+    FermionFieldi srci(Mati.FermionGrid()); 
+    FermionFieldi   Ai(Mati.FermionGrid()); 
+
+    RealD m=Mati.Mass();
+
+    ///////////////////////////////////////
+    //Import source, include Dminus factors
+    ///////////////////////////////////////
+    Mato.ImportPhysicalFermionSource(src4,b); 
+    std::cout << GridLogMessage << " src4 " <<norm2(src4)<<std::endl;
+    std::cout << GridLogMessage << " b    " <<norm2(b)<<std::endl;
+
+    defect = b;
+    sol5=zero;
+    for (int i=0;i<maxiter;i++) {
+
+      ///////////////////////////////////////
+      // Set up c0 from current defect
+      ///////////////////////////////////////
+      PauliVillarsSolvero(Mato,defect,A);
+      Mato.Pdag(A,c);
+      ExtractSlice(c0, c, 0 , 0);
+
+      ////////////////////////////////////////////////
+      // Solve the inner system with surface term c0
+      ////////////////////////////////////////////////
+      ci = zero;  
+      convert(c0,c0i); // Possible precison change
+      InsertSlice(c0i,ci,0, 0);
+
+      // Dwm P y = Dwm x = D(1) P (c0,0,0,0)^T
+      Mati.P(ci,Ai);
+      Mati.SetMass(1.0);      Mati.M(Ai,srci);      Mati.SetMass(m);
+      SchurSolveri(Mati,srci,xi,Guesseri); 
+      Mati.Pdag(xi,yi);
+      ExtractSlice(y0i, yi, 0 , 0);
+      convert(y0i,y0); // Possible precision change
+
+      //////////////////////////////////////
+      // Propagate solution back to outer system
+      // Build Pdag PV^-1 Dm P [-sol4,c2,c3... cL]
+      //////////////////////////////////////
+      c0 = - y0;
+      InsertSlice(c0, c, 0   , 0);
+
+      /////////////////////////////
+      // Reconstruct the bulk solution Pdag PV^-1 Dm P 
+      /////////////////////////////
+      Mato.P(c,B);
+      Mato.M(B,A);
+      PauliVillarsSolvero(Mato,A,B);
+      Mato.Pdag(B,A);
+
+      //////////////////////////////
+      // Reinsert surface prop
+      //////////////////////////////
+      InsertSlice(y0,A,0,0);
+
+      //////////////////////////////
+      // Convert from y back to x 
+      //////////////////////////////
+      Mato.P(A,B);
+
+      //         sol5' = sol5 + M^-1 defect
+      //               = sol5 + M^-1 src - M^-1 M sol5  ...
+      sol5 = sol5 + B;
+      std::cout << GridLogMessage << "***************************************" <<std::endl;
+      std::cout << GridLogMessage << " Sol5 update "<<std::endl;
+      std::cout << GridLogMessage << "***************************************" <<std::endl;
+      std::cout << GridLogMessage << " Sol5 now "<<norm2(sol5)<<std::endl;
+      std::cout << GridLogMessage << " delta    "<<norm2(B)<<std::endl;
+
+       // New defect  = b - M sol5
+       Mato.M(sol5,A);
+       defect = b - A;
+
+       std::cout << GridLogMessage << " defect   "<<norm2(defect)<<std::endl;
+
+       double resid = ::sqrt(norm2(defect) / norm2(b));
+       std::cout << GridLogMessage << "Residual " << i << ": " << resid  << std::endl;
+       std::cout << GridLogMessage << "***************************************" <<std::endl;
+
+       if (resid < target_resid) {
+	 return;
+       }
+    }
+
+    std::cout << GridLogMessage << "MADWF : Exceeded maxiter "<<std::endl;
+    assert(0);
+
+  }
+
+};
+
+}}
--- a/Grid/qcd/action/fermion/PauliVillarsInverters.h
+++ b/Grid/qcd/action/fermion/PauliVillarsInverters.h
@@ -0,0 +1,95 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+
+namespace Grid {
+namespace QCD {
+
+template<class Field>
+class PauliVillarsSolverUnprec
+{
+ public:
+  ConjugateGradient<Field> & CG;
+  PauliVillarsSolverUnprec(  ConjugateGradient<Field> &_CG) : CG(_CG){};
+
+  template<class Matrix>
+  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
+  {
+    RealD m = _Matrix.Mass();
+    Field A  (_Matrix.FermionGrid());
+
+    MdagMLinearOperator<Matrix,Field> HermOp(_Matrix);
+
+    _Matrix.SetMass(1.0);
+    _Matrix.Mdag(src,A);
+    CG(HermOp,A,sol);
+    _Matrix.SetMass(m);
+  };
+};
+
+template<class Field,class SchurSolverType>
+class PauliVillarsSolverRBprec
+{
+ public:
+  SchurSolverType & SchurSolver;
+  PauliVillarsSolverRBprec( SchurSolverType &_SchurSolver) : SchurSolver(_SchurSolver){};
+
+  template<class Matrix>
+  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
+  {
+    RealD m = _Matrix.Mass();
+    Field A  (_Matrix.FermionGrid());
+
+    _Matrix.SetMass(1.0);
+    SchurSolver(_Matrix,src,sol);
+    _Matrix.SetMass(m);
+  };
+};
+
+template<class Field,class GaugeField>
+class PauliVillarsSolverFourierAccel
+{
+ public:
+  GaugeField      & Umu;
+  ConjugateGradient<Field> & CG;
+
+  PauliVillarsSolverFourierAccel(GaugeField &_Umu,ConjugateGradient<Field> &_CG) :  Umu(_Umu), CG(_CG)
+  {
+  };
+
+  template<class Matrix>
+  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
+  {
+    FourierAcceleratedPV<Field, Matrix, typename Matrix::GaugeField > faPV(_Matrix,Umu,CG) ;
+    faPV.pvInv(src,sol);
+  };
+};
+
+
+}
+}
--- a/Grid/algorithms/iterative/Reconstruct5Dprop.h
+++ b/Grid/algorithms/iterative/Reconstruct5Dprop.h
@@ -30,49 +30,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 namespace QCD {

-
-template<class Field>
-class PauliVillarsSolverUnprec
-{
- public:
-  ConjugateGradient<Field> & CG;
-  PauliVillarsSolverUnprec(  ConjugateGradient<Field> &_CG) : CG(_CG){};
-
-  template<class Matrix>
-  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
-  {
-    RealD m = _Matrix.Mass();
-    Field A  (_Matrix.FermionGrid());
-
-    MdagMLinearOperator<Matrix,Field> HermOp(_Matrix);
-
-    _Matrix.SetMass(1.0);
-    _Matrix.Mdag(src,A);
-    CG(HermOp,A,sol);
-    _Matrix.SetMass(m);
-  };
-};
-
-template<class Field>
-class PauliVillarsSolverRBprec
-{
- public:
-  ConjugateGradient<Field> & CG;
-  PauliVillarsSolverRBprec(  ConjugateGradient<Field> &_CG) : CG(_CG){};
-
-  template<class Matrix>
-  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
-  {
-    RealD m = _Matrix.Mass();
-    Field A  (_Matrix.FermionGrid());
-
-    _Matrix.SetMass(1.0);
-    SchurRedBlackDiagMooeeSolve<Field> SchurSolver(CG);
-    SchurSolver(_Matrix,src,sol);
-    _Matrix.SetMass(m);
-  };
-};
-
 template<class Field,class PVinverter> class Reconstruct5DfromPhysical {
 private:
  PVinverter & PauliVillarsSolver;
@@ -85,20 +42,12 @@ template<class Field,class PVinverter> class Reconstruct5DfromPhysical {
 // of the Mobius exact AMA corrections.
 //
 // TODO : understand absence of contact term in eqns in Hantao's thesis
- //        sol4 is contact term subtracted.
+ //        sol4 is contact term subtracted, but thesis & Brower's paper suggests not.
 //
- // Options
- // a) Defect correction approach:
- //    1) Compute defect from current soln (initially guess).
- //       This is ...... outerToInner check !!!!
- //    2) Deflated Zmobius solve to get 4d soln
- //       Ensure deflation is working
- //    3) Refine 5d Outer using the inner 4d delta soln
- // 
- // Step 1: localise PV inverse in a routine. [DONE]
+ // Step 1: Localise PV inverse in a routine. [DONE]
 // Step 2: Schur based PV inverse            [DONE]
- // Step 3: Fourier accelerated PV inverse
- // Step 4: 
+ // Step 3: Fourier accelerated PV inverse    [DONE]
+ //
 /////////////////////////////////////////////////////
 
  Reconstruct5DfromPhysical(PVinverter &_PauliVillarsSolver) 
--- a/Grid/qcd/action/fermion/WilsonTMFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
@@ -0,0 +1,155 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonTMFermion5D.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk> ; NB Christoph did similar in GPT
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once 
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/WilsonFermion.h>
+
+
+namespace Grid {
+
+  namespace QCD {
+    
+    template<class Impl>
+      class WilsonTMFermion5D : public WilsonFermion5D<Impl>
+      {
+      public:
+	INHERIT_IMPL_TYPES(Impl);
+      public:
+
+	virtual void   Instantiatable(void) {};
+
+	// Constructors
+        WilsonTMFermion5D(GaugeField &_Umu,
+			  GridCartesian         &Fgrid,
+			  GridRedBlackCartesian &Frbgrid, 
+			  GridCartesian         &Ugrid,
+			  GridRedBlackCartesian &Urbgrid, 
+			  const std::vector<RealD> _mass,
+			  const std::vector<RealD> _mu,
+			  const ImplParams &p= ImplParams()
+			  ) :
+	WilsonFermion5D<Impl>(_Umu,
+			      Fgrid,
+			      Frbgrid,
+			      Ugrid,
+			      Urbgrid,
+			      4.0,p)
+	
+	  {
+	    update(_mass,_mu);
+	  }
+
+	virtual void Meooe(const FermionField &in, FermionField &out) {
+	  if (in.checkerboard == Odd) {
+	    this->DhopEO(in, out, DaggerNo);
+	  } else {
+	    this->DhopOE(in, out, DaggerNo);
+	  }
+	}
+
+	virtual void MeooeDag(const FermionField &in, FermionField &out) {
+	  if (in.checkerboard == Odd) {
+	    this->DhopEO(in, out, DaggerYes);
+	  } else {
+	    this->DhopOE(in, out, DaggerYes);
+	  }
+	}	
+	
+	// allow override for twisted mass and clover
+	virtual void Mooee(const FermionField &in, FermionField &out) {
+	  out.checkerboard = in.checkerboard;
+	  //axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
+	  for (int s=0;s<(int)this->mass.size();s++) {
+	    ComplexD a = 4.0+this->mass[s];
+	    ComplexD b(0.0,this->mu[s]);
+	    axpbg5y_ssp(out,a,in,b,in,s,s);
+	  }
+	}
+
+	virtual void MooeeDag(const FermionField &in, FermionField &out) {
+	  out.checkerboard = in.checkerboard;
+	  for (int s=0;s<(int)this->mass.size();s++) {
+	    ComplexD a = 4.0+this->mass[s];
+	    ComplexD b(0.0,-this->mu[s]);
+	    axpbg5y_ssp(out,a,in,b,in,s,s);
+	  }
+	}
+	virtual void MooeeInv(const FermionField &in, FermionField &out) {
+	  for (int s=0;s<(int)this->mass.size();s++) {
+	    RealD m    = this->mass[s];
+	    RealD tm   = this->mu[s];
+	    RealD mtil = 4.0+this->mass[s];
+	    RealD sq   = mtil*mtil+tm*tm;
+	    ComplexD a    = mtil/sq;
+	    ComplexD b(0.0, -tm /sq);
+	    axpbg5y_ssp(out,a,in,b,in,s,s);
+	  }
+	}
+	virtual void MooeeInvDag(const FermionField &in, FermionField &out) {
+	  for (int s=0;s<(int)this->mass.size();s++) {
+	    RealD m    = this->mass[s];
+	    RealD tm   = this->mu[s];
+	    RealD mtil = 4.0+this->mass[s];
+	    RealD sq   = mtil*mtil+tm*tm;
+	    ComplexD a    = mtil/sq;
+	    ComplexD b(0.0,tm /sq);
+	    axpbg5y_ssp(out,a,in,b,in,s,s);
+	  }
+	}
+
+	virtual RealD M(const FermionField &in, FermionField &out) {
+	  out.checkerboard = in.checkerboard;
+	  this->Dhop(in, out, DaggerNo);
+	  FermionField tmp(out._grid);
+	  for (int s=0;s<(int)this->mass.size();s++) {
+	    ComplexD a = 4.0+this->mass[s];
+	    ComplexD b(0.0,this->mu[s]);
+	    axpbg5y_ssp(tmp,a,in,b,in,s,s);
+	  }
+	  return axpy_norm(out, 1.0, tmp, out);
+	}
+	
+	// needed for fast PV
+	void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) {
+	  assert(_mass.size() == _mu.size());
+	  assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
+	  this->mass = _mass;
+	  this->mu = _mu;
+	}
+	
+      private:
+	std::vector<RealD> mu;
+	std::vector<RealD> mass;
+	
+      };
+   
+    typedef WilsonTMFermion5D<WilsonImplF> WilsonTMFermion5DF; 
+    typedef WilsonTMFermion5D<WilsonImplD> WilsonTMFermion5DD; 
+
+}}
--- a/Grid/qcd/action/gauge/Photon.h
+++ b/Grid/qcd/action/gauge/Photon.h
@@ -4,9 +4,11 @@
 
 Source file: ./lib/qcd/action/gauge/Photon.h
 
- Copyright (C) 2015
+Copyright (C) 2015-2018
 
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+ Author: Antonin Portelli <antonin.portelli@me.com>
+ Author: James Harrison <J.Harrison@soton.ac.uk>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -30,8 +32,9 @@

 namespace Grid{
 namespace QCD{
+
  template <class S>
-  class QedGimpl
+  class QedGImpl
  {
  public:
    typedef S Simd;
@@ -43,27 +46,27 @@ namespace QCD{
    
    typedef iImplGaugeLink<Simd>  SiteLink;
    typedef iImplGaugeField<Simd> SiteField;
-    typedef SiteField             SiteComplex;
+    typedef SiteLink              SiteComplex;
    
    typedef Lattice<SiteLink>  LinkField;
    typedef Lattice<SiteField> Field;
    typedef Field              ComplexField;
  };
  
-  typedef QedGimpl<vComplex> QedGimplR;
+  typedef QedGImpl<vComplex> QedGImplR;
  
-  template<class Gimpl>
+  template <class GImpl>
  class Photon
  {
  public:
-    INHERIT_GIMPL_TYPES(Gimpl);
+    INHERIT_GIMPL_TYPES(GImpl);
+    typedef typename SiteGaugeLink::scalar_object ScalarSite;
+    typedef typename ScalarSite::scalar_type      ScalarComplex;
    GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3);
-    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2, qedInf, 3);
+    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2);
  public:
-    Photon(Gauge gauge, ZmScheme zmScheme);
-    Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements);
-    Photon(Gauge gauge, ZmScheme zmScheme, Real G0);
-    Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements, Real G0);
+    Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvement);
+    Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme);
    virtual ~Photon(void) = default;
    void FreePropagator(const GaugeField &in, GaugeField &out);
    void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
@@ -73,345 +76,255 @@ namespace QCD{
                         const GaugeLinkField &weight);
    void UnitField(GaugeField &out);
  private:
-    void infVolPropagator(GaugeLinkField &out);
-    void invKHatSquared(GaugeLinkField &out);
+    void makeSpatialNorm(LatticeInteger &spNrm);
+    void makeKHat(std::vector<GaugeLinkField> &khat);
+    void makeInvKHatSquared(GaugeLinkField &out);
    void zmSub(GaugeLinkField &out);
+    void transverseProjectSpatial(GaugeField &out);
+    void gaugeTransform(GaugeField &out);
  private:
-    Gauge    gauge_;
-    ZmScheme zmScheme_;
-    std::vector<Real>  improvement_;
-    Real     G0_;
+    GridBase          *grid_;
+    Gauge             gauge_;
+    ZmScheme          zmScheme_;
+    std::vector<Real> improvement_;
  };

-  typedef Photon<QedGimplR>  PhotonR;
+  typedef Photon<QedGImplR>  PhotonR;
  
-  template<class Gimpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
-  : gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()),
-    G0_(0.15493339023106021408483720810737508876916113364521)
-  {}
-
-  template<class Gimpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
+  template<class GImpl>
+  Photon<GImpl>::Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme,
                        std::vector<Real> improvements)
-  : gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements),
-    G0_(0.15493339023106021408483720810737508876916113364521)
+  : grid_(grid), gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements)
  {}

-  template<class Gimpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme, Real G0)
-  : gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()), G0_(G0)
+  template<class GImpl>
+  Photon<GImpl>::Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme)
+  : Photon(grid, gauge, zmScheme, std::vector<Real>())
  {}

-  template<class Gimpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
-                        std::vector<Real> improvements, Real G0)
-  : gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements), G0_(G0)
-  {}
-
-  template<class Gimpl>
-  void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out)
+  template<class GImpl>
+  void Photon<GImpl>::FreePropagator(const GaugeField &in, GaugeField &out)
  {
-    FFT theFFT(in._grid);
+    FFT        theFFT(dynamic_cast<GridCartesian *>(grid_));
+    GaugeField in_k(grid_);
+    GaugeField prop_k(grid_);
    
-    GaugeField in_k(in._grid);
-    GaugeField prop_k(in._grid);
-    
-    theFFT.FFT_all_dim(in_k,in,FFT::forward);
-    MomentumSpacePropagator(prop_k,in_k);
-    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+    theFFT.FFT_all_dim(in_k, in, FFT::forward);
+    MomentumSpacePropagator(prop_k, in_k);
+    theFFT.FFT_all_dim(out, prop_k, FFT::backward);
  }

-  template<class Gimpl>
-  void Photon<Gimpl>::infVolPropagator(GaugeLinkField &out)
+  template<class GImpl>
+  void Photon<GImpl>::makeSpatialNorm(LatticeInteger &spNrm)
  {
-    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
-    LatticeReal        xmu(grid);
-    GaugeLinkField     one(grid);
-    const unsigned int nd    = grid->_ndimension;
-    std::vector<int>   &l    = grid->_fdimensions;
-    std::vector<int>   x0(nd,0);
-    TComplex           Tone  = Complex(1.0,0.0);
-    TComplex           Tzero = Complex(G0_,0.0);
-    FFT                fft(grid);
+    LatticeInteger   coor(grid_);
+    std::vector<int> l = grid_->FullDimensions();
+
+    spNrm = zero;
+    for(int mu = 0; mu < grid_->Nd() - 1; mu++)
+    {
+      LatticeCoordinate(coor, mu);
+      coor  = where(coor < Integer(l[mu]/2), coor, coor - Integer(l[mu]));
+      spNrm = spNrm + coor*coor;
+    }
+  }
+
+  template<class GImpl>
+  void Photon<GImpl>::makeKHat(std::vector<GaugeLinkField> &khat)
+  {
+    const unsigned int nd = grid_->Nd();
+    std::vector<int>   l  = grid_->FullDimensions();
+    Complex            ci(0., 1.);
+
+    khat.resize(nd, grid_);
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    {
+      Real piL = M_PI/l[mu];
+
+      LatticeCoordinate(khat[mu], mu);
+      khat[mu] = exp(piL*ci*khat[mu])*2.*sin(piL*khat[mu]);
+    }
+  }
+
+  template<class GImpl>
+  void Photon<GImpl>::makeInvKHatSquared(GaugeLinkField &out)
+  {
+    std::vector<GaugeLinkField> khat;
+    GaugeLinkField              lone(grid_);
+    const unsigned int          nd = grid_->Nd();
+    std::vector<int>            zm(nd, 0);
+    ScalarSite                  one = ScalarComplex(1., 0.), z = ScalarComplex(0., 0.);
    
-    one = Complex(1.0,0.0);
    out = zero;
+    makeKHat(khat);
    for(int mu = 0; mu < nd; mu++)
    {
-      LatticeCoordinate(xmu,mu);
-      Real lo2 = l[mu]/2.0;
-      xmu = where(xmu < lo2, xmu, xmu-double(l[mu]));
-      out = out + toComplex(4*M_PI*M_PI*xmu*xmu);
+      out = out + khat[mu]*conjugate(khat[mu]);
    }
-    pokeSite(Tone, out, x0);
-    out = one/out;
-    pokeSite(Tzero, out, x0);
-    fft.FFT_all_dim(out, out, FFT::forward);
+    lone = ScalarComplex(1., 0.);
+    pokeSite(one, out, zm);
+    out = lone/out;
+    pokeSite(z, out, zm);
  }
  
-  template<class Gimpl>
-  void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out)
+  template<class GImpl>
+  void Photon<GImpl>::zmSub(GaugeLinkField &out)
  {
-    GridBase           *grid = out._grid;
-    GaugeLinkField     kmu(grid), one(grid);
-    const unsigned int nd    = grid->_ndimension;
-    std::vector<int>   &l    = grid->_fdimensions;
-    std::vector<int>   zm(nd,0);
-    TComplex           Tone = Complex(1.0,0.0);
-    TComplex           Tzero= Complex(0.0,0.0);
-    
-    one = Complex(1.0,0.0);
-    out = zero;
-    for(int mu = 0; mu < nd; mu++)
-    {
-      Real twoPiL = M_PI*2./l[mu];
-      
-      LatticeCoordinate(kmu,mu);
-      kmu = 2.*sin(.5*twoPiL*kmu);
-      out = out + kmu*kmu;
-    }
-    pokeSite(Tone, out, zm);
-    out = one/out;
-    pokeSite(Tzero, out, zm);
-  }
-  
-  template<class Gimpl>
-  void Photon<Gimpl>::zmSub(GaugeLinkField &out)
-  {
-    GridBase           *grid = out._grid;
-    const unsigned int nd    = grid->_ndimension;
-    std::vector<int>   &l    = grid->_fdimensions;
-    
    switch (zmScheme_)
    {
      case ZmScheme::qedTL:
      {
-        std::vector<int> zm(nd,0);
-        TComplex         Tzero = Complex(0.0,0.0);
-        
-        pokeSite(Tzero, out, zm);
+        std::vector<int> zm(grid_->Nd(), 0);
+        ScalarSite       z = ScalarComplex(0., 0.);
        
+        pokeSite(z, out, zm);
        break;
      }
      case ZmScheme::qedL:
      {
-        LatticeInteger spNrm(grid), coor(grid);
-        GaugeLinkField z(grid);
-        
-        spNrm = zero;
-        for(int d = 0; d < grid->_ndimension - 1; d++)
-        {
-          LatticeCoordinate(coor,d);
-          coor = where(coor < Integer(l[d]/2), coor, coor-Integer(l[d]));
-          spNrm = spNrm + coor*coor;
-        }
-        out = where(spNrm == Integer(0), 0.*out, out);
+        LatticeInteger spNrm(grid_);

-        // IR improvement
+        makeSpatialNorm(spNrm);
+        out = where(spNrm == Integer(0), 0.*out, out);
        for(int i = 0; i < improvement_.size(); i++)
        {
-          Real f = sqrt(improvement_[i]+1);
-          out = where(spNrm == Integer(i+1), f*out, out);
+          Real f = sqrt(improvement_[i] + 1);
+          out = where(spNrm == Integer(i + 1), f*out, out);
        }
+        break;
      }
      default:
+        assert(0);
        break;
    }
  }

-  template<class Gimpl>
-  void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in,
-                                               GaugeField &out)
+  template<class GImpl>
+  void Photon<GImpl>::transverseProjectSpatial(GaugeField &out)
  {
-  GridBase           *grid = out._grid;
-    LatticeComplex     momProp(grid);
-    
-    switch (zmScheme_)
+    const unsigned int          nd = grid_->Nd();
+    GaugeLinkField              invKHat(grid_), cst(grid_), spdiv(grid_);
+    LatticeInteger              spNrm(grid_);
+    std::vector<GaugeLinkField> khat, a(nd, grid_), aProj(nd, grid_);
+
+    invKHat = zero;
+    makeSpatialNorm(spNrm);
+    makeKHat(khat);
+    for (unsigned int mu = 0; mu < nd; ++mu)
    {
-      case ZmScheme::qedTL:
-      case ZmScheme::qedL:
+      a[mu] = peekLorentz(out, mu);
+      if (mu < nd - 1)
      {
-        invKHatSquared(momProp);
-        zmSub(momProp);
-        break;
+        invKHat += khat[mu]*conjugate(khat[mu]);
      }
-      case ZmScheme::qedInf:
-      {
-        infVolPropagator(momProp);
+    }
+    cst     = ScalarComplex(1., 0.);
+    invKHat = where(spNrm == Integer(0), cst, invKHat);
+    invKHat = cst/invKHat;
+    cst     = zero;
+    invKHat = where(spNrm == Integer(0), cst, invKHat);
+    spdiv   = zero;
+    for (unsigned int nu = 0; nu < nd - 1; ++nu)
+    {
+      spdiv += conjugate(khat[nu])*a[nu];
+    }
+    spdiv *= invKHat;
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    {
+      aProj[mu] = a[mu] - khat[mu]*spdiv;
+      pokeLorentz(out, aProj[mu], mu);
+    }
+  }
+
+  template<class GImpl>
+  void Photon<GImpl>::gaugeTransform(GaugeField &out)
+  {
+    switch (gauge_)
+    {
+      case Gauge::feynman:
+        break;
+      case Gauge::coulomb:
+        transverseProjectSpatial(out);
+        break;
+      case Gauge::landau:
+        assert(0);
        break;
-      }
      default:
+        assert(0);
        break;
    }
+  }
+
+  template<class GImpl>
+  void Photon<GImpl>::MomentumSpacePropagator(const GaugeField &in,
+                                              GaugeField &out)
+  {
+    LatticeComplex momProp(grid_);
+    
+    makeInvKHatSquared(momProp);
+    zmSub(momProp);
    
    out = in*momProp;
  }
  
-  template<class Gimpl>
-  void Photon<Gimpl>::StochasticWeight(GaugeLinkField &weight)
+  template<class GImpl>
+  void Photon<GImpl>::StochasticWeight(GaugeLinkField &weight)
  {
-    auto               *grid     = dynamic_cast<GridCartesian *>(weight._grid);
-    const unsigned int nd        = grid->_ndimension;
-    std::vector<int>   latt_size = grid->_fdimensions;
-    
-    switch (zmScheme_)
+    const unsigned int nd  = grid_->Nd();
+    std::vector<int>   l   = grid_->FullDimensions();
+    Integer            vol = 1;
+
+    for(unsigned int mu = 0; mu < nd; mu++)
    {
-      case ZmScheme::qedTL:
-      case ZmScheme::qedL:
-      {
-        Integer vol = 1;
-        for(int d = 0; d < nd; d++)
-        {
-          vol = vol * latt_size[d];
-        }
-        invKHatSquared(weight);
-        weight = sqrt(vol)*sqrt(weight);
-        zmSub(weight);
-        break;
-      }
-      case ZmScheme::qedInf:
-      {
-        infVolPropagator(weight);
-        weight = sqrt(real(weight));
-        break;
-      }
-      default:
-        break;
+      vol = vol*l[mu];
    }
+    makeInvKHatSquared(weight);
+    weight = sqrt(vol)*sqrt(weight);
+    zmSub(weight);
  }
  
-  template<class Gimpl>
-  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
+  template<class GImpl>
+  void Photon<GImpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
  {
-    auto           *grid = dynamic_cast<GridCartesian *>(out._grid);
-    GaugeLinkField weight(grid);
+    GaugeLinkField weight(grid_);
    
    StochasticWeight(weight);
    StochasticField(out, rng, weight);
  }
  
-  template<class Gimpl>
-  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
+  template<class GImpl>
+  void Photon<GImpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
                                      const GaugeLinkField &weight)
  {
-    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
-    const unsigned int nd = grid->_ndimension;
-    GaugeLinkField     r(grid);
-    GaugeField         aTilde(grid);
-    FFT                fft(grid);
+    const unsigned int nd = grid_->Nd();
+    GaugeLinkField     r(grid_);
+    GaugeField         aTilde(grid_);
+    FFT                fft(dynamic_cast<GridCartesian *>(grid_));
    
-    switch (zmScheme_)
+    for(unsigned int mu = 0; mu < nd; mu++)
    {
-      case ZmScheme::qedTL:
-      case ZmScheme::qedL:
-      {
-        for(int mu = 0; mu < nd; mu++)
-        {
-          gaussian(rng, r);
-          r = weight*r;
-          pokeLorentz(aTilde, r, mu);
-        }
-        break;
-      }
-      case ZmScheme::qedInf:
-      {
-        Complex                    shift(1., 1.); // This needs to be a GaugeLink element?
-        for(int mu = 0; mu < nd; mu++)
-        {
-          bernoulli(rng, r);
-          r = weight*(2.*r - shift);
-          pokeLorentz(aTilde, r, mu);
-        }
-        break;
-      }
-      default:
-        break;
+      gaussian(rng, r);
+      r = weight*r;
+      pokeLorentz(aTilde, r, mu);
    }
-
+    gaugeTransform(aTilde);
    fft.FFT_all_dim(out, aTilde, FFT::backward);
-    
    out = real(out);
  }

-  template<class Gimpl>
-  void Photon<Gimpl>::UnitField(GaugeField &out)
+  template<class GImpl>
+  void Photon<GImpl>::UnitField(GaugeField &out)
  {
-    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
-    const unsigned int nd = grid->_ndimension;
-    GaugeLinkField     r(grid);
+    const unsigned int nd = grid_->Nd();
+    GaugeLinkField     r(grid_);
    
-    r = Complex(1.0,0.0);
-
-    for(int mu = 0; mu < nd; mu++)
+    r = ScalarComplex(1., 0.);
+    for(unsigned int mu = 0; mu < nd; mu++)
    {
      pokeLorentz(out, r, mu);
    }
-    
    out = real(out);
  }
-//  template<class Gimpl>
-//  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
-//                                                            const GaugeField &in)
-//  {
-//    
-//    FeynmanGaugeMomentumSpacePropagator_TL(out,in);
-//    
-//    GridBase *grid = out._grid;
-//    LatticeInteger     coor(grid);
-//    GaugeField zz(grid); zz=zero;
-//    
-//    // xyzt
-//    for(int d = 0; d < grid->_ndimension-1;d++){
-//      LatticeCoordinate(coor,d);
-//      out = where(coor==Integer(0),zz,out);
-//    }
-//  }
-//  
-//  template<class Gimpl>
-//  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out,
-//                                                             const GaugeField &in)
-//  {
-//    
-//    // what type LatticeComplex
-//    GridBase *grid = out._grid;
-//    int nd = grid->_ndimension;
-//    
-//    typedef typename GaugeField::vector_type vector_type;
-//    typedef typename GaugeField::scalar_type ScalComplex;
-//    typedef Lattice<iSinglet<vector_type> > LatComplex;
-//    
-//    std::vector<int> latt_size   = grid->_fdimensions;
-//    
-//    LatComplex denom(grid); denom= zero;
-//    LatComplex   one(grid); one = ScalComplex(1.0,0.0);
-//    LatComplex   kmu(grid);
-//    
-//    ScalComplex ci(0.0,1.0);
-//    // momphase = n * 2pi / L
-//    for(int mu=0;mu<Nd;mu++) {
-//      
-//      LatticeCoordinate(kmu,mu);
-//      
-//      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-//      
-//      kmu = TwoPiL * kmu ;
-//      
-//      denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
-//    }
-//    std::vector<int> zero_mode(nd,0);
-//    TComplexD Tone = ComplexD(1.0,0.0);
-//    TComplexD Tzero= ComplexD(0.0,0.0);
-//    
-//    pokeSite(Tone,denom,zero_mode);
-//    
-//    denom= one/denom;
-//    
-//    pokeSite(Tzero,denom,zero_mode);
-//    
-//    out = zero;
-//    out = in*denom;
-//  };
  
 }}
 #endif
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@@ -27,12 +27,13 @@ public:

  typedef iSpinColourMatrix<vector_type> SpinColourMatrix_v;

-  static void MesonField(Eigen::Tensor<ComplexD,5> &mat, 
+  template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
+  static void MesonField(TensorType &mat, 
 			 const FermionField *lhs_wi,
 			 const FermionField *rhs_vj,
 			 std::vector<Gamma::Algebra> gammas,
 			 const std::vector<ComplexField > &mom,
-			 int orthogdim);
+			 int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);

  static void PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat, 
 			     const FermionField *wi,
@@ -59,6 +60,14 @@ public:
 			  const FermionField *vj,
 			  int orthogdim);

+  template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
+  static void AslashField(TensorType &mat, 
+        const FermionField *lhs_wi,
+        const FermionField *rhs_vj,
+        const std::vector<ComplexField> &emB0,
+        const std::vector<ComplexField> &emB1,
+        int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
+
  static void ContractWWVV(std::vector<PropagatorField> &WWVV,
 			   const Eigen::Tensor<ComplexD,3> &WW_sd,
 			   const FermionField *vs,
@@ -92,13 +101,14 @@ public:
 #endif
 };

-template<class FImpl>
-void A2Autils<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat, 
+template <class FImpl>
+template <typename TensorType>
+void A2Autils<FImpl>::MesonField(TensorType &mat, 
 				 const FermionField *lhs_wi,
 				 const FermionField *rhs_vj,
 				 std::vector<Gamma::Algebra> gammas,
 				 const std::vector<ComplexField > &mom,
-				 int orthogdim) 
+				 int orthogdim, double *t_kernel, double *t_gsum) 
 {
  typedef typename FImpl::SiteSpinor vobj;

@@ -146,6 +156,7 @@ void A2Autils<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
  int stride=grid->_slice_stride[orthogdim];

  // potentially wasting cores here if local time extent too small
+  if (t_kernel) *t_kernel = -usecond();
  parallel_for(int r=0;r<rd;r++){

    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
@@ -212,7 +223,7 @@ void A2Autils<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
      }
    }}}
  }
-
+  if (t_kernel) *t_kernel += usecond();
  assert(mat.dimension(0) == Nmom);
  assert(mat.dimension(1) == Ngamma);
  assert(mat.dimension(2) == Nt);
@@ -256,9 +267,9 @@ void A2Autils<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
  // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
  // Healthy size that should suffice
  ////////////////////////////////////////////////////////////////////
-
+  if (t_gsum) *t_gsum = -usecond();
  grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
-
+  if (t_gsum) *t_gsum += usecond();
 }


@@ -614,6 +625,189 @@ void A2Autils<FImpl>::PionFieldVV(Eigen::Tensor<ComplexD,3> &mat,
  PionFieldXX(mat,vi,vj,orthogdim,nog5);
 }

+// "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x)
+//
+// With:
+//
+// B_0 = A_0 + i A_1
+// B_1 = A_2 + i A_3
+// 
+// then in spin space
+// 
+//                 ( 0          0          -conj(B_1) -B_0 )
+// i * A_mu g_mu = ( 0          0          -conj(B_0)  B_1 )
+//                 ( B_1        B_0        0          0    )
+//                 ( conj(B_0)  -conj(B_1) 0          0    )
+template <class FImpl>
+template <typename TensorType>
+void A2Autils<FImpl>::AslashField(TensorType &mat, 
+          const FermionField *lhs_wi,
+          const FermionField *rhs_vj,
+          const std::vector<ComplexField> &emB0,
+          const std::vector<ComplexField> &emB1,
+          int orthogdim, double *t_kernel, double *t_gsum) 
+{
+    typedef typename FermionField::vector_object vobj;
+    typedef typename vobj::scalar_object         sobj;
+    typedef typename vobj::scalar_type           scalar_type;
+    typedef typename vobj::vector_type           vector_type;
+
+    typedef iSpinMatrix<vector_type> SpinMatrix_v;
+    typedef iSpinMatrix<scalar_type> SpinMatrix_s;
+    typedef iSinglet<vector_type>    Singlet_v;
+    typedef iSinglet<scalar_type>    Singlet_s;
+    
+    int Lblock = mat.dimension(3); 
+    int Rblock = mat.dimension(4);
+
+    GridBase *grid = lhs_wi[0]._grid;
+    
+    const int    Nd = grid->_ndimension;
+    const int Nsimd = grid->Nsimd();
+
+    int Nt  = grid->GlobalDimensions()[orthogdim];
+    int Nem = emB0.size();
+    assert(emB1.size() == Nem);
+
+    int fd=grid->_fdimensions[orthogdim];
+    int ld=grid->_ldimensions[orthogdim];
+    int rd=grid->_rdimensions[orthogdim];
+
+    // will locally sum vectors first
+    // sum across these down to scalars
+    // splitting the SIMD
+    int MFrvol = rd*Lblock*Rblock*Nem;
+    int MFlvol = ld*Lblock*Rblock*Nem;
+
+    Vector<vector_type> lvSum(MFrvol);
+    parallel_for (int r = 0; r < MFrvol; r++)
+    {
+        lvSum[r] = zero;
+    }
+
+    Vector<scalar_type> lsSum(MFlvol);             
+    parallel_for (int r = 0; r < MFlvol; r++)
+    {
+        lsSum[r] = scalar_type(0.0);
+    }
+
+    int e1=    grid->_slice_nblock[orthogdim];
+    int e2=    grid->_slice_block [orthogdim];
+    int stride=grid->_slice_stride[orthogdim];
+
+    // Nested parallelism would be ok
+    // Wasting cores here. Test case r
+    if (t_kernel) *t_kernel = -usecond();
+    parallel_for(int r=0;r<rd;r++)
+    {
+        int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+        for(int n=0;n<e1;n++)
+        for(int b=0;b<e2;b++)
+        {
+            int ss= so+n*stride+b;
+
+            for(int i=0;i<Lblock;i++)
+            {
+                auto left = conjugate(lhs_wi[i]._odata[ss]);
+
+                for(int j=0;j<Rblock;j++)
+                {
+                    SpinMatrix_v vv;
+                    auto right = rhs_vj[j]._odata[ss];
+
+                    for(int s1=0;s1<Ns;s1++)
+                    for(int s2=0;s2<Ns;s2++)
+                    {
+                        vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
+                                        + left()(s2)(1) * right()(s1)(1)
+                                        + left()(s2)(2) * right()(s1)(2);
+                    }
+                    
+                    // After getting the sitewise product do the mom phase loop
+                    int base = Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*r;
+
+                    for ( int m=0;m<Nem;m++)
+                    {
+                        int idx  = m+base;
+                        auto b0  = emB0[m]._odata[ss];
+                        auto b1  = emB1[m]._odata[ss];
+                        auto cb0 = conjugate(b0);
+                        auto cb1 = conjugate(b1);
+
+                        lvSum[idx] += - vv()(3,0)()*b0()()()  - vv()(2,0)()*cb1()()()
+                                      + vv()(3,1)()*b1()()()  - vv()(2,1)()*cb0()()()
+                                      + vv()(0,2)()*b1()()()  + vv()(1,2)()*b0()()()
+                                      + vv()(0,3)()*cb0()()() - vv()(1,3)()*cb1()()();
+                    }
+                }
+            }
+        }
+    }
+
+    // Sum across simd lanes in the plane, breaking out orthog dir.
+    parallel_for(int rt=0;rt<rd;rt++)
+    {
+        std::vector<int> icoor(Nd);
+        std::vector<scalar_type> extracted(Nsimd);               
+
+        for(int i=0;i<Lblock;i++)
+        for(int j=0;j<Rblock;j++)
+        for(int m=0;m<Nem;m++)
+        {
+
+            int ij_rdx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*rt;
+
+            extract<vector_type,scalar_type>(lvSum[ij_rdx],extracted);
+            for(int idx=0;idx<Nsimd;idx++)
+            {
+                grid->iCoorFromIindex(icoor,idx);
+
+                int ldx    = rt+icoor[orthogdim]*rd;
+                int ij_ldx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*ldx;
+
+                lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
+            }
+        }
+    }
+    if (t_kernel) *t_kernel += usecond();
+
+    // ld loop and local only??
+    int pd = grid->_processors[orthogdim];
+    int pc = grid->_processor_coor[orthogdim];
+    parallel_for_nest2(int lt=0;lt<ld;lt++)
+    {
+        for(int pt=0;pt<pd;pt++)
+        {
+            int t = lt + pt*ld;
+            if (pt == pc)
+            {
+                for(int i=0;i<Lblock;i++)
+                for(int j=0;j<Rblock;j++)
+                for(int m=0;m<Nem;m++)
+                {
+                    int ij_dx = m+Nem*i + Nem*Lblock * j + Nem*Lblock * Rblock * lt;
+
+                    mat(m,0,t,i,j) = lsSum[ij_dx];
+                }
+            } 
+            else 
+            { 
+                const scalar_type zz(0.0);
+
+                for(int i=0;i<Lblock;i++)
+                for(int j=0;j<Rblock;j++)
+                for(int m=0;m<Nem;m++)
+                {
+                    mat(m,0,t,i,j) = zz;
+                }
+            }
+        }
+    }
+    if (t_gsum) *t_gsum = -usecond();
+    grid->GlobalSumVector(&mat(0,0,0,0,0),Nem*Nt*Lblock*Rblock);
+    if (t_gsum) *t_gsum += usecond();
+}

 ////////////////////////////////////////////
 // Schematic thoughts about more generalised four quark insertion
--- a/Grid/qcd/utils/LinalgUtils.h
+++ b/Grid/qcd/utils/LinalgUtils.h
@@ -173,6 +173,39 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
    }
  }
 }
+}

-}}
+// I explicitly need these outside the QCD namespace
+template<typename vobj>
+void G5C(Lattice<vobj> &z, const Lattice<vobj> &x)
+{
+  GridBase *grid = x._grid;
+  z.checkerboard = x.checkerboard;
+  conformable(x, z);
+
+  QCD::Gamma G5(QCD::Gamma::Algebra::Gamma5);
+  z = G5 * x;
+}
+
+template<class CComplex, int nbasis>
+void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, nbasis>> &x)
+{
+  GridBase *grid = x._grid;
+  z.checkerboard = x.checkerboard;
+  conformable(x, z);
+
+  static_assert(nbasis % 2 == 0, "");
+  int nb = nbasis / 2;
+
+  parallel_for(int ss = 0; ss < grid->oSites(); ss++) {
+    for(int n = 0; n < nb; ++n) {
+      z._odata[ss](n) = x._odata[ss](n);
+    }
+    for(int n = nb; n < nbasis; ++n) {
+      z._odata[ss](n) = -x._odata[ss](n);
+    }
+  }
+}
+
+}
 #endif 
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -6,10 +6,12 @@

    Copyright (C) 2015

-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: neo <cossu@post.kek.jp>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: James Harrison <J.Harrison@soton.ac.uk>
+    Author: Antonin Portelli <antonin.portelli@me.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -645,6 +647,184 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      }
    }
  }
+
+  //////////////////////////////////////////////////
+  // Wilson loop of size (R1, R2), oriented in mu,nu plane
+  //////////////////////////////////////////////////
+  static void wilsonLoop(GaugeMat &wl, const std::vector<GaugeMat> &U,
+                           const int Rmu, const int Rnu,
+                           const int mu, const int nu) {
+    wl = U[nu];
+
+    for(int i = 0; i < Rnu-1; i++){
+      wl = Gimpl::CovShiftForward(U[nu], nu, wl);
+    }
+
+    for(int i = 0; i < Rmu; i++){
+      wl = Gimpl::CovShiftForward(U[mu], mu, wl);
+    }
+
+    for(int i = 0; i < Rnu; i++){
+      wl = Gimpl::CovShiftBackward(U[nu], nu, wl);
+    }
+
+    for(int i = 0; i < Rmu; i++){
+      wl = Gimpl::CovShiftBackward(U[mu], mu, wl);
+    }
+  }
+  //////////////////////////////////////////////////
+  // trace of Wilson Loop oriented in mu,nu plane
+  //////////////////////////////////////////////////
+  static void traceWilsonLoop(LatticeComplex &wl,
+                                const std::vector<GaugeMat> &U,
+                                const int Rmu, const int Rnu,
+                                const int mu, const int nu) {
+    GaugeMat sp(U[0]._grid);
+    wilsonLoop(sp, U, Rmu, Rnu, mu, nu);
+    wl = trace(sp);
+  }
+  //////////////////////////////////////////////////
+  // sum over all planes of Wilson loop
+  //////////////////////////////////////////////////
+  static void siteWilsonLoop(LatticeComplex &Wl,
+                            const std::vector<GaugeMat> &U,
+                            const int R1, const int R2) {
+    LatticeComplex siteWl(U[0]._grid);
+    Wl = zero;
+    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
+      for (int nu = 0; nu < mu; nu++) {
+        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
+        Wl = Wl + siteWl;
+        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
+        Wl = Wl + siteWl;
+      }
+    }
+  }
+  //////////////////////////////////////////////////
+  // sum over planes of Wilson loop with length R1
+  // in the time direction
+  //////////////////////////////////////////////////
+  static void siteTimelikeWilsonLoop(LatticeComplex &Wl,
+                            const std::vector<GaugeMat> &U,
+                            const int R1, const int R2) {
+    LatticeComplex siteWl(U[0]._grid);
+
+    int ndim = U[0]._grid->_ndimension;
+
+    Wl = zero;
+    for (int nu = 0; nu < ndim - 1; nu++) {
+      traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu);
+      Wl = Wl + siteWl;
+    }
+  }
+  //////////////////////////////////////////////////
+  // sum Wilson loop over all planes orthogonal to the time direction
+  //////////////////////////////////////////////////
+  static void siteSpatialWilsonLoop(LatticeComplex &Wl,
+                            const std::vector<GaugeMat> &U,
+                            const int R1, const int R2) {
+    LatticeComplex siteWl(U[0]._grid);
+
+    Wl = zero;
+    for (int mu = 1; mu < U[0]._grid->_ndimension - 1; mu++) {
+      for (int nu = 0; nu < mu; nu++) {
+        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
+        Wl = Wl + siteWl;
+        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
+        Wl = Wl + siteWl;
+      }
+    }
+  }
+  //////////////////////////////////////////////////
+  // sum over all x,y,z,t and over all planes of Wilson loop
+  //////////////////////////////////////////////////
+  static Real sumWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    std::vector<GaugeMat> U(4, Umu._grid);
+
+    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+    }
+
+    LatticeComplex Wl(Umu._grid);
+
+    siteWilsonLoop(Wl, U, R1, R2);
+
+    TComplex Tp = sum(Wl);
+    Complex p = TensorRemove(Tp);
+    return p.real();
+  }
+  //////////////////////////////////////////////////
+  // sum over all x,y,z,t and over all planes of timelike Wilson loop
+  //////////////////////////////////////////////////
+  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    std::vector<GaugeMat> U(4, Umu._grid);
+
+    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+    }
+
+    LatticeComplex Wl(Umu._grid);
+
+    siteTimelikeWilsonLoop(Wl, U, R1, R2);
+
+    TComplex Tp = sum(Wl);
+    Complex p = TensorRemove(Tp);
+    return p.real();
+  }
+  //////////////////////////////////////////////////
+  // sum over all x,y,z,t and over all planes of spatial Wilson loop
+  //////////////////////////////////////////////////
+  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    std::vector<GaugeMat> U(4, Umu._grid);
+
+    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+    }
+
+    LatticeComplex Wl(Umu._grid);
+
+    siteSpatialWilsonLoop(Wl, U, R1, R2);
+
+    TComplex Tp = sum(Wl);
+    Complex p = TensorRemove(Tp);
+    return p.real();
+  }
+  //////////////////////////////////////////////////
+  // average over all x,y,z,t and over all planes of Wilson loop
+  //////////////////////////////////////////////////
+  static Real avgWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    int ndim = Umu._grid->_ndimension;
+    Real sumWl = sumWilsonLoop(Umu, R1, R2);
+    Real vol = Umu._grid->gSites();
+    Real faces = 1.0 * ndim * (ndim - 1);
+    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
+  }
+  //////////////////////////////////////////////////
+  // average over all x,y,z,t and over all planes of timelike Wilson loop
+  //////////////////////////////////////////////////
+  static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    int ndim = Umu._grid->_ndimension;
+    Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2);
+    Real vol = Umu._grid->gSites();
+    Real faces = 1.0 * (ndim - 1);
+    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
+  }
+  //////////////////////////////////////////////////
+  // average over all x,y,z,t and over all planes of spatial Wilson loop
+  //////////////////////////////////////////////////
+  static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    int ndim = Umu._grid->_ndimension;
+    Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2);
+    Real vol = Umu._grid->gSites();
+    Real faces = 1.0 * (ndim - 1) * (ndim - 2);
+    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
+  }
 };

 typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
--- a/Grid/serialisation/Hdf5IO.cc
+++ b/Grid/serialisation/Hdf5IO.cc
@@ -61,9 +61,9 @@ Group & Hdf5Writer::getGroup(void)
 }

 // Reader implementation ///////////////////////////////////////////////////////
-Hdf5Reader::Hdf5Reader(const std::string &fileName)
+Hdf5Reader::Hdf5Reader(const std::string &fileName, const bool readOnly)
 : fileName_(fileName)
-, file_(fileName.c_str(), H5F_ACC_RDWR)
+, file_(fileName.c_str(), readOnly ? H5F_ACC_RDONLY : H5F_ACC_RDWR)
 {
  group_ = file_.openGroup("/");
  readSingleAttribute(dataSetThres_, HDF5_GRID_GUARD "dataset_threshold",
--- a/Grid/serialisation/Hdf5IO.h
+++ b/Grid/serialisation/Hdf5IO.h
@@ -54,7 +54,7 @@ namespace Grid
  class Hdf5Reader: public Reader<Hdf5Reader>
  {
  public:
-    Hdf5Reader(const std::string &fileName);
+    Hdf5Reader(const std::string &fileName, const bool readOnly = true);
    virtual ~Hdf5Reader(void) = default;
    bool push(const std::string &s);
    void pop(void);
@@ -123,9 +123,12 @@ namespace Grid
    
    if (flatx.size() > dataSetThres_)
    {
-      H5NS::DataSet dataSet;
+      H5NS::DataSet           dataSet;
+      H5NS::DSetCreatPropList plist;
      
-      dataSet = group_.createDataSet(s, Hdf5Type<Element>::type(), dataSpace);
+      plist.setChunk(dim.size(), dim.data());
+      plist.setFletcher32();
+      dataSet = group_.createDataSet(s, Hdf5Type<Element>::type(), dataSpace, plist);
      dataSet.write(flatx.data(), Hdf5Type<Element>::type());
    }
    else
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -47,6 +47,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #define PARALLEL_FOR_LOOP
 #define PARALLEL_FOR_LOOP_INTERN
+#define PARALLEL_FOR_LOOP_REDUCE(op, var)
 #define PARALLEL_NESTED_LOOP2
 #define PARALLEL_NESTED_LOOP5
 #define PARALLEL_REGION
@@ -58,6 +59,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
 #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 #define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for
+#define parallel_critical PARALLEL_CRITICAL

 namespace Grid {

--- a/Grid/util/Sha.h
+++ b/Grid/util/Sha.h
@@ -28,17 +28,46 @@
 extern "C" {
 #include <openssl/sha.h>
 }
+#ifdef USE_IPP
+#include "ipp.h"
+#endif

 #pragma once

 class GridChecksum
 {
 public:
-  static inline uint32_t crc32(void *data,size_t bytes)
+  static inline uint32_t crc32(const void *data, size_t bytes)
  {
    return ::crc32(0L,(unsigned char *)data,bytes);
  }
-  static inline std::vector<unsigned char> sha256(void *data,size_t bytes)
+
+#ifdef USE_IPP
+  static inline uint32_t crc32c(const void* data, size_t bytes)
+  {
+      uint32_t crc32c = ~(uint32_t)0;
+      ippsCRC32C_8u(reinterpret_cast<const unsigned char *>(data), bytes, &crc32c);
+      ippsSwapBytes_32u_I(&crc32c, 1);
+  
+      return ~crc32c;
+  }
+#endif
+
+  template <typename T>
+  static inline std::string sha256_string(const std::vector<T> &hash)
+  {
+    std::stringstream sha;
+    std::string       s;
+
+    for(unsigned int i = 0; i < hash.size(); i++) 
+    { 
+        sha << std::hex << static_cast<unsigned int>(hash[i]);
+    }
+    s = sha.str();
+
+    return s;
+  }
+  static inline std::vector<unsigned char> sha256(const void *data,size_t bytes)
  {
    std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
    SHA256_CTX sha256;
--- a/Hadrons/A2AMatrix.hpp
+++ b/Hadrons/A2AMatrix.hpp
@@ -7,6 +7,7 @@ Source file: Hadrons/A2AMatrix.hpp
 Copyright (C) 2015-2018

 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,38 +30,397 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define A2A_Matrix_hpp_

 #include <Hadrons/Global.hpp>
+#include <Hadrons/TimerArray.hpp>
+#include <Grid/Eigen/unsupported/CXX11/Tensor>
+#ifdef USE_MKL
+#include "mkl.h"
+#include "mkl_cblas.h"
+#endif
+
+#ifndef HADRONS_A2AM_NAME 
+#define HADRONS_A2AM_NAME "a2aMatrix"
+#endif
+
+#ifndef HADRONS_A2AM_IO_TYPE
+#define HADRONS_A2AM_IO_TYPE ComplexF
+#endif
+
+#define HADRONS_A2AM_PARALLEL_IO

 BEGIN_HADRONS_NAMESPACE

-template <typename T, typename MetadataType>
+// general A2A matrix set based on Eigen tensors and Grid-allocated memory
+// Dimensions:
+//   0 - ext - external field (momentum, EM field, ...)
+//   1 - str - spin-color structure
+//   2 - t   - timeslice
+//   3 - i   - left  A2A mode index
+//   4 - j   - right A2A mode index
+template <typename T>
+using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;
+
+template <typename T>
+using A2AMatrix = Eigen::Matrix<T, -1, -1, Eigen::RowMajor>;
+
+template <typename T>
+using A2AMatrixMap = Eigen::Map<A2AMatrix<T>>;
+
+template <typename T>
+using A2AMatrixTr = Eigen::Matrix<T, -1, -1, Eigen::ColMajor>;
+
+/******************************************************************************
+ *                      Abstract class for A2A kernels                        *
+ ******************************************************************************/
+template <typename T, typename Field>
+class A2AKernel
+{
+public:
+    A2AKernel(void) = default;
+    virtual ~A2AKernel(void) = default;
+    virtual void operator()(A2AMatrixSet<T> &m, const Field *left, const Field *right,
+                          const unsigned int orthogDim, double &time) = 0;
+    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
+    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
+};
+
+/******************************************************************************
+ *                  Class to handle A2A matrix block HDF5 I/O                 *
+ ******************************************************************************/
+template <typename T>
 class A2AMatrixIo
 {
 public:
+    // constructors
    A2AMatrixIo(void) = default;
    A2AMatrixIo(std::string filename, std::string dataname, 
-                const unsigned int nt, const unsigned int ni,
-                const unsigned int nj);
+                const unsigned int nt, const unsigned int ni = 0,
+                const unsigned int nj = 0);
+    // destructor
    ~A2AMatrixIo(void) = default;
+    // access
+    unsigned int getNi(void) const;
+    unsigned int getNj(void) const;
+    unsigned int getNt(void) const;
+    size_t       getSize(void) const;
+    // file allocation
+    template <typename MetadataType>
    void initFile(const MetadataType &d, const unsigned int chunkSize);
+    // block I/O
    void saveBlock(const T *data, const unsigned int i, const unsigned int j,
                   const unsigned int blockSizei, const unsigned int blockSizej);
+    void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
+                   const unsigned int i, const unsigned int j);
+    template <template <class> class Vec, typename VecT>
+    void load(Vec<VecT> &v, double *tRead = nullptr, const bool useCache = true);
 private:
-    std::string  filename_, dataname_;
-    unsigned int nt_, ni_, nj_;
+    std::string  filename_{""}, dataname_{""};
+    unsigned int nt_{0}, ni_{0}, nj_{0};
 };

-template <typename T, typename MetadataType>
-A2AMatrixIo<T, MetadataType>::A2AMatrixIo(std::string filename, 
-                                          std::string dataname, 
-                                          const unsigned int nt, 
-                                          const unsigned int ni,
-                                          const unsigned int nj)
+/******************************************************************************
+ *                  Wrapper for A2A matrix block computation                  *
+ ******************************************************************************/
+template <typename T, typename Field, typename MetadataType, typename TIo = T>
+class A2AMatrixBlockComputation
+{
+private:
+    struct IoHelper
+    {
+        A2AMatrixIo<TIo> io;
+        MetadataType     md;
+        unsigned int     e, s, i, j;
+    };
+    typedef std::function<std::string(const unsigned int, const unsigned int)>  FilenameFn;
+    typedef std::function<MetadataType(const unsigned int, const unsigned int)> MetadataFn;
+public:
+    // constructor
+    A2AMatrixBlockComputation(GridBase *grid,
+                              const unsigned int orthogDim,
+                              const unsigned int next,
+                              const unsigned int nstr,
+                              const unsigned int blockSize,
+                              const unsigned int cacheBlockSize,
+                              TimerArray *tArray = nullptr);
+    // execution
+    void execute(const std::vector<Field> &left, 
+                 const std::vector<Field> &right,
+                 A2AKernel<T, Field> &kernel,
+                 const FilenameFn &ionameFn,
+                 const FilenameFn &filenameFn,
+                 const MetadataFn &metadataFn);
+private:
+    // I/O handler
+    void saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h);
+private:
+    TimerArray            *tArray_;
+    GridBase              *grid_;
+    unsigned int          orthogDim_, nt_, next_, nstr_, blockSize_, cacheBlockSize_;
+    Vector<T>             mCache_;
+    Vector<TIo>           mBuf_;
+    std::vector<IoHelper> nodeIo_;
+};
+
+/******************************************************************************
+ *                       A2A matrix contraction kernels                       *
+ ******************************************************************************/
+class A2AContraction
+{
+public:
+    // accTrMul(acc, a, b): acc += tr(a*b)
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
+    {
+        if ((MatLeft::Options == Eigen::RowMajor) and
+            (MatRight::Options == Eigen::ColMajor))
+        {
+            parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+            {
+                C tmp;
+#ifdef USE_MKL
+                dotuRow(tmp, r, a, b);
+#else
+                tmp = a.row(r).conjugate().dot(b.col(r));
+#endif
+                parallel_critical
+                {
+                    acc += tmp;
+                }
+            }
+        }
+        else
+        {
+            parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+            {
+                C tmp;
+#ifdef USE_MKL 
+                dotuCol(tmp, c, a, b);
+#else
+                tmp = a.col(c).conjugate().dot(b.row(c));
+#endif
+                parallel_critical
+                {
+                    acc += tmp;
+                }
+            }
+        }
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline double accTrMulFlops(const MatLeft &a, const MatRight &b)
+    {
+        double n = a.rows()*a.cols();
+
+        return 8.*n;
+    }
+
+    // mul(res, a, b): res = a*b
+#ifdef USE_MKL
+    template <template <class, int...> class Mat, int... Opts>
+    static inline void mul(Mat<ComplexD, Opts...> &res, 
+                           const Mat<ComplexD, Opts...> &a, 
+                           const Mat<ComplexD, Opts...> &b)
+    {
+        static const ComplexD one(1., 0.), zero(0., 0.);
+
+        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
+        {
+            res.resize(a.rows(), b.cols());
+        }
+        if (Mat<ComplexD, Opts...>::Options == Eigen::RowMajor)
+        {
+            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat<ComplexD, Opts...>::Options == Eigen::ColMajor)
+        {
+            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    }
+
+    template <template <class, int...> class Mat, int... Opts>
+    static inline void mul(Mat<ComplexF, Opts...> &res, 
+                           const Mat<ComplexF, Opts...> &a, 
+                           const Mat<ComplexF, Opts...> &b)
+    {
+        static const ComplexF one(1., 0.), zero(0., 0.);
+
+        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
+        {
+            res.resize(a.rows(), b.cols());
+        }
+        if (Mat<ComplexF, Opts...>::Options == Eigen::RowMajor)
+        {
+            cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat<ComplexF, Opts...>::Options == Eigen::ColMajor)
+        {
+            cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    }
+#else
+    template <typename Mat>
+    static inline void mul(Mat &res, const Mat &a, const Mat &b)
+    {
+        res = a*b;
+    }
+#endif
+    template <typename Mat>
+    static inline double mulFlops(const Mat &a, const Mat &b)
+    {
+        double nr = a.rows(), nc = a.cols();
+
+        return nr*nr*(6.*nc + 2.*(nc - 1.));
+    }
+private:
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void makeDotRowPt(C * &aPt, unsigned int &aInc, C * &bPt, 
+                                    unsigned int &bInc, const unsigned int aRow, 
+                                    const MatLeft &a, const MatRight &b)
+    {
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            aPt  = a.data() + aRow*a.cols();
+            aInc = 1;
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            aPt  = a.data() + aRow;
+            aInc = a.rows();
+        }
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            bPt  = b.data() + aRow;
+            bInc = b.cols();
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            bPt  = b.data() + aRow*b.rows();
+            bInc = 1;
+        }
+    }
+
+#ifdef USE_MKL
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void makeDotColPt(C * &aPt, unsigned int &aInc, C * &bPt, 
+                                    unsigned int &bInc, const unsigned int aCol, 
+                                    const MatLeft &a, const MatRight &b)
+    {
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            aPt  = a.data() + aCol;
+            aInc = a.cols();
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            aPt  = a.data() + aCol*a.rows();
+            aInc = 1;
+        }
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            bPt  = b.data() + aCol*b.cols();
+            bInc = 1;
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            bPt  = b.data() + aCol;
+            bInc = b.rows();
+        }
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuRow(ComplexF &res, const unsigned int aRow,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexF *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
+        cblas_cdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuCol(ComplexF &res, const unsigned int aCol,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexF *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
+        cblas_cdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuRow(ComplexD &res, const unsigned int aRow,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexD *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
+        cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuCol(ComplexD &res, const unsigned int aCol,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexD *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
+        cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+    }
+#endif
+};
+
+/******************************************************************************
+ *                     A2AMatrixIo template implementation                    *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename T>
+A2AMatrixIo<T>::A2AMatrixIo(std::string filename, std::string dataname, 
+                            const unsigned int nt, const unsigned int ni,
+                            const unsigned int nj)
 : filename_(filename), dataname_(dataname)
 , nt_(nt), ni_(ni), nj_(nj)
 {}

-template <typename T, typename MetadataType>
-void A2AMatrixIo<T, MetadataType>::initFile(const MetadataType &d, const unsigned int chunkSize)
+// access //////////////////////////////////////////////////////////////////////
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNt(void) const
+{
+    return nt_;
+}
+
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNi(void) const
+{
+    return ni_;
+}
+
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNj(void) const
+{
+    return nj_;
+}
+
+template <typename T>
+size_t A2AMatrixIo<T>::getSize(void) const
+{
+    return nt_*ni_*nj_*sizeof(T);
+}
+
+// file allocation /////////////////////////////////////////////////////////////
+template <typename T>
+template <typename MetadataType>
+void A2AMatrixIo<T>::initFile(const MetadataType &d, const unsigned int chunkSize)
 {
 #ifdef HAVE_HDF5
    std::vector<hsize_t>    dim = {static_cast<hsize_t>(nt_), 
@@ -80,26 +440,28 @@ void A2AMatrixIo<T, MetadataType>::initFile(const MetadataType &d, const unsigne
    }

    // create the dataset
-    Hdf5Reader reader(filename_);
+    Hdf5Reader reader(filename_, false);

    push(reader, dataname_);
    auto &group = reader.getGroup();
    plist.setChunk(chunk.size(), chunk.data());
-    dataset = group.createDataSet("data", Hdf5Type<T>::type(), dataspace, plist);
+    plist.setFletcher32();
+    dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type<T>::type(), dataspace, plist);
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
 #endif
 }

-template <typename T, typename MetadataType>
-void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data, 
-                                             const unsigned int i, 
-                                             const unsigned int j,
-                                             const unsigned int blockSizei,
-                                             const unsigned int blockSizej)
+// block I/O ///////////////////////////////////////////////////////////////////
+template <typename T>
+void A2AMatrixIo<T>::saveBlock(const T *data, 
+                               const unsigned int i, 
+                               const unsigned int j,
+                               const unsigned int blockSizei,
+                               const unsigned int blockSizej)
 {
 #ifdef HAVE_HDF5
-    Hdf5Reader           reader(filename_);
+    Hdf5Reader           reader(filename_, false);
    std::vector<hsize_t> count = {nt_, blockSizei, blockSizej},
                         offset = {0, static_cast<hsize_t>(i),
                                   static_cast<hsize_t>(j)},
@@ -111,7 +473,7 @@ void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data,

    push(reader, dataname_);
    auto &group = reader.getGroup();
-    dataset     = group.openDataSet("data");
+    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
    dataspace   = dataset.getSpace();
    dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
                              stride.data(), block.data());
@@ -121,6 +483,286 @@ void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data,
 #endif
 }

+template <typename T>
+void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,
+                               const unsigned int ext, const unsigned int str,
+                               const unsigned int i, const unsigned int j)
+{
+    unsigned int blockSizei = m.dimension(3);
+    unsigned int blockSizej = m.dimension(4);
+    unsigned int nstr       = m.dimension(1);
+    size_t       offset     = (ext*nstr + str)*nt_*blockSizei*blockSizej;
+
+    saveBlock(m.data() + offset, i, j, blockSizei, blockSizej);
+}
+
+template <typename T>
+template <template <class> class Vec, typename VecT>
+void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead, const bool useCache)
+{
+#ifdef HAVE_HDF5
+    Hdf5Reader           reader(filename_);
+    std::vector<hsize_t> hdim;
+    H5NS::DataSet        dataset;
+    H5NS::DataSpace      dataspace;
+    H5NS::CompType       datatype;
+    
+    push(reader, dataname_);
+    auto &group = reader.getGroup();
+    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
+    datatype    = dataset.getCompType();
+    dataspace   = dataset.getSpace();
+    hdim.resize(dataspace.getSimpleExtentNdims());
+    dataspace.getSimpleExtentDims(hdim.data());
+    if ((nt_*ni_*nj_ != 0) and
+        ((hdim[0] != nt_) or (hdim[1] != ni_) or (hdim[2] != nj_)))
+    {
+        HADRONS_ERROR(Size, "all-to-all matrix size mismatch (got "
+            + std::to_string(hdim[0]) + "x" + std::to_string(hdim[1]) + "x"
+            + std::to_string(hdim[2]) + ", expected "
+            + std::to_string(nt_) + "x" + std::to_string(ni_) + "x"
+            + std::to_string(nj_));
+    }
+    else if (ni_*nj_ == 0)
+    {
+        if (hdim[0] != nt_)
+        {
+            HADRONS_ERROR(Size, "all-to-all time size mismatch (got "
+                + std::to_string(hdim[0]) + ", expected "
+                + std::to_string(nt_) + ")");
+        }
+        ni_ = hdim[1];
+        nj_ = hdim[2];
+    }
+
+    if (useCache)
+    {
+        std::vector<T> buf(nt_*ni_*nj_);
+        T              *pt;
+
+        dataset.read(buf.data(), datatype);
+        pt = buf.data();
+        for (unsigned int t = 0; t < nt_; ++t)
+        {
+            A2AMatrixMap<T> bufMap(pt, ni_, nj_);
+
+            v[t]  = bufMap.template cast<VecT>();
+            pt   += ni_*nj_;
+        }
+    }
+    // if useCache = false, do I/O timeslice per timeslice (much slower)
+    else
+    {
+        A2AMatrix<T>         buf(ni_, nj_);
+        std::vector<hsize_t> count    = {1, static_cast<hsize_t>(ni_),
+                                        static_cast<hsize_t>(nj_)},
+                             stride   = {1, 1, 1},
+                             block    = {1, 1, 1},
+                             memCount = {static_cast<hsize_t>(ni_),
+                                         static_cast<hsize_t>(nj_)};
+        H5NS::DataSpace      memspace(memCount.size(), memCount.data());
+
+        std::cout << "Loading timeslice";
+        std::cout.flush();
+        *tRead = 0.;
+        for (unsigned int tp1 = nt_; tp1 > 0; --tp1)
+        {
+            unsigned int         t      = tp1 - 1;
+            std::vector<hsize_t> offset = {static_cast<hsize_t>(t), 0, 0};
+            
+            if (t % 10 == 0)
+            {
+                std::cout << " " << t;
+                std::cout.flush();
+            }
+            dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
+                                      stride.data(), block.data());
+            if (tRead) *tRead -= usecond();    
+            dataset.read(buf.data(), datatype, memspace, dataspace);
+            if (tRead) *tRead += usecond();
+            v[t] = buf.template cast<VecT>();
+        }
+        std::cout << std::endl;
+    }
+#else
+    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
+#endif
+}
+
+/******************************************************************************
+ *               A2AMatrixBlockComputation template implementation            *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename T, typename Field, typename MetadataType, typename TIo>
+A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
+::A2AMatrixBlockComputation(GridBase *grid,
+                            const unsigned int orthogDim,
+                            const unsigned int next, 
+                            const unsigned int nstr,
+                            const unsigned int blockSize, 
+                            const unsigned int cacheBlockSize,
+                            TimerArray *tArray)
+: grid_(grid), nt_(grid->GlobalDimensions()[orthogDim]), orthogDim_(orthogDim)
+, next_(next), nstr_(nstr), blockSize_(blockSize), cacheBlockSize_(cacheBlockSize)
+, tArray_(tArray)
+{
+    mCache_.resize(nt_*next_*nstr_*cacheBlockSize_*cacheBlockSize_);
+    mBuf_.resize(nt_*next_*nstr_*blockSize_*blockSize_);
+}
+
+#define START_TIMER(name) if (tArray_) tArray_->startTimer(name)
+#define STOP_TIMER(name)  if (tArray_) tArray_->stopTimer(name)
+#define GET_TIMER(name)   ((tArray_ != nullptr) ? tArray_->getDTimer(name) : 0.)
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename T, typename Field, typename MetadataType, typename TIo>
+void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
+::execute(const std::vector<Field> &left, const std::vector<Field> &right,
+          A2AKernel<T, Field> &kernel, const FilenameFn &ionameFn,
+          const FilenameFn &filenameFn, const MetadataFn &metadataFn)
+{
+    //////////////////////////////////////////////////////////////////////////
+    // i,j   is first  loop over blockSize_ factors
+    // ii,jj is second loop over cacheBlockSize_ factors for high perf contractions
+    // iii,jjj are loops within cacheBlock
+    // Total index is sum of these  i+ii+iii etc...
+    //////////////////////////////////////////////////////////////////////////
+    int    N_i = left.size();
+    int    N_j = right.size();
+    double flops, bytes, t_kernel;
+    double nodes = grid_->NodeCount();
+    
+    int NBlock_i = N_i/blockSize_ + (((N_i % blockSize_) != 0) ? 1 : 0);
+    int NBlock_j = N_j/blockSize_ + (((N_j % blockSize_) != 0) ? 1 : 0);
+
+    for(int i=0;i<N_i;i+=blockSize_)
+    for(int j=0;j<N_j;j+=blockSize_)
+    {
+        // Get the W and V vectors for this block^2 set of terms
+        int N_ii = MIN(N_i-i,blockSize_);
+        int N_jj = MIN(N_j-j,blockSize_);
+        A2AMatrixSet<TIo> mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj);
+
+        LOG(Message) << "All-to-all matrix block " 
+                     << j/blockSize_ + NBlock_j*i/blockSize_ + 1 
+                     << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
+                     << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
+                     << std::endl;
+        // Series of cache blocked chunks of the contractions within this block
+        flops    = 0.0;
+        bytes    = 0.0;
+        t_kernel = 0.0;
+        for(int ii=0;ii<N_ii;ii+=cacheBlockSize_)
+        for(int jj=0;jj<N_jj;jj+=cacheBlockSize_)
+        {
+            double t;
+            int N_iii = MIN(N_ii-ii,cacheBlockSize_);
+            int N_jjj = MIN(N_jj-jj,cacheBlockSize_);
+            A2AMatrixSet<T> mCacheBlock(mCache_.data(), next_, nstr_, nt_, N_iii, N_jjj);
+
+            START_TIMER("kernel");
+            kernel(mCacheBlock, &left[i+ii], &right[j+jj], orthogDim_, t);
+            STOP_TIMER("kernel");
+            t_kernel += t;
+            flops    += kernel.flops(N_iii, N_jjj);
+            bytes    += kernel.bytes(N_iii, N_jjj);
+
+            START_TIMER("cache copy");
+            parallel_for_nest5(int e =0;e<next_;e++)
+            for(int s =0;s< nstr_;s++)
+            for(int t =0;t< nt_;t++)
+            for(int iii=0;iii< N_iii;iii++)
+            for(int jjj=0;jjj< N_jjj;jjj++)
+            {
+                mBlock(e,s,t,ii+iii,jj+jjj) = mCacheBlock(e,s,t,iii,jjj);
+            }
+            STOP_TIMER("cache copy");
+        }
+
+        // perf
+        LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes 
+                     << " Gflop/s/node " << std::endl;
+        LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes 
+                     << " GB/s/node "  << std::endl;
+
+        // IO
+        double       blockSize, ioTime;
+        unsigned int myRank = grid_->ThisRank(), nRank  = grid_->RankCount();
+    
+        LOG(Message) << "Writing block to disk" << std::endl;
+        ioTime = -GET_TIMER("IO: write block");
+        START_TIMER("IO: total");
+        makeFileDir(filenameFn(0, 0), grid_);
+#ifdef HADRONS_A2AM_PARALLEL_IO
+        grid_->Barrier();
+        // make task list for current node
+        nodeIo_.clear();
+        for(int f = myRank; f < next_*nstr_; f += nRank)
+        {
+            IoHelper h;
+
+            h.i  = i;
+            h.j  = j;
+            h.e  = f/nstr_;
+            h.s  = f % nstr_;
+            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
+                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
+            h.md = metadataFn(h.e, h.s);
+            nodeIo_.push_back(h);
+        }
+        // parallel IO
+        for (auto &h: nodeIo_)
+        {
+            saveBlock(mBlock, h);
+        }
+        grid_->Barrier();
+#else
+        // serial IO, for testing purposes only
+        for(int e = 0; e < next_; e++)
+        for(int s = 0; s < nstr_; s++)
+        {
+            IoHelper h;
+
+            h.i  = i;
+            h.j  = j;
+            h.e  = e;
+            h.s  = s;
+            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
+                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
+            h.md = metadataFn(h.e, h.s);
+            saveBlock(mfBlock, h);
+        }
+#endif
+        STOP_TIMER("IO: total");
+        blockSize  = static_cast<double>(next_*nstr_*nt_*N_ii*N_jj*sizeof(TIo));
+        ioTime    += GET_TIMER("IO: write block");
+        LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
+                     << ioTime  << " us (" 
+                     << blockSize/ioTime*1.0e6/1024/1024
+                     << " MB/s)" << std::endl;
+    }
+}
+
+// I/O handler /////////////////////////////////////////////////////////////////
+template <typename T, typename Field, typename MetadataType, typename TIo>
+void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
+::saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h)
+{
+    if ((h.i == 0) and (h.j == 0))
+    {
+        START_TIMER("IO: file creation");
+        h.io.initFile(h.md, blockSize_);
+        STOP_TIMER("IO: file creation");
+    }
+    START_TIMER("IO: write block");
+    h.io.saveBlock(m, h.e, h.s, h.i, h.j);
+    STOP_TIMER("IO: write block");
+}
+
+#undef START_TIMER
+#undef STOP_TIMER
+#undef GET_TIMER
+
 END_HADRONS_NAMESPACE

 #endif // A2A_Matrix_hpp_
--- a/Hadrons/A2AVectors.hpp
+++ b/Hadrons/A2AVectors.hpp
@@ -36,7 +36,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 BEGIN_HADRONS_NAMESPACE

 /******************************************************************************
- *               Classes to generate V & W all-to-all vectors                 *
+ *                 Class to generate V & W all-to-all vectors                 *
 ******************************************************************************/
 template <typename FImpl>
 class A2AVectorsSchurDiagTwo
@@ -70,6 +70,42 @@ private:
    SchurDiagTwoOperator<FMat, FermionField> op_;
 };

+/******************************************************************************
+ *                  Methods for V & W all-to-all vectors I/O                  *
+ ******************************************************************************/
+class A2AVectorsIo
+{
+public:
+    struct Record: Serializable
+    {
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Record,
+                                        unsigned int, index);
+        Record(void): index(0) {}
+    };
+public:
+    template <typename Field>
+    static void write(const std::string fileStem, std::vector<Field> &vec, 
+                      const bool multiFile, const int trajectory = -1);
+    template <typename Field>
+    static void read(std::vector<Field> &vec, const std::string fileStem,
+                     const bool multiFile, const int trajectory = -1);
+private:
+    static inline std::string vecFilename(const std::string stem, const int traj, 
+                                          const bool multiFile)
+    {
+        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
+
+        if (multiFile)
+        {
+            return stem + t;
+        }
+        else
+        {
+            return stem + t + ".bin";
+        }
+    }
+};
+
 /******************************************************************************
 *               A2AVectorsSchurDiagTwo template implementation               *
 ******************************************************************************/
@@ -217,6 +253,90 @@ void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeW5D(FermionField &wout_4d,
    }
 }

+/******************************************************************************
+ *               all-to-all vectors I/O template implementation               *
+ ******************************************************************************/
+template <typename Field>
+void A2AVectorsIo::write(const std::string fileStem, std::vector<Field> &vec, 
+                         const bool multiFile, const int trajectory)
+{
+    Record       record;
+    GridBase     *grid = vec[0]._grid;
+    ScidacWriter binWriter(grid->IsBoss());
+    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
+
+    if (multiFile)
+    {
+        std::string fullFilename;
+
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
+
+            LOG(Message) << "Writing vector " << i << std::endl;
+            makeFileDir(fullFilename, grid);
+            binWriter.open(fullFilename);
+            record.index = i;
+            binWriter.writeScidacFieldRecord(vec[i], record);
+            binWriter.close();
+        }
+    }
+    else
+    {
+        makeFileDir(filename, grid);
+        binWriter.open(filename);
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            LOG(Message) << "Writing vector " << i << std::endl;
+            record.index = i;
+            binWriter.writeScidacFieldRecord(vec[i], record);
+        }
+        binWriter.close();
+    }
+}
+
+template <typename Field>
+void A2AVectorsIo::read(std::vector<Field> &vec, const std::string fileStem, 
+                        const bool multiFile, const int trajectory)
+{
+    Record       record;
+    ScidacReader binReader;
+    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
+
+    if (multiFile)
+    {
+        std::string fullFilename;
+
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
+
+            LOG(Message) << "Reading vector " << i << std::endl;
+            binReader.open(fullFilename);
+            binReader.readScidacFieldRecord(vec[i], record);
+            binReader.close();
+            if (record.index != i)
+            {
+                HADRONS_ERROR(Io, "vector index mismatch");
+            }
+        }
+    }
+    else
+    {
+        binReader.open(filename);
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            LOG(Message) << "Reading vector " << i << std::endl;
+            binReader.readScidacFieldRecord(vec[i], record);
+            if (record.index != i)
+            {
+                HADRONS_ERROR(Io, "vector index mismatch");
+            }
+        }
+        binReader.close();
+    }
+}
+
 END_HADRONS_NAMESPACE

 #endif // A2A_Vectors_hpp_
--- a/Hadrons/Application.cc
+++ b/Hadrons/Application.cc
@@ -108,6 +108,9 @@ void Application::run(void)
        HADRONS_ERROR(Definition, "run id is empty");
    }
    LOG(Message) << "RUN ID '" << getPar().runId << "'" << std::endl;
+    BinaryIO::latticeWriteMaxRetry = getPar().parallelWriteMaxRetry;
+    LOG(Message) << "Attempt(s) for resilient parallel I/O: " 
+                 << BinaryIO::latticeWriteMaxRetry << std::endl;
    vm().setRunId(getPar().runId);
    vm().printContent();
    env().printContent();
--- a/Hadrons/Application.hpp
+++ b/Hadrons/Application.hpp
@@ -41,14 +41,6 @@ BEGIN_HADRONS_NAMESPACE
 class Application
 {
 public:
-    class TrajRange: Serializable
-    {
-    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
-                                        unsigned int, start,
-                                        unsigned int, end,
-                                        unsigned int, step);
-    };
    class GlobalPar: Serializable
    {
    public:
@@ -56,7 +48,9 @@ public:
                                        TrajRange,                  trajCounter,
                                        VirtualMachine::GeneticPar, genetic,
                                        std::string,                runId,
-                                        std::string,                graphFile);
+                                        std::string,                graphFile,
+                                        int,                        parallelWriteMaxRetry);
+        GlobalPar(void): parallelWriteMaxRetry{-1} {}
    };
 public:
    // constructors
--- a/Hadrons/DilutedNoise.hpp
+++ b/Hadrons/DilutedNoise.hpp
@@ -7,6 +7,7 @@ Source file: Hadrons/DilutedNoise.hpp
 Copyright (C) 2015-2018

 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -76,6 +77,22 @@ private:
    unsigned int nt_;
 };

+template <typename FImpl>
+class FullVolumeSpinColorDiagonalNoise: public DilutedNoise<FImpl>
+{
+public:
+    typedef typename FImpl::FermionField FermionField;
+public:
+    // constructor/destructor
+    FullVolumeSpinColorDiagonalNoise(GridCartesian *g, unsigned int n_src);
+    virtual ~FullVolumeSpinColorDiagonalNoise(void) = default;
+    // generate noise
+    virtual void generateNoise(GridParallelRNG &rng);
+private:
+    unsigned int nSrc_;
+};
+
+
 /******************************************************************************
 *                    DilutedNoise template implementation                    *
 ******************************************************************************/
@@ -186,6 +203,47 @@ void TimeDilutedSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rn
    }
 }

+/******************************************************************************
+ *        FullVolumeSpinColorDiagonalNoise template implementation           *
+ ******************************************************************************/
+template <typename FImpl>
+FullVolumeSpinColorDiagonalNoise<FImpl>::
+FullVolumeSpinColorDiagonalNoise(GridCartesian *g, unsigned int nSrc)
+: DilutedNoise<FImpl>(g, nSrc*Ns*FImpl::Dimension), nSrc_(nSrc)
+{}
+
+template <typename FImpl>
+void FullVolumeSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng)
+{
+    typedef decltype(peekColour((*this)[0], 0)) SpinField;
+
+    auto                       &noise = *this;
+    auto                       g      = this->getGrid();
+    auto                       nd     = g->GlobalDimensions().size();
+    auto                       nc     = FImpl::Dimension;
+    Complex                    shift(1., 1.);
+    LatticeComplex             eta(g);
+    SpinField                  etas(g);
+    unsigned int               i = 0;
+
+    bernoulli(rng, eta);
+    eta = (2.*eta - shift)*(1./::sqrt(2.));
+    for (unsigned int n = 0; n < nSrc_; ++n)
+    {
+        for (unsigned int s = 0; s < Ns; ++s)
+        {
+            etas = zero;
+            pokeSpin(etas, eta, s);
+            for (unsigned int c = 0; c < nc; ++c)
+            {
+                noise[i] = zero;
+                pokeColour(noise[i], etas, c);
+                i++;
+            }
+        }
+    }
+}
+
 END_HADRONS_NAMESPACE

 #endif // Hadrons_DilutedNoise_hpp_
--- a/Hadrons/DiskVector.hpp
+++ b/Hadrons/DiskVector.hpp
@@ -29,11 +29,18 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define Hadrons_DiskVector_hpp_

 #include <Hadrons/Global.hpp>
+#include <Hadrons/A2AMatrix.hpp>
 #include <deque>
 #include <sys/stat.h>
 #include <ftw.h>
 #include <unistd.h>

+#ifdef DV_DEBUG
+#define DV_DEBUG_MSG(dv, stream) LOG(Debug) << "diskvector " << (dv) << ": " << stream << std::endl
+#else
+#define DV_DEBUG_MSG(dv, stream)
+#endif
+
 BEGIN_HADRONS_NAMESPACE

 /******************************************************************************
@@ -53,16 +60,18 @@ public:
        : master_(master), cmaster_(master), i_(i) {}

        // operator=: somebody is trying to store a vector element
-        // write to disk and cache
+        // write to cache and tag as modified
        T &operator=(const T &obj) const
        {
-#ifdef DV_DEBUG
-            LOG(Debug) << "diskvector " << &master_ << ": writing to " << i_ << std::endl;
-#endif
+            auto &cache    = *master_.cachePtr_;
+            auto &modified = *master_.modifiedPtr_;
+            auto &index    = *master_.indexPtr_;
+
+            DV_DEBUG_MSG(&master_, "writing to " << i_);
            master_.cacheInsert(i_, obj);
-            master_.save(master_.filename(i_), obj);
+            modified[index.at(i_)] = true;
            
-            return master_.cachePtr_->at(i_);
+            return cache[index.at(i_)];
        }

        // implicit cast to const object reference and redirection
@@ -79,9 +88,12 @@ public:
 public:
    DiskVectorBase(const std::string dirname, const unsigned int size = 0,
                   const unsigned int cacheSize = 1, const bool clean = true);
+    DiskVectorBase(DiskVectorBase<T> &&v) = default;
    virtual ~DiskVectorBase(void);
    const T & operator[](const unsigned int i) const;
    RwAccessHelper operator[](const unsigned int i);
+    double hitRatio(void) const;
+    void resetStat(void);
 private:
    virtual void load(T &obj, const std::string filename) const = 0;
    virtual void save(const std::string filename, const T &obj) const = 0;
@@ -91,13 +103,17 @@ private:
    void cacheInsert(const unsigned int i, const T &obj) const;
    void clean(void);
 private:
-    std::string                                dirname_;
-    unsigned int                               size_, cacheSize_;
-    bool                                       clean_;
+    std::string                                           dirname_;
+    unsigned int                                          size_, cacheSize_;
+    double                                                access_{0.}, hit_{0.};
+    bool                                                  clean_;
    // using pointers to allow modifications when class is const
    // semantic: const means data unmodified, but cache modification allowed
-    std::unique_ptr<std::map<unsigned int, T>> cachePtr_;
-    std::unique_ptr<std::deque<unsigned int>>  loadsPtr_;                
+    std::unique_ptr<std::vector<T>>                       cachePtr_;
+    std::unique_ptr<std::vector<bool>>                    modifiedPtr_;
+    std::unique_ptr<std::map<unsigned int, unsigned int>> indexPtr_;
+    std::unique_ptr<std::stack<unsigned int>>             freePtr_;
+    std::unique_ptr<std::deque<unsigned int>>             loadsPtr_;                
 };

 /******************************************************************************
@@ -115,6 +131,7 @@ private:

        read(reader, basename(filename), obj);
    }
+
    virtual void save(const std::string filename, const T &obj) const
    {
        Writer writer(filename);
@@ -123,20 +140,100 @@ private:
    }
 };

+/******************************************************************************
+ *                      Specialisation for Eigen matrices                     *
+ ******************************************************************************/
+template <typename T>
+using EigenDiskVectorMat = A2AMatrix<T>;
+
+template <typename T>
+class EigenDiskVector: public DiskVectorBase<EigenDiskVectorMat<T>>
+{
+public:
+    using DiskVectorBase<EigenDiskVectorMat<T>>::DiskVectorBase;
+    typedef EigenDiskVectorMat<T> Matrix;
+public:
+    T operator()(const unsigned int i, const Eigen::Index j,
+                 const Eigen::Index k) const
+    {
+        return (*this)[i](j, k);
+    }
+private:
+    virtual void load(EigenDiskVectorMat<T> &obj, const std::string filename) const
+    {
+        std::ifstream f(filename, std::ios::binary);
+        uint32_t      crc, check;
+        Eigen::Index  nRow, nCol;
+        size_t        matSize;
+        double        tRead, tHash;
+
+        f.read(reinterpret_cast<char *>(&crc), sizeof(crc));
+        f.read(reinterpret_cast<char *>(&nRow), sizeof(nRow));
+        f.read(reinterpret_cast<char *>(&nCol), sizeof(nCol));
+        obj.resize(nRow, nCol);
+        matSize = nRow*nCol*sizeof(T);
+        tRead  = -usecond();
+        f.read(reinterpret_cast<char *>(obj.data()), matSize);
+        tRead += usecond();
+        tHash  = -usecond();
+#ifdef USE_IPP
+        check  = GridChecksum::crc32c(obj.data(), matSize);
+#else
+        check  = GridChecksum::crc32(obj.data(), matSize);
+#endif
+        tHash += usecond();
+        DV_DEBUG_MSG(this, "Eigen read " << tRead/1.0e6 << " sec " << matSize/tRead*1.0e6/1024/1024 << " MB/s");
+        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << check << std::dec 
+                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
+        if (crc != check)
+        {
+            HADRONS_ERROR(Io, "checksum failed")
+        }
+    }
+
+    virtual void save(const std::string filename, const EigenDiskVectorMat<T> &obj) const
+    {
+        std::ofstream f(filename, std::ios::binary);
+        uint32_t      crc;
+        Eigen::Index  nRow, nCol;
+        size_t        matSize;
+        double        tWrite, tHash;
+        
+        nRow    = obj.rows();
+        nCol    = obj.cols();
+        matSize = nRow*nCol*sizeof(T);
+        tHash   = -usecond();
+#ifdef USE_IPP
+        crc     = GridChecksum::crc32c(obj.data(), matSize);
+#else
+        crc     = GridChecksum::crc32(obj.data(), matSize);
+#endif
+        tHash  += usecond();
+        f.write(reinterpret_cast<char *>(&crc), sizeof(crc));
+        f.write(reinterpret_cast<char *>(&nRow), sizeof(nRow));
+        f.write(reinterpret_cast<char *>(&nCol), sizeof(nCol));
+        tWrite = -usecond();
+        f.write(reinterpret_cast<const char *>(obj.data()), matSize);
+        tWrite += usecond();
+        DV_DEBUG_MSG(this, "Eigen write " << tWrite/1.0e6 << " sec " << matSize/tWrite*1.0e6/1024/1024 << " MB/s");
+        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << crc << std::dec
+                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
+    }
+};
+
 /******************************************************************************
 *                       DiskVectorBase implementation                         *
 ******************************************************************************/
-#ifdef DV_DEBUG
-#define DV_DEBUG_MSG(stream) LOG(Debug) << "diskvector " << this << ": " << stream << std::endl
-#endif
-
 template <typename T>
 DiskVectorBase<T>::DiskVectorBase(const std::string dirname, 
                                  const unsigned int size,
                                  const unsigned int cacheSize,
                                  const bool clean)
 : dirname_(dirname), size_(size), cacheSize_(cacheSize), clean_(clean)
-, cachePtr_(new std::map<unsigned int, T>())
+, cachePtr_(new std::vector<T>(size))
+, modifiedPtr_(new std::vector<bool>(size, false))
+, indexPtr_(new std::map<unsigned int, unsigned int>())
+, freePtr_(new std::stack<unsigned int>)
 , loadsPtr_(new std::deque<unsigned int>())
 {
    struct stat s;
@@ -146,6 +243,10 @@ DiskVectorBase<T>::DiskVectorBase(const std::string dirname,
        HADRONS_ERROR(Io, "directory '" + dirname + "' already exists")
    }
    mkdir(dirname);
+    for (unsigned int i = 0; i < cacheSize_; ++i)
+    {
+        freePtr_->push(i);
+    }
 }

 template <typename T>
@@ -160,28 +261,31 @@ DiskVectorBase<T>::~DiskVectorBase(void)
 template <typename T>
 const T & DiskVectorBase<T>::operator[](const unsigned int i) const
 {
-    auto &cache = *cachePtr_;
-    auto &loads = *loadsPtr_;
+    auto &cache   = *cachePtr_;
+    auto &index   = *indexPtr_;
+    auto &freeInd = *freePtr_;
+    auto &loads   = *loadsPtr_;

-    DV_DEBUG_MSG("accessing " << i << " (RO)");
+    DV_DEBUG_MSG(this, "accessing " << i << " (RO)");

    if (i >= size_)
    {
        HADRONS_ERROR(Size, "index out of range");
    }
-
-    if (cache.find(i) == cache.end())
+    const_cast<double &>(access_)++;
+    if (index.find(i) == index.end())
    {
        // cache miss
-        DV_DEBUG_MSG("cache miss");
+        DV_DEBUG_MSG(this, "cache miss");
        fetch(i);
    }
    else
    {
-        DV_DEBUG_MSG("cache hit");
+        DV_DEBUG_MSG(this, "cache hit");

        auto pos = std::find(loads.begin(), loads.end(), i);

+        const_cast<double &>(hit_)++;
        loads.erase(pos);
        loads.push_back(i);
    }
@@ -193,16 +297,16 @@ const T & DiskVectorBase<T>::operator[](const unsigned int i) const
    {
        msg += std::to_string(p) + " ";
    }
-    DV_DEBUG_MSG("in cache: " << msg);
+    DV_DEBUG_MSG(this, "in cache: " << msg);
 #endif

-    return cache.at(i);
+    return cache[index.at(i)];
 }

 template <typename T>
 typename DiskVectorBase<T>::RwAccessHelper DiskVectorBase<T>::operator[](const unsigned int i)
 {
-    DV_DEBUG_MSG("accessing " << i << " (RW)");
+    DV_DEBUG_MSG(this, "accessing " << i << " (RW)");

    if (i >= size_)
    {
@@ -212,6 +316,19 @@ typename DiskVectorBase<T>::RwAccessHelper DiskVectorBase<T>::operator[](const u
    return RwAccessHelper(*this, i);
 }

+template <typename T>
+double DiskVectorBase<T>::hitRatio(void) const
+{
+    return hit_/access_;
+}
+
+template <typename T>
+void DiskVectorBase<T>::resetStat(void)
+{
+    access_ = 0.;
+    hit_    = 0.;
+}
+
 template <typename T>
 std::string DiskVectorBase<T>::filename(const unsigned int i) const
 {
@@ -221,13 +338,24 @@ std::string DiskVectorBase<T>::filename(const unsigned int i) const
 template <typename T>
 void DiskVectorBase<T>::evict(void) const
 {
-    auto &cache = *cachePtr_;
-    auto &loads = *loadsPtr_;
+    auto &cache    = *cachePtr_;
+    auto &modified = *modifiedPtr_;
+    auto &index    = *indexPtr_;
+    auto &freeInd  = *freePtr_;
+    auto &loads    = *loadsPtr_;

-    if (cache.size() >= cacheSize_)
+    if (index.size() >= cacheSize_)
    {
-        DV_DEBUG_MSG("evicting " << loads.front());
-        cache.erase(loads.front());
+        unsigned int i = loads.front();
+        
+        DV_DEBUG_MSG(this, "evicting " << i);
+        if (modified[index.at(i)])
+        {
+            DV_DEBUG_MSG(this, "element " << i << " modified, saving to disk");
+            save(filename(i), cache[index.at(i)]);
+        }
+        freeInd.push(index.at(i));
+        index.erase(i);
        loads.pop_front();
    }
 }
@@ -235,30 +363,44 @@ void DiskVectorBase<T>::evict(void) const
 template <typename T>
 void DiskVectorBase<T>::fetch(const unsigned int i) const
 {
-    auto &cache = *cachePtr_;
-    auto &loads = *loadsPtr_;
+    auto &cache    = *cachePtr_;
+    auto &modified = *modifiedPtr_;
+    auto &index    = *indexPtr_;
+    auto &freeInd  = *freePtr_;
+    auto &loads    = *loadsPtr_;
+
    struct stat s;

-    DV_DEBUG_MSG("loading " << i << " from disk");
+    DV_DEBUG_MSG(this, "loading " << i << " from disk");

    evict();
+    
    if(stat(filename(i).c_str(), &s) != 0)
    {
        HADRONS_ERROR(Io, "disk vector element " + std::to_string(i) + " uninitialised");
    }
-    load(cache[i], filename(i));
+    index[i] = freeInd.top();
+    freeInd.pop();
+    load(cache[index.at(i)], filename(i));
    loads.push_back(i);
+    modified[index.at(i)] = false;
 }

 template <typename T>
 void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
 {
-    auto &cache = *cachePtr_;
-    auto &loads = *loadsPtr_;
+    auto &cache    = *cachePtr_;
+    auto &modified = *modifiedPtr_;
+    auto &index    = *indexPtr_;
+    auto &freeInd  = *freePtr_;
+    auto &loads    = *loadsPtr_;

    evict();
-    cache[i] = obj;
+    index[i] = freeInd.top();
+    freeInd.pop();
+    cache[index.at(i)] = obj;
    loads.push_back(i);
+    modified[index.at(i)] = false;

 #ifdef DV_DEBUG
    std::string msg;
@@ -267,7 +409,7 @@ void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
    {
        msg += std::to_string(p) + " ";
    }
-    DV_DEBUG_MSG("in cache: " << msg);
+    DV_DEBUG_MSG(this, "in cache: " << msg);
 #endif
 }

--- a/Hadrons/EigenPack.hpp
+++ b/Hadrons/EigenPack.hpp
@@ -39,12 +39,12 @@ BEGIN_HADRONS_NAMESPACE
 #define HADRONS_DEFAULT_LANCZOS_NBASIS 60
 #endif

-#define HADRONS_DUMP_EP_METADATA \
+#define HADRONS_DUMP_EP_METADATA(record) \
 LOG(Message) << "Eigenpack metadata:" << std::endl;\
 LOG(Message) << "* operator" << std::endl;\
-LOG(Message) << record.operatorXml << std::endl;\
+LOG(Message) << (record).operatorXml << std::endl;\
 LOG(Message) << "* solver" << std::endl;\
-LOG(Message) << record.solverXml << std::endl;
+LOG(Message) << (record).solverXml << std::endl;

 struct PackRecord
 {
@@ -59,66 +59,9 @@ struct VecRecord: Serializable
    VecRecord(void): index(0), eval(0.) {}
 };

-template <typename F>
-class EigenPack
+namespace EigenPackIo
 {
-public:
-    typedef F Field;
-public:
-    std::vector<RealD> eval;
-    std::vector<F>     evec;
-    PackRecord         record;
-public:
-    EigenPack(void)          = default;
-    virtual ~EigenPack(void) = default;
-
-    EigenPack(const size_t size, GridBase *grid)
-    {
-        resize(size, grid);
-    }
-
-    void resize(const size_t size, GridBase *grid)
-    {
-        eval.resize(size);
-        evec.resize(size, grid);
-    }
-
-    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
-    {
-        if (multiFile)
-        {
-            for(int k = 0; k < evec.size(); ++k)
-            {
-                basicReadSingle(evec[k], eval[k], evecFilename(fileStem, k, traj), k);
-                if (k == 0)
-                {
-                    HADRONS_DUMP_EP_METADATA;
-                }
-            }
-        }
-        else
-        {
-            basicRead(evec, eval, evecFilename(fileStem, -1, traj), evec.size());
-            HADRONS_DUMP_EP_METADATA;
-        }
-    }
-
-    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
-    {
-        if (multiFile)
-        {
-            for(int k = 0; k < evec.size(); ++k)
-            {
-                basicWriteSingle(evecFilename(fileStem, k, traj), evec[k], eval[k], k);
-            }
-        }
-        else
-        {
-            basicWrite(evecFilename(fileStem, -1, traj), evec, eval, evec.size());
-        }
-    }
-
-    static void readHeader(PackRecord &record, ScidacReader &binReader)
+    inline void readHeader(PackRecord &record, ScidacReader &binReader)
    {
        std::string recordXml;

@@ -130,13 +73,75 @@ public:
        xmlReader.readCurrentSubtree(record.solverXml);
    }

-    template <typename T>
-    static void readElement(T &evec, VecRecord &vecRecord, ScidacReader &binReader)
+    template <typename T, typename TIo = T>
+    void readElement(T &evec, RealD &eval, const unsigned int index,
+                     ScidacReader &binReader, TIo *ioBuf = nullptr)
    {
-        binReader.readScidacFieldRecord(evec, vecRecord);
+        VecRecord vecRecord;
+
+        LOG(Message) << "Reading eigenvector " << index << std::endl;
+        if (ioBuf == nullptr)
+        {
+            binReader.readScidacFieldRecord(evec, vecRecord);
+        }
+        else
+        {
+            binReader.readScidacFieldRecord(*ioBuf, vecRecord);
+            precisionChange(evec, *ioBuf);
+        }
+        if (vecRecord.index != index)
+        {
+            HADRONS_ERROR(Io, "Eigenvector " + std::to_string(index) + " has a"
+                            + " wrong index (expected " + std::to_string(vecRecord.index) 
+                            + ")");
+        }
+        eval = vecRecord.eval;
    }

-    static void writeHeader(ScidacWriter &binWriter, PackRecord &record)
+    template <typename T, typename TIo = T>
+    static void readPack(std::vector<T> &evec, std::vector<RealD> &eval,
+                         PackRecord &record, const std::string filename, 
+                         const unsigned int size, bool multiFile, 
+                         GridBase *gridIo = nullptr)
+    {
+        std::unique_ptr<TIo> ioBuf{nullptr};
+        ScidacReader         binReader;
+
+        if (typeHash<T>() != typeHash<TIo>())
+        {
+            if (gridIo == nullptr)
+            {
+                HADRONS_ERROR(Definition, 
+                              "I/O type different from vector type but null I/O grid passed");
+            }
+            ioBuf.reset(new TIo(gridIo));
+        }
+        if (multiFile)
+        {
+            std::string fullFilename;
+
+            for(int k = 0; k < size; ++k) 
+            {
+                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
+                binReader.open(fullFilename);
+                readHeader(record, binReader);
+                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
+                binReader.close();
+            }
+        }
+        else
+        {
+            binReader.open(filename);
+            readHeader(record, binReader);
+            for(int k = 0; k < size; ++k) 
+            {
+                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
+            }
+            binReader.close();
+        }
+    }
+
+    inline void writeHeader(ScidacWriter &binWriter, PackRecord &record)
    {
        XmlWriter xmlWriter("", "eigenPackPar");

@@ -145,165 +150,217 @@ public:
        binWriter.writeLimeObject(1, 1, xmlWriter, "parameters", SCIDAC_FILE_XML);
    }

-    template <typename T>
-    static void writeElement(ScidacWriter &binWriter, T &evec, VecRecord &vecRecord)
+    template <typename T, typename TIo = T>
+    void writeElement(ScidacWriter &binWriter, T &evec, RealD &eval, 
+                      const unsigned int index, TIo *ioBuf, 
+                      T *testBuf = nullptr)
    {
-        binWriter.writeScidacFieldRecord(evec, vecRecord, DEFAULT_ASCII_PREC);
-    }
-protected:
-    std::string evecFilename(const std::string stem, const int vec, const int traj)
-    {
-        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
+        VecRecord vecRecord;

-        if (vec == -1)
+        LOG(Message) << "Writing eigenvector " << index << std::endl;
+        vecRecord.eval  = eval;
+        vecRecord.index = index;
+        if ((ioBuf == nullptr) || (testBuf == nullptr))
        {
-            return stem + t + ".bin";
+            binWriter.writeScidacFieldRecord(evec, vecRecord, DEFAULT_ASCII_PREC);
        }
        else
        {
-            return stem + t + "/v" + std::to_string(vec) + ".bin";
-        }
+            precisionChange(*ioBuf, evec);
+            precisionChange(*testBuf, *ioBuf);
+            *testBuf -= evec;
+            LOG(Message) << "Precision diff norm^2 " << norm2(*testBuf) << std::endl;
+            binWriter.writeScidacFieldRecord(*ioBuf, vecRecord, DEFAULT_ASCII_PREC);
+        }   
    }
-
-    template <typename T>
-    void basicRead(std::vector<T> &evec, std::vector<RealD> &eval,
-                   const std::string filename, const unsigned int size)
+    
+    template <typename T, typename TIo = T>
+    static void writePack(const std::string filename, std::vector<T> &evec, 
+                          std::vector<RealD> &eval, PackRecord &record, 
+                          const unsigned int size, bool multiFile, 
+                          GridBase *gridIo = nullptr)
    {
-        ScidacReader binReader;
+        GridBase             *grid = evec[0]._grid;
+        std::unique_ptr<TIo> ioBuf{nullptr}; 
+        std::unique_ptr<T>   testBuf{nullptr};
+        ScidacWriter         binWriter(grid->IsBoss());

-        binReader.open(filename);
-        readHeader(record, binReader);
-        for(int k = 0; k < size; ++k) 
+        if (typeHash<T>() != typeHash<TIo>())
        {
-            VecRecord vecRecord;
-
-            LOG(Message) << "Reading eigenvector " << k << std::endl;
-            readElement(evec[k], vecRecord, binReader);
-            if (vecRecord.index != k)
+            if (gridIo == nullptr)
            {
-                HADRONS_ERROR(Io, "Eigenvector " + std::to_string(k) + " has a"
-                              + " wrong index (expected " + std::to_string(vecRecord.index) 
-                              + ") in file '" + filename + "'");
+                HADRONS_ERROR(Definition, 
+                              "I/O type different from vector type but null I/O grid passed");
            }
-            eval[k] = vecRecord.eval;
+            ioBuf.reset(new TIo(gridIo));
+            testBuf.reset(new T(grid));
        }
-        binReader.close();
-    }
-
-    template <typename T>
-    void basicReadSingle(T &evec, RealD &eval, const std::string filename, 
-                         const unsigned int index)
-    {
-        ScidacReader binReader;
-        VecRecord    vecRecord;
-
-        binReader.open(filename);
-        readHeader(record, binReader);
-        LOG(Message) << "Reading eigenvector " << index << std::endl;
-        readElement(evec, vecRecord, binReader);
-        if (vecRecord.index != index)
+        if (multiFile)
        {
-            HADRONS_ERROR(Io, "Eigenvector " + std::to_string(index) + " has a"
-                          + " wrong index (expected " + std::to_string(vecRecord.index) 
-                          + ") in file '" + filename + "'");
+            std::string fullFilename;
+
+            for(int k = 0; k < size; ++k) 
+            {
+                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
+
+                makeFileDir(fullFilename, grid);
+                binWriter.open(fullFilename);
+                writeHeader(binWriter, record);
+                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
+                binWriter.close();
+            }
        }
-        eval = vecRecord.eval;
-        binReader.close();
-    }
-
-    template <typename T>
-    void basicWrite(const std::string filename, std::vector<T> &evec, 
-                    const std::vector<RealD> &eval, const unsigned int size)
-    {
-        ScidacWriter binWriter(evec[0]._grid->IsBoss());
-
-        makeFileDir(filename, evec[0]._grid);
-        binWriter.open(filename);
-        writeHeader(binWriter, record);
-        for(int k = 0; k < size; ++k) 
+        else
        {
-            VecRecord vecRecord;
-
-            vecRecord.index = k;
-            vecRecord.eval  = eval[k];
-            LOG(Message) << "Writing eigenvector " << k << std::endl;
-            writeElement(binWriter, evec[k], vecRecord);
+            makeFileDir(filename, grid);
+            binWriter.open(filename);
+            writeHeader(binWriter, record);
+            for(int k = 0; k < size; ++k) 
+            {
+                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
+            }
+            binWriter.close();
        }
-        binWriter.close();
    }
+}

-    template <typename T>
-    void basicWriteSingle(const std::string filename, T &evec, 
-                          const RealD eval, const unsigned int index)
+template <typename F>
+class BaseEigenPack
+{
+public:
+    typedef F Field;
+public:
+    std::vector<RealD> eval;
+    std::vector<F>     evec;
+    PackRecord         record;
+public:
+    BaseEigenPack(void)          = default;
+    BaseEigenPack(const size_t size, GridBase *grid)
    {
-        ScidacWriter binWriter(evec._grid->IsBoss());
-        VecRecord    vecRecord;
-
-        makeFileDir(filename, evec._grid);
-        binWriter.open(filename);
-        writeHeader(binWriter, record);
-        vecRecord.index = index;
-        vecRecord.eval  = eval;
-        LOG(Message) << "Writing eigenvector " << index << std::endl;
-        writeElement(binWriter, evec, vecRecord);
-        binWriter.close();
+        resize(size, grid);
+    }
+    virtual ~BaseEigenPack(void) = default;
+    void resize(const size_t size, GridBase *grid)
+    {
+        eval.resize(size);
+        evec.resize(size, grid);
    }
 };

-template <typename FineF, typename CoarseF>
-class CoarseEigenPack: public EigenPack<FineF>
+template <typename F, typename FIo = F>
+class EigenPack: public BaseEigenPack<F>
 {
 public:
-    typedef CoarseF CoarseField;
+    typedef F   Field;
+    typedef FIo FieldIo;
 public:
-    std::vector<RealD>   evalCoarse;
+    EigenPack(void)          = default;
+    virtual ~EigenPack(void) = default;
+
+    EigenPack(const size_t size, GridBase *grid, GridBase *gridIo = nullptr)
+    : BaseEigenPack<F>(size, grid)
+    {
+        if (typeHash<F>() != typeHash<FIo>())
+        {
+            if (gridIo == nullptr)
+            {
+                HADRONS_ERROR(Definition, 
+                              "I/O type different from vector type but null I/O grid passed");
+            }
+        }
+        gridIo_ = gridIo;
+    }
+
+    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
+    {
+        EigenPackIo::readPack<F, FIo>(this->evec, this->eval, this->record, 
+                                      evecFilename(fileStem, traj, multiFile), 
+                                      this->evec.size(), multiFile, gridIo_);
+        HADRONS_DUMP_EP_METADATA(this->record);
+    }
+
+    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
+    {
+        EigenPackIo::writePack<F, FIo>(evecFilename(fileStem, traj, multiFile), 
+                                       this->evec, this->eval, this->record, 
+                                       this->evec.size(), multiFile, gridIo_);
+    }
+protected:
+    std::string evecFilename(const std::string stem, const int traj, const bool multiFile)
+    {
+        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
+
+        if (multiFile)
+        {
+            return stem + t;
+        }
+        else
+        {
+            return stem + t + ".bin";
+        }
+    }
+protected:
+    GridBase *gridIo_;
+};
+
+template <typename FineF, typename CoarseF, 
+          typename FineFIo = FineF, typename CoarseFIo = CoarseF>
+class CoarseEigenPack: public EigenPack<FineF, FineFIo>
+{
+public:
+    typedef CoarseF CoarseField;         
    std::vector<CoarseF> evecCoarse;
+    std::vector<RealD>   evalCoarse;
 public:
    CoarseEigenPack(void)          = default;
    virtual ~CoarseEigenPack(void) = default;

    CoarseEigenPack(const size_t sizeFine, const size_t sizeCoarse, 
-                    GridBase *gridFine, GridBase *gridCoarse)
+                    GridBase *gridFine, GridBase *gridCoarse,
+                    GridBase *gridFineIo = nullptr, 
+                    GridBase *gridCoarseIo = nullptr)
    {
+        if (typeHash<FineF>() != typeHash<FineFIo>())
+        {
+            if (gridFineIo == nullptr)
+            {
+                HADRONS_ERROR(Definition, 
+                              "Fine I/O type different from vector type but null fine I/O grid passed");
+            }
+        }
+        if (typeHash<CoarseF>() != typeHash<CoarseFIo>())
+        {
+            if (gridCoarseIo == nullptr)
+            {
+                HADRONS_ERROR(Definition, 
+                              "Coarse I/O type different from vector type but null coarse I/O grid passed");
+            }
+        }
+        this->gridIo_ = gridFineIo;
+        gridCoarseIo_ = gridCoarseIo;
        resize(sizeFine, sizeCoarse, gridFine, gridCoarse);
    }

    void resize(const size_t sizeFine, const size_t sizeCoarse, 
                GridBase *gridFine, GridBase *gridCoarse)
    {
-        EigenPack<FineF>::resize(sizeFine, gridFine);
+        EigenPack<FineF, FineFIo>::resize(sizeFine, gridFine);
        evalCoarse.resize(sizeCoarse);
        evecCoarse.resize(sizeCoarse, gridCoarse);
    }

    void readFine(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
-        {
-            for(int k = 0; k < this->evec.size(); ++k)
-            {
-                this->basicReadSingle(this->evec[k], this->eval[k], this->evecFilename(fileStem + "_fine", k, traj), k);
-            }
-        }
-        else
-        {
-            this->basicRead(this->evec, this->eval, this->evecFilename(fileStem + "_fine", -1, traj), this->evec.size());
-        }
+        EigenPack<FineF, FineFIo>::read(fileStem + "_fine", multiFile, traj);
    }

    void readCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
-        {
-            for(int k = 0; k < evecCoarse.size(); ++k)
-            {
-                this->basicReadSingle(evecCoarse[k], evalCoarse[k], this->evecFilename(fileStem + "_coarse", k, traj), k);
-            }
-        }
-        else
-        {
-            this->basicRead(evecCoarse, evalCoarse, this->evecFilename(fileStem + "_coarse", -1, traj), evecCoarse.size());
-        }
+        PackRecord dummy;
+
+        EigenPackIo::readPack<CoarseF, CoarseFIo>(evecCoarse, evalCoarse, dummy, 
+                              this->evecFilename(fileStem + "_coarse", traj, multiFile), 
+                              evecCoarse.size(), multiFile, gridCoarseIo_);
    }

    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
@@ -314,32 +371,14 @@ public:

    void writeFine(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
-        {
-            for(int k = 0; k < this->evec.size(); ++k)
-            {
-                this->basicWriteSingle(this->evecFilename(fileStem + "_fine", k, traj), this->evec[k], this->eval[k], k);
-            }
-        }
-        else
-        {
-            this->basicWrite(this->evecFilename(fileStem + "_fine", -1, traj), this->evec, this->eval, this->evec.size());
-        }
+        EigenPack<FineF, FineFIo>::write(fileStem + "_fine", multiFile, traj);
    }

    void writeCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
-        {
-            for(int k = 0; k < evecCoarse.size(); ++k)
-            {
-                this->basicWriteSingle(this->evecFilename(fileStem + "_coarse", k, traj), evecCoarse[k], evalCoarse[k], k);
-            }
-        }
-        else
-        {
-            this->basicWrite(this->evecFilename(fileStem + "_coarse", -1, traj), evecCoarse, evalCoarse, evecCoarse.size());
-        }
+        EigenPackIo::writePack<CoarseF, CoarseFIo>(this->evecFilename(fileStem + "_coarse", traj, multiFile), 
+                                                   evecCoarse, evalCoarse, this->record, 
+                                                   evecCoarse.size(), multiFile, gridCoarseIo_);
    }
    
    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
@@ -347,16 +386,25 @@ public:
        writeFine(fileStem, multiFile, traj);
        writeCoarse(fileStem, multiFile, traj);
    }
+private:
+    GridBase *gridCoarseIo_;
 };

 template <typename FImpl>
-using FermionEigenPack = EigenPack<typename FImpl::FermionField>;
+using BaseFermionEigenPack = BaseEigenPack<typename FImpl::FermionField>;

-template <typename FImpl, int nBasis>
+template <typename FImpl, typename FImplIo = FImpl>
+using FermionEigenPack = EigenPack<typename FImpl::FermionField, typename FImplIo::FermionField>;
+
+template <typename FImpl, int nBasis, typename FImplIo = FImpl>
 using CoarseFermionEigenPack = CoarseEigenPack<
    typename FImpl::FermionField,
    typename LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
                                   typename FImpl::SiteComplex, 
+                                   nBasis>::CoarseField,
+    typename FImplIo::FermionField,
+    typename LocalCoherenceLanczos<typename FImplIo::SiteSpinor, 
+                                   typename FImplIo::SiteComplex, 
                                   nBasis>::CoarseField>;

 #undef HADRONS_DUMP_EP_METADATA
--- a/Hadrons/Global.cc
+++ b/Hadrons/Global.cc
@@ -166,7 +166,13 @@ std::string Hadrons::dirname(const std::string &s)

 void Hadrons::makeFileDir(const std::string filename, GridBase *g)
 {
-    if (g->IsBoss())
+    bool doIt = true;
+
+    if (g)
+    {
+        doIt = g->IsBoss();
+    }
+    if (doIt)
    {
        std::string dir    = dirname(filename);
        int         status = mkdir(dir);
--- a/Hadrons/Global.hpp
+++ b/Hadrons/Global.hpp
@@ -32,6 +32,7 @@ See the full license in the file "LICENSE" in the top level distribution directo

 #include <set>
 #include <stack>
+#include <regex>
 #include <Grid/Grid.h>
 #include <cxxabi.h>

@@ -217,15 +218,15 @@ typedef XmlReader ResultReader;
 typedef XmlWriter ResultWriter;
 #endif

-#define RESULT_FILE_NAME(name) \
-name + "." + std::to_string(vm().getTrajectory()) + "." + resultFileExt
+#define RESULT_FILE_NAME(name, traj) \
+name + "." + std::to_string(traj) + "." + resultFileExt

 // recursive mkdir
 #define MAX_PATH_LENGTH 512u
 int         mkdir(const std::string dirName);
 std::string basename(const std::string &s);
 std::string dirname(const std::string &s);
-void        makeFileDir(const std::string filename, GridBase *g);
+void        makeFileDir(const std::string filename, GridBase *g = nullptr);

 // default Schur convention
 #ifndef HADRONS_DEFAULT_SCHUR 
@@ -248,6 +249,47 @@ void        makeFileDir(const std::string filename, GridBase *g);
 // pretty print time profile
 void printTimeProfile(const std::map<std::string, GridTime> &timing, GridTime total);

+// token replacement utility
+template <typename T>
+void tokenReplace(std::string &str, const std::string token,
+                  const T &x, const std::string mark = "@")
+{
+    std::string fullToken = mark + token + mark;
+    
+    auto pos = str.find(fullToken);
+    if (pos != std::string::npos)
+    {
+        str.replace(pos, fullToken.size(), std::to_string(x));
+    }
+}
+
+// trajectory range
+class TrajRange: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
+                                    unsigned int, start,
+                                    unsigned int, end,
+                                    unsigned int, step,
+                                    std::string,  exclude);
+
+    inline std::vector<unsigned int> getTrajectoryList(void)
+    {
+        std::vector<unsigned int> excVec = strToVec<unsigned int>(exclude);
+        std::vector<unsigned int> list;
+
+        for (unsigned int t = start; t < end; t += step)
+        {
+            if (std::find(excVec.begin(), excVec.end(), t) == excVec.end())
+            {
+                list.push_back(t);
+            }
+        }
+
+        return list;
+    }
+};
+
 END_HADRONS_NAMESPACE

 #include <Hadrons/Exceptions.hpp>
--- a/Hadrons/Makefile.am
+++ b/Hadrons/Makefile.am
@@ -5,16 +5,17 @@ lib_LIBRARIES = libHadrons.a
 include modules.inc

 libHadrons_a_SOURCES = \
-    $(modules_cc)      \
    Application.cc     \
    Environment.cc     \
 	Exceptions.cc      \
    Global.cc          \
    Module.cc		   \
-	VirtualMachine.cc
+	TimerArray.cc      \
+	VirtualMachine.cc  \
+	$(modules_cc)
+	
 libHadrons_adir = $(includedir)/Hadrons
 nobase_libHadrons_a_HEADERS = \
-	$(modules_hpp)            \
 	A2AVectors.hpp            \
 	A2AMatrix.hpp             \
 	Application.hpp           \
@@ -31,4 +32,7 @@ nobase_libHadrons_a_HEADERS = \
 	Modules.hpp               \
 	ModuleFactory.hpp         \
 	Solver.hpp                \
-	VirtualMachine.hpp
+	TimerArray.hpp            \
+	VirtualMachine.hpp        \
+	Utilities/Contractor.hpp  \
+	$(modules_hpp)
--- a/Hadrons/Module.cc
+++ b/Hadrons/Module.cc
@@ -66,101 +66,6 @@ void ModuleBase::operator()(void)
    stopAllTimers();
 }

-// timers //////////////////////////////////////////////////////////////////////
-void ModuleBase::startTimer(const std::string &name)
-{
-    if (!name.empty())
-    {
-        timer_[name].Start();
-    }
-}
-
-GridTime ModuleBase::getTimer(const std::string &name)
-{
-    GridTime t;
-    
-    if (!name.empty())
-    {
-        try
-        {
-            bool running = timer_.at(name).isRunning();
-
-            if (running) stopTimer(name);
-            t = timer_.at(name).Elapsed();
-            if (running) startTimer(name);
-        }
-        catch (std::out_of_range &)
-        {
-            t = GridTime::zero();
-        }
-    }
-    else
-    {
-        t = GridTime::zero();
-    }
-
-    return t;
-}
-
-double ModuleBase::getDTimer(const std::string &name)
-{
-    return static_cast<double>(getTimer(name).count());
-}
-
-void ModuleBase::startCurrentTimer(const std::string &name)
-{
-    if (!name.empty())
-    {
-        stopCurrentTimer();
-        startTimer(name);
-        currentTimer_ = name;
-    }
-}
-
-void ModuleBase::stopTimer(const std::string &name)
-{
-    if (timer_.at(name).isRunning())
-    {
-        timer_.at(name).Stop();
-    }
-}
-
-void ModuleBase::stopCurrentTimer(void)
-{
-    if (!currentTimer_.empty())
-    {
-        stopTimer(currentTimer_);
-        currentTimer_ = "";
-    }
-}
-
-void ModuleBase::stopAllTimers(void)
-{
-    for (auto &t: timer_)
-    {
-        stopTimer(t.first);
-    }
-    currentTimer_ = "";
-}
-
-void ModuleBase::resetTimers(void)
-{
-    timer_.clear();
-    currentTimer_ = "";
-}
-
-std::map<std::string, GridTime> ModuleBase::getTimings(void)
-{
-    std::map<std::string, GridTime> timing;
-
-    for (auto &t: timer_)
-    {
-        timing[t.first] = t.second.Elapsed();
-    }
-
-    return timing;
-}
-
 std::string ModuleBase::makeSeedString(void)
 {
    std::string seed;
--- a/Hadrons/Module.hpp
+++ b/Hadrons/Module.hpp
@@ -30,6 +30,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define Hadrons_Module_hpp_

 #include <Hadrons/Global.hpp>
+#include <Hadrons/TimerArray.hpp>
 #include <Hadrons/VirtualMachine.hpp>

 BEGIN_HADRONS_NAMESPACE
@@ -143,7 +144,7 @@ if (env().getGrid()->IsBoss() and !ioStem.empty())\
 {\
    makeFileDir(ioStem, env().getGrid());\
    {\
-        ResultWriter _writer(RESULT_FILE_NAME(ioStem));\
+        ResultWriter _writer(RESULT_FILE_NAME(ioStem, vm().getTrajectory()));\
        write(_writer, name, result);\
    }\
 }
@@ -152,7 +153,7 @@ if (env().getGrid()->IsBoss() and !ioStem.empty())\
 *                            Module class                                    *
 ******************************************************************************/
 // base class
-class ModuleBase
+class ModuleBase: public TimerArray
 {
 public:
    // constructor
@@ -180,16 +181,6 @@ public:
    virtual void execute(void) = 0;
    // execution
    void operator()(void);
-    // timers
-    void                            startTimer(const std::string &name);
-    GridTime                        getTimer(const std::string &name);
-    double                          getDTimer(const std::string &name);
-    void                            startCurrentTimer(const std::string &name);
-    void                            stopTimer(const std::string &name);
-    void                            stopCurrentTimer(void);
-    void                            stopAllTimers(void);
-    void                            resetTimers(void);
-    std::map<std::string, GridTime> getTimings(void);
 protected:
    // environment shortcut
    DEFINE_ENV_ALIAS;
--- a/Hadrons/Modules.hpp
+++ b/Hadrons/Modules.hpp
@@ -1,6 +1,6 @@
 #include <Hadrons/Modules/MContraction/Baryon.hpp>
+#include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
 #include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
-#include <Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp>
 #include <Hadrons/Modules/MContraction/Meson.hpp>
 #include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 #include <Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
@@ -16,6 +16,7 @@
 #include <Hadrons/Modules/MSource/Wall.hpp>
 #include <Hadrons/Modules/MSource/Z2.hpp>
 #include <Hadrons/Modules/MSource/SeqConserved.hpp>
+#include <Hadrons/Modules/MSource/Momentum.hpp>
 #include <Hadrons/Modules/MSink/Smear.hpp>
 #include <Hadrons/Modules/MSink/Point.hpp>
 #include <Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp>
@@ -23,13 +24,17 @@
 #include <Hadrons/Modules/MSolver/Guesser.hpp>
 #include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Hadrons/Modules/MSolver/A2AVectors.hpp>
+#include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
 #include <Hadrons/Modules/MGauge/UnitEm.hpp>
 #include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
 #include <Hadrons/Modules/MGauge/Unit.hpp>
+#include <Hadrons/Modules/MGauge/Electrify.hpp>
 #include <Hadrons/Modules/MGauge/Random.hpp>
+#include <Hadrons/Modules/MGauge/GaugeFix.hpp>
 #include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
 #include <Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
+#include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
 #include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
 #include <Hadrons/Modules/MUtilities/RandomVectors.hpp>
 #include <Hadrons/Modules/MUtilities/TestSeqGamma.hpp>
@@ -40,6 +45,9 @@
 #include <Hadrons/Modules/MScalar/ScalarVP.hpp>
 #include <Hadrons/Modules/MScalar/Scalar.hpp>
 #include <Hadrons/Modules/MScalar/ChargedProp.hpp>
+#include <Hadrons/Modules/MNPR/Bilinear.hpp>
+#include <Hadrons/Modules/MNPR/Amputate.hpp>
+#include <Hadrons/Modules/MNPR/FourQuark.hpp>
 #include <Hadrons/Modules/MAction/DWF.hpp>
 #include <Hadrons/Modules/MAction/MobiusDWF.hpp>
 #include <Hadrons/Modules/MAction/Wilson.hpp>
@@ -50,7 +58,6 @@
 #include <Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp>
 #include <Hadrons/Modules/MScalarSUN/ShiftProbe.hpp>
 #include <Hadrons/Modules/MScalarSUN/Div.hpp>
-#include <Hadrons/Modules/MScalarSUN/TimeMomProbe.hpp>
 #include <Hadrons/Modules/MScalarSUN/TrMag.hpp>
 #include <Hadrons/Modules/MScalarSUN/EMT.hpp>
 #include <Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
@@ -61,6 +68,7 @@
 #include <Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
 #include <Hadrons/Modules/MIO/LoadEigenPack.hpp>
 #include <Hadrons/Modules/MIO/LoadNersc.hpp>
+#include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
 #include <Hadrons/Modules/MIO/LoadCosmHol.hpp>
 #include <Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp>
 #include <Hadrons/Modules/MIO/LoadBinary.hpp>
--- a/Hadrons/Modules/MAction/DWF.cc
+++ b/Hadrons/Modules/MAction/DWF.cc
@@ -32,4 +32,6 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TDWF<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TDWF<FIMPLF>;
+#endif
--- a/Hadrons/Modules/MAction/DWF.hpp
+++ b/Hadrons/Modules/MAction/DWF.hpp
@@ -49,7 +49,8 @@ public:
                                    unsigned int, Ls,
                                    double      , mass,
                                    double      , M5,
-                                    std::string , boundary);
+                                    std::string , boundary,
+                                    std::string , twist);
 };

 template <typename FImpl>
@@ -73,7 +74,9 @@ protected:
 };

 MODULE_REGISTER_TMP(DWF, TDWF<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(DWFF, TDWF<FIMPLF>, MAction);
+#endif

 /******************************************************************************
 *                        DWF template implementation                         *
@@ -117,8 +120,9 @@ void TDWF<FImpl>::setup(void)
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    typename DomainWallFermion<FImpl>::ImplParams implParams(boundary);
+    typename DomainWallFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, DomainWallFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, implParams);
 }
--- a/Hadrons/Modules/MAction/MobiusDWF.cc
+++ b/Hadrons/Modules/MAction/MobiusDWF.cc
@@ -32,4 +32,6 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TMobiusDWF<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TMobiusDWF<FIMPLF>;
+#endif
--- a/Hadrons/Modules/MAction/MobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/MobiusDWF.hpp
@@ -49,7 +49,8 @@ public:
                                    double      , M5,
                                    double      , b,
                                    double      , c,
-                                    std::string , boundary);
+                                    std::string , boundary,
+                                    std::string , twist);
 };

 template <typename FImpl>
@@ -72,7 +73,9 @@ public:
 };

 MODULE_REGISTER_TMP(MobiusDWF, TMobiusDWF<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(MobiusDWFF, TMobiusDWF<FIMPLF>, MAction);
+#endif

 /******************************************************************************
 *                      TMobiusDWF implementation                             *
@@ -117,8 +120,9 @@ void TMobiusDWF<FImpl>::setup(void)
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    typename MobiusFermion<FImpl>::ImplParams implParams(boundary);
+    typename MobiusFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, MobiusFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, par().b, par().c,
                     implParams);
--- a/Hadrons/Modules/MAction/ScaledDWF.cc
+++ b/Hadrons/Modules/MAction/ScaledDWF.cc
@@ -32,4 +32,6 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TScaledDWF<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TScaledDWF<FIMPLF>;
+#endif
--- a/Hadrons/Modules/MAction/ScaledDWF.hpp
+++ b/Hadrons/Modules/MAction/ScaledDWF.hpp
@@ -48,7 +48,8 @@ public:
                                    double      , mass,
                                    double      , M5,
                                    double      , scale,
-                                    std::string , boundary);
+                                    std::string , boundary,
+                                    std::string , twist);
 };

 template <typename FImpl>
@@ -71,7 +72,9 @@ public:
 };

 MODULE_REGISTER_TMP(ScaledDWF, TScaledDWF<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(ScaledDWFF, TScaledDWF<FIMPLF>, MAction);
+#endif

 /******************************************************************************
 *                      TScaledDWF implementation                             *
@@ -116,8 +119,9 @@ void TScaledDWF<FImpl>::setup(void)
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    typename MobiusFermion<FImpl>::ImplParams implParams(boundary);
+    typename ScaledShamirFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, ScaledShamirFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, par().scale,
                     implParams);
--- a/Hadrons/Modules/MAction/Wilson.cc
+++ b/Hadrons/Modules/MAction/Wilson.cc
@@ -32,4 +32,6 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TWilson<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TWilson<FIMPLF>;
+#endif
--- a/Hadrons/Modules/MAction/Wilson.hpp
+++ b/Hadrons/Modules/MAction/Wilson.hpp
@@ -47,7 +47,9 @@ public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonPar,
                                    std::string, gauge,
                                    double     , mass,
-                                    std::string, boundary);
+                                    std::string, boundary,
+                                    std::string, string,
+                                    std::string, twist);
 };

 template <typename FImpl>
@@ -71,7 +73,9 @@ protected:
 };

 MODULE_REGISTER_TMP(Wilson, TWilson<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(WilsonF, TWilson<FIMPLF>, MAction);
+#endif

 /******************************************************************************
 *                     TWilson template implementation                        *
@@ -111,8 +115,9 @@ void TWilson<FImpl>::setup(void)
    auto &U      = envGet(GaugeField, par().gauge);
    auto &grid   = *envGetGrid(FermionField);
    auto &gridRb = *envGetRbGrid(FermionField);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    typename WilsonFermion<FImpl>::ImplParams implParams(boundary);
+    typename WilsonFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, WilsonFermion<FImpl>, getName(), 1, U, grid, gridRb,
                     par().mass, implParams);
 }
--- a/Hadrons/Modules/MAction/WilsonClover.cc
+++ b/Hadrons/Modules/MAction/WilsonClover.cc
@@ -32,4 +32,6 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TWilsonClover<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TWilsonClover<FIMPLF>;
+#endif
--- a/Hadrons/Modules/MAction/WilsonClover.hpp
+++ b/Hadrons/Modules/MAction/WilsonClover.hpp
@@ -51,7 +51,8 @@ public:
 				                    double     , csw_r,
 				                    double     , csw_t,
 				                    WilsonAnisotropyCoefficients ,clover_anisotropy,
-                                    std::string, boundary
+                                    std::string, boundary,
+                                    std::string, twist
 				    );
 };

@@ -75,7 +76,9 @@ public:
 };

 MODULE_REGISTER_TMP(WilsonClover, TWilsonClover<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(WilsonCloverF, TWilsonClover<FIMPLF>, MAction);
+#endif

 /******************************************************************************
 *                    TWilsonClover template implementation                   *
@@ -117,8 +120,9 @@ void TWilsonClover<FImpl>::setup(void)
    auto &U      = envGet(GaugeField, par().gauge);
    auto &grid   = *envGetGrid(FermionField);
    auto &gridRb = *envGetRbGrid(FermionField);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    typename WilsonCloverFermion<FImpl>::ImplParams implParams(boundary);
+    typename WilsonCloverFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, WilsonCloverFermion<FImpl>, getName(), 1, U, grid,
                     gridRb, par().mass, par().csw_r, par().csw_t, 
                     par().clover_anisotropy, implParams); 
--- a/Hadrons/Modules/MAction/ZMobiusDWF.cc
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.cc
@@ -32,4 +32,6 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TZMobiusDWF<ZFIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TZMobiusDWF<ZFIMPLF>;
+#endif
--- a/Hadrons/Modules/MAction/ZMobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.hpp
@@ -50,7 +50,8 @@ public:
                                    double                           , b,
                                    double                           , c,
                                    std::vector<std::complex<double>>, omega,
-                                    std::string                      , boundary);
+                                    std::string                      , boundary,
+                                    std::string                      , twist);
 };

 template <typename FImpl>
@@ -73,7 +74,9 @@ public:
 };

 MODULE_REGISTER_TMP(ZMobiusDWF, TZMobiusDWF<ZFIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(ZMobiusDWFF, TZMobiusDWF<ZFIMPLF>, MAction);
+#endif

 /******************************************************************************
 *                     TZMobiusDWF implementation                             *
@@ -125,8 +128,9 @@ void TZMobiusDWF<FImpl>::setup(void)
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    auto omega = par().omega;
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    typename ZMobiusFermion<FImpl>::ImplParams implParams(boundary);
+    typename ZMobiusFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, ZMobiusFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, omega,
                     par().b, par().c, implParams);
--- a/Hadrons/Modules/MContraction/A2AAslashField.cc
+++ b/Hadrons/Modules/MContraction/A2AAslashField.cc
@@ -2,7 +2,7 @@

 Grid physics library, www.github.com/paboyle/Grid 

-Source file: Hadrons/Modules/MScalarSUN/TimeMomProbe.cc
+Source file: Hadrons/Modules/MContraction/A2AAslashField.cc

 Copyright (C) 2015-2018

@@ -25,14 +25,10 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Hadrons/Modules/MScalarSUN/TimeMomProbe.hpp>
+#include <Hadrons/Modules/MContraction/A2AAslashField.hpp>

 using namespace Grid;
 using namespace Hadrons;
-using namespace MScalarSUN;
+using namespace MContraction;

-template class Grid::Hadrons::MScalarSUN::TTimeMomProbe<ScalarNxNAdjImplR<2>>;
-template class Grid::Hadrons::MScalarSUN::TTimeMomProbe<ScalarNxNAdjImplR<3>>;
-template class Grid::Hadrons::MScalarSUN::TTimeMomProbe<ScalarNxNAdjImplR<4>>;
-template class Grid::Hadrons::MScalarSUN::TTimeMomProbe<ScalarNxNAdjImplR<5>>;
-template class Grid::Hadrons::MScalarSUN::TTimeMomProbe<ScalarNxNAdjImplR<6>>;
+template class Grid::Hadrons::MContraction::TA2AAslashField<FIMPL, PhotonR>;
--- a/Hadrons/Modules/MContraction/A2AAslashField.hpp
+++ b/Hadrons/Modules/MContraction/A2AAslashField.hpp
@@ -0,0 +1,246 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MContraction/A2AAslashField.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MContraction_A2AAslashField_hpp_
+#define Hadrons_MContraction_A2AAslashField_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/A2AMatrix.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         A2AAslashField                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MContraction)
+
+class A2AAslashFieldPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashFieldPar,
+                                    int, cacheBlock,
+                                    int, block,
+                                    std::string, left,
+                                    std::string, right,
+                                    std::string, output,
+                                    std::vector<std::string>, emField);
+};
+
+class A2AAslashFieldMetadata: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashFieldMetadata,
+                                    std::string, emFieldName);
+};
+
+template <typename T, typename FImpl>
+class AslashFieldKernel: public A2AKernel<T, typename FImpl::FermionField>
+{
+public:
+    typedef typename FImpl::FermionField FermionField;
+public:
+    AslashFieldKernel(const std::vector<LatticeComplex> &emB0,
+                      const std::vector<LatticeComplex> &emB1,
+                      GridBase *grid)
+    : emB0_(emB0), emB1_(emB1), grid_(grid)
+    {
+        vol_ = 1.;
+        for (auto &d: grid_->GlobalDimensions())
+        {
+            vol_ *= d;
+        }
+    }
+
+    virtual ~AslashFieldKernel(void) = default;
+    virtual void operator()(A2AMatrixSet<T> &m, const FermionField *left, 
+                            const FermionField *right,
+                            const unsigned int orthogDim, double &t)
+    {
+        A2Autils<FImpl>::AslashField(m, left, right, emB0_, emB1_, orthogDim, &t);
+    }
+
+    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej)
+    {
+        return 0.;
+    }
+
+    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej)
+    {
+        return 0.;
+    }
+private:
+    const std::vector<LatticeComplex> &emB0_, &emB1_;
+    GridBase                          *grid_;
+    double                            vol_;
+};
+
+template <typename FImpl, typename PhotonImpl>
+class TA2AAslashField: public Module<A2AAslashFieldPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    typedef typename PhotonImpl::GaugeField EmField;
+    typedef A2AMatrixBlockComputation<Complex, 
+                                      FermionField, 
+                                      A2AAslashFieldMetadata, 
+                                      HADRONS_A2AM_IO_TYPE> Computation;
+    typedef AslashFieldKernel<Complex, FImpl> Kernel;
+public:
+    // constructor
+    TA2AAslashField(const std::string name);
+    // destructor
+    virtual ~TA2AAslashField(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(A2AAslashField, ARG(TA2AAslashField<FIMPL, PhotonR>), MContraction);
+
+/******************************************************************************
+ *                 TA2AAslashField implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl, typename PhotonImpl>
+TA2AAslashField<FImpl, PhotonImpl>::TA2AAslashField(const std::string name)
+: Module<A2AAslashFieldPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl, typename PhotonImpl>
+std::vector<std::string> TA2AAslashField<FImpl, PhotonImpl>::getInput(void)
+{
+    std::vector<std::string> in = par().emField;
+    
+    in.push_back(par().left);
+    in.push_back(par().right);
+
+    return in;
+}
+
+template <typename FImpl, typename PhotonImpl>
+std::vector<std::string> TA2AAslashField<FImpl, PhotonImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl, typename PhotonImpl>
+void TA2AAslashField<FImpl, PhotonImpl>::setup(void)
+{
+    envTmp(Computation, "computation", 1, envGetGrid(FermionField), 
+           env().getNd() - 1, par().emField.size(), 1, par().block, 
+           par().cacheBlock, this);
+    envTmp(std::vector<ComplexField>, "B0", 1, 
+           par().emField.size(), envGetGrid(ComplexField));
+    envTmp(std::vector<ComplexField>, "B1", 1, 
+           par().emField.size(), envGetGrid(ComplexField));
+    envTmpLat(ComplexField, "Amu");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl, typename PhotonImpl>
+void TA2AAslashField<FImpl, PhotonImpl>::execute(void)
+{
+    auto &left  = envGet(std::vector<FermionField>, par().left);
+    auto &right = envGet(std::vector<FermionField>, par().right);
+
+    int nt         = env().getDim().back();
+    int N_i        = left.size();
+    int N_j        = right.size();
+    int nem        = par().emField.size();
+    int block      = par().block;
+    int cacheBlock = par().cacheBlock;
+
+    LOG(Message) << "Computing all-to-all A-slash fields" << std::endl;
+    LOG(Message) << "Left: '" << par().left << "' Right: '" << par().right << "'" << std::endl;
+    LOG(Message) << "EM fields:" << std::endl;
+    for (auto &name: par().emField)
+    {
+        LOG(Message) << "  " << name << std::endl;
+    }
+    LOG(Message) << "A-slash field size: " << nt << "*" << N_i << "*" << N_j 
+                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(HADRONS_A2AM_IO_TYPE)) 
+                 << "/EM field)" << std::endl;
+    
+    // preparing "B" complexified fields
+    startTimer("Complexify EM fields");
+    envGetTmp(std::vector<ComplexField>, B0);
+    envGetTmp(std::vector<ComplexField>, B1);
+    for (unsigned int i = 0; i < par().emField.size(); ++i)
+    {
+        auto &A = envGet(EmField, par().emField[i]);
+        envGetTmp(ComplexField, Amu);
+
+        B0[i]  = peekLorentz(A, 0);
+        B0[i] += timesI(peekLorentz(A, 1));
+        B1[i]  = peekLorentz(A, 2);
+        B1[i] += timesI(peekLorentz(A, 3));
+    }
+    stopTimer("Complexify EM fields");
+
+    // I/O name & metadata lambdas
+    auto ionameFn = [this](const unsigned int em, const unsigned int dummy)
+    {
+        return par().emField[em];
+    };
+
+    auto filenameFn = [this, &ionameFn](const unsigned int em, const unsigned int dummy)
+    {
+        return par().output + "." + std::to_string(vm().getTrajectory()) 
+               + "/" + ionameFn(em, dummy) + ".h5";
+    };
+
+    auto metadataFn = [this](const unsigned int em, const unsigned int dummy)
+    {
+        A2AAslashFieldMetadata md;
+
+        md.emFieldName = par().emField[em];
+        
+        return md;
+    };
+
+    // executing computation
+    Kernel kernel(B0, B1, envGetGrid(FermionField));
+
+    envGetTmp(Computation, computation);
+    computation.execute(left, right, kernel, ionameFn, filenameFn, metadataFn);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MContraction_A2AAslashField_hpp_
--- a/Hadrons/Modules/MContraction/A2AMesonField.cc
+++ b/Hadrons/Modules/MContraction/A2AMesonField.cc
@@ -33,4 +33,3 @@ using namespace Hadrons;
 using namespace MContraction;

 template class Grid::Hadrons::MContraction::TA2AMesonField<FIMPL>;
-template class Grid::Hadrons::MContraction::TA2AMesonField<ZFIMPL>;
--- a/Hadrons/Modules/MContraction/A2AMesonField.hpp
+++ b/Hadrons/Modules/MContraction/A2AMesonField.hpp
@@ -33,15 +33,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
-#include <Hadrons/A2AVectors.hpp>
 #include <Hadrons/A2AMatrix.hpp>
-#include <Hadrons/Modules/MSolver/A2AVectors.hpp>
-#include <Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp>
-
-#define MF_PARALLEL_IO
-#ifndef MF_IO_TYPE
-#define MF_IO_TYPE ComplexF
-#endif

 BEGIN_HADRONS_NAMESPACE

@@ -56,8 +48,8 @@ public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMesonFieldPar,
                                    int, cacheBlock,
                                    int, block,
-                                    std::string, v,
-                                    std::string, w,
+                                    std::string, left,
+                                    std::string, right,
                                    std::string, output,
                                    std::string, gammas,
                                    std::vector<std::string>, mom);
@@ -71,22 +63,59 @@ public:
                                    Gamma::Algebra, gamma);
 };

+template <typename T, typename FImpl>
+class MesonFieldKernel: public A2AKernel<T, typename FImpl::FermionField>
+{
+public:
+    typedef typename FImpl::FermionField FermionField;
+public:
+    MesonFieldKernel(const std::vector<Gamma::Algebra> &gamma,
+                     const std::vector<LatticeComplex> &mom,
+                     GridBase *grid)
+    : gamma_(gamma), mom_(mom), grid_(grid)
+    {
+        vol_ = 1.;
+        for (auto &d: grid_->GlobalDimensions())
+        {
+            vol_ *= d;
+        }
+    }
+
+    virtual ~MesonFieldKernel(void) = default;
+    virtual void operator()(A2AMatrixSet<T> &m, const FermionField *left, 
+                            const FermionField *right,
+                            const unsigned int orthogDim, double &t)
+    {
+        A2Autils<FImpl>::MesonField(m, left, right, gamma_, mom_, orthogDim, &t);
+    }
+
+    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej)
+    {
+        return vol_*(2*8.0+6.0+8.0*mom_.size())*blockSizei*blockSizej*gamma_.size();
+    }
+
+    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej)
+    {
+        return vol_*(12.0*sizeof(T))*blockSizei*blockSizej
+               +  vol_*(2.0*sizeof(T)*mom_.size())*blockSizei*blockSizej*gamma_.size();
+    }
+private:
+    const std::vector<Gamma::Algebra> &gamma_;
+    const std::vector<LatticeComplex> &mom_;
+    GridBase                          *grid_;
+    double                            vol_;
+};
+
 template <typename FImpl>
 class TA2AMesonField : public Module<A2AMesonFieldPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
-    SOLVER_TYPE_ALIASES(FImpl,);
-    typedef Eigen::TensorMap<Eigen::Tensor<Complex, 5, Eigen::RowMajor>>    MesonField;
-    typedef Eigen::TensorMap<Eigen::Tensor<MF_IO_TYPE, 5, Eigen::RowMajor>> MesonFieldIo;
-    typedef A2AMatrixIo<MF_IO_TYPE, A2AMesonFieldMetadata>                  MatrixIo;
-    struct IoHelper
-    {
-        MatrixIo              io;
-        A2AMesonFieldMetadata metadata;
-        size_t                offset;
-        unsigned int          i, j, blockSizei, blockSizej;
-    };
+    typedef A2AMatrixBlockComputation<Complex, 
+                                      FermionField, 
+                                      A2AMesonFieldMetadata, 
+                                      HADRONS_A2AM_IO_TYPE> Computation;
+    typedef MesonFieldKernel<Complex, FImpl> Kernel;
 public:
    // constructor
    TA2AMesonField(const std::string name);
@@ -100,20 +129,13 @@ public:
    // execution
    virtual void execute(void);
 private:
-    // IO
-    std::string ioname(const unsigned int m, const unsigned int g) const;
-    std::string filename(const unsigned int m, const unsigned int g) const;
-    void saveBlock(const MF_IO_TYPE *data, IoHelper &h);
-private:
-    bool                                               hasPhase_{false};
-    std::string                                        momphName_;
-    std::vector<Gamma::Algebra>                        gamma_;
-    std::vector<std::vector<Real>>                     mom_;
-    std::vector<IoHelper>                              nodeIo_;
+    bool                               hasPhase_{false};
+    std::string                        momphName_;
+    std::vector<Gamma::Algebra>        gamma_;
+    std::vector<std::vector<Real>>     mom_;
 };

 MODULE_REGISTER(A2AMesonField, ARG(TA2AMesonField<FIMPL>), MContraction);
-MODULE_REGISTER(ZA2AMesonField, ARG(TA2AMesonField<ZFIMPL>), MContraction);

 /******************************************************************************
 *                  TA2AMesonField implementation                             *
@@ -130,7 +152,7 @@ TA2AMesonField<FImpl>::TA2AMesonField(const std::string name)
 template <typename FImpl>
 std::vector<std::string> TA2AMesonField<FImpl>::getInput(void)
 {
-    std::vector<std::string> in = {par().v, par().w};
+    std::vector<std::string> in = {par().left, par().right};

    return in;
 }
@@ -186,34 +208,31 @@ void TA2AMesonField<FImpl>::setup(void)
        }
        mom_.push_back(p);
    }
-    
    envCache(std::vector<ComplexField>, momphName_, 1, 
             par().mom.size(), envGetGrid(ComplexField));
    envTmpLat(ComplexField, "coor");
-    // preallocate memory for meson field block
-    auto tgp = env().getDim().back()*gamma_.size()*mom_.size();
-
-    envTmp(Vector<MF_IO_TYPE>, "mfBuf", 1, tgp*par().block*par().block);
-    envTmp(Vector<Complex>, "mfCache", 1, tgp*par().cacheBlock*par().cacheBlock);
+    envTmp(Computation, "computation", 1, envGetGrid(FermionField), 
+           env().getNd() - 1, mom_.size(), gamma_.size(), par().block, 
+           par().cacheBlock, this);
 }

 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TA2AMesonField<FImpl>::execute(void)
 {
-    auto &v = envGet(std::vector<FermionField>, par().v);
-    auto &w = envGet(std::vector<FermionField>, par().w);
+    auto &left  = envGet(std::vector<FermionField>, par().left);
+    auto &right = envGet(std::vector<FermionField>, par().right);

    int nt         = env().getDim().back();
-    int N_i        = w.size();
-    int N_j        = v.size();
+    int N_i        = left.size();
+    int N_j        = right.size();
    int ngamma     = gamma_.size();
    int nmom       = mom_.size();
    int block      = par().block;
    int cacheBlock = par().cacheBlock;

    LOG(Message) << "Computing all-to-all meson fields" << std::endl;
-    LOG(Message) << "W: '" << par().w << "' V: '" << par().v << "'" << std::endl;
+    LOG(Message) << "Left: '" << par().left << "' Right: '" << par().right << "'" << std::endl;
    LOG(Message) << "Momenta:" << std::endl;
    for (auto &p: mom_)
    {
@@ -225,12 +244,9 @@ void TA2AMesonField<FImpl>::execute(void)
        LOG(Message) << "  " << g << std::endl;
    }
    LOG(Message) << "Meson field size: " << nt << "*" << N_i << "*" << N_j 
-                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(MF_IO_TYPE)) 
+                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(HADRONS_A2AM_IO_TYPE)) 
                 << "/momentum/bilinear)" << std::endl;

-    ///////////////////////////////////////////////
-    // Momentum setup
-    ///////////////////////////////////////////////
    auto &ph = envGet(std::vector<ComplexField>, momphName_);

    if (!hasPhase_)
@@ -253,189 +269,43 @@ void TA2AMesonField<FImpl>::execute(void)
        hasPhase_ = true;
        stopTimer("Momentum phases");
    }
-    
-    //////////////////////////////////////////////////////////////////////////
-    // i,j   is first  loop over SchurBlock factors reusing 5D matrices
-    // ii,jj is second loop over cacheBlock factors for high perf contractoin
-    // iii,jjj are loops within cacheBlock
-    // Total index is sum of these  i+ii+iii etc...
-    //////////////////////////////////////////////////////////////////////////
-    
-    double flops;
-    double bytes;
-    double vol      = env().getVolume();
-    double t_kernel = 0.0;
-    double nodes    = env().getGrid()->NodeCount();
-    double tot_kernel;

-    envGetTmp(Vector<MF_IO_TYPE>, mfBuf);
-    envGetTmp(Vector<Complex>, mfCache);
-    
-    double t0    = usecond();
-    int NBlock_i = N_i/block + (((N_i % block) != 0) ? 1 : 0);
-    int NBlock_j = N_j/block + (((N_j % block) != 0) ? 1 : 0);
-
-    for(int i=0;i<N_i;i+=block)
-    for(int j=0;j<N_j;j+=block)
+    auto ionameFn = [this](const unsigned int m, const unsigned int g)
    {
-        // Get the W and V vectors for this block^2 set of terms
-        int N_ii = MIN(N_i-i,block);
-        int N_jj = MIN(N_j-j,block);
+        std::stringstream ss;

-        LOG(Message) << "Meson field block " 
-                    << j/block + NBlock_j*i/block + 1 
-                    << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
-                    << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
-                    << std::endl;
-
-        MesonFieldIo mfBlock(mfBuf.data(),nmom,ngamma,nt,N_ii,N_jj);
-
-        // Series of cache blocked chunks of the contractions within this block
-        flops = 0.0;
-        bytes = 0.0;
-        for(int ii=0;ii<N_ii;ii+=cacheBlock)
-        for(int jj=0;jj<N_jj;jj+=cacheBlock)
+        ss << gamma_[g] << "_";
+        for (unsigned int mu = 0; mu < mom_[m].size(); ++mu)
        {
-            int N_iii = MIN(N_ii-ii,cacheBlock);
-            int N_jjj = MIN(N_jj-jj,cacheBlock);
-            MesonField mfCacheBlock(mfCache.data(),nmom,ngamma,nt,N_iii,N_jjj);    
-
-            startTimer("contraction: total");
-            makeMesonFieldBlock(mfCacheBlock, &w[i+ii], &v[j+jj], gamma_, ph, 
-                                env().getNd() - 1, this);
-            stopTimer("contraction: total");
-            
-            // flops for general N_c & N_s
-            flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma;
-            bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj
-                     +  vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma;
-
-            startTimer("cache copy");
-            parallel_for_nest5(int m =0;m< nmom;m++)
-            for(int g =0;g< ngamma;g++)
-            for(int t =0;t< nt;t++)
-            for(int iii=0;iii< N_iii;iii++)
-            for(int jjj=0;jjj< N_jjj;jjj++)
-            {
-                mfBlock(m,g,t,ii+iii,jj+jjj) = mfCacheBlock(m,g,t,iii,jjj);
-            }
-            stopTimer("cache copy");
+            ss << mom_[m][mu] << ((mu == mom_[m].size() - 1) ? "" : "_");
        }

-        // perf
-        tot_kernel = getDTimer("contraction: colour trace & mom.")
-                     + getDTimer("contraction: local space sum");
-        t_kernel   = tot_kernel - t_kernel;
-        LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes 
-                     << " Gflop/s/node " << std::endl;
-        LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes 
-                     << " GB/s/node "  << std::endl;
-        t_kernel = tot_kernel;
+        return ss.str();
+    };

-        // IO
-        if (!par().output.empty())
+    auto filenameFn = [this, &ionameFn](const unsigned int m, const unsigned int g)
+    {
+        return par().output + "." + std::to_string(vm().getTrajectory()) 
+               + "/" + ionameFn(m, g) + ".h5";
+    };
+
+    auto metadataFn = [this](const unsigned int m, const unsigned int g)
+    {
+        A2AMesonFieldMetadata md;
+
+        for (auto pmu: mom_[m])
        {
-            double       blockSize, ioTime;
-            unsigned int myRank = env().getGrid()->ThisRank(),
-                         nRank  = env().getGrid()->RankCount();
+            md.momentum.push_back(pmu);
+        }
+        md.gamma = gamma_[g];
        
-            LOG(Message) << "Writing block to disk" << std::endl;
-            ioTime = -getDTimer("IO: write block");
-            startTimer("IO: total");
-            makeFileDir(filename(0, 0), env().getGrid());
-#ifdef MF_PARALLEL_IO
-            env().getGrid()->Barrier();
-            nodeIo_.clear();
-            for(int f = myRank; f < nmom*ngamma; f += nRank)
-            {
-                const unsigned int    m = f/ngamma, g = f % ngamma;
-                IoHelper              h;
+        return md;
+    };

-                h.io = MatrixIo(filename(m, g), ioname(m, g), nt, N_i, N_j);
-                for (auto pmu: mom_[m])
-                {
-                    h.metadata.momentum.push_back(pmu);
-                }
-                h.metadata.gamma = gamma_[g];
-                h.i              = i;
-                h.j              = j;
-                h.blockSizei     = mfBlock.dimension(3);
-                h.blockSizej     = mfBlock.dimension(4);
-                h.offset         = (m*ngamma + g)*nt*h.blockSizei*h.blockSizej;
-                nodeIo_.push_back(h);
-            }
-            // parallel IO
-            for (auto &h: nodeIo_)
-            {
-                saveBlock(mfBlock.data(), h);
-            }
-            env().getGrid()->Barrier();
-#else
-            // serial IO
-            for(int m = 0; m < nmom; m++)
-            for(int g = 0; g < ngamma; g++)
-            {
-                IoHelper h;
+    Kernel      kernel(gamma_, ph, envGetGrid(FermionField));

-                h.io = MatrixIo(filename(m, g), ioname(m, g), nt, N_i, N_j);
-                for (auto pmu: mom_[m])
-                {
-                    h.metadata.momentum.push_back(pmu);
-                }
-                h.metadata.gamma = gamma_[g];
-                h.i              = i;
-                h.j              = j;
-                h.blockSizei     = mfBlock.dimension(3);
-                h.blockSizej     = mfBlock.dimension(4);
-                h.offset         = (m*ngamma + g)*nt*h.blockSizei*h.blockSizej;
-                saveBlock(mfBlock.data(), h);
-            }
-#endif
-            stopTimer("IO: total");
-            blockSize  = static_cast<double>(nmom*ngamma*nt*N_ii*N_jj*sizeof(MF_IO_TYPE));
-            ioTime    += getDTimer("IO: write block");
-            LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
-                         << ioTime  << " us (" 
-                         << blockSize/ioTime*1.0e6/1024/1024
-                         << " MB/s)" << std::endl;
-        }
-    }
-}
-
-// IO
-template <typename FImpl>
-std::string TA2AMesonField<FImpl>::ioname(unsigned int m, unsigned int g) const
-{
-    std::stringstream ss;
-
-    ss << gamma_[g] << "_";
-    for (unsigned int mu = 0; mu < mom_[m].size(); ++mu)
-    {
-        ss << mom_[m][mu] << ((mu == mom_[m].size() - 1) ? "" : "_");
-    }
-
-    return ss.str();
-}
-
-template <typename FImpl>
-std::string TA2AMesonField<FImpl>::filename(unsigned int m, unsigned int g) const
-{
-    return par().output + "." + std::to_string(vm().getTrajectory()) 
-           + "/" + ioname(m, g) + ".h5";
-}
-
-template <typename FImpl>
-void TA2AMesonField<FImpl>::saveBlock(const MF_IO_TYPE *data, IoHelper &h)
-{
-    if ((h.i == 0) and (h.j == 0))
-    {
-        startTimer("IO: file creation");
-        h.io.initFile(h.metadata, par().block);
-        stopTimer("IO: file creation");
-    }
-    startTimer("IO: write block");
-    h.io.saveBlock(data + h.offset, h.i, h.j, h.blockSizei, h.blockSizej);
-    stopTimer("IO: write block");
+    envGetTmp(Computation, computation);
+    computation.execute(left, right, kernel, ionameFn, filenameFn, metadataFn);
 }

 END_MODULE_NAMESPACE
--- a/Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
+++ b/Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
@@ -1,224 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
-
-Copyright (C) 2015-2018
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef Hadrons_MContraction_A2AMesonFieldKernels_hpp_
-#define Hadrons_MContraction_A2AMesonFieldKernels_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Module.hpp>
-#include <Grid/Eigen/unsupported/CXX11/Tensor>
-
-BEGIN_HADRONS_NAMESPACE
-
-BEGIN_MODULE_NAMESPACE(MContraction)
-
-////////////////////////////////////////////////////////////////////////////////
-// Cache blocked arithmetic routine
-// Could move to Grid ???
-////////////////////////////////////////////////////////////////////////////////
-template <typename Field, typename MesonField>
-void makeMesonFieldBlock(MesonField &mat, 
-                         const Field *lhs_wi,
-                         const Field *rhs_vj,
-                         std::vector<Gamma::Algebra> gamma,
-                         const std::vector<LatticeComplex> &mom,
-                         int orthogdim,
-                         ModuleBase *caller = nullptr) 
-{
-    typedef typename Field::vector_object vobj;
-    typedef typename vobj::scalar_object  sobj;
-    typedef typename vobj::scalar_type    scalar_type;
-    typedef typename vobj::vector_type    vector_type;
-
-    typedef iSpinMatrix<vector_type> SpinMatrix_v;
-    typedef iSpinMatrix<scalar_type> SpinMatrix_s;
-    
-    int Lblock = mat.dimension(3); 
-    int Rblock = mat.dimension(4);
-
-    GridBase *grid = lhs_wi[0]._grid;
-    
-    const int    Nd = grid->_ndimension;
-    const int Nsimd = grid->Nsimd();
-
-    int Nt     = grid->GlobalDimensions()[orthogdim];
-    int Ngamma = gamma.size();
-    int Nmom   = mom.size();
-
-    int fd=grid->_fdimensions[orthogdim];
-    int ld=grid->_ldimensions[orthogdim];
-    int rd=grid->_rdimensions[orthogdim];
-
-    // will locally sum vectors first
-    // sum across these down to scalars
-    // splitting the SIMD
-    int MFrvol = rd*Lblock*Rblock*Nmom;
-    int MFlvol = ld*Lblock*Rblock*Nmom;
-
-    Vector<SpinMatrix_v > lvSum(MFrvol);
-    parallel_for (int r = 0; r < MFrvol; r++)
-    {
-        lvSum[r] = zero;
-    }
-
-    Vector<SpinMatrix_s > lsSum(MFlvol);             
-    parallel_for (int r = 0; r < MFlvol; r++)
-    {
-        lsSum[r]=scalar_type(0.0);
-    }
-
-    int e1=    grid->_slice_nblock[orthogdim];
-    int e2=    grid->_slice_block [orthogdim];
-    int stride=grid->_slice_stride[orthogdim];
-
-    if (caller) caller->startTimer("contraction: colour trace & mom.");
-    // Nested parallelism would be ok
-    // Wasting cores here. Test case r
-    parallel_for(int r=0;r<rd;r++)
-    {
-        int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-        for(int n=0;n<e1;n++)
-        for(int b=0;b<e2;b++)
-        {
-            int ss= so+n*stride+b;
-
-            for(int i=0;i<Lblock;i++)
-            {
-                auto left = conjugate(lhs_wi[i]._odata[ss]);
-
-                for(int j=0;j<Rblock;j++)
-                {
-                    SpinMatrix_v vv;
-                    auto right = rhs_vj[j]._odata[ss];
-
-                    for(int s1=0;s1<Ns;s1++)
-                    for(int s2=0;s2<Ns;s2++)
-                    {
-                        vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
-                                        + left()(s2)(1) * right()(s1)(1)
-                                        + left()(s2)(2) * right()(s1)(2);
-                    }
-                    
-                    // After getting the sitewise product do the mom phase loop
-                    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
-
-                    for ( int m=0;m<Nmom;m++)
-                    {
-                        int idx = m+base;
-                        auto phase = mom[m]._odata[ss];
-                        mac(&lvSum[idx],&vv,&phase);
-                    }
-                }
-            }
-        }
-    }
-    if (caller) caller->stopTimer("contraction: colour trace & mom.");
-
-    // Sum across simd lanes in the plane, breaking out orthog dir.
-    if (caller) caller->startTimer("contraction: local space sum");
-    parallel_for(int rt=0;rt<rd;rt++)
-    {
-        std::vector<int> icoor(Nd);
-        std::vector<SpinMatrix_s> extracted(Nsimd);               
-
-        for(int i=0;i<Lblock;i++)
-        for(int j=0;j<Rblock;j++)
-        for(int m=0;m<Nmom;m++)
-        {
-
-            int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
-
-            extract(lvSum[ij_rdx],extracted);
-            for(int idx=0;idx<Nsimd;idx++)
-            {
-                grid->iCoorFromIindex(icoor,idx);
-
-                int ldx    = rt+icoor[orthogdim]*rd;
-                int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
-
-                lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
-            }
-        }
-    }
-    if (caller) caller->stopTimer("contraction: local space sum");
-
-    // ld loop and local only??
-    if (caller) caller->startTimer("contraction: spin trace");
-    int pd = grid->_processors[orthogdim];
-    int pc = grid->_processor_coor[orthogdim];
-    parallel_for_nest2(int lt=0;lt<ld;lt++)
-    {
-        for(int pt=0;pt<pd;pt++)
-        {
-            int t = lt + pt*ld;
-            if (pt == pc)
-            {
-                for(int i=0;i<Lblock;i++)
-                for(int j=0;j<Rblock;j++)
-                for(int m=0;m<Nmom;m++)
-                {
-                    int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
-
-                    for(int mu=0;mu<Ngamma;mu++)
-                    {
-                        // this is a bit slow
-                        mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gamma[mu]));
-                    }
-                }
-            } 
-            else 
-            { 
-                const scalar_type zz(0.0);
-
-                for(int i=0;i<Lblock;i++)
-                for(int j=0;j<Rblock;j++)
-                for(int mu=0;mu<Ngamma;mu++)
-                for(int m=0;m<Nmom;m++)
-                {
-                    mat(m,mu,t,i,j) =zz;
-                }
-            }
-        }
-    }
-    if (caller) caller->stopTimer("contraction: spin trace");
-    ////////////////////////////////////////////////////////////////////
-    // This global sum is taking as much as 50% of time on 16 nodes
-    // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
-    // Healthy size that should suffice
-    ////////////////////////////////////////////////////////////////////
-    if (caller) caller->startTimer("contraction: global sum");
-    grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
-    if (caller) caller->stopTimer("contraction: global sum");
-}
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif //Hadrons_MContraction_A2AMesonField_hpp_
--- a/Hadrons/Modules/MGauge/Electrify.cc
+++ b/Hadrons/Modules/MGauge/Electrify.cc
@@ -0,0 +1,34 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/Electrify.cc
+
+Copyright (C) 2015-2018
+
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MGauge/Electrify.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MGauge;
+
+template class Grid::Hadrons::MGauge::TElectrify<GIMPL>;
--- a/Hadrons/Modules/MGauge/Electrify.hpp
+++ b/Hadrons/Modules/MGauge/Electrify.hpp
@@ -0,0 +1,151 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/Electrify.hpp
+
+Copyright (C) 2015-2018
+
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MGauge_Electrify_hpp_
+#define Hadrons_MGauge_Electrify_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                              Electrify gauge                               *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MGauge)
+
+/****************************************************************************
+*  Electrify a gauge field:
+*
+*  Ue_mu(x) = U_mu(x)*exp(ieqA_mu(x))
+*
+*  with
+*
+*  - gauge: U_mu(x): gauge field
+*  - emField: A_mu(x): electromagnetic photon field
+*  - e: value for the elementary charge
+*  - q: charge in units of e
+*
+*****************************************************************************/
+
+
+class ElectrifyPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ElectrifyPar,
+                                    std::string, gauge,
+				    std::string, emField,
+				    double, e,
+				    double, charge);
+};
+
+template <typename GImpl>
+class TElectrify: public Module<ElectrifyPar>
+{
+public:
+    GAUGE_TYPE_ALIASES(GImpl,);
+public:
+    typedef PhotonR::GaugeField     EmField;
+public:
+    // constructor
+    TElectrify(const std::string name);
+    // destructor
+    virtual ~TElectrify(void) {};
+    // dependencies/products
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+protected:
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(Electrify, TElectrify<GIMPL>, MGauge);
+
+/******************************************************************************
+*                            TElectrify implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename GImpl>
+TElectrify<GImpl>::TElectrify(const std::string name)
+: Module<ElectrifyPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename GImpl>
+std::vector<std::string> TElectrify<GImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().gauge, par().emField};
+
+    return in;
+}
+
+template <typename GImpl>
+std::vector<std::string> TElectrify<GImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TElectrify<GImpl>::setup(void)
+{
+    envCreateLat(GaugeField, getName());
+    envTmpLat(LatticeComplex, "eiAmu");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TElectrify<GImpl>::execute(void)
+{
+    LOG(Message) << "Electrify the gauge field " << par().gauge << " using the photon field " 
+                  << par().emField << " with charge e*q= " << par().e << "*" << par().charge << std::endl;
+    
+    auto &Ue = envGet(GaugeField, getName());
+    auto &U = envGet(GaugeField, par().gauge);
+    auto &A = envGet(EmField,  par().emField);
+    envGetTmp(LatticeComplex, eiAmu);
+
+    Complex i(0.0,1.0);
+
+    for(unsigned int mu = 0; mu < env().getNd(); mu++)
+    {
+	eiAmu = exp(i * (Real)(par().e * par().charge) * PeekIndex<LorentzIndex>(A, mu));
+	PokeIndex<LorentzIndex>(Ue, PeekIndex<LorentzIndex>(U, mu) * eiAmu, mu);
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MGauge_Electrify_hpp_
--- a/Hadrons/Modules/MGauge/GaugeFix.cc
+++ b/Hadrons/Modules/MGauge/GaugeFix.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/GaugeFix.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Modules/MGauge/GaugeFix.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MGauge;
+
+template class Grid::Hadrons::MGauge::TGaugeFix<GIMPL>;
--- a/Hadrons/Modules/MGauge/GaugeFix.hpp
+++ b/Hadrons/Modules/MGauge/GaugeFix.hpp
@@ -0,0 +1,135 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/GaugeFix.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MGaugeFix_hpp_
+#define Hadrons_MGaugeFix_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Grid/qcd/utils/GaugeFix.h>
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                              Fix gauge                                    *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MGauge)
+
+class GaugeFixPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(GaugeFixPar,
+                                    std::string, gauge,
+                                    Real,  alpha,
+                                    int, maxiter, 
+                                    Real, Omega_tol, 
+                                    Real, Phi_tol,
+                                    bool, Fourier);
+};
+
+template <typename GImpl>
+class TGaugeFix: public Module<GaugeFixPar>
+{
+public:
+    GAUGE_TYPE_ALIASES(GImpl,);
+public:
+    // constructor
+    TGaugeFix(const std::string name);
+    // destructor
+    virtual ~TGaugeFix(void) {};
+    // dependencies/products
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(GaugeFix, TGaugeFix<GIMPL>, MGauge);
+
+/******************************************************************************
+*                            TGaugeFix implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename GImpl>
+TGaugeFix<GImpl>::TGaugeFix(const std::string name)
+: Module<GaugeFixPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename GImpl>
+std::vector<std::string> TGaugeFix<GImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().gauge};
+    return in;
+}
+
+template <typename GImpl>
+std::vector<std::string> TGaugeFix<GImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TGaugeFix<GImpl>::setup(void)
+{
+    envCreateLat(GaugeField, getName());
+}
+
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TGaugeFix<GImpl>::execute(void)
+//Loads the gauge and fixes it
+{
+    std::cout << "executing" << std::endl;
+    LOG(Message) << "Fixing the Gauge" << std::endl;
+    LOG(Message) << par().gauge << std::endl;
+    auto &U     = envGet(GaugeField, par().gauge);
+    auto &Umu   = envGet(GaugeField, getName());
+    LOG(Message) << "Gauge Field fetched" << std::endl;
+    //do we allow maxiter etc to be user set?
+    Real alpha     = par().alpha;
+    int  maxiter   = par().maxiter;
+    Real Omega_tol = par().Omega_tol;
+    Real Phi_tol   = par().Phi_tol;
+    bool Fourier   = par().Fourier;
+    FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(U,alpha,maxiter,Omega_tol,Phi_tol,Fourier);
+    Umu = U;
+    LOG(Message) << "Gauge Fixed" << std::endl;
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MGaugeFix_hpp_
--- a/Hadrons/Modules/MGauge/StochEm.cc
+++ b/Hadrons/Modules/MGauge/StochEm.cc
@@ -70,7 +70,7 @@ void TStochEm::execute(void)
    LOG(Message) << "Generating stochastic EM potential..." << std::endl;

    std::vector<Real> improvements = strToVec<Real>(par().improvement);
-    PhotonR photon(par().gauge, par().zmScheme, improvements, par().G0_qedInf);
+    PhotonR photon(envGetGrid(EmField), par().gauge, par().zmScheme, improvements);
    auto    &a = envGet(EmField, getName());
    auto    &w = envGet(EmComp, "_" + getName() + "_weight");
    
--- a/Hadrons/Modules/MGauge/StochEm.hpp
+++ b/Hadrons/Modules/MGauge/StochEm.hpp
@@ -47,8 +47,7 @@ public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(StochEmPar,
                                    PhotonR::Gauge,    gauge,
                                    PhotonR::ZmScheme, zmScheme,
-                                    std::string,       improvement,
-                                    Real,              G0_qedInf);
+                                    std::string,       improvement);
 };

 class TStochEm: public Module<StochEmPar>
--- a/Hadrons/Modules/MGauge/UnitEm.cc
+++ b/Hadrons/Modules/MGauge/UnitEm.cc
@@ -62,7 +62,7 @@ void TUnitEm::setup(void)
 // execution ///////////////////////////////////////////////////////////////////
 void TUnitEm::execute(void)
 {
-    PhotonR photon(0, 0); // Just chose arbitrary input values here
+    PhotonR photon(envGetGrid(EmField), 0, 0); // Just chose arbitrary input values here
    auto    &a = envGet(EmField, getName());
    LOG(Message) << "Generating unit EM potential..." << std::endl;
    photon.UnitField(a);
--- a/Hadrons/Modules/MIO/LoadA2AVectors.cc
+++ b/Hadrons/Modules/MIO/LoadA2AVectors.cc
@@ -0,0 +1,34 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MIO/LoadA2AVectors.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MIO;
+
+template class Grid::Hadrons::MIO::TLoadA2AVectors<FIMPL>;
--- a/Hadrons/Modules/MIO/LoadA2AVectors.hpp
+++ b/Hadrons/Modules/MIO/LoadA2AVectors.hpp
@@ -0,0 +1,120 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MIO/LoadA2AVectors.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MIO_LoadA2AVectors_hpp_
+#define Hadrons_MIO_LoadA2AVectors_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/A2AVectors.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                    Module to load all-to-all vectors                       *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MIO)
+
+class LoadA2AVectorsPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadA2AVectorsPar,
+                                    std::string,  filestem,
+                                    bool,         multiFile,
+                                    unsigned int, size);
+};
+
+template <typename FImpl>
+class TLoadA2AVectors: public Module<LoadA2AVectorsPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+public:
+    // constructor
+    TLoadA2AVectors(const std::string name);
+    // destructor
+    virtual ~TLoadA2AVectors(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(LoadA2AVectors, TLoadA2AVectors<FIMPL>, MIO);
+
+/******************************************************************************
+ *                      TLoadA2AVectors implementation                        *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TLoadA2AVectors<FImpl>::TLoadA2AVectors(const std::string name)
+: Module<LoadA2AVectorsPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TLoadA2AVectors<FImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TLoadA2AVectors<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TLoadA2AVectors<FImpl>::setup(void)
+{
+    envCreate(std::vector<FermionField>, getName(), 1, par().size, 
+              envGetGrid(FermionField));
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TLoadA2AVectors<FImpl>::execute(void)
+{
+    auto &vec = envGet(std::vector<FermionField>, getName());
+
+    A2AVectorsIo::read(vec, par().filestem, par().multiFile, vm().getTrajectory());
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MIO_LoadA2AVectors_hpp_
--- a/Hadrons/Modules/MIO/LoadEigenPack.cc
+++ b/Hadrons/Modules/MIO/LoadEigenPack.cc
@@ -32,4 +32,6 @@ using namespace Hadrons;
 using namespace MIO;

 template class Grid::Hadrons::MIO::TLoadEigenPack<FermionEigenPack<FIMPL>>;
-
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+template class Grid::Hadrons::MIO::TLoadEigenPack<FermionEigenPack<FIMPL, FIMPLF>>;
+#endif
--- a/Hadrons/Modules/MIO/LoadEigenPack.hpp
+++ b/Hadrons/Modules/MIO/LoadEigenPack.hpp
@@ -54,7 +54,9 @@ template <typename Pack>
 class TLoadEigenPack: public Module<LoadEigenPackPar>
 {
 public:
-    typedef EigenPack<typename Pack::Field> BasePack;
+    typedef typename Pack::Field   Field;
+    typedef typename Pack::FieldIo FieldIo;
+    typedef BaseEigenPack<Field>   BasePack;
 public:
    // constructor
    TLoadEigenPack(const std::string name);
@@ -70,6 +72,9 @@ public:
 };

 MODULE_REGISTER_TMP(LoadFermionEigenPack, TLoadEigenPack<FermionEigenPack<FIMPL>>, MIO);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+MODULE_REGISTER_TMP(LoadFermionEigenPackIo32, ARG(TLoadEigenPack<FermionEigenPack<FIMPL, FIMPLF>>), MIO);
+#endif

 /******************************************************************************
 *                    TLoadEigenPack implementation                           *
@@ -101,9 +106,14 @@ std::vector<std::string> TLoadEigenPack<Pack>::getOutput(void)
 template <typename Pack>
 void TLoadEigenPack<Pack>::setup(void)
 {
-    env().createGrid(par().Ls);
+    GridBase *gridIo = nullptr;
+
+    if (typeHash<Field>() != typeHash<FieldIo>())
+    {
+        gridIo = envGetRbGrid(FieldIo, par().Ls);
+    }
    envCreateDerived(BasePack, Pack, getName(), par().Ls, par().size, 
-                     env().getRbGrid(par().Ls));
+                     envGetRbGrid(Field, par().Ls), gridIo);
 }

 // execution ///////////////////////////////////////////////////////////////////
--- a/Hadrons/Modules/MNPR/Amputate.cc
+++ b/Hadrons/Modules/MNPR/Amputate.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNPR/Amputate.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MNPR/Amputate.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MNPR;
+
+template class Grid::Hadrons::MNPR::TAmputate<FIMPL,FIMPL>;
+
--- a/Hadrons/Modules/MNPR/Amputate.hpp
+++ b/Hadrons/Modules/MNPR/Amputate.hpp
@@ -0,0 +1,200 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNPR/Amputate.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Julia Kettle J.R.Kettle-2@sms.ed.ac.uk
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_Amputate_hpp_
+#define Hadrons_Amputate_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Grid/Eigen/LU>
+//#include <Grid/qcd/utils/PropagatorUtils.h>
+//#include <Grid/serialisation/Serialisation.h>
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                                TAmputate                                       *
+        Performs bilinear contractions of the type tr[g5*adj(Sout)*g5*G*Sin]
+        Suitable for non exceptional momenta
+******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MNPR)
+
+class AmputatePar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(AmputatePar,
+                                    std::string,    Sin, //need to make this a propogator type?
+                                    std::string,    Sout, //same
+                                    std::string,    vertex,
+                                    std::string,    pin,
+                                    std::string,    pout,
+                                    std::string,    output,
+                                    std::string,    input);
+};
+
+template <typename FImpl1, typename FImpl2>
+class TAmputate: public Module<AmputatePar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl2, 2);
+    class Result: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
+                                        std::vector<Complex>, Vamp,
+                                        ); 
+    };
+public:
+    // constructor
+    TAmputate(const std::string name);
+    // destructor
+    virtual ~TAmputate(void) {};
+    // dependencies/products
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    virtual SpinColourMatrix invertspincolmat(SpinColourMatrix &scmat);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(Amputate, ARG(TAmputate<FIMPL, FIMPL>), MNPR);
+
+/******************************************************************************
+ *                           TAmputate implementation                            *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+TAmputate<FImpl1, FImpl2>::TAmputate(const std::string name)
+: Module<AmputatePar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+std::vector<std::string> TAmputate<FImpl1, FImpl2>::getInput(void)
+{
+    std::vector<std::string> input = {par().Sin, par().Sout, par().vertex};
+    
+    return input;
+}
+
+template <typename FImpl1, typename FImpl2>
+std::vector<std::string> TAmputate<FImpl1, FImpl2>::getOutput(void)
+{
+    std::vector<std::string> output = {getName()};
+    
+    
+    return output;
+}
+
+// Invert spin colour matrix using Eigen
+template <typename Fimpl1, typename Fimpl2>
+SpinColourMatrix TAmputate<Fimpl1, Fimpl2>::invertspincolmat(SpinColourMatrix &scmat)
+{
+    Eigen::MatrixXcf scmat_2d(Ns*Nc,Ns*Nc);
+    for(int ic=0; ic<Nc; ic++){
+    for(int jc=0; jc<Nc; jc++){
+        for(int is=0; is<Ns; is++){
+        for(int js=0; js<Ns; js++){
+            scmat_2d(Ns*ic+is,Ns*jc+js) = scmat()(is,js)(ic,jc);
+        }}
+    }}      
+    Eigen::MatrixXcf scmat_2d_inv = scmat_2d.inverse();
+    SpinColourMatrix scmat_inv;
+    for(int ic=0; ic<Nc; ic++){
+    for(int jc=0; jc<Nc; jc++){
+        for(int is=0; is<Ns; is++){
+        for(int js=0; js<Ns; js++){
+            scmat_inv()(is,js)(ic,jc) = scmat_2d_inv(Ns*ic+is,Ns*jc+js);
+        }}
+    }}      
+    return scmat_inv;
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+void TAmputate<FImpl1, FImpl2>::execute(void)
+{
+    LOG(Message) << "Computing bilinear amputations '" << getName() << "' using"
+                 << " momentum '" << par().Sin << "' and '" << par().Sout << "'"
+                 << std::endl;
+    BinaryWriter                    writer(par().output);
+    PropagatorField1                &Sin = *env().template getObject<PropagatorField1>(par().Sin); //Do these have the phases taken into account?? Don't think so. FIX
+    PropagatorField2                &Sout = *env().template getObject<PropagatorField2>(par().Sout);
+    std::vector<int>                pin  = strToVec<int>(par().pin), pout = strToVec<int>(par().pout);
+    std::vector<Real>               latt_size(pin.begin(), pin.end()); 
+    LatticeComplex                  pdotxin(env().getGrid()), pdotxout(env().getGrid()), coor(env().getGrid());
+    LOG(Message) << "Propagators set up " << std::endl;
+    std::vector<SpinColourMatrix>   vertex; // Let's read from file here
+    Gamma                           g5(Gamma::Algebra::Gamma5);
+    Result                          result;
+    LOG(Message) << "reading file - "  << par().input << std::endl;
+    BinaryReader                    reader(par().input); 
+    Complex                         Ci(0.0,1.0);
+
+    std::string svertex;
+    read(reader,"vertex", vertex);
+    LOG(Message) << "vertex read" << std::endl;
+
+    pdotxin=zero;
+    pdotxout=zero;
+    for (unsigned int mu = 0; mu < 4; ++mu)
+    {
+        Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
+        LatticeCoordinate(coor,mu);
+        pdotxin = pdotxin +(TwoPiL * pin[mu]) * coor;
+        pdotxout= pdotxout +(TwoPiL * pout[mu]) * coor;
+    }
+    Sin = Sin*exp(-Ci*pdotxin); //phase corrections
+    Sout = Sout*exp(-Ci*pdotxout);
+
+    SpinColourMatrix Sin_mom = sum(Sin);
+    SpinColourMatrix Sout_mom = sum(Sout);
+    LOG(Message) << "summed over lattice" << std::endl;
+   
+    LOG(Message) << "Lattice -> spincolourmatrix conversion" << std::endl;
+
+    SpinColourMatrix Sin_inv = invertspincolmat(Sin_mom);
+    SpinColourMatrix Sout_inv = invertspincolmat(Sout_mom);
+    LOG(Message) << "Inversions done" << std::endl;
+
+    result.Vamp.resize(Gamma::nGamma/2);
+    for( int mu=0; mu < Gamma::nGamma/2; mu++){
+        Gamma::Algebra gam = mu;
+        result.Vamp[mu] = 1/12.0*trace(adj(Gamma(mu*2+1))*g5*Sout_inv*g5*vertex[mu]*Sin_inv);
+        LOG(Message) << "Vamp[" << mu << "] - " << result.Vamp[mu] << std::endl;
+        }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_Amputate_hpp_
--- a/Hadrons/Modules/MNPR/Bilinear.cc
+++ b/Hadrons/Modules/MNPR/Bilinear.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNPR/Bilinear.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MNPR/Bilinear.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MNPR;
+
+template class Grid::Hadrons::MNPR::TBilinear<FIMPL,FIMPL>;
+
--- a/Hadrons/Modules/MNPR/Bilinear.hpp
+++ b/Hadrons/Modules/MNPR/Bilinear.hpp
@@ -0,0 +1,225 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNPR/Bilinear.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Julia Kettle J.R.Kettle-2@sms.ed.ac.uk
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_Bilinear_hpp_
+#define Hadrons_Bilinear_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+//#include <Grid/qcd/utils/PropagatorUtils.h>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                                TBilinear                                       *
+        Performs bilinear contractions of the type tr[g5*adj(Sout)*g5*G*Sin]
+        Suitable for non exceptional momenta in Rome-Southampton NPR
+******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MNPR)
+
+class BilinearPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(BilinearPar,
+                                    std::string,    Sin,
+                                    std::string,    Sout,
+                                    std::string,    pin,
+                                    std::string,    pout,
+                                    std::string,    output);
+};
+
+template <typename FImpl1, typename FImpl2>
+class TBilinear: public Module<BilinearPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl2, 2);
+    class Result: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Result, 
+                                        std::vector<SpinColourMatrix>, bilinear);
+    };
+public:
+    // constructor
+    TBilinear(const std::string name);
+    // destructor
+    virtual ~TBilinear(void) {};
+    // dependencies/products
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    //LatticeSpinColourMatrix PhaseProps(LatticeSpinColourMatrix S, std::vector<Real> p);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(Bilinear, ARG(TBilinear<FIMPL, FIMPL>), MNPR);
+
+/******************************************************************************
+ *                           TBilinear implementation                            *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+TBilinear<FImpl1, FImpl2>::TBilinear(const std::string name)
+: Module<BilinearPar>(name)
+{}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+void TBilinear<FImpl1, FImpl2>::setup(void)
+{
+    //env().template registerLattice<LatticeSpinColourMatrix>(getName());
+    //env().template registerObject<SpinColourMatrix>(getName());
+}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+std::vector<std::string> TBilinear<FImpl1, FImpl2>::getInput(void)
+{
+    std::vector<std::string> input = {par().Sin, par().Sout};
+    
+    return input;
+}
+
+template <typename FImpl1, typename FImpl2>
+std::vector<std::string> TBilinear<FImpl1, FImpl2>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+/*
+/////Phase propagators//////////////////////////
+template <typename FImpl1, typename FImpl2>
+LatticeSpinColourMatrix TBilinear<FImpl1, FImpl2>::PhaseProps(LatticeSpinColourMatrix S, std::vector<Real> p)
+{
+    GridBase *grid = S._grid;
+    LatticeComplex      pdotx(grid),  coor(grid);
+    std::vector<int>   latt_size = grid->_fdimensions; 
+    Complex             Ci(0.0,1.0);
+    pdotx=zero;
+    for (unsigned int mu = 0; mu < 4; ++mu)
+    {
+        Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
+        LatticeCoordinate(coor,mu);
+        pdotx = pdotx +(TwoPiL * p[mu]) * coor;
+    }
+    S = S*exp(-Ci*pdotx);
+    return S;
+}
+*/
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+void TBilinear<FImpl1, FImpl2>::execute(void)
+{
+/**************************************************************************
+
+Compute the bilinear vertex needed for the NPR.
+V(G) = sum_x  [ g5 * adj(S'(x,p2)) * g5 * G * S'(x,p1) ]_{si,sj,ci,cj}
+G is one of the 16 gamma vertices [I,gmu,g5,g5gmu,sig(mu,nu)]
+
+        * G
+       / \
+    p1/   \p2
+     /     \
+    /       \
+
+Returns a spin-colour matrix, with indices si,sj, ci,cj
+
+Conventions:
+p1 - incoming momenta
+p2 - outgoing momenta
+q = (p1-p2)
+**************************************************************************/
+
+    LOG(Message) << "Computing bilinear contractions '" << getName() << "' using"
+                 << " momentum '" << par().Sin << "' and '" << par().Sout << "'"
+                 << std::endl;
+     
+    BinaryWriter             writer(par().output);
+    
+
+    // Propogators
+    LatticeSpinColourMatrix     &Sin = *env().template getObject<LatticeSpinColourMatrix>(par().Sin);
+    LatticeSpinColourMatrix     &Sout = *env().template getObject<LatticeSpinColourMatrix>(par().Sout);
+    LatticeComplex              pdotxin(env().getGrid()), pdotxout(env().getGrid()), coor(env().getGrid());
+    // momentum on legs
+    std::vector<Real>           pin  = strToVec<Real>(par().pin), pout = strToVec<Real>(par().pout);
+    std::vector<Real>           latt_size(pin.begin(), pin.end()); 
+    //bilinears
+    LatticeSpinColourMatrix     bilinear_x(env().getGrid());
+    SpinColourMatrix            bilinear;
+    Gamma                       g5(Gamma::Algebra::Gamma5);
+    Result                      result;
+    Complex                     Ci(0.0,1.0);
+
+    //
+
+    pdotxin=zero;
+    pdotxout=zero;
+    for (unsigned int mu = 0; mu < 4; ++mu)
+    {
+        Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
+        LatticeCoordinate(coor,mu);
+        pdotxin = pdotxin +(TwoPiL * pin[mu]) * coor;
+        pdotxout= pdotxout +(TwoPiL * pout[mu]) * coor;
+    }
+    Sin = Sin*exp(-Ci*pdotxin); //phase corrections
+    Sout = Sout*exp(-Ci*pdotxout);
+    
+    ////Set up gamma vector//////////////////////////
+    std::vector<Gamma> gammavector;
+    for( int i=0; i<Gamma::nGamma; i++){
+        Gamma::Algebra gam = i;
+        gammavector.push_back(Gamma(gam));
+    }
+    result.bilinear.resize(Gamma::nGamma);
+    /////////////////////////////////////////////////
+    //LatticeSpinMatrix temp = g5*Sout;
+    ////////Form Vertex//////////////////////////////
+    for (int i=0; i < Gamma::nGamma; i++){
+        bilinear_x = g5*adj(Sout)*g5*gammavector[i]*Sin; 
+        result.bilinear[i] = sum(bilinear_x); //sum over lattice sites
+    }
+    //////////////////////////////////////////////////
+    write(writer, par().output, result.bilinear);
+    LOG(Message) << "Complete. Writing results to " << par().output << std:: endl;
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_Bilinear_hpp_
--- a/Hadrons/Modules/MNPR/FourQuark.cc
+++ b/Hadrons/Modules/MNPR/FourQuark.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNPR/FourQuark.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MNPR/FourQuark.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MNPR;
+
+template class Grid::Hadrons::MNPR::TFourQuark<FIMPL,FIMPL>;
+
--- a/Hadrons/Modules/MNPR/FourQuark.hpp
+++ b/Hadrons/Modules/MNPR/FourQuark.hpp
@@ -0,0 +1,274 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNPR/FourQuark.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Julia Kettle J.R.Kettle-2@sms.ed.ac.uk
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_FourQuark_hpp_
+#define Hadrons_FourQuark_hpp_
+
+#include <typeinfo>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Grid/serialisation/Serialisation.h>
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                                TFourQuark                                       *
+        Performs fourquark contractions of the type tr[g5*adj(Sout)*g5*G*Sin]
+        Suitable for non exceptional momenta
+******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MNPR)
+
+class FourQuarkPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(FourQuarkPar,
+                                    std::string,    Sin, //need to make this a propogator type?
+                                    std::string,    Sout, //same
+                                    std::string,    pin,
+                                    std::string,    pout,
+                                    bool,           fullbasis,
+                                    std::string,    output);
+};
+
+template <typename FImpl1, typename FImpl2>
+class TFourQuark: public Module<FourQuarkPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl2, 2);
+    class Result: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
+                                        std::vector<SpinColourSpinColourMatrix>, fourquark);
+    };
+public:
+    // constructor
+    TFourQuark(const std::string name);
+    // destructor
+    virtual ~TFourQuark(void) {};
+    // dependencies/products
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void tensorprod(LatticeSpinColourSpinColourMatrix &lret, LatticeSpinColourMatrix a, LatticeSpinColourMatrix b);
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(FourQuark, ARG(TFourQuark<FIMPL, FIMPL>), MNPR);
+
+/******************************************************************************
+ *                           TFourQuark implementation                            *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+TFourQuark<FImpl1, FImpl2>::TFourQuark(const std::string name)
+: Module<FourQuarkPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+std::vector<std::string> TFourQuark<FImpl1, FImpl2>::getInput(void)
+{
+    std::vector<std::string> input = {par().Sin, par().Sout};
+    
+    return input;
+}
+
+template <typename FImpl1, typename FImpl2>
+std::vector<std::string> TFourQuark<FImpl1, FImpl2>::getOutput(void)
+{
+    std::vector<std::string> output = {getName()};
+    
+    return output;
+}
+
+
+template <typename FImpl1, typename FImpl2>
+void TFourQuark<FImpl1, FImpl2>::tensorprod(LatticeSpinColourSpinColourMatrix &lret, LatticeSpinColourMatrix a, LatticeSpinColourMatrix b)
+{
+#if 0
+            parallel_for(auto site=lret.begin();site<lret.end();site++) {
+                for (int si; si < 4; ++si){
+                for(int sj; sj <4; ++sj){
+                    for (int ci; ci < 3; ++ci){
+                    for (int cj; cj < 3; ++cj){
+                        for (int sk; sk < 4; ++sk){
+                        for(int sl; sl <4; ++sl){
+                            for (int ck; ck < 3; ++ck){
+                            for (int cl; cl < 3; ++cl){
+                        lret[site]()(si,sj)(ci,cj)(sk,sl)(ck,cl)=a[site]()(si,sj)(ci,cj)*b[site]()(sk,sl)(ck,cl);
+                            }}
+                        }}
+                    }}
+                }}
+        }
+#else 
+            // FIXME ; is there a general need for this construct ? In which case we should encapsulate the
+            //         below loops in a helper function.
+            //LOG(Message) << "sp co mat a is - " << a << std::endl;
+            //LOG(Message) << "sp co mat b is - " << b << std::endl;
+            parallel_for(auto site=lret.begin();site<lret.end();site++) {
+            vTComplex left;
+                for(int si=0; si < Ns; ++si){
+                for(int sj=0; sj < Ns; ++sj){
+                    for (int ci=0; ci < Nc; ++ci){
+                    for (int cj=0; cj < Nc; ++cj){
+                      //LOG(Message) << "si, sj, ci, cj -  " << si << ", " << sj  << ", "<< ci  << ", "<< cj << std::endl;
+                      left()()() = a[site]()(si,sj)(ci,cj);
+                      //LOG(Message) << left << std::endl;
+                      lret[site]()(si,sj)(ci,cj)=left()*b[site]();
+                    }}
+                }}
+            }
+#endif      
+}
+
+
+
+
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+void TFourQuark<FImpl1, FImpl2>::setup(void)
+{
+    envCreateLat(LatticeSpinColourMatrix, getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+void TFourQuark<FImpl1, FImpl2>::execute(void)
+{
+
+/*********************************************************************************
+
+TFourQuark : Creates the four quark vertex required for the NPR of four-quark ops
+
+V_{Gamma_1,Gamma_2} = sum_x [ ( g5 * adj(S'(x,p2)) * g5 * G1 * S'(x,p1) )_ci,cj;si,sj x ( g5 * adj(S'(x,p2)) * g5 * G2 S'(x,p1) )_ck,cl;sk,cl ]
+
+Create a bilinear vertex for G1 and G2  the spin and colour indices are kept free. Where there are 16 potential Gs.
+We then find the outer product of V1 and V2, keeping the spin and colour indices uncontracted
+Then this is summed over the lattice coordinate
+Result is a SpinColourSpinColourMatrix - with 4 colour and 4 spin indices. 
+We have up to 256 of these including the offdiag (G1 != G2).
+
+        \         /
+         \p1   p1/
+          \     /
+           \   /
+         G1 * * G2
+           /   \
+          /     \
+         /p2   p2\
+        /         \
+
+*********************************************************************************/
+
+
+
+
+    LOG(Message) << "Computing fourquark contractions '" << getName() << "' using"
+                 << " momentum '" << par().Sin << "' and '" << par().Sout << "'"
+                 << std::endl;
+    
+    BinaryWriter             writer(par().output);
+    
+    PropagatorField1                            &Sin = *env().template getObject<PropagatorField1>(par().Sin);
+    PropagatorField2                            &Sout = *env().template getObject<PropagatorField2>(par().Sout);
+    std::vector<Real>                           pin  = strToVec<Real>(par().pin), pout = strToVec<Real>(par().pout);
+    bool                                        fullbasis = par().fullbasis;
+    Gamma                                       g5(Gamma::Algebra::Gamma5);
+    Result                                      result;
+    std::vector<Real>                           latt_size(pin.begin(), pin.end());
+    LatticeComplex                              pdotxin(env().getGrid()), pdotxout(env().getGrid()), coor(env().getGrid());
+    LatticeSpinColourMatrix                     bilinear_mu(env().getGrid()), bilinear_nu(env().getGrid());
+    LatticeSpinColourSpinColourMatrix           lret(env().getGrid()); 
+    Complex                         Ci(0.0,1.0);
+
+    //Phase propagators
+    //Sin = Grid::QCD::PropUtils::PhaseProps(Sin,pin);
+    //Sout = Grid::QCD::PropUtils::PhaseProps(Sout,pout);
+    
+    //find p.x for in and out so phase can be accounted for in propagators
+    pdotxin=zero;
+    pdotxout=zero;
+    for (unsigned int mu = 0; mu < 4; ++mu)
+    {
+        Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
+        LatticeCoordinate(coor,mu);
+        pdotxin = pdotxin +(TwoPiL * pin[mu]) * coor;
+        pdotxout= pdotxout +(TwoPiL * pout[mu]) * coor;
+    }
+    Sin = Sin*exp(-Ci*pdotxin); //phase corrections
+    Sout = Sout*exp(-Ci*pdotxout);
+
+
+    //Set up Gammas 
+    std::vector<Gamma> gammavector;
+     for( int i=1; i<Gamma::nGamma; i+=2){
+         Gamma::Algebra gam = i;
+         gammavector.push_back(Gamma(gam));
+       }
+    
+    lret = zero;
+    if (fullbasis == true){ // all combinations of mu and nu
+        result.fourquark.resize(Gamma::nGamma/2*Gamma::nGamma/2);
+        for( int mu=0; mu<Gamma::nGamma/2; mu++){ 
+            bilinear_mu = g5*adj(Sout)*g5*gammavector[mu]*Sin;
+            for ( int nu=0; nu<Gamma::nGamma; nu++){
+                LatticeSpinColourMatrix     bilinear_nu(env().getGrid());
+                bilinear_nu = g5*adj(Sout)*g5*gammavector[nu]*Sin;
+                LOG(Message) << "bilinear_nu for nu = " << nu << " is - " << bilinear_mu << std::endl;
+                result.fourquark[mu*Gamma::nGamma/2 + nu] = zero;
+                tensorprod(lret,bilinear_mu,bilinear_nu);
+                result.fourquark[mu*Gamma::nGamma/2 + nu] = sum(lret);
+            }
+        }
+    } else {
+        result.fourquark.resize(Gamma::nGamma/2);
+        for ( int mu=0; mu<1; mu++){
+        //for( int mu=0; mu<Gamma::nGamma/2; mu++ ){
+            bilinear_mu = g5*adj(Sout)*g5*gammavector[mu]*Sin;
+            //LOG(Message) << "bilinear_mu for mu = " << mu << " is - " << bilinear_mu << std::endl;
+            result.fourquark[mu] = zero;
+            tensorprod(lret,bilinear_mu,bilinear_mu); //tensor outer product
+            result.fourquark[mu] = sum(lret);
+        }
+    }
+    write(writer, "fourquark", result.fourquark);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_FourQuark_hpp_
--- a/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.cc
+++ b/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MNoise;
+
+template class Grid::Hadrons::MNoise::TFullVolumeSpinColorDiagonal<FIMPL>;
+template class Grid::Hadrons::MNoise::TFullVolumeSpinColorDiagonal<ZFIMPL>;
--- a/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp
+++ b/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp
@@ -0,0 +1,121 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MNoise_FullVolumeSpinColorDiagonal_hpp_
+#define Hadrons_MNoise_FullVolumeSpinColorDiagonal_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/DilutedNoise.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *             Generate full volume spin-color diagonal noise                *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MNoise)
+
+class FullVolumeSpinColorDiagonalPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(FullVolumeSpinColorDiagonalPar,
+                                    unsigned int, nsrc);
+};
+
+template <typename FImpl>
+class TFullVolumeSpinColorDiagonal: public Module<FullVolumeSpinColorDiagonalPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+public:
+    // constructor
+    TFullVolumeSpinColorDiagonal(const std::string name);
+    // destructor
+    virtual ~TFullVolumeSpinColorDiagonal(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(FullVolumeSpinColorDiagonal, TFullVolumeSpinColorDiagonal<FIMPL>, MNoise);
+MODULE_REGISTER_TMP(ZFullVolumeSpinColorDiagonal, TFullVolumeSpinColorDiagonal<ZFIMPL>, MNoise);
+
+/******************************************************************************
+ *              TFullVolumeSpinColorDiagonal implementation                  *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TFullVolumeSpinColorDiagonal<FImpl>::TFullVolumeSpinColorDiagonal(const std::string name)
+: Module<FullVolumeSpinColorDiagonalPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TFullVolumeSpinColorDiagonal<FImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TFullVolumeSpinColorDiagonal<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TFullVolumeSpinColorDiagonal<FImpl>::setup(void)
+{
+    envCreateDerived(DilutedNoise<FImpl>, 
+                     FullVolumeSpinColorDiagonalNoise<FImpl>,
+                     getName(), 1, envGetGrid(FermionField), par().nsrc);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TFullVolumeSpinColorDiagonal<FImpl>::execute(void)
+{
+    auto &noise = envGet(DilutedNoise<FImpl>, getName());
+    LOG(Message) << "Generating full volume, spin-color diagonal noise" << std::endl;
+    noise.generateNoise(rng4d());
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MNoise_FullVolumeSpinColorDiagonal_hpp_
--- a/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -146,7 +146,7 @@ void TChargedProp::execute(void)
        std::vector<int>    siteCoor;

        LOG(Message) << "Saving momentum-projected propagator to '"
-                     << RESULT_FILE_NAME(par().output) << "'..."
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        result.projection.resize(par().outputMom.size());
        result.lattice_size = env().getGrid()->_fdimensions;
--- a/Hadrons/Modules/MScalar/ScalarVP.cc
+++ b/Hadrons/Modules/MScalar/ScalarVP.cc
@@ -462,7 +462,7 @@ void TScalarVP::execute(void)
    if (!par().output.empty())
    {
        LOG(Message) << "Saving momentum-projected HVP to '"
-                     << RESULT_FILE_NAME(par().output) << "'..."
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        saveResult(par().output, "HVP", outputData);
    }
--- a/Hadrons/Modules/MScalar/VPCounterTerms.cc
+++ b/Hadrons/Modules/MScalar/VPCounterTerms.cc
@@ -239,7 +239,7 @@ void TVPCounterTerms::execute(void)
    if (!par().output.empty())
    {
        LOG(Message) << "Saving momentum-projected correlators to '"
-                     << RESULT_FILE_NAME(par().output) << "'..."
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        saveResult(par().output, "scalar_loops", outputData);
    }
--- a/Hadrons/Modules/MScalarSUN/TimeMomProbe.hpp
+++ b/Hadrons/Modules/MScalarSUN/TimeMomProbe.hpp
@@ -1,268 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Modules/MScalarSUN/TimeMomProbe.hpp
-
-Copyright (C) 2015-2018
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef Hadrons_MScalarSUN_TimeMomProbe_hpp_
-#define Hadrons_MScalarSUN_TimeMomProbe_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Module.hpp>
-#include <Hadrons/ModuleFactory.hpp>
-#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *          n-point functions O(t,p)*tr(phi(t_1,p_1)*...*phi(t_n,p_n))        *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MScalarSUN)
-
-class TimeMomProbePar: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(TimeMomProbePar,
-                                    std::string,              field,
-                                    std::vector<std::string>, op,
-                                    std::vector<std::vector<std::string>>, timeMom,
-                                    std::string,              output);
-};
-
-class TimeMomProbeResult: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(TimeMomProbeResult,
-                                    std::string,                   op,
-                                    std::vector<std::vector<int>>, timeMom,
-                                    std::vector<Complex>,          data);
-};
-
-template <typename SImpl>
-class TTimeMomProbe: public Module<TimeMomProbePar>
-{
-public:
-    typedef typename SImpl::Field                    Field;
-    typedef typename SImpl::SiteField::scalar_object Site;
-    typedef typename SImpl::ComplexField             ComplexField;
-    typedef          std::vector<Complex>            SlicedOp;
-public:
-    // constructor
-    TTimeMomProbe(const std::string name);
-    // destructor
-    virtual ~TTimeMomProbe(void) {};
-    // dependency relation
-    virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getOutput(void);
-    // setup
-    virtual void setup(void);
-    // execution
-    virtual void execute(void);
-private:
-    void vectorModulo(std::vector<int> &v);
-};
-
-MODULE_REGISTER_TMP(TimeMomProbeSU2, TTimeMomProbe<ScalarNxNAdjImplR<2>>, MScalarSUN);
-MODULE_REGISTER_TMP(TimeMomProbeSU3, TTimeMomProbe<ScalarNxNAdjImplR<3>>, MScalarSUN);
-MODULE_REGISTER_TMP(TimeMomProbeSU4, TTimeMomProbe<ScalarNxNAdjImplR<4>>, MScalarSUN);
-MODULE_REGISTER_TMP(TimeMomProbeSU5, TTimeMomProbe<ScalarNxNAdjImplR<5>>, MScalarSUN);
-MODULE_REGISTER_TMP(TimeMomProbeSU6, TTimeMomProbe<ScalarNxNAdjImplR<6>>, MScalarSUN);
-
-/******************************************************************************
- *                        TTimeMomProbe implementation                        *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-template <typename SImpl>
-TTimeMomProbe<SImpl>::TTimeMomProbe(const std::string name)
-: Module<TimeMomProbePar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-template <typename SImpl>
-std::vector<std::string> TTimeMomProbe<SImpl>::getInput(void)
-{
-    std::vector<std::string> in = par().op;
-    
-    in.push_back(par().field);
-
-    return in;
-}
-
-template <typename SImpl>
-std::vector<std::string> TTimeMomProbe<SImpl>::getOutput(void)
-{
-    std::vector<std::string> out;
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-template <typename SImpl>
-void TTimeMomProbe<SImpl>::setup(void)
-{
-    envTmpLat(ComplexField, "ftBuf");
-    envTmpLat(Field, "ftMatBuf");
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-// NB: time is direction 0
-template <typename SImpl>
-void TTimeMomProbe<SImpl>::vectorModulo(std::vector<int> &v)
-{
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        auto d = env().getDim(mu);
-        v[mu] = ((v[mu] % d) + d) % d;
-    }
-}
-
-template <typename SImpl>
-void TTimeMomProbe<SImpl>::execute(void)
-{
-    const unsigned int                           nd = env().getNd();
-    const unsigned int                           nt = env().getDim(0);
-    double                                       partVol = 1.;
-    std::set<std::vector<int>>                   timeMomSet;
-    std::vector<std::vector<std::vector<int>>>   timeMom;
-    std::vector<std::vector<int>>                transferMom;
-    FFT                                          fft(envGetGrid(Field));
-    std::vector<int>                             dMask(nd, 1);
-    std::vector<TimeMomProbeResult>              result;
-    std::map<std::string, std::vector<SlicedOp>> slicedOp;
-    std::vector<SlicedOp>                        slicedProbe;
-    auto                                         &phi = envGet(Field, par().field);
-
-    envGetTmp(ComplexField, ftBuf);
-    envGetTmp(Field, ftMatBuf);
-    dMask[0] = 0;
-    for (unsigned int mu = 1; mu < nd; ++mu)
-    {
-        partVol *= env().getDim(mu);
-    }
-    timeMom.resize(par().timeMom.size());
-    for (unsigned int p = 0; p < timeMom.size(); ++p)
-    {
-        for (auto &tms: par().timeMom[p])
-        {
-            std::vector<int> tm = strToVec<int>(tms);
-            
-            timeMom[p].push_back(tm);
-            timeMomSet.insert(tm);
-        }
-        transferMom.push_back(std::vector<int>(nd - 1, 0));
-        for (auto &tm: timeMom[p])
-        {
-            for (unsigned int j = 1; j < nd; ++j)
-            {
-                transferMom[p][j - 1] -= tm[j];
-            }
-        }
-        LOG(Message) << "Probe " << p << " (" << timeMom[p].size() << " points) : " << std::endl;
-        LOG(Message) << "  phi(t_i, p_i) for (t_i, p_i) in " << timeMom[p] << std::endl;
-        LOG(Message) << "  operator with momentum " << transferMom[p] << std::endl;
-    }
-    LOG(Message) << "FFT: field '" << par().field << "'" << std::endl;
-    fft.FFT_dim_mask(ftMatBuf, phi, dMask, FFT::forward);
-    slicedProbe.resize(timeMom.size());
-    for (unsigned int p = 0; p < timeMom.size(); ++p)
-    {
-        std::vector<int> qt;
-
-        LOG(Message) << "Making probe " << p << std::endl;
-        slicedProbe[p].resize(nt);
-        for (unsigned int t = 0; t < nt; ++t)
-        {
-            Site acc;
-            
-            for (unsigned int i = 0; i < timeMom[p].size(); ++i)
-            {
-                Site buf;
-
-                qt     = timeMom[p][i];
-                qt[0] += t;
-                vectorModulo(qt);
-                peekSite(buf, ftMatBuf, qt);
-                if (i == 0)
-                {
-                    acc = buf;
-                }
-                else
-                {
-                    acc *= buf;
-                }
-            }
-            slicedProbe[p][t] = TensorRemove(trace(acc));
-        }
-        //std::cout << slicedProbe[p]<< std::endl;
-    }
-    for (auto &o: par().op)
-    {
-        auto &op = envGet(ComplexField, o);
-
-        slicedOp[o].resize(transferMom.size());
-        LOG(Message) << "FFT: operator '" << o << "'" << std::endl;
-        fft.FFT_dim_mask(ftBuf, op, dMask, FFT::forward);
-        //std::cout << ftBuf << std::endl;
-        for (unsigned int p = 0; p < transferMom.size(); ++p)
-        {
-            std::vector<int> qt(nd, 0);
-
-            for (unsigned int j = 1; j < nd; ++j)
-            {
-                qt[j] = transferMom[p][j - 1];
-            }
-            slicedOp[o][p].resize(nt);
-            for (unsigned int t = 0; t < nt; ++t)
-            {
-                TComplex buf;
-
-                qt[0] = t;
-                vectorModulo(qt);
-                peekSite(buf, ftBuf, qt);
-                slicedOp[o][p][t] = TensorRemove(buf);
-            }
-            //std::cout << ftBuf << std::endl;
-            //std::cout << slicedOp[o][p] << std::endl;
-        }
-    }
-    LOG(Message) << "Making correlators" << std::endl;
-    for (auto &o: par().op)
-    for (unsigned int p = 0; p < timeMom.size(); ++p)
-    {
-        TimeMomProbeResult r;
-
-        LOG(Message) << "  <" << o << " probe_" << p << ">" << std::endl;
-        r.op      = o;
-        r.timeMom = timeMom[p];
-        r.data    = makeTwoPoint(slicedOp[o][p], slicedProbe[p], 1./partVol);
-        result.push_back(r);
-    }
-    saveResult(par().output, "timemomprobe", result);
-}
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_MScalarSUN_TimeMomProbe_hpp_
--- a/Hadrons/Modules/MScalarSUN/TrMag.hpp
+++ b/Hadrons/Modules/MScalarSUN/TrMag.hpp
@@ -124,7 +124,8 @@ void TTrMag<SImpl>::execute(void)
    std::vector<TrMagResult> result;
    auto                     &phi = envGet(Field, par().field);

-    auto m2 = sum(phi), mn = m2;
+    auto m2 = sum(phi);
+    auto mn = m2;

    m2 = -m2*m2;
    mn = 1.;
--- a/Hadrons/Modules/MScalarSUN/Utils.hpp
+++ b/Hadrons/Modules/MScalarSUN/Utils.hpp
@@ -103,7 +103,7 @@ std::vector<Complex> makeTwoPoint(const std::vector<SinkSite>   &sink,
    {
        for (unsigned int t  = 0; t < nt; ++t)
        {
-            res[dt] += trace(sink[(t+dt)%nt]*source[t]);
+            res[dt] += trace(sink[(t+dt)%nt]*adj(source[t]));
        }
        res[dt] *= factor/static_cast<double>(nt);
    }
--- a/Hadrons/Modules/MSolver/A2AAslashVectors.cc
+++ b/Hadrons/Modules/MSolver/A2AAslashVectors.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSolver/A2AAslashVectors.cc
+
+Copyright (C) 2015-2018
+
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSolver;
+
+template class Grid::Hadrons::MSolver::TA2AAslashVectors<FIMPL>;
+template class Grid::Hadrons::MSolver::TA2AAslashVectors<ZFIMPL>;
--- a/Show More
+++ b/Show More