resilient I/O fix

Merge branch 'develop' into feature/resilient-io
HDF5 serialiser fix
2025-09-19 01:31:04 +01:00 · 2018-11-27 20:17:09 +00:00 · 2018-11-27 19:09:59 +00:00 · 2018-11-27 19:09:50 +00:00 · 2018-11-27 18:46:43 +00:00 · 2018-11-27 18:38:24 +00:00
84 changed files with 4062 additions and 695 deletions
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -380,6 +380,12 @@ namespace Grid {
    template<class Field> class OperatorFunction {
    public:
      virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
+      virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
+	assert(in.size()==out.size());
+	for(int k=0;k<in.size();k++){
+	  (*this)(Linop,in[k],out[k]);
+	}
+      };
    };

    template<class Field> class LinearFunction {
@@ -421,7 +427,7 @@ namespace Grid {
  // Hermitian operator Linear function and operator function
  ////////////////////////////////////////////////////////////////////////////////////////////
    template<class Field>
-      class HermOpOperatorFunction : public OperatorFunction<Field> {
+    class HermOpOperatorFunction : public OperatorFunction<Field> {
      void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
 	Linop.HermOp(in,out);
      };
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -55,6 +55,14 @@ namespace Grid {
    template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
    public:
      virtual GridBase *RedBlackGrid(void)=0;
+
+      //////////////////////////////////////////////////////////////////////
+      // Query the even even properties to make algorithmic decisions
+      //////////////////////////////////////////////////////////////////////
+      virtual RealD  Mass(void)        { return 0.0; };
+      virtual int    ConstEE(void)     { return 0; }; // Disable assumptions unless overridden
+      virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
+
      // half checkerboard operaions
      virtual  void Meooe    (const Field &in, Field &out)=0;
      virtual  void Mooee    (const Field &in, Field &out)=0;
--- a/Grid/algorithms/iterative/BlockConjugateGradient.h
+++ b/Grid/algorithms/iterative/BlockConjugateGradient.h
@@ -33,7 +33,7 @@ directory

 namespace Grid {

-enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
+enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };

 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient. Dimension zero should be the block direction
@@ -42,7 +42,6 @@ template <class Field>
 class BlockConjugateGradient : public OperatorFunction<Field> {
 public:

-
  typedef typename Field::scalar_type scomplex;

  int blockDim ;
@@ -54,21 +53,15 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  Integer PrintInterval; //GridLogMessages or Iterative
  
  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
+    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100)
  {};

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Thin QR factorisation (google it)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-void ThinQRfact (Eigen::MatrixXcd &m_rr,
-		 Eigen::MatrixXcd &C,
-		 Eigen::MatrixXcd &Cinv,
-		 Field & Q,
-		 const Field & R)
-{
-  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  //Dimensions
  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
@@ -85,22 +78,20 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  // Cdag C = Rdag R ; passes.
  // QdagQ  = 1      ; passes
  ////////////////////////////////////////////////////////////////////////////////////////////////////
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 Field & Q,
+		 const Field & R)
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  sliceInnerProductMatrix(m_rr,R,R,Orthog);

  // Force manifest hermitian to avoid rounding related
  m_rr = 0.5*(m_rr+m_rr.adjoint());

-#if 0
-  std::cout << " Calling Cholesky  ldlt on m_rr "  << m_rr <<std::endl;
-  Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL(); 
-  std::cout << " Called Cholesky  ldlt on m_rr "  << L_ldlt <<std::endl;
-  auto  D_ldlt = m_rr.ldlt().vectorD(); 
-  std::cout << " Called Cholesky  ldlt on m_rr "  << D_ldlt <<std::endl;
-#endif
-
-  //  std::cout << " Calling Cholesky  llt on m_rr "  <<std::endl;
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
-  //  std::cout << " Called Cholesky  llt on m_rr "  << L <<std::endl;
+
  C    = L.adjoint();
  Cinv = C.inverse();
  ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -112,6 +103,25 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  sliceMulMatrix(Q,Cinv,R,Orthog);
 }
+// see comments above
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 std::vector<Field> & Q,
+		 const std::vector<Field> & R)
+{
+  InnerProductMatrix(m_rr,R,R);
+
+  m_rr = 0.5*(m_rr+m_rr.adjoint());
+
+  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+
+  C    = L.adjoint();
+  Cinv = C.inverse();
+
+  MulMatrix(Q,Cinv,R);
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Call one of several implementations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -119,14 +129,20 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 {
  if ( CGtype == BlockCGrQ ) {
    BlockCGrQsolve(Linop,Src,Psi);
-  } else if (CGtype == BlockCG ) {
-    BlockCGsolve(Linop,Src,Psi);
  } else if (CGtype == CGmultiRHS ) {
    CGmultiRHSsolve(Linop,Src,Psi);
  } else {
    assert(0);
  }
 }
+virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi) 
+{
+  if ( CGtype == BlockCGrQVec ) {
+    BlockCGrQsolveVec(Linop,Src,Psi);
+  } else {
+    assert(0);
+  }
+}

 ////////////////////////////////////////////////////////////////////////////
 // BlockCGrQ implementation:
@@ -139,7 +155,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = B._grid->_fdimensions[Orthog];
-
+/* FAKE */
+  Nblock=8;
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;

  X.checkerboard = B.checkerboard;
@@ -202,15 +219,10 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;

  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
-
  Linop.HermOp(X, AD);
  tmp = B - AD;  
-  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
+
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
-  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
-  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
-  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
-  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
  D=Q;

  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
@@ -232,14 +244,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    MatrixTimer.Start();
    Linop.HermOp(D, Z);      
    MatrixTimer.Stop();
-    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;

    //4. M  = [D^dag Z]^{-1}
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
    sliceInnerTimer.Stop();
    m_M       = m_DZ.inverse();
-    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
    
    //5. X  = X + D MC
    m_tmp     = m_M * m_C;
@@ -257,6 +267,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    
    //7. D  = Q + D S^dag
    m_tmp = m_S.adjoint();
+
    sliceMaddTimer.Start();
    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
    sliceMaddTimer.Stop();
@@ -317,152 +328,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  IterationsToComplete = k;
 }
 //////////////////////////////////////////////////////////////////////////
-// Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
-//////////////////////////////////////////////////////////////////////////
-void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
-{
-  int Orthog = blockDim; // First dimension is block dim; this is an assumption
-  Nblock = Src._grid->_fdimensions[Orthog];
-
-  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
-
-  Psi.checkerboard = Src.checkerboard;
-  conformable(Psi, Src);
-
-  Field P(Src);
-  Field AP(Src);
-  Field R(Src);
-  
-  Eigen::MatrixXcd m_pAp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
-  Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
-  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-  Eigen::MatrixXcd m_alpha      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  Eigen::MatrixXcd m_beta   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-  // Initial residual computation & set up
-  std::vector<RealD> residuals(Nblock);
-  std::vector<RealD> ssq(Nblock);
-
-  sliceNorm(ssq,Src,Orthog);
-  RealD sssum=0;
-  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
-
-  sliceNorm(residuals,Src,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-  sliceNorm(residuals,Psi,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-  // Initial search dir is guess
-  Linop.HermOp(Psi, AP);
-  
-
-  /************************************************************************
-   * Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
-   ************************************************************************
-   * O'Leary : R = B - A X
-   * O'Leary : P = M R ; preconditioner M = 1
-   * O'Leary : alpha = PAP^{-1} RMR
-   * O'Leary : beta  = RMR^{-1}_old RMR_new
-   * O'Leary : X=X+Palpha
-   * O'Leary : R_new=R_old-AP alpha
-   * O'Leary : P=MR_new+P beta
-   */
-
-  R = Src - AP;  
-  P = R;
-  sliceInnerProductMatrix(m_rr,R,R,Orthog);
-
-  GridStopWatch sliceInnerTimer;
-  GridStopWatch sliceMaddTimer;
-  GridStopWatch MatrixTimer;
-  GridStopWatch SolverTimer;
-  SolverTimer.Start();
-
-  int k;
-  for (k = 1; k <= MaxIterations; k++) {
-
-    RealD rrsum=0;
-    for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
-
-    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
-	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
-
-    MatrixTimer.Start();
-    Linop.HermOp(P, AP);
-    MatrixTimer.Stop();
-
-    // Alpha
-    sliceInnerTimer.Start();
-    sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
-    sliceInnerTimer.Stop();
-    m_pAp_inv = m_pAp.inverse();
-    m_alpha   = m_pAp_inv * m_rr ;
-
-    // Psi, R update
-    sliceMaddTimer.Start();
-    sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog);     // add alpha *  P to psi
-    sliceMaddMatrix(R  ,m_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
-    sliceMaddTimer.Stop();
-
-    // Beta
-    m_rr_inv = m_rr.inverse();
-    sliceInnerTimer.Start();
-    sliceInnerProductMatrix(m_rr,R,R,Orthog);
-    sliceInnerTimer.Stop();
-    m_beta = m_rr_inv *m_rr;
-
-    // Search update
-    sliceMaddTimer.Start();
-    sliceMaddMatrix(AP,m_beta,P,R,Orthog);
-    sliceMaddTimer.Stop();
-    P= AP;
-
-    /*********************
-     * convergence monitor
-     *********************
-     */
-    RealD max_resid=0;
-    RealD rr;
-    for(int b=0;b<Nblock;b++){
-      rr = real(m_rr(b,b))/ssq[b];
-      if ( rr > max_resid ) max_resid = rr;
-    }
-    
-    if ( max_resid < Tolerance*Tolerance ) { 
-
-      SolverTimer.Stop();
-
-      std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
-      for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
-		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
-      }
-      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
-
-      Linop.HermOp(Psi, AP);
-      AP = AP-Src;
-      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
-
-      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
-      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
-      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
-	    
-      IterationsToComplete = k;
-      return;
-    }
-
-  }
-  std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;
-
-  if (ErrorOnNoConverge) assert(0);
-  IterationsToComplete = k;
-}
-//////////////////////////////////////////////////////////////////////////
 // multiRHS conjugate gradient. Dimension zero should be the block direction
 // Use this for spread out across nodes
 //////////////////////////////////////////////////////////////////////////
@@ -600,6 +465,233 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
  IterationsToComplete = k;
 }

+void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
+  for(int b=0;b<Nblock;b++){
+  for(int bp=0;bp<Nblock;bp++) {
+    m(b,bp) = innerProduct(X[b],Y[bp]);  
+  }}
+}
+void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
+  // Should make this cache friendly with site outermost, parallel_for
+  // Deal with case AP aliases with either Y or X
+  std::vector<Field> tmp(Nblock,X[0]);
+  for(int b=0;b<Nblock;b++){
+    tmp[b]   = Y[b];
+    for(int bp=0;bp<Nblock;bp++) {
+      tmp[b] = tmp[b] + (scale*m(bp,b))*X[bp]; 
+    }
+  }
+  for(int b=0;b<Nblock;b++){
+    AP[b] = tmp[b];
+  }
+}
+void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
+  // Should make this cache friendly with site outermost, parallel_for
+  for(int b=0;b<Nblock;b++){
+    AP[b] = zero;
+    for(int bp=0;bp<Nblock;bp++) {
+      AP[b] += (m(bp,b))*X[bp]; 
+    }
+  }
+}
+double normv(const std::vector<Field> &P){
+  double nn = 0.0;
+  for(int b=0;b<Nblock;b++) {
+    nn+=norm2(P[b]);
+  }
+  return nn;
+}
+
+////////////////////////////////////////////////////////////////////////////
+// BlockCGrQvec implementation:
+//--------------------------
+// X is guess/Solution
+// B is RHS
+// Solve A X_i = B_i    ;        i refers to Nblock index
+////////////////////////////////////////////////////////////////////////////
+void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X) 
+{
+  Nblock = B.size();
+  assert(Nblock == X.size());
+
+  std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
+
+  for(int b=0;b<Nblock;b++){ 
+    X[b].checkerboard = B[b].checkerboard;
+    conformable(X[b], B[b]);
+    conformable(X[b], X[0]); 
+  }
+
+  Field Fake(B[0]);
+
+  std::vector<Field> tmp(Nblock,Fake);
+  std::vector<Field>   Q(Nblock,Fake);
+  std::vector<Field>   D(Nblock,Fake);
+  std::vector<Field>   Z(Nblock,Fake);
+  std::vector<Field>  AD(Nblock,Fake);
+
+  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  /************************************************************************
+   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
+   ************************************************************************
+   * Dimensions:
+   *
+   *   X,B==(Nferm x Nblock)
+   *   A==(Nferm x Nferm)
+   *  
+   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
+   * 
+   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+   * for k: 
+   *   Z  = AD
+   *   M  = [D^dag Z]^{-1}
+   *   X  = X + D MC
+   *   QS = Q - ZM
+   *   D  = Q + D S^dag
+   *   C  = S C
+   */
+  ///////////////////////////////////////
+  // Initial block: initial search dir is guess
+  ///////////////////////////////////////
+  std::cout << GridLogMessage<<"BlockCGrQvec algorithm initialisation " <<std::endl;
+
+  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+  for(int b=0;b<Nblock;b++) {
+    Linop.HermOp(X[b], AD[b]);
+    tmp[b] = B[b] - AD[b];  
+  }
+
+  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+
+  for(int b=0;b<Nblock;b++) D[b]=Q[b];
+
+  std::cout << GridLogMessage<<"BlockCGrQ vec computed initial residual and QR fact " <<std::endl;
+
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
+
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    //3. Z  = AD
+    MatrixTimer.Start();
+    for(int b=0;b<Nblock;b++) Linop.HermOp(D[b], Z[b]);      
+    MatrixTimer.Stop();
+
+    //4. M  = [D^dag Z]^{-1}
+    sliceInnerTimer.Start();
+    InnerProductMatrix(m_DZ,D,Z);
+    sliceInnerTimer.Stop();
+    m_M       = m_DZ.inverse();
+    
+    //5. X  = X + D MC
+    m_tmp     = m_M * m_C;
+    sliceMaddTimer.Start();
+    MaddMatrix(X,m_tmp, D,X);     
+    sliceMaddTimer.Stop();
+
+    //6. QS = Q - ZM
+    sliceMaddTimer.Start();
+    MaddMatrix(tmp,m_M,Z,Q,-1.0);
+    sliceMaddTimer.Stop();
+    QRTimer.Start();
+    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
+    QRTimer.Stop();
+    
+    //7. D  = Q + D S^dag
+    m_tmp = m_S.adjoint();
+    sliceMaddTimer.Start();
+    MaddMatrix(D,m_tmp,D,Q);
+    sliceMaddTimer.Stop();
+
+    //8. C  = S C
+    m_C = m_S*m_C;
+    
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    m_rr = m_C.adjoint() * m_C;
+
+    RealD max_resid=0;
+    RealD rrsum=0;
+    RealD rr;
+
+    for(int b=0;b<Nblock;b++) {
+      rrsum+=real(m_rr(b,b));
+      rr = real(m_rr(b,b))/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+
+    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl;
+
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
+
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
+      for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
+	    
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+
+
+
 };

 }
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -133,7 +133,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
      LinalgTimer.Stop();

      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
-                << " residual " << cp << " target " << rsq << std::endl;
+                << " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;

      // Stopping condition
      if (cp <= rsq) {
@@ -150,13 +150,13 @@ class ConjugateGradient : public OperatorFunction<Field> {
 	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
 	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;

-        std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
-	std::cout << GridLogPerformance << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
+	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;

        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -86,229 +86,23 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   */
 namespace Grid {

+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Use base class to share code
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Now make the norm reflect extra factor of Mee
-  template<class Field> class SchurRedBlackStaggeredSolve {
-  private:
+  template<class Field> class SchurRedBlackBase {
+  protected:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
  public:

-    /////////////////////////////////////////////////////
-    // Wrap the usual normal equations Schur trick
-    /////////////////////////////////////////////////////
-  SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
-     _HermitianRBSolver(HermitianRBSolver) 
-    { 
-      CBfactorise=0;
-      subtractGuess(initSubGuess);
-    };
-    void subtractGuess(const bool initSubGuess)
-    {
-      subGuess = initSubGuess;
-    }
-    bool isSubtractGuess(void)
-    {
-      return subGuess;
-    }
-
-    template<class Matrix>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out){
-      ZeroGuesser<Field> guess;
-      (*this)(_Matrix,in,out,guess);
-    }
-    template<class Matrix, class Guesser>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
-
-      // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME use CBfactorise to control schur decomp
-      GridBase *grid = _Matrix.RedBlackGrid();
-      GridBase *fgrid= _Matrix.Grid();
-
-      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
-      Field   tmp(grid);
-      Field  Mtmp(grid);
-      Field resid(fgrid);
-      
-      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve " <<std::endl;
-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
-      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
-    
-      /////////////////////////////////////////////////////
-      // src_o = (source_o - Moe MeeInv source_e)
-      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
-
-      //src_o = tmp;     assert(src_o.checkerboard ==Odd);
-      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
-
-      //////////////////////////////////////////////////////////////
-      // Call the red-black solver
-      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
-      guess(src_o, sol_o);
-      Mtmp = sol_o;
-      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;
-      // Fionn A2A boolean behavioural control
-      if (subGuess)        sol_o = sol_o-Mtmp;
-
-      ///////////////////////////////////////////////////
-      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-     
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver reconstructed other CB" <<std::endl;
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver inserted solution" <<std::endl;
-
-      // Verify the unprec residual
-      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
-        resid = resid-in;
-        RealD ns = norm2(in);
-        RealD nr = norm2(resid);
-        std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
-      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
-      }
-    }     
-  };
-  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
-
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Take a matrix and form a Red Black solver calling a Herm solver
-  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackDiagMooeeSolve {
-  private:
-    OperatorFunction<Field> & _HermitianRBSolver;
-    int CBfactorise;
-    bool subGuess;
-  public:
-
-    /////////////////////////////////////////////////////
-    // Wrap the usual normal equations Schur trick
-    /////////////////////////////////////////////////////
-  SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0, const bool initSubGuess = false)  :  _HermitianRBSolver(HermitianRBSolver) 
-  { 
-    CBfactorise=cb;
-    subtractGuess(initSubGuess);
-  };
-    void subtractGuess(const bool initSubGuess)
-    {
-      subGuess = initSubGuess;
-    }
-    bool isSubtractGuess(void)
-    {
-      return subGuess;
-    }
-    template<class Matrix>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out){
-      ZeroGuesser<Field> guess;
-      (*this)(_Matrix,in,out,guess);
-    }
-    template<class Matrix, class Guesser>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
-
-      // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME use CBfactorise to control schur decomp
-      GridBase *grid = _Matrix.RedBlackGrid();
-      GridBase *fgrid= _Matrix.Grid();
-
-      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
-      Field   tmp(grid);
-      Field  Mtmp(grid);
-      Field resid(fgrid);
-
-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
-    
-      /////////////////////////////////////////////////////
-      // src_o = Mdag * (source_o - Moe MeeInv source_e)
-      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
-
-      // get the right MpcDag
-      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
-
-      //////////////////////////////////////////////////////////////
-      // Call the red-black solver
-      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-      guess(src_o,sol_o);
-      Mtmp = sol_o;
-      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-      // Fionn A2A boolean behavioural control
-      if (subGuess)        sol_o = sol_o-Mtmp;
-
-      ///////////////////////////////////////////////////
-      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-     
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
-
-      // Verify the unprec residual
-      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
-        resid = resid-in;
-        RealD ns = norm2(in);
-        RealD nr = norm2(resid);
-
-        std::cout<<GridLogMessage << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
-      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
-      }
-    }     
-  };
-
-
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Take a matrix and form a Red Black solver calling a Herm solver
-  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackDiagTwoSolve {
-  private:
-    OperatorFunction<Field> & _HermitianRBSolver;
-    int CBfactorise;
-    bool subGuess;
-  public:
-
-    /////////////////////////////////////////////////////
-    // Wrap the usual normal equations Schur trick
-    /////////////////////////////////////////////////////
-  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
-     _HermitianRBSolver(HermitianRBSolver) 
+    SchurRedBlackBase(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
+    _HermitianRBSolver(HermitianRBSolver) 
    { 
      CBfactorise = 0;
      subtractGuess(initSubGuess);
@@ -322,12 +116,86 @@ namespace Grid {
      return subGuess;
    }

-    template<class Matrix>
+    /////////////////////////////////////////////////////////////
+    // Shared code
+    /////////////////////////////////////////////////////////////
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
-    template<class Matrix,class Guesser>
+    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out) 
+    {
+      ZeroGuesser<Field> guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+
+    template<class Guesser>
+    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess) 
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+      int nblock = in.size();
+
+      std::vector<Field> src_o(nblock,grid);
+      std::vector<Field> sol_o(nblock,grid);
+      
+      std::vector<Field> guess_save;
+
+      Field resid(fgrid);
+      Field tmp(grid);
+
+      ////////////////////////////////////////////////
+      // Prepare RedBlack source
+      ////////////////////////////////////////////////
+      for(int b=0;b<nblock;b++){
+	RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
+      }
+      ////////////////////////////////////////////////
+      // Make the guesses
+      ////////////////////////////////////////////////
+      if ( subGuess ) guess_save.resize(nblock,grid);
+
+      for(int b=0;b<nblock;b++){
+	guess(src_o[b],sol_o[b]); 
+
+	if ( subGuess ) { 
+	  guess_save[b] = sol_o[b];
+	}
+      }
+      //////////////////////////////////////////////////////////////
+      // Call the block solver
+      //////////////////////////////////////////////////////////////
+      std::cout<<GridLogMessage << "SchurRedBlackBase calling the solver for "<<nblock<<" RHS" <<std::endl;
+      RedBlackSolve(_Matrix,src_o,sol_o);
+
+      ////////////////////////////////////////////////
+      // A2A boolean behavioural control & reconstruct other checkerboard
+      ////////////////////////////////////////////////
+      for(int b=0;b<nblock;b++) {
+
+	if (subGuess)   sol_o[b] = sol_o[b] - guess_save[b];
+
+	///////// Needs even source //////////////
+	pickCheckerboard(Even,tmp,in[b]);
+	RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
+
+	/////////////////////////////////////////////////
+	// Check unprec residual if possible
+	/////////////////////////////////////////////////
+	if ( ! subGuess ) {
+	  _Matrix.M(out[b],resid); 
+	  resid = resid-in[b];
+	  RealD ns = norm2(in[b]);
+	  RealD nr = norm2(resid);
+	
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
+	} else {
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
+	}
+
+      }
+    }
+    template<class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
@@ -335,52 +203,39 @@ namespace Grid {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();

-      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
-      Field   tmp(grid);
-      Field  Mtmp(grid);
      Field resid(fgrid);
+      Field src_o(grid);
+      Field src_e(grid);
+      Field sol_o(grid);

-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
-    
-      /////////////////////////////////////////////////////
-      // src_o = Mdag * (source_o - Moe MeeInv source_e)
-      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+      ////////////////////////////////////////////////
+      // RedBlack source
+      ////////////////////////////////////////////////
+      RedBlackSource(_Matrix,in,src_e,src_o);

-      // get the right MpcDag
-      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
+      ////////////////////////////////
+      // Construct the guess
+      ////////////////////////////////
+      Field   tmp(grid);
+      guess(src_o,sol_o);
+
+      Field  guess_save(grid);
+      guess_save = sol_o;

      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-//      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-      guess(src_o,tmp);
-      Mtmp = tmp;
-      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
+      RedBlackSolve(_Matrix,src_o,sol_o);
+
+      ////////////////////////////////////////////////
      // Fionn A2A boolean behavioural control
-      if (subGuess)      tmp = tmp-Mtmp;
-      _Matrix.MooeeInv(tmp,sol_o);       assert(  sol_o.checkerboard   ==Odd);
+      ////////////////////////////////////////////////
+      if (subGuess)      sol_o= sol_o-guess_save;

      ///////////////////////////////////////////////////
-      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      // RedBlack solution needs the even source
      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-     
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
+      RedBlackSolution(_Matrix,sol_o,src_e,out);

      // Verify the unprec residual
      if ( ! subGuess ) {
@@ -389,68 +244,182 @@ namespace Grid {
        RealD ns = norm2(in);
        RealD nr = norm2(resid);

-        std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
+        std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl;
      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
+        std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl;
      }
    }     
+    
+    /////////////////////////////////////////////////////////////
+    // Override in derived. Not virtual as template methods
+    /////////////////////////////////////////////////////////////
+    virtual void RedBlackSource  (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)                =0;
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)          =0;
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)                           =0;
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)=0;
+
  };
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Take a matrix and form a Red Black solver calling a Herm solver
-  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackDiagTwoMixed {
-  private:
-    LinearFunction<Field> & _HermitianRBSolver;
-    int CBfactorise;
-    bool subGuess;
+
+  template<class Field> class SchurRedBlackStaggeredSolve : public SchurRedBlackBase<Field> {
  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) 
+      :    SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess) 
+    {
+    }
+
+    //////////////////////////////////////////////////////
+    // Override RedBlack specialisation
+    //////////////////////////////////////////////////////
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+
+      /////////////////////////////////////////////////////
+      // src_o = (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+
+      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
+    }
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e_c,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field   sol_e(grid);
+      Field   src_e(grid);
+
+      src_e = src_e_c; // Const correctness
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
+      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
+     
+      setCheckerboard(sol,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(sol,sol_o); assert(  sol_o.checkerboard ==Odd );
+    }
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal has Mooee on it.
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  
+      : SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess) {};
+
+
+    //////////////////////////////////////////////////////
+    // Override RedBlack specialisation
+    //////////////////////////////////////////////////////
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+
+      /////////////////////////////////////////////////////
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+
+      // get the right MpcDag
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
+
+    }
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  sol_e(grid);
+      Field  src_e_i(grid);
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);          assert(  tmp.checkerboard   ==Even);
+      src_e_i = src_e-tmp;               assert(  src_e_i.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e_i,sol_e);   assert(  sol_e.checkerboard ==Even);
+     
+      setCheckerboard(sol,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(sol,sol_o); assert(  sol_o.checkerboard ==Odd );
+    }
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal is identity, right preconditioned by Mee^inv
+  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
+  //=> psi = MeeInv phi
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;

    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
-  SchurRedBlackDiagTwoMixed(LinearFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
-     _HermitianRBSolver(HermitianRBSolver) 
-    { 
-      CBfactorise=0;
-      subtractGuess(initSubGuess);
-    };
-    void subtractGuess(const bool initSubGuess)
-    {
-      subGuess = initSubGuess;
-    }
-    bool isSubtractGuess(void)
-    {
-      return subGuess;
-    }
+  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  
+    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess) {};

-    template<class Matrix>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out){
-      ZeroGuesser<Field> guess;
-      (*this)(_Matrix,in,out,guess);
-    }
-    template<class Matrix, class Guesser>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
-
-      // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME use CBfactorise to control schur decomp
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();

      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
+      
      Field   tmp(grid);
      Field  Mtmp(grid);
-      Field resid(fgrid);

-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
    
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
@@ -461,43 +430,44 @@ namespace Grid {

      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
+    }

-      //////////////////////////////////////////////////////////////
-      // Call the red-black solver
-      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-//      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-//      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
-      guess(src_o,tmp);
-      Mtmp = tmp;
-      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
-      // Fionn A2A boolean behavioural control
-      if (subGuess)      tmp = tmp-Mtmp;
-      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   sol_o_i(grid);
+      Field   tmp(grid);
+      Field   sol_e(grid);
+
+      ////////////////////////////////////////////////
+      // MooeeInv due to pecond
+      ////////////////////////////////////////////////
+      _Matrix.MooeeInv(sol_o,tmp);
+      sol_o_i = tmp;

      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
+      _Matrix.Meooe(sol_o_i,tmp);    assert(  tmp.checkerboard   ==Even);
+      tmp = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      _Matrix.MooeeInv(tmp,sol_e);   assert(  sol_e.checkerboard ==Even);
     
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
+      setCheckerboard(sol,sol_e);    assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(sol,sol_o_i);  assert(  sol_o_i.checkerboard ==Odd );
+    };

-      // Verify the unprec residual
-      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
-        resid = resid-in;
-        RealD ns = norm2(in);
-        RealD nr = norm2(resid);
-
-        std::cout << GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid " << std::sqrt(nr / ns) << " nr " << nr << " ns " << ns << std::endl;
-      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
-      }
-    }     
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
  };
-
 }
 #endif
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -50,8 +50,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
      assert(0);
  }

-  Grid_quiesce_nodes();
-
  // Never clean up as done once.
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);

@@ -124,10 +122,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  // split the communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  //  int Nparent = parent._processors ; 
-  //  std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
  int Nparent;
  MPI_Comm_size(parent.communicator,&Nparent);
-  //  std::cout << " Parent size  "<<Nparent <<std::endl;

  int childsize=1;
  for(int d=0;d<processors.size();d++) {
@@ -136,8 +132,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  int Nchild = Nparent/childsize;
  assert (childsize * Nchild == Nparent);

-  //  std::cout << " child size  "<<childsize <<std::endl;
-
  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
  std::vector<int> scoor(_ndimension); // coor of split within parent
  std::vector<int> ssize(_ndimension); // coor of split within parent
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -413,7 +413,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@@ -455,7 +455,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@@ -499,7 +499,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
      
-      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
+      //      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
      if ( ptr == (void * )MAP_FAILED ) {       
 	perror("failed mmap");     
 	assert(0);    
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -464,8 +464,10 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
  assert(orthog>=0);

  for(int d=0;d<nh;d++){
-    assert(lg->_processors[d]  == hg->_processors[d]);
-    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    if ( d!=orthog ) {
+      assert(lg->_processors[d]  == hg->_processors[d]);
+      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    }
  }

  // the above should guarantee that the operations are local
@@ -485,7 +487,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int


 template<class vobj>
-void ExtractSliceLocal(Lattice<vobj> &lowDim, const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
  typedef typename vobj::scalar_object sobj;

@@ -499,8 +501,10 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, const Lattice<vobj> & higherDim,in
  assert(orthog>=0);

  for(int d=0;d<nh;d++){
-    assert(lg->_processors[d]  == hg->_processors[d]);
-    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    if ( d!=orthog ) {
+      assert(lg->_processors[d]  == hg->_processors[d]);
+      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    }
  }

  // the above should guarantee that the operations are local
@@ -520,7 +524,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, const Lattice<vobj> & higherDim,in


 template<class vobj>
-void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
+void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
  typedef typename vobj::scalar_object sobj;

--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -146,9 +146,11 @@ public:
      if ( log.timestamp ) {
 	log.StopWatch->Stop();
 	GridTime now = log.StopWatch->Elapsed();
+	
 	if ( log.timing_mode==1 ) log.StopWatch->Reset();
 	log.StopWatch->Start();
-	stream << log.evidence()<< std::setw(6)<<now << log.background() << " : " ;
+	stream << log.evidence()
+	       << now	       << log.background() << " : " ;
      }
      stream << log.colour();
      return stream;
--- a/Grid/parallelIO/BinaryIO.cc
+++ b/Grid/parallelIO/BinaryIO.cc
@@ -0,0 +1,3 @@
+#include <Grid/GridCore.h>
+
+int Grid::BinaryIO::latticeWriteMaxRetry = -1;
--- a/Grid/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@@ -81,6 +81,7 @@ inline void removeWhitespace(std::string &key)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 class BinaryIO {
 public:
+  static int latticeWriteMaxRetry;

  /////////////////////////////////////////////////////////////////////////////
  // more byte manipulation helpers
@@ -370,7 +371,7 @@ PARALLEL_CRITICAL
 #endif
      } else {
 	std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
-                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
+                  << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
        std::ifstream fin;
 	fin.open(file, std::ios::binary | std::ios::in);
        if (control & BINARYIO_MASTER_APPEND)
@@ -582,7 +583,9 @@ PARALLEL_CRITICAL
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
    GridBase *grid = Umu._grid;
-    uint64_t lsites = grid->lSites();
+    uint64_t lsites = grid->lSites(), offsetCopy = offset;
+    int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry);
+    bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0);

    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
@@ -597,9 +600,35 @@ PARALLEL_CRITICAL

    grid->Barrier();
    timer.Stop();
+    while (attemptsLeft >= 0)
+    {
+      grid->Barrier();
+      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
+	             nersc_csum,scidac_csuma,scidac_csumb);
+      if (checkWrite)
+      {
+        std::vector<fobj> ckiodata(lsites);
+        uint32_t          cknersc_csum, ckscidac_csuma, ckscidac_csumb;
+        uint64_t          ckoffset = offsetCopy;

-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
-	     nersc_csum,scidac_csuma,scidac_csumb);
+        std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
+        grid->Barrier();
+        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
+	               cknersc_csum,ckscidac_csuma,ckscidac_csumb);
+        if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
+        {
+          std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl;
+          offset = offsetCopy;
+        }
+        else
+        {
+          std::cout << GridLogMessage << "writeLatticeObject: read test checksum correct" << std::endl;
+          break;
+        }
+      }
+      attemptsLeft--;
+    }
+    

    std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed()  <<std::endl;
  }
@@ -725,5 +754,6 @@ PARALLEL_CRITICAL
    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
  }
 };
+
 }
 #endif
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -233,7 +233,8 @@ class GridLimeReader : public BinaryIO {
 	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
 	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
-
+  std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
+  std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
 	/////////////////////////////////////////////
 	// Insist checksum is next record
 	/////////////////////////////////////////////
--- a/Grid/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@@ -49,21 +49,39 @@ inline double usecond(void) {

 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
-typedef  std::chrono::milliseconds          GridMillisecs;
-typedef  std::chrono::microseconds          GridTime;
-typedef  std::chrono::microseconds          GridUsecs;

-inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
+typedef  std::chrono::seconds               GridSecs;
+typedef  std::chrono::milliseconds          GridMillisecs;
+typedef  std::chrono::microseconds          GridUsecs;
+typedef  std::chrono::microseconds          GridTime;
+
+inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
 {
-  stream << time.count()<<" ms";
+  stream << time.count()<<" s";
  return stream;
 }
-inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
+inline std::ostream& operator<< (std::ostream & stream, const GridMillisecs & now)
 {
-  stream << time.count()<<" usec";
+  GridSecs second(1);
+  auto     secs       = now/second ; 
+  auto     subseconds = now%second ;
+  auto     fill       = stream.fill();
+  stream << secs<<"."<<std::setw(3)<<std::setfill('0')<<subseconds.count()<<" s";
+  stream.fill(fill);
  return stream;
 }
- 
+inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now)
+{
+  GridSecs second(1);
+  auto     seconds    = now/second ; 
+  auto     subseconds = now%second ;
+  auto     fill       = stream.fill();
+  stream << seconds<<"."<<std::setw(6)<<std::setfill('0')<<subseconds.count()<<" s";
+  stream.fill(fill);
+  return stream;
+}
+
+
 class GridStopWatch {
 private:
  bool running;
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -44,12 +44,15 @@ namespace QCD {
  
  struct WilsonImplParams {
    bool overlapCommsCompute;
+    std::vector<Real> twist_n_2pi_L;
    std::vector<Complex> boundary_phases;
    WilsonImplParams() : overlapCommsCompute(false) {
      boundary_phases.resize(Nd, 1.0);
+      twist_n_2pi_L.resize(Nd, 0.0);
    };
-    WilsonImplParams(const std::vector<Complex> phi)
-      : boundary_phases(phi), overlapCommsCompute(false) {}
+    WilsonImplParams(const std::vector<Complex> phi) : boundary_phases(phi), overlapCommsCompute(false) {
+      twist_n_2pi_L.resize(Nd, 0.0);
+    }
  };

  struct StaggeredImplParams {
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -64,11 +64,6 @@ namespace Grid {
      virtual RealD  M    (const FermionField &in, FermionField &out)=0;
      virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;

-      // Query the even even properties to make algorithmic decisions
-      virtual int    ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
-      virtual int    isTrivialEE(void) { return 0; };
-      virtual RealD  Mass(void) {return 0.0;};
-
      // half checkerboard operaions
      virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
      virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0;
--- a/Grid/qcd/action/fermion/FermionOperatorImpl.h
+++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h
@@ -240,16 +240,30 @@ namespace QCD {
      GaugeLinkField tmp(GaugeGrid);

      Lattice<iScalar<vInteger> > coor(GaugeGrid);
+      ////////////////////////////////////////////////////
+      // apply any boundary phase or twists
+      ////////////////////////////////////////////////////
      for (int mu = 0; mu < Nd; mu++) {

-	      auto pha = Params.boundary_phases[mu];
-	      scalar_type phase( real(pha),imag(pha) );
+	////////// boundary phase /////////////
+	auto pha = Params.boundary_phases[mu];
+	scalar_type phase( real(pha),imag(pha) );

-        int Lmu = GaugeGrid->GlobalDimensions()[mu] - 1;
+	int L   = GaugeGrid->GlobalDimensions()[mu];
+        int Lmu = L - 1;

        LatticeCoordinate(coor, mu);

        U = PeekIndex<LorentzIndex>(Umu, mu);
+
+	// apply any twists
+	RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
+	if ( theta != 0.0) { 
+	  scalar_type twphase(::cos(theta),::sin(theta));
+	  U = twphase*U;
+	  std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
+	}
+
        tmp = where(coor == Lmu, phase * U, U);
        PokeIndex<LorentzIndex>(Uds, tmp, mu);

--- a/Grid/serialisation/Hdf5IO.cc
+++ b/Grid/serialisation/Hdf5IO.cc
@@ -61,9 +61,9 @@ Group & Hdf5Writer::getGroup(void)
 }

 // Reader implementation ///////////////////////////////////////////////////////
-Hdf5Reader::Hdf5Reader(const std::string &fileName)
+Hdf5Reader::Hdf5Reader(const std::string &fileName, const bool readOnly)
 : fileName_(fileName)
-, file_(fileName.c_str(), H5F_ACC_RDWR)
+, file_(fileName.c_str(), readOnly ? H5F_ACC_RDONLY : H5F_ACC_RDWR)
 {
  group_ = file_.openGroup("/");
  readSingleAttribute(dataSetThres_, HDF5_GRID_GUARD "dataset_threshold",
--- a/Grid/serialisation/Hdf5IO.h
+++ b/Grid/serialisation/Hdf5IO.h
@@ -54,7 +54,7 @@ namespace Grid
  class Hdf5Reader: public Reader<Hdf5Reader>
  {
  public:
-    Hdf5Reader(const std::string &fileName);
+    Hdf5Reader(const std::string &fileName, const bool readOnly = true);
    virtual ~Hdf5Reader(void) = default;
    bool push(const std::string &s);
    void pop(void);
@@ -123,9 +123,12 @@ namespace Grid
    
    if (flatx.size() > dataSetThres_)
    {
-      H5NS::DataSet dataSet;
+      H5NS::DataSet           dataSet;
+      H5NS::DSetCreatPropList plist;
      
-      dataSet = group_.createDataSet(s, Hdf5Type<Element>::type(), dataSpace);
+      plist.setChunk(dim.size(), dim.data());
+      plist.setFletcher32();
+      dataSet = group_.createDataSet(s, Hdf5Type<Element>::type(), dataSpace, plist);
      dataSet.write(flatx.data(), Hdf5Type<Element>::type());
    }
    else
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -47,6 +47,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #define PARALLEL_FOR_LOOP
 #define PARALLEL_FOR_LOOP_INTERN
+#define PARALLEL_FOR_LOOP_REDUCE(op, var)
 #define PARALLEL_NESTED_LOOP2
 #define PARALLEL_NESTED_LOOP5
 #define PARALLEL_REGION
@@ -58,6 +59,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
 #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 #define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for
+#define parallel_critical PARALLEL_CRITICAL

 namespace Grid {

--- a/Grid/util/Sha.h
+++ b/Grid/util/Sha.h
@@ -28,16 +28,31 @@
 extern "C" {
 #include <openssl/sha.h>
 }
+#ifdef USE_IPP
+#include "ipp.h"
+#endif

 #pragma once

 class GridChecksum
 {
 public:
-  static inline uint32_t crc32(void *data,size_t bytes)
+  static inline uint32_t crc32(const void *data, size_t bytes)
  {
    return ::crc32(0L,(unsigned char *)data,bytes);
  }
+
+#ifdef USE_IPP
+  static inline uint32_t crc32c(const void* data, size_t bytes)
+  {
+      uint32_t crc32c = ~(uint32_t)0;
+      ippsCRC32C_8u(reinterpret_cast<const unsigned char *>(data), bytes, &crc32c);
+      ippsSwapBytes_32u_I(&crc32c, 1);
+  
+      return ~crc32c;
+  }
+#endif
+
  template <typename T>
  static inline std::string sha256_string(const std::vector<T> &hash)
  {
--- a/Hadrons/A2AMatrix.hpp
+++ b/Hadrons/A2AMatrix.hpp
@@ -32,11 +32,19 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Global.hpp>
 #include <Hadrons/TimerArray.hpp>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
+#ifdef USE_MKL
+#include "mkl.h"
+#include "mkl_cblas.h"
+#endif

 #ifndef HADRONS_A2AM_NAME 
 #define HADRONS_A2AM_NAME "a2aMatrix"
 #endif

+#ifndef HADRONS_A2AM_IO_TYPE
+#define HADRONS_A2AM_IO_TYPE ComplexF
+#endif
+
 #define HADRONS_A2AM_PARALLEL_IO

 BEGIN_HADRONS_NAMESPACE
@@ -51,6 +59,12 @@ BEGIN_HADRONS_NAMESPACE
 template <typename T>
 using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;

+template <typename T>
+using A2AMatrix = Eigen::Matrix<T, -1, -1, Eigen::RowMajor>;
+
+template <typename T>
+using A2AMatrixTr = Eigen::Matrix<T, -1, -1, Eigen::ColMajor>;
+
 /******************************************************************************
 *                      Abstract class for A2A kernels                        *
 ******************************************************************************/
@@ -76,10 +90,15 @@ public:
    // constructors
    A2AMatrixIo(void) = default;
    A2AMatrixIo(std::string filename, std::string dataname, 
-                const unsigned int nt, const unsigned int ni,
-                const unsigned int nj);
+                const unsigned int nt, const unsigned int ni = 0,
+                const unsigned int nj = 0);
    // destructor
    ~A2AMatrixIo(void) = default;
+    // access
+    unsigned int getNi(void) const;
+    unsigned int getNj(void) const;
+    unsigned int getNt(void) const;
+    size_t       getSize(void) const;
    // file allocation
    template <typename MetadataType>
    void initFile(const MetadataType &d, const unsigned int chunkSize);
@@ -88,9 +107,11 @@ public:
                   const unsigned int blockSizei, const unsigned int blockSizej);
    void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
                   const unsigned int i, const unsigned int j);
+    template <template <class> class Vec, typename VecT>
+    void load(Vec<VecT> &v, double *tRead = nullptr);
 private:
-    std::string  filename_, dataname_;
-    unsigned int nt_, ni_, nj_;
+    std::string  filename_{""}, dataname_{""};
+    unsigned int nt_{0}, ni_{0}, nj_{0};
 };

 /******************************************************************************
@@ -136,6 +157,226 @@ private:
    std::vector<IoHelper> nodeIo_;
 };

+/******************************************************************************
+ *                       A2A matrix contraction kernels                       *
+ ******************************************************************************/
+class A2AContraction
+{
+public:
+    // accTrMul(acc, a, b): acc += tr(a*b)
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
+    {
+        if ((MatLeft::Options == Eigen::RowMajor) and
+            (MatRight::Options == Eigen::ColMajor))
+        {
+            parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+            {
+                C tmp;
+#ifdef USE_MKL
+                dotuRow(tmp, r, a, b);
+#else
+                tmp = a.row(r).conjugate().dot(b.col(r));
+#endif
+                parallel_critical
+                {
+                    acc += tmp;
+                }
+            }
+        }
+        else
+        {
+            parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+            {
+                C tmp;
+#ifdef USE_MKL 
+                dotuCol(tmp, c, a, b);
+#else
+                tmp = a.col(c).conjugate().dot(b.row(c));
+#endif
+                parallel_critical
+                {
+                    acc += tmp;
+                }
+            }
+        }
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline double accTrMulFlops(const MatLeft &a, const MatRight &b)
+    {
+        double n = a.rows()*a.cols();
+
+        return 8.*n;
+    }
+
+    // mul(res, a, b): res = a*b
+#ifdef USE_MKL
+    template <template <class, int...> class Mat, int... Opts>
+    static inline void mul(Mat<ComplexD, Opts...> &res, 
+                           const Mat<ComplexD, Opts...> &a, 
+                           const Mat<ComplexD, Opts...> &b)
+    {
+        static const ComplexD one(1., 0.), zero(0., 0.);
+
+        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
+        {
+            res.resize(a.rows(), b.cols());
+        }
+        if (Mat<ComplexD, Opts...>::Options == Eigen::RowMajor)
+        {
+            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat<ComplexD, Opts...>::Options == Eigen::ColMajor)
+        {
+            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    }
+
+    template <template <class, int...> class Mat, int... Opts>
+    static inline void mul(Mat<ComplexF, Opts...> &res, 
+                           const Mat<ComplexF, Opts...> &a, 
+                           const Mat<ComplexF, Opts...> &b)
+    {
+        static const ComplexF one(1., 0.), zero(0., 0.);
+
+        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
+        {
+            res.resize(a.rows(), b.cols());
+        }
+        if (Mat<ComplexF, Opts...>::Options == Eigen::RowMajor)
+        {
+            cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat<ComplexF, Opts...>::Options == Eigen::ColMajor)
+        {
+            cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    }
+#else
+    template <typename Mat>
+    static inline void mul(Mat &res, const Mat &a, const Mat &b)
+    {
+        res = a*b;
+    }
+#endif
+    template <typename Mat>
+    static inline double mulFlops(const Mat &a, const Mat &b)
+    {
+        double nr = a.rows(), nc = a.cols();
+
+        return nr*nr*(6.*nc + 2.*(nc - 1.));
+    }
+private:
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void makeDotRowPt(C * &aPt, unsigned int &aInc, C * &bPt, 
+                                    unsigned int &bInc, const unsigned int aRow, 
+                                    const MatLeft &a, const MatRight &b)
+    {
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            aPt  = a.data() + aRow*a.cols();
+            aInc = 1;
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            aPt  = a.data() + aRow;
+            aInc = a.rows();
+        }
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            bPt  = b.data() + aRow;
+            bInc = b.cols();
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            bPt  = b.data() + aRow*b.rows();
+            bInc = 1;
+        }
+    }
+
+#ifdef USE_MKL
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void makeDotColPt(C * &aPt, unsigned int &aInc, C * &bPt, 
+                                    unsigned int &bInc, const unsigned int aCol, 
+                                    const MatLeft &a, const MatRight &b)
+    {
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            aPt  = a.data() + aCol;
+            aInc = a.cols();
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            aPt  = a.data() + aCol*a.rows();
+            aInc = 1;
+        }
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            bPt  = b.data() + aCol*b.cols();
+            bInc = 1;
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            bPt  = b.data() + aCol;
+            bInc = b.rows();
+        }
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuRow(ComplexF &res, const unsigned int aRow,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexF *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
+        cblas_cdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuCol(ComplexF &res, const unsigned int aCol,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexF *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
+        cblas_cdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuRow(ComplexD &res, const unsigned int aRow,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexD *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
+        cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuCol(ComplexD &res, const unsigned int aCol,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexD *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
+        cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+    }
+#endif
+};
+
 /******************************************************************************
 *                     A2AMatrixIo template implementation                    *
 ******************************************************************************/
@@ -148,6 +389,31 @@ A2AMatrixIo<T>::A2AMatrixIo(std::string filename, std::string dataname,
 , nt_(nt), ni_(ni), nj_(nj)
 {}

+// access //////////////////////////////////////////////////////////////////////
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNt(void) const
+{
+    return nt_;
+}
+
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNi(void) const
+{
+    return ni_;
+}
+
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNj(void) const
+{
+    return nj_;
+}
+
+template <typename T>
+size_t A2AMatrixIo<T>::getSize(void) const
+{
+    return nt_*ni_*nj_*sizeof(T);
+}
+
 // file allocation /////////////////////////////////////////////////////////////
 template <typename T>
 template <typename MetadataType>
@@ -171,11 +437,12 @@ void A2AMatrixIo<T>::initFile(const MetadataType &d, const unsigned int chunkSiz
    }

    // create the dataset
-    Hdf5Reader reader(filename_);
+    Hdf5Reader reader(filename_, false);

    push(reader, dataname_);
    auto &group = reader.getGroup();
    plist.setChunk(chunk.size(), chunk.data());
+    plist.setFletcher32();
    dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type<T>::type(), dataspace, plist);
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
@@ -191,7 +458,7 @@ void A2AMatrixIo<T>::saveBlock(const T *data,
                               const unsigned int blockSizej)
 {
 #ifdef HAVE_HDF5
-    Hdf5Reader           reader(filename_);
+    Hdf5Reader           reader(filename_, false);
    std::vector<hsize_t> count = {nt_, blockSizei, blockSizej},
                         offset = {0, static_cast<hsize_t>(i),
                                   static_cast<hsize_t>(j)},
@@ -226,6 +493,80 @@ void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,
    saveBlock(m.data() + offset, i, j, blockSizei, blockSizej);
 }

+template <typename T>
+template <template <class> class Vec, typename VecT>
+void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead)
+{
+#ifdef HAVE_HDF5
+    Hdf5Reader           reader(filename_);
+    std::vector<hsize_t> hdim;
+    H5NS::DataSet        dataset;
+    H5NS::DataSpace      dataspace;
+    H5NS::CompType       datatype;
+    
+    push(reader, dataname_);
+    auto &group = reader.getGroup();
+    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
+    datatype    = dataset.getCompType();
+    dataspace   = dataset.getSpace();
+    hdim.resize(dataspace.getSimpleExtentNdims());
+    dataspace.getSimpleExtentDims(hdim.data());
+    if ((nt_*ni_*nj_ != 0) and
+        ((hdim[0] != nt_) or (hdim[1] != ni_) or (hdim[2] != nj_)))
+    {
+        HADRONS_ERROR(Size, "all-to-all matrix size mismatch (got "
+            + std::to_string(hdim[0]) + "x" + std::to_string(hdim[1]) + "x"
+            + std::to_string(hdim[2]) + ", expected "
+            + std::to_string(nt_) + "x" + std::to_string(ni_) + "x"
+            + std::to_string(nj_));
+    }
+    else if (ni_*nj_ == 0)
+    {
+        if (hdim[0] != nt_)
+        {
+            HADRONS_ERROR(Size, "all-to-all time size mismatch (got "
+                + std::to_string(hdim[0]) + ", expected "
+                + std::to_string(nt_) + ")");
+        }
+        ni_ = hdim[1];
+        nj_ = hdim[2];
+    }
+
+    A2AMatrix<T>         buf(ni_, nj_);
+    std::vector<hsize_t> count    = {1, static_cast<hsize_t>(ni_),
+                                     static_cast<hsize_t>(nj_)},
+                         stride   = {1, 1, 1},
+                         block    = {1, 1, 1},
+                         memCount = {static_cast<hsize_t>(ni_),
+                                     static_cast<hsize_t>(nj_)};
+    H5NS::DataSpace      memspace(memCount.size(), memCount.data());
+
+    std::cout << "Loading timeslice";
+    std::cout.flush();
+    *tRead = 0.;
+    for (unsigned int tp1 = nt_; tp1 > 0; --tp1)
+    {
+        unsigned int         t      = tp1 - 1;
+        std::vector<hsize_t> offset = {static_cast<hsize_t>(t), 0, 0};
+        
+        if (t % 10 == 0)
+        {
+            std::cout << " " << t;
+            std::cout.flush();
+        }
+        dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
+                                  stride.data(), block.data());
+        if (tRead) *tRead -= usecond();    
+        dataset.read(buf.data(), datatype, memspace, dataspace);
+        if (tRead) *tRead += usecond();
+        v[t] = buf.template cast<VecT>();
+    }
+    std::cout << std::endl;
+#else
+    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
+#endif
+}
+
 /******************************************************************************
 *               A2AMatrixBlockComputation template implementation            *
 ******************************************************************************/
--- a/Hadrons/A2AVectors.hpp
+++ b/Hadrons/A2AVectors.hpp
@@ -36,7 +36,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 BEGIN_HADRONS_NAMESPACE

 /******************************************************************************
- *               Classes to generate V & W all-to-all vectors                 *
+ *                 Class to generate V & W all-to-all vectors                 *
 ******************************************************************************/
 template <typename FImpl>
 class A2AVectorsSchurDiagTwo
@@ -70,6 +70,42 @@ private:
    SchurDiagTwoOperator<FMat, FermionField> op_;
 };

+/******************************************************************************
+ *                  Methods for V & W all-to-all vectors I/O                  *
+ ******************************************************************************/
+class A2AVectorsIo
+{
+public:
+    struct Record: Serializable
+    {
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Record,
+                                        unsigned int, index);
+        Record(void): index(0) {}
+    };
+public:
+    template <typename Field>
+    static void write(const std::string fileStem, std::vector<Field> &vec, 
+                      const bool multiFile, const int trajectory = -1);
+    template <typename Field>
+    static void read(std::vector<Field> &vec, const std::string fileStem,
+                     const bool multiFile, const int trajectory = -1);
+private:
+    static inline std::string vecFilename(const std::string stem, const int traj, 
+                                          const bool multiFile)
+    {
+        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
+
+        if (multiFile)
+        {
+            return stem + t;
+        }
+        else
+        {
+            return stem + t + ".bin";
+        }
+    }
+};
+
 /******************************************************************************
 *               A2AVectorsSchurDiagTwo template implementation               *
 ******************************************************************************/
@@ -217,6 +253,90 @@ void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeW5D(FermionField &wout_4d,
    }
 }

+/******************************************************************************
+ *               all-to-all vectors I/O template implementation               *
+ ******************************************************************************/
+template <typename Field>
+void A2AVectorsIo::write(const std::string fileStem, std::vector<Field> &vec, 
+                         const bool multiFile, const int trajectory)
+{
+    Record       record;
+    GridBase     *grid = vec[0]._grid;
+    ScidacWriter binWriter(grid->IsBoss());
+    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
+
+    if (multiFile)
+    {
+        std::string fullFilename;
+
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
+
+            LOG(Message) << "Writing vector " << i << std::endl;
+            makeFileDir(fullFilename, grid);
+            binWriter.open(fullFilename);
+            record.index = i;
+            binWriter.writeScidacFieldRecord(vec[i], record);
+            binWriter.close();
+        }
+    }
+    else
+    {
+        makeFileDir(filename, grid);
+        binWriter.open(filename);
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            LOG(Message) << "Writing vector " << i << std::endl;
+            record.index = i;
+            binWriter.writeScidacFieldRecord(vec[i], record);
+        }
+        binWriter.close();
+    }
+}
+
+template <typename Field>
+void A2AVectorsIo::read(std::vector<Field> &vec, const std::string fileStem, 
+                        const bool multiFile, const int trajectory)
+{
+    Record       record;
+    ScidacReader binReader;
+    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
+
+    if (multiFile)
+    {
+        std::string fullFilename;
+
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
+
+            LOG(Message) << "Reading vector " << i << std::endl;
+            binReader.open(fullFilename);
+            binReader.readScidacFieldRecord(vec[i], record);
+            binReader.close();
+            if (record.index != i)
+            {
+                HADRONS_ERROR(Io, "vector index mismatch");
+            }
+        }
+    }
+    else
+    {
+        binReader.open(filename);
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            LOG(Message) << "Reading vector " << i << std::endl;
+            binReader.readScidacFieldRecord(vec[i], record);
+            if (record.index != i)
+            {
+                HADRONS_ERROR(Io, "vector index mismatch");
+            }
+        }
+        binReader.close();
+    }
+}
+
 END_HADRONS_NAMESPACE

 #endif // A2A_Vectors_hpp_
--- a/Hadrons/Application.cc
+++ b/Hadrons/Application.cc
@@ -108,6 +108,9 @@ void Application::run(void)
        HADRONS_ERROR(Definition, "run id is empty");
    }
    LOG(Message) << "RUN ID '" << getPar().runId << "'" << std::endl;
+    BinaryIO::latticeWriteMaxRetry = getPar().parallelWriteMaxRetry;
+    LOG(Message) << "Attempt(s) for resilient parallel I/O: " 
+                 << BinaryIO::latticeWriteMaxRetry << std::endl;
    vm().setRunId(getPar().runId);
    vm().printContent();
    env().printContent();
--- a/Hadrons/Application.hpp
+++ b/Hadrons/Application.hpp
@@ -56,7 +56,9 @@ public:
                                        TrajRange,                  trajCounter,
                                        VirtualMachine::GeneticPar, genetic,
                                        std::string,                runId,
-                                        std::string,                graphFile);
+                                        std::string,                graphFile,
+                                        int,                        parallelWriteMaxRetry);
+        GlobalPar(void): parallelWriteMaxRetry{-1} {}
    };
 public:
    // constructors
--- a/Hadrons/DilutedNoise.hpp
+++ b/Hadrons/DilutedNoise.hpp
@@ -7,6 +7,7 @@ Source file: Hadrons/DilutedNoise.hpp
 Copyright (C) 2015-2018

 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -76,6 +77,22 @@ private:
    unsigned int nt_;
 };

+template <typename FImpl>
+class FullVolumeSpinColorDiagonalNoise: public DilutedNoise<FImpl>
+{
+public:
+    typedef typename FImpl::FermionField FermionField;
+public:
+    // constructor/destructor
+    FullVolumeSpinColorDiagonalNoise(GridCartesian *g, unsigned int n_src);
+    virtual ~FullVolumeSpinColorDiagonalNoise(void) = default;
+    // generate noise
+    virtual void generateNoise(GridParallelRNG &rng);
+private:
+    unsigned int nSrc_;
+};
+
+
 /******************************************************************************
 *                    DilutedNoise template implementation                    *
 ******************************************************************************/
@@ -186,6 +203,47 @@ void TimeDilutedSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rn
    }
 }

+/******************************************************************************
+ *        FullVolumeSpinColorDiagonalNoise template implementation           *
+ ******************************************************************************/
+template <typename FImpl>
+FullVolumeSpinColorDiagonalNoise<FImpl>::
+FullVolumeSpinColorDiagonalNoise(GridCartesian *g, unsigned int nSrc)
+: DilutedNoise<FImpl>(g, nSrc*Ns*FImpl::Dimension), nSrc_(nSrc)
+{}
+
+template <typename FImpl>
+void FullVolumeSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng)
+{
+    typedef decltype(peekColour((*this)[0], 0)) SpinField;
+
+    auto                       &noise = *this;
+    auto                       g      = this->getGrid();
+    auto                       nd     = g->GlobalDimensions().size();
+    auto                       nc     = FImpl::Dimension;
+    Complex                    shift(1., 1.);
+    LatticeComplex             eta(g);
+    SpinField                  etas(g);
+    unsigned int               i = 0;
+
+    bernoulli(rng, eta);
+    eta = (2.*eta - shift)*(1./::sqrt(2.));
+    for (unsigned int n = 0; n < nSrc_; ++n)
+    {
+        for (unsigned int s = 0; s < Ns; ++s)
+        {
+            etas = zero;
+            pokeSpin(etas, eta, s);
+            for (unsigned int c = 0; c < nc; ++c)
+            {
+                noise[i] = zero;
+                pokeColour(noise[i], etas, c);
+                i++;
+            }
+        }
+    }
+}
+
 END_HADRONS_NAMESPACE

 #endif // Hadrons_DilutedNoise_hpp_
--- a/Hadrons/DiskVector.hpp
+++ b/Hadrons/DiskVector.hpp
@@ -29,6 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define Hadrons_DiskVector_hpp_

 #include <Hadrons/Global.hpp>
+#include <Hadrons/A2AMatrix.hpp>
 #include <deque>
 #include <sys/stat.h>
 #include <ftw.h>
@@ -59,14 +60,18 @@ public:
        : master_(master), cmaster_(master), i_(i) {}

        // operator=: somebody is trying to store a vector element
-        // write to disk and cache
+        // write to cache and tag as modified
        T &operator=(const T &obj) const
        {
+            auto &cache    = *master_.cachePtr_;
+            auto &modified = *master_.modifiedPtr_;
+            auto &index    = *master_.indexPtr_;
+
            DV_DEBUG_MSG(&master_, "writing to " << i_);
            master_.cacheInsert(i_, obj);
-            master_.save(master_.filename(i_), obj);
+            modified[index.at(i_)] = true;
            
-            return master_.cachePtr_->at(i_);
+            return cache[index.at(i_)];
        }

        // implicit cast to const object reference and redirection
@@ -83,6 +88,7 @@ public:
 public:
    DiskVectorBase(const std::string dirname, const unsigned int size = 0,
                   const unsigned int cacheSize = 1, const bool clean = true);
+    DiskVectorBase(DiskVectorBase<T> &&v) = default;
    virtual ~DiskVectorBase(void);
    const T & operator[](const unsigned int i) const;
    RwAccessHelper operator[](const unsigned int i);
@@ -97,14 +103,17 @@ private:
    void cacheInsert(const unsigned int i, const T &obj) const;
    void clean(void);
 private:
-    std::string                                dirname_;
-    unsigned int                               size_, cacheSize_;
-    double                                     access_{0.}, hit_{0.};
-    bool                                       clean_;
+    std::string                                           dirname_;
+    unsigned int                                          size_, cacheSize_;
+    double                                                access_{0.}, hit_{0.};
+    bool                                                  clean_;
    // using pointers to allow modifications when class is const
    // semantic: const means data unmodified, but cache modification allowed
-    std::unique_ptr<std::map<unsigned int, T>> cachePtr_;
-    std::unique_ptr<std::deque<unsigned int>>  loadsPtr_;                
+    std::unique_ptr<std::vector<T>>                       cachePtr_;
+    std::unique_ptr<std::vector<bool>>                    modifiedPtr_;
+    std::unique_ptr<std::map<unsigned int, unsigned int>> indexPtr_;
+    std::unique_ptr<std::stack<unsigned int>>             freePtr_;
+    std::unique_ptr<std::deque<unsigned int>>             loadsPtr_;                
 };

 /******************************************************************************
@@ -135,7 +144,7 @@ private:
 *                      Specialisation for Eigen matrices                     *
 ******************************************************************************/
 template <typename T>
-using EigenDiskVectorMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+using EigenDiskVectorMat = A2AMatrix<T>;

 template <typename T>
 class EigenDiskVector: public DiskVectorBase<EigenDiskVectorMat<T>>
@@ -152,24 +161,31 @@ public:
 private:
    virtual void load(EigenDiskVectorMat<T> &obj, const std::string filename) const
    {
-        std::ifstream              f(filename, std::ios::binary);
-        std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
-        Eigen::Index               nRow, nCol;
-        size_t                     matSize;
-        double                     t;
+        std::ifstream f(filename, std::ios::binary);
+        uint32_t      crc, check;
+        Eigen::Index  nRow, nCol;
+        size_t        matSize;
+        double        tRead, tHash;

-        f.read(reinterpret_cast<char *>(hash.data()), hash.size()*sizeof(unsigned char));
-        f.read(reinterpret_cast<char *>(&nRow), sizeof(Eigen::Index));
-        f.read(reinterpret_cast<char *>(&nCol), sizeof(Eigen::Index));
+        f.read(reinterpret_cast<char *>(&crc), sizeof(crc));
+        f.read(reinterpret_cast<char *>(&nRow), sizeof(nRow));
+        f.read(reinterpret_cast<char *>(&nCol), sizeof(nCol));
        obj.resize(nRow, nCol);
        matSize = nRow*nCol*sizeof(T);
-        t  = -usecond();
+        tRead  = -usecond();
        f.read(reinterpret_cast<char *>(obj.data()), matSize);
-        t += usecond();
-        DV_DEBUG_MSG(this, "Eigen read " << matSize/t*1.0e6/1024/1024 << " MB/s");
-        auto check = GridChecksum::sha256(obj.data(), matSize);
-        DV_DEBUG_MSG(this, "Eigen sha256 " << GridChecksum::sha256_string(check));
-        if (hash != check)
+        tRead += usecond();
+        tHash  = -usecond();
+#ifdef USE_IPP
+        check  = GridChecksum::crc32c(obj.data(), matSize);
+#else
+        check  = GridChecksum::crc32(obj.data(), matSize);
+#endif
+        tHash += usecond();
+        DV_DEBUG_MSG(this, "Eigen read " << tRead/1.0e6 << " sec " << matSize/tRead*1.0e6/1024/1024 << " MB/s");
+        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << check << std::dec 
+                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
+        if (crc != check)
        {
            HADRONS_ERROR(Io, "checksum failed")
        }
@@ -177,24 +193,31 @@ private:

    virtual void save(const std::string filename, const EigenDiskVectorMat<T> &obj) const
    {
-        std::ofstream              f(filename, std::ios::binary);
-        std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
-        Eigen::Index               nRow, nCol;
-        size_t                     matSize;
-        double                     t;
+        std::ofstream f(filename, std::ios::binary);
+        uint32_t      crc;
+        Eigen::Index  nRow, nCol;
+        size_t        matSize;
+        double        tWrite, tHash;
        
        nRow    = obj.rows();
        nCol    = obj.cols();
        matSize = nRow*nCol*sizeof(T);
-        hash    = GridChecksum::sha256(obj.data(), matSize);
-        DV_DEBUG_MSG(this, "Eigen sha256 " << GridChecksum::sha256_string(hash));
-        f.write(reinterpret_cast<char *>(hash.data()), hash.size()*sizeof(unsigned char));
-        f.write(reinterpret_cast<char *>(&nRow), sizeof(Eigen::Index));
-        f.write(reinterpret_cast<char *>(&nCol), sizeof(Eigen::Index));
-        t  = -usecond();
+        tHash   = -usecond();
+#ifdef USE_IPP
+        crc     = GridChecksum::crc32c(obj.data(), matSize);
+#else
+        crc     = GridChecksum::crc32(obj.data(), matSize);
+#endif
+        tHash  += usecond();
+        f.write(reinterpret_cast<char *>(&crc), sizeof(crc));
+        f.write(reinterpret_cast<char *>(&nRow), sizeof(nRow));
+        f.write(reinterpret_cast<char *>(&nCol), sizeof(nCol));
+        tWrite = -usecond();
        f.write(reinterpret_cast<const char *>(obj.data()), matSize);
-        t += usecond();
-        DV_DEBUG_MSG(this, "Eigen write " << matSize/t*1.0e6/1024/1024 << " MB/s");
+        tWrite += usecond();
+        DV_DEBUG_MSG(this, "Eigen write " << tWrite/1.0e6 << " sec " << matSize/tWrite*1.0e6/1024/1024 << " MB/s");
+        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << crc << std::dec
+                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
    }
 };

@@ -207,7 +230,10 @@ DiskVectorBase<T>::DiskVectorBase(const std::string dirname,
                                  const unsigned int cacheSize,
                                  const bool clean)
 : dirname_(dirname), size_(size), cacheSize_(cacheSize), clean_(clean)
-, cachePtr_(new std::map<unsigned int, T>())
+, cachePtr_(new std::vector<T>(size))
+, modifiedPtr_(new std::vector<bool>(size, false))
+, indexPtr_(new std::map<unsigned int, unsigned int>())
+, freePtr_(new std::stack<unsigned int>)
 , loadsPtr_(new std::deque<unsigned int>())
 {
    struct stat s;
@@ -217,6 +243,10 @@ DiskVectorBase<T>::DiskVectorBase(const std::string dirname,
        HADRONS_ERROR(Io, "directory '" + dirname + "' already exists")
    }
    mkdir(dirname);
+    for (unsigned int i = 0; i < cacheSize_; ++i)
+    {
+        freePtr_->push(i);
+    }
 }

 template <typename T>
@@ -231,8 +261,10 @@ DiskVectorBase<T>::~DiskVectorBase(void)
 template <typename T>
 const T & DiskVectorBase<T>::operator[](const unsigned int i) const
 {
-    auto &cache  = *cachePtr_;
-    auto &loads  = *loadsPtr_;
+    auto &cache   = *cachePtr_;
+    auto &index   = *indexPtr_;
+    auto &freeInd = *freePtr_;
+    auto &loads   = *loadsPtr_;

    DV_DEBUG_MSG(this, "accessing " << i << " (RO)");

@@ -241,7 +273,7 @@ const T & DiskVectorBase<T>::operator[](const unsigned int i) const
        HADRONS_ERROR(Size, "index out of range");
    }
    const_cast<double &>(access_)++;
-    if (cache.find(i) == cache.end())
+    if (index.find(i) == index.end())
    {
        // cache miss
        DV_DEBUG_MSG(this, "cache miss");
@@ -268,7 +300,7 @@ const T & DiskVectorBase<T>::operator[](const unsigned int i) const
    DV_DEBUG_MSG(this, "in cache: " << msg);
 #endif

-    return cache.at(i);
+    return cache[index.at(i)];
 }

 template <typename T>
@@ -306,13 +338,24 @@ std::string DiskVectorBase<T>::filename(const unsigned int i) const
 template <typename T>
 void DiskVectorBase<T>::evict(void) const
 {
-    auto &cache = *cachePtr_;
-    auto &loads = *loadsPtr_;
+    auto &cache    = *cachePtr_;
+    auto &modified = *modifiedPtr_;
+    auto &index    = *indexPtr_;
+    auto &freeInd  = *freePtr_;
+    auto &loads    = *loadsPtr_;

-    if (cache.size() >= cacheSize_)
+    if (index.size() >= cacheSize_)
    {
-        DV_DEBUG_MSG(this, "evicting " << loads.front());
-        cache.erase(loads.front());
+        unsigned int i = loads.front();
+        
+        DV_DEBUG_MSG(this, "evicting " << i);
+        if (modified[index.at(i)])
+        {
+            DV_DEBUG_MSG(this, "element " << i << " modified, saving to disk");
+            save(filename(i), cache[index.at(i)]);
+        }
+        freeInd.push(index.at(i));
+        index.erase(i);
        loads.pop_front();
    }
 }
@@ -320,30 +363,44 @@ void DiskVectorBase<T>::evict(void) const
 template <typename T>
 void DiskVectorBase<T>::fetch(const unsigned int i) const
 {
-    auto &cache = *cachePtr_;
-    auto &loads = *loadsPtr_;
+    auto &cache    = *cachePtr_;
+    auto &modified = *modifiedPtr_;
+    auto &index    = *indexPtr_;
+    auto &freeInd  = *freePtr_;
+    auto &loads    = *loadsPtr_;
+
    struct stat s;

    DV_DEBUG_MSG(this, "loading " << i << " from disk");

    evict();
+    
    if(stat(filename(i).c_str(), &s) != 0)
    {
        HADRONS_ERROR(Io, "disk vector element " + std::to_string(i) + " uninitialised");
    }
-    load(cache[i], filename(i));
+    index[i] = freeInd.top();
+    freeInd.pop();
+    load(cache[index.at(i)], filename(i));
    loads.push_back(i);
+    modified[index.at(i)] = false;
 }

 template <typename T>
 void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
 {
-    auto &cache = *cachePtr_;
-    auto &loads = *loadsPtr_;
+    auto &cache    = *cachePtr_;
+    auto &modified = *modifiedPtr_;
+    auto &index    = *indexPtr_;
+    auto &freeInd  = *freePtr_;
+    auto &loads    = *loadsPtr_;

    evict();
-    cache[i] = obj;
+    index[i] = freeInd.top();
+    freeInd.pop();
+    cache[index.at(i)] = obj;
    loads.push_back(i);
+    modified[index.at(i)] = false;

 #ifdef DV_DEBUG
    std::string msg;
--- a/Hadrons/Global.cc
+++ b/Hadrons/Global.cc
@@ -166,7 +166,13 @@ std::string Hadrons::dirname(const std::string &s)

 void Hadrons::makeFileDir(const std::string filename, GridBase *g)
 {
-    if (g->IsBoss())
+    bool doIt = true;
+
+    if (g)
+    {
+        doIt = g->IsBoss();
+    }
+    if (doIt)
    {
        std::string dir    = dirname(filename);
        int         status = mkdir(dir);
--- a/Hadrons/Global.hpp
+++ b/Hadrons/Global.hpp
@@ -32,6 +32,7 @@ See the full license in the file "LICENSE" in the top level distribution directo

 #include <set>
 #include <stack>
+#include <regex>
 #include <Grid/Grid.h>
 #include <cxxabi.h>

@@ -217,15 +218,15 @@ typedef XmlReader ResultReader;
 typedef XmlWriter ResultWriter;
 #endif

-#define RESULT_FILE_NAME(name) \
-name + "." + std::to_string(vm().getTrajectory()) + "." + resultFileExt
+#define RESULT_FILE_NAME(name, traj) \
+name + "." + std::to_string(traj) + "." + resultFileExt

 // recursive mkdir
 #define MAX_PATH_LENGTH 512u
 int         mkdir(const std::string dirName);
 std::string basename(const std::string &s);
 std::string dirname(const std::string &s);
-void        makeFileDir(const std::string filename, GridBase *g);
+void        makeFileDir(const std::string filename, GridBase *g = nullptr);

 // default Schur convention
 #ifndef HADRONS_DEFAULT_SCHUR 
@@ -248,6 +249,20 @@ void        makeFileDir(const std::string filename, GridBase *g);
 // pretty print time profile
 void printTimeProfile(const std::map<std::string, GridTime> &timing, GridTime total);

+// token replacement utility
+template <typename T>
+void tokenReplace(std::string &str, const std::string token,
+                  const T &x, const std::string mark = "@")
+{
+    std::string fullToken = mark + token + mark;
+    
+    auto pos = str.find(fullToken);
+    if (pos != std::string::npos)
+    {
+        str.replace(pos, fullToken.size(), std::to_string(x));
+    }
+}
+
 END_HADRONS_NAMESPACE

 #include <Hadrons/Exceptions.hpp>
--- a/Hadrons/Makefile.am
+++ b/Hadrons/Makefile.am
@@ -5,17 +5,17 @@ lib_LIBRARIES = libHadrons.a
 include modules.inc

 libHadrons_a_SOURCES = \
-    $(modules_cc)      \
    Application.cc     \
    Environment.cc     \
 	Exceptions.cc      \
    Global.cc          \
    Module.cc		   \
 	TimerArray.cc      \
-	VirtualMachine.cc
+	VirtualMachine.cc  \
+	$(modules_cc)
+	
 libHadrons_adir = $(includedir)/Hadrons
 nobase_libHadrons_a_HEADERS = \
-	$(modules_hpp)            \
 	A2AVectors.hpp            \
 	A2AMatrix.hpp             \
 	Application.hpp           \
@@ -33,4 +33,5 @@ nobase_libHadrons_a_HEADERS = \
 	ModuleFactory.hpp         \
 	Solver.hpp                \
 	TimerArray.hpp            \
-	VirtualMachine.hpp
+	VirtualMachine.hpp        \
+	$(modules_hpp)
--- a/Hadrons/Module.hpp
+++ b/Hadrons/Module.hpp
@@ -144,7 +144,7 @@ if (env().getGrid()->IsBoss() and !ioStem.empty())\
 {\
    makeFileDir(ioStem, env().getGrid());\
    {\
-        ResultWriter _writer(RESULT_FILE_NAME(ioStem));\
+        ResultWriter _writer(RESULT_FILE_NAME(ioStem, vm().getTrajectory()));\
        write(_writer, name, result);\
    }\
 }
--- a/Hadrons/Modules.hpp
+++ b/Hadrons/Modules.hpp
@@ -24,14 +24,17 @@
 #include <Hadrons/Modules/MSolver/Guesser.hpp>
 #include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Hadrons/Modules/MSolver/A2AVectors.hpp>
+#include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
 #include <Hadrons/Modules/MGauge/UnitEm.hpp>
 #include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
 #include <Hadrons/Modules/MGauge/Unit.hpp>
+#include <Hadrons/Modules/MGauge/Electrify.hpp>
 #include <Hadrons/Modules/MGauge/Random.hpp>
 #include <Hadrons/Modules/MGauge/GaugeFix.hpp>
 #include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
 #include <Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
+#include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
 #include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
 #include <Hadrons/Modules/MUtilities/RandomVectors.hpp>
 #include <Hadrons/Modules/MUtilities/TestSeqGamma.hpp>
@@ -65,6 +68,7 @@
 #include <Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
 #include <Hadrons/Modules/MIO/LoadEigenPack.hpp>
 #include <Hadrons/Modules/MIO/LoadNersc.hpp>
+#include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
 #include <Hadrons/Modules/MIO/LoadCosmHol.hpp>
 #include <Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp>
 #include <Hadrons/Modules/MIO/LoadBinary.hpp>
--- a/Hadrons/Modules/MAction/DWF.cc
+++ b/Hadrons/Modules/MAction/DWF.cc
@@ -32,4 +32,6 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TDWF<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TDWF<FIMPLF>;
+#endif
--- a/Hadrons/Modules/MAction/DWF.hpp
+++ b/Hadrons/Modules/MAction/DWF.hpp
@@ -49,7 +49,8 @@ public:
                                    unsigned int, Ls,
                                    double      , mass,
                                    double      , M5,
-                                    std::string , boundary);
+                                    std::string , boundary,
+                                    std::string , twist);
 };

 template <typename FImpl>
@@ -73,7 +74,9 @@ protected:
 };

 MODULE_REGISTER_TMP(DWF, TDWF<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(DWFF, TDWF<FIMPLF>, MAction);
+#endif

 /******************************************************************************
 *                        DWF template implementation                         *
@@ -117,8 +120,9 @@ void TDWF<FImpl>::setup(void)
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    typename DomainWallFermion<FImpl>::ImplParams implParams(boundary);
+    typename DomainWallFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, DomainWallFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, implParams);
 }
--- a/Hadrons/Modules/MAction/MobiusDWF.cc
+++ b/Hadrons/Modules/MAction/MobiusDWF.cc
@@ -32,4 +32,6 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TMobiusDWF<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TMobiusDWF<FIMPLF>;
+#endif
--- a/Hadrons/Modules/MAction/MobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/MobiusDWF.hpp
@@ -49,7 +49,8 @@ public:
                                    double      , M5,
                                    double      , b,
                                    double      , c,
-                                    std::string , boundary);
+                                    std::string , boundary,
+                                    std::string , twist);
 };

 template <typename FImpl>
@@ -72,7 +73,9 @@ public:
 };

 MODULE_REGISTER_TMP(MobiusDWF, TMobiusDWF<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(MobiusDWFF, TMobiusDWF<FIMPLF>, MAction);
+#endif

 /******************************************************************************
 *                      TMobiusDWF implementation                             *
@@ -117,8 +120,9 @@ void TMobiusDWF<FImpl>::setup(void)
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    typename MobiusFermion<FImpl>::ImplParams implParams(boundary);
+    typename MobiusFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, MobiusFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, par().b, par().c,
                     implParams);
--- a/Hadrons/Modules/MAction/ScaledDWF.cc
+++ b/Hadrons/Modules/MAction/ScaledDWF.cc
@@ -32,4 +32,6 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TScaledDWF<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TScaledDWF<FIMPLF>;
+#endif
--- a/Hadrons/Modules/MAction/ScaledDWF.hpp
+++ b/Hadrons/Modules/MAction/ScaledDWF.hpp
@@ -48,7 +48,8 @@ public:
                                    double      , mass,
                                    double      , M5,
                                    double      , scale,
-                                    std::string , boundary);
+                                    std::string , boundary,
+                                    std::string , twist);
 };

 template <typename FImpl>
@@ -71,7 +72,9 @@ public:
 };

 MODULE_REGISTER_TMP(ScaledDWF, TScaledDWF<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(ScaledDWFF, TScaledDWF<FIMPLF>, MAction);
+#endif

 /******************************************************************************
 *                      TScaledDWF implementation                             *
@@ -116,8 +119,9 @@ void TScaledDWF<FImpl>::setup(void)
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    typename MobiusFermion<FImpl>::ImplParams implParams(boundary);
+    typename ScaledShamirFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, ScaledShamirFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, par().scale,
                     implParams);
--- a/Hadrons/Modules/MAction/Wilson.cc
+++ b/Hadrons/Modules/MAction/Wilson.cc
@@ -32,4 +32,6 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TWilson<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TWilson<FIMPLF>;
+#endif
--- a/Hadrons/Modules/MAction/Wilson.hpp
+++ b/Hadrons/Modules/MAction/Wilson.hpp
@@ -47,7 +47,9 @@ public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonPar,
                                    std::string, gauge,
                                    double     , mass,
-                                    std::string, boundary);
+                                    std::string, boundary,
+                                    std::string, string,
+                                    std::string, twist);
 };

 template <typename FImpl>
@@ -71,7 +73,9 @@ protected:
 };

 MODULE_REGISTER_TMP(Wilson, TWilson<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(WilsonF, TWilson<FIMPLF>, MAction);
+#endif

 /******************************************************************************
 *                     TWilson template implementation                        *
@@ -111,8 +115,9 @@ void TWilson<FImpl>::setup(void)
    auto &U      = envGet(GaugeField, par().gauge);
    auto &grid   = *envGetGrid(FermionField);
    auto &gridRb = *envGetRbGrid(FermionField);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    typename WilsonFermion<FImpl>::ImplParams implParams(boundary);
+    typename WilsonFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, WilsonFermion<FImpl>, getName(), 1, U, grid, gridRb,
                     par().mass, implParams);
 }
--- a/Hadrons/Modules/MAction/WilsonClover.cc
+++ b/Hadrons/Modules/MAction/WilsonClover.cc
@@ -32,4 +32,6 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TWilsonClover<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TWilsonClover<FIMPLF>;
+#endif
--- a/Hadrons/Modules/MAction/WilsonClover.hpp
+++ b/Hadrons/Modules/MAction/WilsonClover.hpp
@@ -51,7 +51,8 @@ public:
 				                    double     , csw_r,
 				                    double     , csw_t,
 				                    WilsonAnisotropyCoefficients ,clover_anisotropy,
-                                    std::string, boundary
+                                    std::string, boundary,
+                                    std::string, twist
 				    );
 };

@@ -75,7 +76,9 @@ public:
 };

 MODULE_REGISTER_TMP(WilsonClover, TWilsonClover<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(WilsonCloverF, TWilsonClover<FIMPLF>, MAction);
+#endif

 /******************************************************************************
 *                    TWilsonClover template implementation                   *
@@ -117,8 +120,9 @@ void TWilsonClover<FImpl>::setup(void)
    auto &U      = envGet(GaugeField, par().gauge);
    auto &grid   = *envGetGrid(FermionField);
    auto &gridRb = *envGetRbGrid(FermionField);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    typename WilsonCloverFermion<FImpl>::ImplParams implParams(boundary);
+    typename WilsonCloverFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, WilsonCloverFermion<FImpl>, getName(), 1, U, grid,
                     gridRb, par().mass, par().csw_r, par().csw_t, 
                     par().clover_anisotropy, implParams); 
--- a/Hadrons/Modules/MAction/ZMobiusDWF.cc
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.cc
@@ -32,4 +32,6 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TZMobiusDWF<ZFIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TZMobiusDWF<ZFIMPLF>;
+#endif
--- a/Hadrons/Modules/MAction/ZMobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.hpp
@@ -50,7 +50,8 @@ public:
                                    double                           , b,
                                    double                           , c,
                                    std::vector<std::complex<double>>, omega,
-                                    std::string                      , boundary);
+                                    std::string                      , boundary,
+                                    std::string                      , twist);
 };

 template <typename FImpl>
@@ -73,7 +74,9 @@ public:
 };

 MODULE_REGISTER_TMP(ZMobiusDWF, TZMobiusDWF<ZFIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(ZMobiusDWFF, TZMobiusDWF<ZFIMPLF>, MAction);
+#endif

 /******************************************************************************
 *                     TZMobiusDWF implementation                             *
@@ -125,8 +128,9 @@ void TZMobiusDWF<FImpl>::setup(void)
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    auto omega = par().omega;
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    typename ZMobiusFermion<FImpl>::ImplParams implParams(boundary);
+    typename ZMobiusFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, ZMobiusFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, omega,
                     par().b, par().c, implParams);
--- a/Hadrons/Modules/MContraction/A2AAslashField.hpp
+++ b/Hadrons/Modules/MContraction/A2AAslashField.hpp
@@ -33,10 +33,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/A2AMatrix.hpp>

-#ifndef ASF_IO_TYPE
-#define ASF_IO_TYPE ComplexF
-#endif
-
 BEGIN_HADRONS_NAMESPACE

 /******************************************************************************
@@ -113,7 +109,7 @@ public:
    typedef A2AMatrixBlockComputation<Complex, 
                                      FermionField, 
                                      A2AAslashFieldMetadata, 
-                                      ASF_IO_TYPE> Computation;
+                                      HADRONS_A2AM_IO_TYPE> Computation;
    typedef AslashFieldKernel<Complex, FImpl> Kernel;
 public:
    // constructor
@@ -196,7 +192,7 @@ void TA2AAslashField<FImpl, PhotonImpl>::execute(void)
        LOG(Message) << "  " << name << std::endl;
    }
    LOG(Message) << "A-slash field size: " << nt << "*" << N_i << "*" << N_j 
-                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(ASF_IO_TYPE)) 
+                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(HADRONS_A2AM_IO_TYPE)) 
                 << "/EM field)" << std::endl;
    
    // preparing "B" complexified fields
--- a/Hadrons/Modules/MContraction/A2AMesonField.hpp
+++ b/Hadrons/Modules/MContraction/A2AMesonField.hpp
@@ -35,10 +35,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/A2AMatrix.hpp>

-#ifndef MF_IO_TYPE
-#define MF_IO_TYPE ComplexF
-#endif
-
 BEGIN_HADRONS_NAMESPACE

 /******************************************************************************
@@ -118,7 +114,7 @@ public:
    typedef A2AMatrixBlockComputation<Complex, 
                                      FermionField, 
                                      A2AMesonFieldMetadata, 
-                                      MF_IO_TYPE> Computation;
+                                      HADRONS_A2AM_IO_TYPE> Computation;
    typedef MesonFieldKernel<Complex, FImpl> Kernel;
 public:
    // constructor
@@ -248,7 +244,7 @@ void TA2AMesonField<FImpl>::execute(void)
        LOG(Message) << "  " << g << std::endl;
    }
    LOG(Message) << "Meson field size: " << nt << "*" << N_i << "*" << N_j 
-                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(MF_IO_TYPE)) 
+                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(HADRONS_A2AM_IO_TYPE)) 
                 << "/momentum/bilinear)" << std::endl;

    auto &ph = envGet(std::vector<ComplexField>, momphName_);
--- a/Hadrons/Modules/MGauge/Electrify.cc
+++ b/Hadrons/Modules/MGauge/Electrify.cc
@@ -0,0 +1,34 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/Electrify.cc
+
+Copyright (C) 2015-2018
+
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MGauge/Electrify.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MGauge;
+
+template class Grid::Hadrons::MGauge::TElectrify<GIMPL>;
--- a/Hadrons/Modules/MGauge/Electrify.hpp
+++ b/Hadrons/Modules/MGauge/Electrify.hpp
@@ -0,0 +1,151 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/Electrify.hpp
+
+Copyright (C) 2015-2018
+
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MGauge_Electrify_hpp_
+#define Hadrons_MGauge_Electrify_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                              Electrify gauge                               *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MGauge)
+
+/****************************************************************************
+*  Electrify a gauge field:
+*
+*  Ue_mu(x) = U_mu(x)*exp(ieqA_mu(x))
+*
+*  with
+*
+*  - gauge: U_mu(x): gauge field
+*  - emField: A_mu(x): electromagnetic photon field
+*  - e: value for the elementary charge
+*  - q: charge in units of e
+*
+*****************************************************************************/
+
+
+class ElectrifyPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ElectrifyPar,
+                                    std::string, gauge,
+				    std::string, emField,
+				    double, e,
+				    double, charge);
+};
+
+template <typename GImpl>
+class TElectrify: public Module<ElectrifyPar>
+{
+public:
+    GAUGE_TYPE_ALIASES(GImpl,);
+public:
+    typedef PhotonR::GaugeField     EmField;
+public:
+    // constructor
+    TElectrify(const std::string name);
+    // destructor
+    virtual ~TElectrify(void) {};
+    // dependencies/products
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+protected:
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(Electrify, TElectrify<GIMPL>, MGauge);
+
+/******************************************************************************
+*                            TElectrify implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename GImpl>
+TElectrify<GImpl>::TElectrify(const std::string name)
+: Module<ElectrifyPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename GImpl>
+std::vector<std::string> TElectrify<GImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().gauge, par().emField};
+
+    return in;
+}
+
+template <typename GImpl>
+std::vector<std::string> TElectrify<GImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TElectrify<GImpl>::setup(void)
+{
+    envCreateLat(GaugeField, getName());
+    envTmpLat(LatticeComplex, "eiAmu");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TElectrify<GImpl>::execute(void)
+{
+    LOG(Message) << "Electrify the gauge field " << par().gauge << " using the photon field " 
+                  << par().emField << " with charge e*q= " << par().e << "*" << par().charge << std::endl;
+    
+    auto &Ue = envGet(GaugeField, getName());
+    auto &U = envGet(GaugeField, par().gauge);
+    auto &A = envGet(EmField,  par().emField);
+    envGetTmp(LatticeComplex, eiAmu);
+
+    Complex i(0.0,1.0);
+
+    for(unsigned int mu = 0; mu < env().getNd(); mu++)
+    {
+	eiAmu = exp(i * (Real)(par().e * par().charge) * PeekIndex<LorentzIndex>(A, mu));
+	PokeIndex<LorentzIndex>(Ue, PeekIndex<LorentzIndex>(U, mu) * eiAmu, mu);
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MGauge_Electrify_hpp_
--- a/Hadrons/Modules/MIO/LoadA2AVectors.cc
+++ b/Hadrons/Modules/MIO/LoadA2AVectors.cc
@@ -0,0 +1,34 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MIO/LoadA2AVectors.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MIO;
+
+template class Grid::Hadrons::MIO::TLoadA2AVectors<FIMPL>;
--- a/Hadrons/Modules/MIO/LoadA2AVectors.hpp
+++ b/Hadrons/Modules/MIO/LoadA2AVectors.hpp
@@ -0,0 +1,120 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MIO/LoadA2AVectors.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MIO_LoadA2AVectors_hpp_
+#define Hadrons_MIO_LoadA2AVectors_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/A2AVectors.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                    Module to load all-to-all vectors                       *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MIO)
+
+class LoadA2AVectorsPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadA2AVectorsPar,
+                                    std::string,  filestem,
+                                    bool,         multiFile,
+                                    unsigned int, size);
+};
+
+template <typename FImpl>
+class TLoadA2AVectors: public Module<LoadA2AVectorsPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+public:
+    // constructor
+    TLoadA2AVectors(const std::string name);
+    // destructor
+    virtual ~TLoadA2AVectors(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(LoadA2AVectors, TLoadA2AVectors<FIMPL>, MIO);
+
+/******************************************************************************
+ *                      TLoadA2AVectors implementation                        *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TLoadA2AVectors<FImpl>::TLoadA2AVectors(const std::string name)
+: Module<LoadA2AVectorsPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TLoadA2AVectors<FImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TLoadA2AVectors<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TLoadA2AVectors<FImpl>::setup(void)
+{
+    envCreate(std::vector<FermionField>, getName(), 1, par().size, 
+              envGetGrid(FermionField));
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TLoadA2AVectors<FImpl>::execute(void)
+{
+    auto &vec = envGet(std::vector<FermionField>, getName());
+
+    A2AVectorsIo::read(vec, par().filestem, par().multiFile, vm().getTrajectory());
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MIO_LoadA2AVectors_hpp_
--- a/Hadrons/Modules/MIO/LoadEigenPack.cc
+++ b/Hadrons/Modules/MIO/LoadEigenPack.cc
@@ -32,4 +32,6 @@ using namespace Hadrons;
 using namespace MIO;

 template class Grid::Hadrons::MIO::TLoadEigenPack<FermionEigenPack<FIMPL>>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MIO::TLoadEigenPack<FermionEigenPack<FIMPL, FIMPLF>>;
+#endif
--- a/Hadrons/Modules/MIO/LoadEigenPack.hpp
+++ b/Hadrons/Modules/MIO/LoadEigenPack.hpp
@@ -72,7 +72,9 @@ public:
 };

 MODULE_REGISTER_TMP(LoadFermionEigenPack, TLoadEigenPack<FermionEigenPack<FIMPL>>, MIO);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(LoadFermionEigenPackIo32, ARG(TLoadEigenPack<FermionEigenPack<FIMPL, FIMPLF>>), MIO);
+#endif

 /******************************************************************************
 *                    TLoadEigenPack implementation                           *
--- a/Hadrons/Modules/MNPR/Amputate.cc
+++ b/Hadrons/Modules/MNPR/Amputate.cc
@@ -6,6 +6,7 @@ Source file: Hadrons/Modules/MNPR/Amputate.cc

 Copyright (C) 2015-2018

+Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
--- a/Hadrons/Modules/MNPR/Bilinear.cc
+++ b/Hadrons/Modules/MNPR/Bilinear.cc
@@ -6,6 +6,7 @@ Source file: Hadrons/Modules/MNPR/Bilinear.cc

 Copyright (C) 2015-2018

+Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
--- a/Hadrons/Modules/MNPR/FourQuark.cc
+++ b/Hadrons/Modules/MNPR/FourQuark.cc
@@ -6,6 +6,7 @@ Source file: Hadrons/Modules/MNPR/FourQuark.cc

 Copyright (C) 2015-2018

+Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
--- a/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.cc
+++ b/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MNoise;
+
+template class Grid::Hadrons::MNoise::TFullVolumeSpinColorDiagonal<FIMPL>;
+template class Grid::Hadrons::MNoise::TFullVolumeSpinColorDiagonal<ZFIMPL>;
--- a/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp
+++ b/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp
@@ -0,0 +1,121 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MNoise_FullVolumeSpinColorDiagonal_hpp_
+#define Hadrons_MNoise_FullVolumeSpinColorDiagonal_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/DilutedNoise.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *             Generate full volume spin-color diagonal noise                *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MNoise)
+
+class FullVolumeSpinColorDiagonalPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(FullVolumeSpinColorDiagonalPar,
+                                    unsigned int, nsrc);
+};
+
+template <typename FImpl>
+class TFullVolumeSpinColorDiagonal: public Module<FullVolumeSpinColorDiagonalPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+public:
+    // constructor
+    TFullVolumeSpinColorDiagonal(const std::string name);
+    // destructor
+    virtual ~TFullVolumeSpinColorDiagonal(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(FullVolumeSpinColorDiagonal, TFullVolumeSpinColorDiagonal<FIMPL>, MNoise);
+MODULE_REGISTER_TMP(ZFullVolumeSpinColorDiagonal, TFullVolumeSpinColorDiagonal<ZFIMPL>, MNoise);
+
+/******************************************************************************
+ *              TFullVolumeSpinColorDiagonal implementation                  *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TFullVolumeSpinColorDiagonal<FImpl>::TFullVolumeSpinColorDiagonal(const std::string name)
+: Module<FullVolumeSpinColorDiagonalPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TFullVolumeSpinColorDiagonal<FImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TFullVolumeSpinColorDiagonal<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TFullVolumeSpinColorDiagonal<FImpl>::setup(void)
+{
+    envCreateDerived(DilutedNoise<FImpl>, 
+                     FullVolumeSpinColorDiagonalNoise<FImpl>,
+                     getName(), 1, envGetGrid(FermionField), par().nsrc);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TFullVolumeSpinColorDiagonal<FImpl>::execute(void)
+{
+    auto &noise = envGet(DilutedNoise<FImpl>, getName());
+    LOG(Message) << "Generating full volume, spin-color diagonal noise" << std::endl;
+    noise.generateNoise(rng4d());
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MNoise_FullVolumeSpinColorDiagonal_hpp_
--- a/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -146,7 +146,7 @@ void TChargedProp::execute(void)
        std::vector<int>    siteCoor;

        LOG(Message) << "Saving momentum-projected propagator to '"
-                     << RESULT_FILE_NAME(par().output) << "'..."
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        result.projection.resize(par().outputMom.size());
        result.lattice_size = env().getGrid()->_fdimensions;
--- a/Hadrons/Modules/MScalar/ScalarVP.cc
+++ b/Hadrons/Modules/MScalar/ScalarVP.cc
@@ -462,7 +462,7 @@ void TScalarVP::execute(void)
    if (!par().output.empty())
    {
        LOG(Message) << "Saving momentum-projected HVP to '"
-                     << RESULT_FILE_NAME(par().output) << "'..."
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        saveResult(par().output, "HVP", outputData);
    }
--- a/Hadrons/Modules/MScalar/VPCounterTerms.cc
+++ b/Hadrons/Modules/MScalar/VPCounterTerms.cc
@@ -239,7 +239,7 @@ void TVPCounterTerms::execute(void)
    if (!par().output.empty())
    {
        LOG(Message) << "Saving momentum-projected correlators to '"
-                     << RESULT_FILE_NAME(par().output) << "'..."
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        saveResult(par().output, "scalar_loops", outputData);
    }
--- a/Hadrons/Modules/MSolver/A2AAslashVectors.cc
+++ b/Hadrons/Modules/MSolver/A2AAslashVectors.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSolver/A2AAslashVectors.cc
+
+Copyright (C) 2015-2018
+
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSolver;
+
+template class Grid::Hadrons::MSolver::TA2AAslashVectors<FIMPL>;
+template class Grid::Hadrons::MSolver::TA2AAslashVectors<ZFIMPL>;
--- a/Hadrons/Modules/MSolver/A2AAslashVectors.hpp
+++ b/Hadrons/Modules/MSolver/A2AAslashVectors.hpp
@@ -0,0 +1,194 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSolver/A2AAslashVectors.hpp
+
+Copyright (C) 2015-2018
+
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MSolver_A2AAslashVectors_hpp_
+#define Hadrons_MSolver_A2AAslashVectors_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Solver.hpp>
+#include <Hadrons/A2AVectors.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                       Create all-to-all V & W vectors                      *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MSolver)
+
+/****************************************************************************
+*  Calculate a sequential propagator on an insertion of i*g_mu*A_mu 
+*  on an A2A vector
+*
+*  vv_i(y) = S(y,x) * i * g_mu*A_mu(x) * v_i(x)
+*
+*  with
+*
+*  - vector: A2A vector v_i(x)
+*  - emField: A_mu(x): electromagnetic photon field
+*  - solver: the solver for calculating the sequential propagator
+*
+*****************************************************************************/
+
+class A2AAslashVectorsPar: Serializable
+{
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashVectorsPar,
+                                  std::string, vector,
+                                  std::string, emField,
+                                  std::string, solver,
+                                  std::string, output,
+                                  bool,        multiFile);
+};
+
+template <typename FImpl>
+class TA2AAslashVectors : public Module<A2AAslashVectorsPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    SOLVER_TYPE_ALIASES(FImpl,);
+public:
+    typedef PhotonR::GaugeField EmField;
+public:
+    // constructor
+    TA2AAslashVectors(const std::string name);
+    // destructor
+    virtual ~TA2AAslashVectors(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+private:
+    unsigned int Ls_;
+};
+
+MODULE_REGISTER_TMP(A2AAslashVectors, TA2AAslashVectors<FIMPL>, MSolver);
+MODULE_REGISTER_TMP(ZA2AAslashVectors, TA2AAslashVectors<ZFIMPL>, MSolver);
+
+/******************************************************************************
+ *                       TA2AAslashVectors implementation                       *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TA2AAslashVectors<FImpl>::TA2AAslashVectors(const std::string name)
+: Module<A2AAslashVectorsPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TA2AAslashVectors<FImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().vector, par().emField, par().solver};
+
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TA2AAslashVectors<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TA2AAslashVectors<FImpl>::setup(void)
+{
+    Ls_  = env().getObjectLs(par().solver);
+    auto &vvector = envGet(std::vector<FermionField>, par().vector);
+    unsigned int Nmodes = vvector.size();
+    envCreate(std::vector<FermionField>, getName(), 1, 
+              Nmodes, envGetGrid(FermionField));
+   
+    envTmpLat(FermionField, "v4dtmp");
+    envTmpLat(FermionField, "v5dtmp", Ls_);
+    envTmpLat(FermionField, "v5dtmp_sol", Ls_);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TA2AAslashVectors<FImpl>::execute(void)
+{
+    auto &solver = envGet(Solver, par().solver);
+    auto &stoch_photon = envGet(EmField,  par().emField);
+    auto &vvector = envGet(std::vector<FermionField>, par().vector);
+    auto &Aslashv = envGet(std::vector<FermionField>, getName());
+    unsigned int Nmodes = vvector.size();
+    auto &mat = solver.getFMat();
+    envGetTmp(FermionField, v4dtmp);
+    envGetTmp(FermionField, v5dtmp);
+    envGetTmp(FermionField, v5dtmp_sol);
+
+    Complex ci(0.0,1.0);
+
+    startTimer("Seq Aslash");
+    LOG(Message) << "Calculate Sequential propagator on Aslash * v with the A2A vector " 
+                 << par().vector << " and the photon field " << par().emField << std::endl;
+    for(unsigned int i=0; i<Nmodes; i++)
+    {
+        v4dtmp = zero;
+        startTimer("Multiply Aslash");
+        for(unsigned int mu=0;mu<=3;mu++)
+        {
+            Gamma gmu(Gamma::gmu[mu]);
+            v4dtmp +=  ci * PeekIndex<LorentzIndex>(stoch_photon, mu) * (gmu * vvector[i]);
+        }
+        stopTimer("Multiply Aslash");
+
+        startTimer("Inversion");
+        if (Ls_ == 1)
+        {
+            solver(Aslashv[i], v4dtmp);
+        }
+        else
+        {
+            mat.ImportPhysicalFermionSource(v4dtmp, v5dtmp);
+            solver(v5dtmp_sol, v5dtmp);
+            mat.ExportPhysicalFermionSolution(v5dtmp_sol, v4dtmp);
+            Aslashv[i] = v4dtmp;
+        }
+        stopTimer("Inversion");
+    }
+    stopTimer("Seq Aslash");
+    if (!par().output.empty())
+    {
+        startTimer("I/O");
+        A2AVectorsIo::write(par().output, Aslashv, par().multiFile, vm().getTrajectory());
+        stopTimer("I/O");
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MSolver_A2AAslashVectors_hpp_
--- a/Hadrons/Modules/MSolver/A2AVectors.hpp
+++ b/Hadrons/Modules/MSolver/A2AVectors.hpp
@@ -51,7 +51,9 @@ public:
                                  std::string, noise,
                                  std::string, action,
                                  std::string, eigenPack,
-                                  std::string, solver);
+                                  std::string, solver,
+                                  std::string, output,
+                                  bool,        multiFile);
 };

 template <typename FImpl, typename Pack>
@@ -236,6 +238,17 @@ void TA2AVectors<FImpl, Pack>::execute(void)
        }
        stopTimer("W high mode");
    }
+
+    // I/O if necessary
+    if (!par().output.empty())
+    {
+        startTimer("V I/O");
+        A2AVectorsIo::write(par().output + "_v", v, par().multiFile, vm().getTrajectory());
+        stopTimer("V I/O");
+        startTimer("W I/O");
+        A2AVectorsIo::write(par().output + "_w", w, par().multiFile, vm().getTrajectory());
+        stopTimer("W I/O");
+    }
 }

 END_MODULE_NAMESPACE
--- a/Hadrons/Modules/MSolver/LocalCoherenceLanczos.cc
+++ b/Hadrons/Modules/MSolver/LocalCoherenceLanczos.cc
@@ -33,4 +33,7 @@ using namespace MSolver;

 template class Grid::Hadrons::MSolver::TLocalCoherenceLanczos<FIMPL,HADRONS_DEFAULT_LANCZOS_NBASIS>;
 template class Grid::Hadrons::MSolver::TLocalCoherenceLanczos<ZFIMPL,HADRONS_DEFAULT_LANCZOS_NBASIS>;
-
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+template class Grid::Hadrons::MSolver::TLocalCoherenceLanczos<FIMPL,HADRONS_DEFAULT_LANCZOS_NBASIS, FIMPLF>;
+template class Grid::Hadrons::MSolver::TLocalCoherenceLanczos<ZFIMPL,HADRONS_DEFAULT_LANCZOS_NBASIS, ZFIMPLF>;
+#endif
--- a/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
+++ b/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
@@ -55,17 +55,17 @@ public:
                                    bool,          multiFile);
 };

-template <typename FImpl, int nBasis>
+template <typename FImpl, int nBasis, typename FImplIo = FImpl>
 class TLocalCoherenceLanczos: public Module<LocalCoherenceLanczosPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    typedef LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
                                  typename FImpl::SiteComplex, 
-                                  nBasis>                LCL;
-    typedef BaseFermionEigenPack<FImpl>                  BasePack;
-    typedef CoarseFermionEigenPack<FImpl, nBasis>        CoarsePack;
-    typedef HADRONS_DEFAULT_SCHUR_OP<FMat, FermionField> SchurFMat;
+                                  nBasis>                  LCL;
+    typedef BaseFermionEigenPack<FImpl>                    BasePack;
+    typedef CoarseFermionEigenPack<FImpl, nBasis, FImplIo> CoarsePack;
+    typedef HADRONS_DEFAULT_SCHUR_OP<FMat, FermionField>   SchurFMat;
 public:
    // constructor
    TLocalCoherenceLanczos(const std::string name);
@@ -82,27 +82,31 @@ public:

 MODULE_REGISTER_TMP(LocalCoherenceLanczos, ARG(TLocalCoherenceLanczos<FIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS>), MSolver);
 MODULE_REGISTER_TMP(ZLocalCoherenceLanczos, ARG(TLocalCoherenceLanczos<ZFIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS>), MSolver);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+MODULE_REGISTER_TMP(LocalCoherenceLanczosIo32, ARG(TLocalCoherenceLanczos<FIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS, FIMPLF>), MSolver);
+MODULE_REGISTER_TMP(ZLocalCoherenceLanczosIo32, ARG(TLocalCoherenceLanczos<ZFIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS, ZFIMPLF>), MSolver);
+#endif

 /******************************************************************************
 *                 TLocalCoherenceLanczos implementation                      *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
-template <typename FImpl, int nBasis>
-TLocalCoherenceLanczos<FImpl, nBasis>::TLocalCoherenceLanczos(const std::string name)
+template <typename FImpl, int nBasis, typename FImplIo>
+TLocalCoherenceLanczos<FImpl, nBasis, FImplIo>::TLocalCoherenceLanczos(const std::string name)
 : Module<LocalCoherenceLanczosPar>(name)
 {}

 // dependencies/products ///////////////////////////////////////////////////////
-template <typename FImpl, int nBasis>
-std::vector<std::string> TLocalCoherenceLanczos<FImpl, nBasis>::getInput(void)
+template <typename FImpl, int nBasis, typename FImplIo>
+std::vector<std::string> TLocalCoherenceLanczos<FImpl, nBasis, FImplIo>::getInput(void)
 {
    std::vector<std::string> in = {par().action};
    
    return in;
 }

-template <typename FImpl, int nBasis>
-std::vector<std::string> TLocalCoherenceLanczos<FImpl, nBasis>::getOutput(void)
+template <typename FImpl, int nBasis, typename FImplIo>
+std::vector<std::string> TLocalCoherenceLanczos<FImpl, nBasis, FImplIo>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    
@@ -110,8 +114,8 @@ std::vector<std::string> TLocalCoherenceLanczos<FImpl, nBasis>::getOutput(void)
 }

 // setup ///////////////////////////////////////////////////////////////////////
-template <typename FImpl, int nBasis>
-void TLocalCoherenceLanczos<FImpl, nBasis>::setup(void)
+template <typename FImpl, int nBasis, typename FImplIo>
+void TLocalCoherenceLanczos<FImpl, nBasis, FImplIo>::setup(void)
 {
    LOG(Message) << "Setting up local coherence Lanczos eigensolver for"
                 << " action '" << par().action << "' (" << nBasis
@@ -138,8 +142,8 @@ void TLocalCoherenceLanczos<FImpl, nBasis>::setup(void)
 }

 // execution ///////////////////////////////////////////////////////////////////
-template <typename FImpl, int nBasis>
-void TLocalCoherenceLanczos<FImpl, nBasis>::execute(void)
+template <typename FImpl, int nBasis, typename FImplIo>
+void TLocalCoherenceLanczos<FImpl, nBasis, FImplIo>::execute(void)
 {
    auto &finePar   = par().fineParams;
    auto &coarsePar = par().coarseParams;
--- a/Hadrons/Utilities/Contractor.cc
+++ b/Hadrons/Utilities/Contractor.cc
@@ -0,0 +1,454 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Utilities/Contractor.cc
+
+Copyright (C) 2015-2018
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Global.hpp>
+#include <Hadrons/A2AMatrix.hpp>
+#include <Hadrons/DiskVector.hpp>
+#include <Hadrons/TimerArray.hpp>
+
+using namespace Grid;
+using namespace QCD;
+using namespace Hadrons;
+
+#define TIME_MOD(t) (((t) + par.global.nt) % par.global.nt)
+
+namespace Contractor
+{
+    class TrajRange: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
+                                        unsigned int, start,
+                                        unsigned int, end,
+                                        unsigned int, step);
+    };
+    
+    class GlobalPar: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(GlobalPar,
+                                        TrajRange, trajCounter,
+                                        unsigned int, nt,
+                                        std::string, diskVectorDir,
+                                        std::string, output);
+    };
+
+    class A2AMatrixPar: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMatrixPar,
+                                        std::string, file,
+                                        std::string, dataset,
+                                        unsigned int, cacheSize,
+                                        std::string, name);
+    };
+
+    class ProductPar: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(ProductPar,
+                                        std::string, terms,
+                                        std::vector<std::string>, times,
+                                        std::string, translations,
+                                        bool, translationAverage);
+    };
+
+    class CorrelatorResult: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(CorrelatorResult,
+                                        std::vector<Contractor::A2AMatrixPar>,  a2aMatrix,
+                                        ProductPar, contraction,
+                                        std::vector<unsigned int>, times,
+                                        std::vector<ComplexD>, correlator);
+    };
+}
+
+struct ContractorPar
+{
+    Contractor::GlobalPar                  global;
+    std::vector<Contractor::A2AMatrixPar>  a2aMatrix;
+    std::vector<Contractor::ProductPar>    product;
+};
+
+void makeTimeSeq(std::vector<std::vector<unsigned int>> &timeSeq, 
+                 const std::vector<std::set<unsigned int>> &times,
+                 std::vector<unsigned int> &current,
+                 const unsigned int depth)
+{
+    if (depth > 0)
+    {
+        for (auto t: times[times.size() - depth])
+        {
+            current[times.size() - depth] = t;
+            makeTimeSeq(timeSeq, times, current, depth - 1);
+        }
+    }
+    else
+    {
+        timeSeq.push_back(current);
+    }
+}
+
+void makeTimeSeq(std::vector<std::vector<unsigned int>> &timeSeq, 
+                 const std::vector<std::set<unsigned int>> &times)
+{
+    std::vector<unsigned int> current(times.size());
+
+    makeTimeSeq(timeSeq, times, current, times.size());
+}
+
+void saveCorrelator(const Contractor::CorrelatorResult &result, const std::string dir, 
+                    const unsigned int dt, const unsigned int traj)
+{
+    std::string              fileStem = "", filename;
+    std::vector<std::string> terms = strToVec<std::string>(result.contraction.terms);
+
+    for (unsigned int i = 0; i < terms.size() - 1; i++)
+    {
+        fileStem += terms[i] + "_" + std::to_string(result.times[i]) + "_";
+    }
+    fileStem += terms.back();
+    if (!result.contraction.translationAverage)
+    {
+        fileStem += "_dt_" + std::to_string(dt);
+    }
+    filename = dir + "/" + RESULT_FILE_NAME(fileStem, traj);
+    std::cout << "Saving correlator to '" << filename << "'" << std::endl;
+    makeFileDir(dir);
+    ResultWriter writer(filename);
+    write(writer, fileStem, result);
+}
+
+std::set<unsigned int> parseTimeRange(const std::string str, const unsigned int nt)
+{
+    std::regex               rex("([0-9]+)|(([0-9]+)\\.\\.([0-9]+))");
+    std::smatch              sm;
+    std::vector<std::string> rstr = strToVec<std::string>(str);
+    std::set<unsigned int>   tSet;
+
+    for (auto &s: rstr)
+    {
+        std::regex_match(s, sm, rex);
+        if (sm[1].matched)
+        {
+            unsigned int t;
+            
+            t = std::stoi(sm[1].str());
+            if (t >= nt)
+            {
+                HADRONS_ERROR(Range, "time out of range (from expression '" + str + "')");
+            }
+            tSet.insert(t);
+        }
+        else if (sm[2].matched)
+        {
+            unsigned int ta, tb;
+
+            ta = std::stoi(sm[3].str());
+            tb = std::stoi(sm[4].str());
+            if ((ta >= nt) or (tb >= nt))
+            {
+                HADRONS_ERROR(Range, "time out of range (from expression '" + str + "')");
+            }
+            for (unsigned int ti = ta; ti <= tb; ++ti)
+            {
+                tSet.insert(ti);
+            }
+        }
+    }
+
+    return tSet;
+}
+
+struct Sec
+{
+    Sec(const double usec)
+    {
+        seconds = usec/1.0e6;
+    }
+    
+    double seconds;
+};
+
+inline std::ostream & operator<< (std::ostream& s, const Sec &&sec)
+{
+    s << std::setw(10) << sec.seconds << " sec";
+
+    return s;
+}
+
+struct Flops
+{
+    Flops(const double flops, const double fusec)
+    {
+        gFlopsPerSec = flops/fusec/1.0e3;
+    }
+    
+    double gFlopsPerSec;
+};
+
+inline std::ostream & operator<< (std::ostream& s, const Flops &&f)
+{
+    s << std::setw(10) << f.gFlopsPerSec << " GFlop/s";
+
+    return s;
+}
+
+struct Bytes
+{
+    Bytes(const double bytes, const double busec)
+    {
+        gBytesPerSec = bytes/busec*1.0e6/1024/1024/1024;
+    }
+    
+    double gBytesPerSec;
+};
+
+inline std::ostream & operator<< (std::ostream& s, const Bytes &&b)
+{
+    s << std::setw(10) << b.gBytesPerSec << " GB/s";
+
+    return s;
+}
+
+int main(int argc, char* argv[])
+{
+    // parse command line
+    std::string   parFilename;
+
+    if (argc != 2)
+    {
+        std::cerr << "usage: " << argv[0] << " <parameter file>";
+        std::cerr << std::endl;
+        
+        return EXIT_FAILURE;
+    }
+    parFilename = argv[1];
+
+    // parse parameter file
+    ContractorPar par;
+    unsigned int  nMat, nCont;
+    XmlReader     reader(parFilename);
+
+    read(reader, "global",    par.global);
+    read(reader, "a2aMatrix", par.a2aMatrix);
+    read(reader, "product",   par.product);
+    nMat  = par.a2aMatrix.size();
+    nCont = par.product.size();
+
+    // create diskvectors
+    std::map<std::string, EigenDiskVector<ComplexD>> a2aMat;
+    unsigned int                                     cacheSize;
+
+    for (auto &p: par.a2aMatrix)
+    {
+        std::string dirName = par.global.diskVectorDir + "/" + p.name;
+
+        a2aMat.emplace(p.name, EigenDiskVector<ComplexD>(dirName, par.global.nt, p.cacheSize));
+    }
+
+    // trajectory loop
+    for (unsigned int traj = par.global.trajCounter.start; 
+         traj < par.global.trajCounter.end; traj += par.global.trajCounter.step)
+    {
+        std::cout << ":::::::: Trajectory " << traj << std::endl;
+
+        // load data
+        for (auto &p: par.a2aMatrix)
+        {
+            std::string filename = p.file;
+            double      t, size;
+
+            tokenReplace(filename, "traj", traj);
+            std::cout << "======== Loading '" << filename << "'" << std::endl;
+
+            A2AMatrixIo<HADRONS_A2AM_IO_TYPE> a2aIo(filename, p.dataset, par.global.nt);
+
+            a2aIo.load(a2aMat.at(p.name), &t);
+            std::cout << "Read " << a2aIo.getSize() << " bytes in " << t/1.0e6 
+                    << " sec, " << a2aIo.getSize()/t*1.0e6/1024/1024 << " MB/s" << std::endl;
+        }
+
+        // contract
+        EigenDiskVector<ComplexD>::Matrix buf;
+
+        for (auto &p: par.product)
+        {
+            std::vector<std::string>               term = strToVec<std::string>(p.terms);
+            std::vector<std::set<unsigned int>>    times;
+            std::vector<std::vector<unsigned int>> timeSeq;
+            std::set<unsigned int>                 translations;
+            std::vector<A2AMatrixTr<ComplexD>>     lastTerm(par.global.nt);
+            A2AMatrix<ComplexD>                    prod, buf, tmp;
+            TimerArray                             tAr;
+            double                                 fusec, busec, flops, bytes, tusec;
+            Contractor::CorrelatorResult           result;             
+
+            tAr.startTimer("Total");
+            std::cout << "======== Contraction tr(";
+            for (unsigned int g = 0; g < term.size(); ++g)
+            {
+                std::cout << term[g] << ((g == term.size() - 1) ? ')' : '*');
+            }
+            std::cout << std::endl;
+            if (term.size() != p.times.size() + 1)
+            {
+                HADRONS_ERROR(Size, "number of terms (" + std::to_string(term.size()) 
+                            + ") different from number of times (" 
+                            + std::to_string(p.times.size() + 1) + ")");
+            }
+            for (auto &s: p.times)
+            {
+                times.push_back(parseTimeRange(s, par.global.nt));
+            }
+            for (auto &m: par.a2aMatrix)
+            {
+                if (std::find(result.a2aMatrix.begin(), result.a2aMatrix.end(), m) == result.a2aMatrix.end())
+                {
+                    result.a2aMatrix.push_back(m);
+                    tokenReplace(result.a2aMatrix.back().file, "traj", traj);
+                }
+            }
+            result.contraction = p;
+            result.correlator.resize(par.global.nt, 0.);
+
+            translations = parseTimeRange(p.translations, par.global.nt);
+            makeTimeSeq(timeSeq, times);
+            std::cout << timeSeq.size()*translations.size()*(term.size() - 2) << " A*B, "
+                    << timeSeq.size()*translations.size()*par.global.nt << " tr(A*B)"
+                    << std::endl;
+
+            std::cout << "* Caching transposed last term" << std::endl;
+            for (unsigned int t = 0; t < par.global.nt; ++t)
+            {
+                tAr.startTimer("Disk vector overhead");
+                const A2AMatrix<ComplexD> &ref = a2aMat.at(term.back())[t];
+                tAr.stopTimer("Disk vector overhead");
+
+                tAr.startTimer("Transpose caching");
+                lastTerm[t].resize(ref.rows(), ref.cols());
+                parallel_for (unsigned int j = 0; j < ref.cols(); ++j)
+                for (unsigned int i = 0; i < ref.rows(); ++i)
+                {
+                    lastTerm[t](i, j) = ref(i, j);
+                }
+                tAr.stopTimer("Transpose caching");
+            }
+            bytes = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols()*sizeof(ComplexD);
+            std::cout << Sec(tAr.getDTimer("Transpose caching")) << " " 
+                      << Bytes(bytes, tAr.getDTimer("Transpose caching")) << std::endl;
+            for (unsigned int i = 0; i < timeSeq.size(); ++i)
+            {
+                unsigned int dti = 0;
+                auto         &t = timeSeq[i];
+
+                result.times = t;
+                for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                {
+                    result.correlator[tLast] = 0.;
+                }
+                for (auto &dt: translations)
+                {
+                    std::cout << "* Step " << i*translations.size() + dti + 1
+                            << "/" << timeSeq.size()*translations.size()
+                            << " -- positions= " << t << ", dt= " << dt << std::endl;
+                    if (term.size() > 2)
+                    {
+                        std::cout << std::setw(8) << "products";
+                    }
+                    flops  = 0.;
+                    bytes  = 0.;
+                    fusec  = tAr.getDTimer("A*B algebra");
+                    busec  = tAr.getDTimer("A*B total");
+                    tAr.startTimer("Linear algebra");
+                    tAr.startTimer("Disk vector overhead");
+                    prod = a2aMat.at(term[0])[TIME_MOD(t[0] + dt)];
+                    tAr.stopTimer("Disk vector overhead");
+                    for (unsigned int j = 1; j < term.size() - 1; ++j)
+                    {
+                        tAr.startTimer("Disk vector overhead");
+                        const A2AMatrix<ComplexD> &ref = a2aMat.at(term[j])[TIME_MOD(t[j] + dt)];
+                        tAr.stopTimer("Disk vector overhead");
+                        
+                        tAr.startTimer("A*B total");
+                        tAr.startTimer("A*B algebra");
+                        A2AContraction::mul(tmp, prod, ref);
+                        tAr.stopTimer("A*B algebra");
+                        flops += A2AContraction::mulFlops(prod, ref);
+                        prod   = tmp;
+                        tAr.stopTimer("A*B total");
+                        bytes += 3.*tmp.rows()*tmp.cols()*sizeof(ComplexD);
+                    }
+                    if (term.size() > 2)
+                    {
+                        std::cout << Sec(tAr.getDTimer("A*B total") - busec) << " "
+                                << Flops(flops, tAr.getDTimer("A*B algebra") - fusec) << " " 
+                                << Bytes(bytes, tAr.getDTimer("A*B total") - busec) << std::endl;
+                    }
+                    std::cout << std::setw(8) << "traces";
+                    flops  = 0.;
+                    bytes  = 0.;
+                    fusec  = tAr.getDTimer("tr(A*B)");
+                    busec  = tAr.getDTimer("tr(A*B)");
+                    for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                    {
+                        tAr.startTimer("tr(A*B)");
+                        A2AContraction::accTrMul(result.correlator[TIME_MOD(tLast - dt)], prod, lastTerm[tLast]);
+                        tAr.stopTimer("tr(A*B)");
+                        flops += A2AContraction::accTrMulFlops(prod, lastTerm[tLast]);
+                        bytes += 2.*prod.rows()*prod.cols()*sizeof(ComplexD);
+                    }
+                    tAr.stopTimer("Linear algebra");
+                    std::cout << Sec(tAr.getDTimer("tr(A*B)") - busec) << " "
+                            << Flops(flops, tAr.getDTimer("tr(A*B)") - fusec) << " " 
+                            << Bytes(bytes, tAr.getDTimer("tr(A*B)") - busec) << std::endl;
+                    if (!p.translationAverage)
+                    {
+                        saveCorrelator(result, par.global.output, dt, traj);
+                        for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                        {
+                            result.correlator[tLast] = 0.;
+                        }
+                    }
+                    dti++;
+                }
+                if (p.translationAverage)
+                {
+                    for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                    {
+                        result.correlator[tLast] /= translations.size();
+                    }
+                    saveCorrelator(result, par.global.output, 0, traj);
+                }
+            }
+            tAr.stopTimer("Total");
+            printTimeProfile(tAr.getTimings(), tAr.getTimer("Total"));
+        }
+    }
+    
+    return EXIT_SUCCESS;
+}
--- a/Hadrons/Utilities/Contractor.hpp
+++ b/Hadrons/Utilities/Contractor.hpp
@@ -0,0 +1,12 @@
+#ifndef  Hadrons_Contractor_hpp_
+#define Hadrons_Contractor_hpp_
+
+#include <Hadrons/Global.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_Contractor_hpp_
--- a/Hadrons/Utilities/ContractorBenchmark.cc
+++ b/Hadrons/Utilities/ContractorBenchmark.cc
@@ -0,0 +1,434 @@
+#include <Hadrons/Global.hpp>
+#include <Hadrons/A2AMatrix.hpp>
+#ifdef USE_MKL
+#include "mkl.h"
+#include "mkl_cblas.h"
+#endif
+
+using namespace Grid;
+using namespace Hadrons;
+
+#ifdef GRID_COMMS_MPI3
+#define GET_RANK(rank, nMpi) \
+MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\
+MPI_Comm_rank(MPI_COMM_WORLD, &(rank))
+#define BARRIER() MPI_Barrier(MPI_COMM_WORLD)
+#define INIT() MPI_Init(NULL, NULL)
+#define FINALIZE() MPI_Finalize()
+#else
+#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0
+#define BARRIER()
+#define INIT()
+#define FINALIZE()
+#endif
+
+template <typename Function, typename MatLeft, typename MatRight>
+inline void trBenchmark(const std::string name, const MatLeft &left,
+                        const MatRight &right, const ComplexD ref, Function fn)
+{
+    double       t, flops, bytes, n = left[0].rows()*left[0].cols();
+    unsigned int nMat = left.size();
+    int          nMpi, rank;
+    ComplexD     buf;
+
+    t = 0.;
+    GET_RANK(rank, nMpi);
+    t = -usecond();
+    BARRIER();
+    for (unsigned int i = rank*nMat/nMpi; i < (rank+1)*nMat/nMpi; ++i)
+    {
+        fn(buf, left[i], right[i]);      
+    }
+    BARRIER();
+    t += usecond();
+    flops = nMat*(6.*n + 2.*(n - 1.));
+    bytes = nMat*(2.*n*sizeof(ComplexD));
+
+    if (rank == 0)
+    {
+        std::cout << std::setw(34) << name << ": diff= "
+                  << std::setw(12) << std::norm(buf-ref)
+                  << std::setw(10) << t/1.0e6 << " sec "
+                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
+                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
+                  << std::endl;
+    }
+    ::sleep(1);
+}
+
+template <typename Function, typename MatV, typename Mat>
+inline void mulBenchmark(const std::string name, const MatV &left,
+                         const MatV &right, const Mat &ref, Function fn)
+{
+    double       t, flops, bytes;
+    double       nr = left[0].rows(), nc = left[0].cols(), n = nr*nc;
+    unsigned int nMat = left.size();
+    int          nMpi, rank;
+    Mat          buf(left[0].rows(), left[0].rows());
+
+    t = 0.;
+    GET_RANK(rank, nMpi);
+    t = -usecond();
+    BARRIER();
+    for (unsigned int i = rank*nMat/nMpi; i < (rank+1)*nMat/nMpi; ++i)
+    {
+        fn(buf, left[i], right[i]);
+    }
+    BARRIER();
+    t += usecond();
+    flops = nMat*(nr*nr*(6.*nc + 2.*(nc - 1.)));
+    bytes = nMat*(2*nc*nr*sizeof(ComplexD));
+
+    if (rank == 0)
+    {
+        std::cout << std::setw(34) << name << ": diff= "
+                  << std::setw(12) << (buf-ref).squaredNorm()
+                  << std::setw(10) << t/1.0e6 << " sec "
+                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
+                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
+                  << std::endl;
+    }
+    ::sleep(1);
+}
+
+#ifdef USE_MKL
+template <typename MatLeft, typename MatRight>
+static inline void zdotuRow(ComplexD &res, const unsigned int aRow,
+                            const MatLeft &a, const MatRight &b)
+{
+    const ComplexD *aPt, *bPt;
+    unsigned int   aInc, bInc;
+
+    if (MatLeft::Options == Eigen::RowMajor)
+    {
+        aPt  = a.data() + aRow*a.cols();
+        aInc = 1;
+    }
+    else if (MatLeft::Options == Eigen::ColMajor)
+    {
+        aPt  = a.data() + aRow;
+        aInc = a.rows();
+    }
+    if (MatRight::Options == Eigen::RowMajor)
+    {
+        bPt  = b.data() + aRow;
+        bInc = b.cols();
+    }
+    else if (MatRight::Options == Eigen::ColMajor)
+    {
+        bPt  = b.data() + aRow*b.rows();
+        bInc = 1;
+    }
+    cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+}
+
+template <typename MatLeft, typename MatRight>
+static inline void zdotuCol(ComplexD &res, const unsigned int aCol,
+                            const MatLeft &a, const MatRight &b)
+{
+    const ComplexD *aPt, *bPt;
+    unsigned int   aInc, bInc;
+
+    if (MatLeft::Options == Eigen::RowMajor)
+    {
+        aPt  = a.data() + aCol;
+        aInc = a.cols();
+    }
+    else if (MatLeft::Options == Eigen::ColMajor)
+    {
+        aPt  = a.data() + aCol*a.rows();
+        aInc = 1;
+    }
+    if (MatRight::Options == Eigen::RowMajor)
+    {
+        bPt  = b.data() + aCol*b.cols();
+        bInc = 1;
+    }
+    else if (MatRight::Options == Eigen::ColMajor)
+    {
+        bPt  = b.data() + aCol;
+        bInc = b.rows();
+    }
+    cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+}
+#endif
+
+template <typename MatLeft, typename MatRight>
+void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
+{
+    std::vector<MatLeft>  left;
+    std::vector<MatRight> right;
+    MatRight              buf;
+    ComplexD              ref;
+    int                   rank, nMpi;
+
+    left.resize(nMat, MatLeft::Random(ni, nj));
+    right.resize(nMat, MatRight::Random(nj, ni));
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
+    {
+        std::cout << "==== tr(A*B) benchmarks" << std::endl;
+        std::cout << "A matrices use ";
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << "B matrices use ";
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << std::endl;
+    }
+    BARRIER();
+    ref = (left.back()*right.back()).trace();
+    trBenchmark("Hadrons A2AContraction::accTrMul", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = 0.;
+        A2AContraction::accTrMul(res, a, b);
+    });
+    trBenchmark("Naive loop rows first", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        auto nr = a.rows(), nc = a.cols();
+        
+        res = 0.;
+        parallel_for (unsigned int i = 0; i < nr; ++i)
+        {
+            ComplexD tmp = 0.;
+
+            for (unsigned int j = 0; j < nc; ++j)
+            {
+                tmp += a(i, j)*b(j, i);
+            }
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Naive loop cols first", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        auto nr = a.rows(), nc = a.cols();
+        
+        res = 0.;
+        parallel_for (unsigned int j = 0; j < nc; ++j)
+        {
+            ComplexD tmp = 0.;
+
+            for (unsigned int i = 0; i < nr; ++i)
+            {
+                tmp += a(i, j)*b(j, i);
+            }        
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Eigen tr(A*B)", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = (a*b).trace();
+    });
+    trBenchmark("Eigen row-wise dot", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+        {
+            ComplexD tmp;
+
+            tmp = a.row(r).conjugate().dot(b.col(r));
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Eigen col-wise dot", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+        {
+            ComplexD tmp;
+
+            tmp = a.col(c).conjugate().dot(b.row(c));
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Eigen Hadamard", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = a.cwiseProduct(b.transpose()).sum();
+    });
+#ifdef USE_MKL
+    trBenchmark("MKL row-wise zdotu", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+        {
+            ComplexD tmp;
+
+            zdotuRow(tmp, r, a, b);
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("MKL col-wise zdotu", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+        {
+            ComplexD tmp;
+
+            zdotuCol(tmp, c, a, b);
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+#endif
+    BARRIER();
+    if (rank == 0)
+    {
+        std::cout << std::endl;
+    }
+}
+
+template <typename Mat>
+void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
+{
+    std::vector<Mat> left, right;
+    Mat              ref;
+    int              rank, nMpi;
+
+    left.resize(nMat, Mat::Random(ni, nj));
+    right.resize(nMat, Mat::Random(nj, ni));
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
+    {
+        std::cout << "==== A*B benchmarks" << std::endl;
+        std::cout << "all matrices use ";
+        if (Mat::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (Mat::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << std::endl;
+    }
+    BARRIER();
+    ref = left.back()*right.back();
+    mulBenchmark("Hadrons A2AContraction::mul", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    { 
+        A2AContraction::mul(res, a, b);
+    });
+    mulBenchmark("Eigen A*B", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    { 
+        res = a*b;
+    });
+#ifdef USE_MKL
+    mulBenchmark("MKL A*B", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    {
+        const ComplexD one(1., 0.), zero(0., 0.);
+        if (Mat::Options == Eigen::RowMajor)
+        {
+            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat::Options == Eigen::ColMajor)
+        {
+            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    });
+#endif
+    BARRIER();
+    if (rank == 0)
+    {
+        std::cout << std::endl;
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    // parse command line
+    Eigen::Index ni, nj, nMat;
+    int          nMpi, rank;
+
+    if (argc != 4)
+    {
+        std::cerr << "usage: " << argv[0] << " <Ni> <Nj> <#matrices>";
+        std::cerr << std::endl;
+        
+        return EXIT_FAILURE;
+    }
+    ni   = std::stoi(argv[1]);
+    nj   = std::stoi(argv[2]);
+    nMat = std::stoi(argv[3]);
+
+    INIT();
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
+    {
+        std::cout << "\n*** ALL-TO-ALL MATRIX CONTRACTION BENCHMARK ***\n" << std::endl;
+        std::cout << nMat << " couples of " << ni << "x" << nj << " matrices\n" << std::endl;
+
+        std::cout << nMpi << " MPI processes" << std::endl;
+#ifdef GRID_OMP
+        #pragma omp parallel
+        {
+            #pragma omp single
+            std::cout << omp_get_num_threads() << " threads\n" << std::endl; 
+        }
+#else
+        std::cout << "Single-threaded\n" << std::endl; 
+#endif
+
+#ifdef EIGEN_USE_MKL_ALL
+        std::cout << "Eigen uses the MKL" << std::endl;
+#endif
+        std::cout << "Eigen uses " << Eigen::nbThreads() << " threads" << std::endl;
+#ifdef USE_MKL
+        std::cout << "MKL   uses " << mkl_get_max_threads() << " threads" << std::endl;
+#endif
+        std::cout << std::endl;
+    }
+
+    fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
+    fullMulBenchmark<A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullMulBenchmark<A2AMatrixTr<ComplexD>>(ni, nj, nMat);
+    FINALIZE();
+
+    return EXIT_SUCCESS;
+}
--- a/Hadrons/Utilities/EigenPackCast.cc
+++ b/Hadrons/Utilities/EigenPackCast.cc
@@ -35,7 +35,7 @@ using namespace Hadrons;
 template <typename FOut, typename FIn>
 void convert(const std::string outFilename, const std::string inFilename, 
             const unsigned int Ls, const bool rb, const unsigned int size, 
-             const bool multiFile)
+             const bool multiFile, const bool testRead)
 {
    assert(outFilename != inFilename);
    
@@ -102,6 +102,7 @@ void convert(const std::string outFilename, const std::string inFilename,
    LOG(Message) << "Out type      : " << typeName<FOut>() << std::endl;
    LOG(Message) << "#vectors      : " << size << std::endl;
    LOG(Message) << "Multifile     : " << (multiFile ? "yes" : "no") << std::endl;
+    LOG(Message) << "Test read     : " << (testRead ? "yes" : "no") << std::endl;
    if (multiFile)
    {
        for(unsigned int k = 0; k < size; ++k)
@@ -112,6 +113,8 @@ void convert(const std::string outFilename, const std::string inFilename,
            LOG(Message) << "==== Converting vector " << k << std::endl;
            LOG(Message) << "In : " << inV  << std::endl;
            LOG(Message) << "Out: " << outV << std::endl;
+            // conversion
+            LOG(Message) << "-- Doing conversion" << std::endl;
            makeFileDir(outV, gOut);
            binWriter.open(outV);
            binReader.open(inV);
@@ -121,10 +124,20 @@ void convert(const std::string outFilename, const std::string inFilename,
            EigenPackIo::writeElement<FIn, FOut>(binWriter, bufIn, eval, k, &bufOut, &testIn);
            binWriter.close();
            binReader.close();
+            // read test
+            if (testRead)
+            {
+                LOG(Message) << "-- Test read" << std::endl;
+                binReader.open(outV);
+                EigenPackIo::readElement<FOut>(bufOut, eval, k, binReader);
+                binReader.close();
+            }
        }
    }
    else
    {
+        // conversion
+        LOG(Message) << "-- Doing conversion" << std::endl;
        makeFileDir(outFilename, gOut);
        binWriter.open(outFilename);
        binReader.open(inFilename);
@@ -137,6 +150,18 @@ void convert(const std::string outFilename, const std::string inFilename,
        }
        binWriter.close();
        binReader.close();
+        // read test
+        if (testRead)
+        {
+            LOG(Message) << "-- Test read" << std::endl;
+            binReader.open(outFilename);
+            EigenPackIo::readHeader(record, binReader);
+            for(unsigned int k = 0; k < size; ++k)
+            {
+                EigenPackIo::readElement<FOut>(bufOut, eval, k, binReader);
+            }
+            binReader.close();
+        }
    }
 }

@@ -154,11 +179,11 @@ int main(int argc, char *argv[])
    // parse command line
    std::string  outFilename, inFilename;
    unsigned int size, Ls;
-    bool         rb, multiFile;
+    bool         rb, multiFile, testRead;
    
-    if (argc < 7)
+    if (argc < 8)
    {
-        std::cerr << "usage: " << argv[0] << " <out eigenpack> <in eigenpack> <Ls> <red-black (0|1)> <#vector> <multifile (0|1)> [Grid options]";
+        std::cerr << "usage: " << argv[0] << " <out eigenpack> <in eigenpack> <Ls> <red-black {0|1}> <#vector> <multifile {0|1}> <test read {0|1}> [Grid options]";
        std::cerr << std::endl;
        std::exit(EXIT_FAILURE);
    }
@@ -168,6 +193,7 @@ int main(int argc, char *argv[])
    rb          = (std::string(argv[4]) != "0");
    size        = std::stoi(std::string(argv[5]));
    multiFile   = (std::string(argv[6]) != "0");
+    testRead    = (std::string(argv[7]) != "0");
    
    // initialization
    Grid_init(&argc, &argv);
@@ -176,7 +202,7 @@ int main(int argc, char *argv[])
    // execution
    try
    {
-        convert<FOUT, FIN>(outFilename, inFilename, Ls, rb, size, multiFile);
+        convert<FOUT, FIN>(outFilename, inFilename, Ls, rb, size, multiFile, testRead);
    }
    catch (const std::exception& e)
    {
--- a/Hadrons/Utilities/Makefile.am
+++ b/Hadrons/Utilities/Makefile.am
@@ -1,4 +1,4 @@
-bin_PROGRAMS = HadronsXmlRun HadronsFermionEP64To32
+bin_PROGRAMS = HadronsXmlRun HadronsFermionEP64To32 HadronsContractor HadronsContractorBenchmark

 HadronsXmlRun_SOURCES = HadronsXmlRun.cc
 HadronsXmlRun_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
@@ -6,3 +6,9 @@ HadronsXmlRun_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
 HadronsFermionEP64To32_SOURCES  = EigenPackCast.cc
 HadronsFermionEP64To32_CXXFLAGS = $(AM_CXXFLAGS) -DFIN=WilsonImplD::FermionField -DFOUT=WilsonImplF::FermionField
 HadronsFermionEP64To32_LDADD    = ../libHadrons.a ../../Grid/libGrid.a
+
+HadronsContractor_SOURCES = Contractor.cc
+HadronsContractor_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
+
+HadronsContractorBenchmark_SOURCES = ContractorBenchmark.cc
+HadronsContractorBenchmark_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
--- a/Hadrons/modules.inc
+++ b/Hadrons/modules.inc
@@ -20,17 +20,20 @@ modules_cc =\
  Modules/MSink/Point.cc \
  Modules/MSink/Smear.cc \
  Modules/MSolver/A2AVectors.cc \
+  Modules/MSolver/A2AAslashVectors.cc \
  Modules/MSolver/RBPrecCG.cc \
  Modules/MSolver/MixedPrecisionRBPrecCG.cc \
  Modules/MSolver/LocalCoherenceLanczos.cc \
  Modules/MGauge/StoutSmearing.cc \
  Modules/MGauge/Unit.cc \
+  Modules/MGauge/Electrify.cc \
  Modules/MGauge/UnitEm.cc \
  Modules/MGauge/StochEm.cc \
  Modules/MGauge/Random.cc \
  Modules/MGauge/FundtoHirep.cc \
  Modules/MGauge/GaugeFix.cc \
  Modules/MNoise/TimeDilutedSpinColorDiagonal.cc \
+  Modules/MNoise/FullVolumeSpinColorDiagonal.cc \
  Modules/MUtilities/RandomVectors.cc \
  Modules/MUtilities/TestSeqGamma.cc \
  Modules/MUtilities/PrecisionCast.cc \
@@ -64,7 +67,8 @@ modules_cc =\
  Modules/MIO/LoadBinary.cc \
  Modules/MIO/LoadNersc.cc \
  Modules/MIO/LoadCoarseEigenPack.cc \
-  Modules/MIO/LoadCosmHol.cc
+  Modules/MIO/LoadCosmHol.cc \
+  Modules/MIO/LoadA2AVectors.cc

 modules_hpp =\
  Modules/MContraction/Baryon.hpp \
@@ -93,14 +97,17 @@ modules_hpp =\
  Modules/MSolver/Guesser.hpp \
  Modules/MSolver/RBPrecCG.hpp \
  Modules/MSolver/A2AVectors.hpp \
+  Modules/MSolver/A2AAslashVectors.hpp \
  Modules/MGauge/UnitEm.hpp \
  Modules/MGauge/StoutSmearing.hpp \
  Modules/MGauge/Unit.hpp \
+  Modules/MGauge/Electrify.hpp \
  Modules/MGauge/Random.hpp \
  Modules/MGauge/GaugeFix.hpp \
  Modules/MGauge/FundtoHirep.hpp \
  Modules/MGauge/StochEm.hpp \
  Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp \
+  Modules/MNoise/FullVolumeSpinColorDiagonal.hpp \
  Modules/MUtilities/PrecisionCast.hpp \
  Modules/MUtilities/RandomVectors.hpp \
  Modules/MUtilities/TestSeqGamma.hpp \
@@ -134,6 +141,7 @@ modules_hpp =\
  Modules/MScalarSUN/TrKinetic.hpp \
  Modules/MIO/LoadEigenPack.hpp \
  Modules/MIO/LoadNersc.hpp \
+  Modules/MIO/LoadA2AVectors.hpp \
  Modules/MIO/LoadCosmHol.hpp \
  Modules/MIO/LoadCoarseEigenPack.hpp \
  Modules/MIO/LoadBinary.hpp
--- a/benchmarks/Benchmark_IO_vs_dir.cc
+++ b/benchmarks/Benchmark_IO_vs_dir.cc
@@ -3,9 +3,6 @@
 #define MSG std::cout << GridLogMessage
 #define SEP \
 "============================================================================="
-#ifndef BENCH_IO_LMAX
-#define BENCH_IO_LMAX 40
-#endif

 using namespace Grid;
 using namespace QCD;
@@ -41,7 +38,7 @@ int main (int argc, char ** argv)
  int64_t threads = GridThread::GetThreads();
  MSG << "Grid is setup to use " << threads << " threads" << std::endl;
  MSG << SEP << std::endl;
-  MSG << "Benchmark Lime write" << std::endl;
+  MSG << "Benchmark double precision Lime write" << std::endl;
  MSG << SEP << std::endl;
  for (auto &d: dir)
  {
@@ -49,7 +46,8 @@ int main (int argc, char ** argv)
    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermion>, Ls, rb);
  }

-  MSG << "Benchmark Lime read" << std::endl;
+  MSG << SEP << std::endl;
+  MSG << "Benchmark double precision Lime read" << std::endl;
  MSG << SEP << std::endl;
  for (auto &d: dir)
  {
@@ -57,6 +55,24 @@ int main (int argc, char ** argv)
    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermion>, Ls, rb);
  }

+  MSG << SEP << std::endl;
+  MSG << "Benchmark single precision Lime write" << std::endl;
+  MSG << SEP << std::endl;
+  for (auto &d: dir)
+  {
+    MSG << "-- Directory " << d << std::endl;
+    writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb);
+  }
+
+  MSG << SEP << std::endl;
+  MSG << "Benchmark single precision Lime read" << std::endl;
+  MSG << SEP << std::endl;
+  for (auto &d: dir)
+  {
+    MSG << "-- Directory " << d << std::endl;
+    readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb);
+  }
+
  Grid_finalize();

  return EXIT_SUCCESS;
--- a/configure.ac
+++ b/configure.ac
@@ -123,10 +123,13 @@ case ${ac_SFW_FP16} in
      AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
 esac

-############### MKL
+############### Intel libraries
 AC_ARG_ENABLE([mkl],
    [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
    [ac_MKL=${enable_mkl}], [ac_MKL=no])
+AC_ARG_ENABLE([ipp],
+    [AC_HELP_STRING([--enable-ipp=yes|no|prefix], [enable Intel IPP for fast CRC32C])],
+    [ac_IPP=${enable_mkl}], [ac_IPP=no])

 case ${ac_MKL} in
    no)
@@ -139,6 +142,17 @@ case ${ac_MKL} in
        AC_DEFINE([USE_MKL], [1], [Define to 1 if you use the Intel MKL]);;
 esac

+case ${ac_IPP} in
+    no)
+        ;;
+    yes)
+        AC_DEFINE([USE_IPP], [1], [Define to 1 if you use the Intel IPP]);;
+    *)
+        AM_CXXFLAGS="-I$ac_IPP/include $AM_CXXFLAGS"
+        AM_LDFLAGS="-L$ac_IPP/lib $AM_LDFLAGS"
+        AC_DEFINE([USE_IPP], [1], [Define to 1 if you use the Intel IPP]);;
+esac
+
 ############### HDF5
 AC_ARG_WITH([hdf5],
    [AS_HELP_STRING([--with-hdf5=prefix],
@@ -170,7 +184,13 @@ AC_CHECK_FUNCS([gettimeofday])

 if test "${ac_MKL}x" != "nox"; then
    AC_SEARCH_LIBS([mkl_set_interface_layer], [mkl_rt], [],
-                   [AC_MSG_ERROR("MKL enabled but library not found")])
+                   [AC_MSG_ERROR("Intel MKL enabled but library not found")])
+fi
+
+if test "${ac_IPP}x" != "nox"; then
+     AC_SEARCH_LIBS([ippsCRC32C_8u], [ippdc], 
+                   [LIBS="${LIBS} -lippdc -lippvm -lipps -lippcore"],
+                   [AC_MSG_ERROR("Intel IPP enabled but library not found")])
 fi

 AC_SEARCH_LIBS([__gmpf_init], [gmp],
@@ -485,6 +505,7 @@ DX_INIT_DOXYGEN([$PACKAGE_NAME], [doxygen.cfg])

 ############### Ouput
 cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
+GRID_CXX="$CXX"
 GRID_CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
 GRID_LDFLAGS="$AM_LDFLAGS $LDFLAGS"
 GRID_LIBS=$LIBS
@@ -497,6 +518,7 @@ AM_LDFLAGS="-L${cwd}/Grid $AM_LDFLAGS"
 AC_SUBST([AM_CFLAGS])
 AC_SUBST([AM_CXXFLAGS])
 AC_SUBST([AM_LDFLAGS])
+AC_SUBST([GRID_CXX])
 AC_SUBST([GRID_CXXFLAGS])
 AC_SUBST([GRID_LDFLAGS])
 AC_SUBST([GRID_LIBS])
--- a/grid-config.in
+++ b/grid-config.in
@@ -61,6 +61,10 @@ while test $# -gt 0; do
      echo @GRID_CXXFLAGS@
    ;;
    
+    --cxx)
+      echo @GRID_CXX@
+    ;;
+    
    --ldflags)
      echo @GRID_LDFLAGS@
    ;;
--- a/tests/debug/Test_split_laplacian.cc
+++ b/tests/debug/Test_split_laplacian.cc
@@ -0,0 +1,104 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+int main (int argc, char ** argv)
+{
+  typedef LatticeComplex ComplexField; 
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  int nd   = latt_size.size();
+  int ndm1 = nd-1;
+
+  std::vector<int> simd_layout = GridDefaultSimd(nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  std::vector<int> mpi_split (mpi_layout.size(),1);
+
+  std::cout << " Full " << GridCmdVectorIntToString(latt_size)  << " subgrid"         <<std::endl;
+  std::cout << " Full " << GridCmdVectorIntToString(mpi_layout) << " sub communicator"<<std::endl;
+  std::cout << " Full " << GridCmdVectorIntToString(simd_layout)<< " simd layout "    <<std::endl;
+
+  GridCartesian         * GridN = new GridCartesian(latt_size,
+						    simd_layout,
+						    mpi_layout);
+
+  std::vector<int> latt_m  = latt_size;   latt_m[nd-1] = 1;
+  std::vector<int> mpi_m   = mpi_layout;  mpi_m [nd-1] = 1;
+  std::vector<int> simd_m  = GridDefaultSimd(ndm1,vComplex::Nsimd()); simd_m.push_back(1);
+
+
+  std::cout << " Requesting " << GridCmdVectorIntToString(latt_m)<< " subgrid"         <<std::endl;
+  std::cout << " Requesting " << GridCmdVectorIntToString(mpi_m) << " sub communicator"<<std::endl;
+  std::cout << " Requesting " << GridCmdVectorIntToString(simd_m)<< " simd layout "    <<std::endl;
+  GridCartesian         * Grid_m = new GridCartesian(latt_m,
+						     simd_m,
+						     mpi_m,
+						     *GridN); 
+
+  Complex C(1.0);
+  Complex tmp;
+
+  ComplexField Full(GridN); Full = C;
+  ComplexField Full_cpy(GridN);
+  ComplexField Split(Grid_m);Split= C;
+
+  std::cout << GridLogMessage<< " Full  volume "<< norm2(Full) <<std::endl;
+  std::cout << GridLogMessage<< " Split volume "<< norm2(Split) <<std::endl;
+
+  tmp=C;
+  GridN->GlobalSum(tmp);
+  std::cout << GridLogMessage<< " Full  nodes "<< tmp <<std::endl;
+
+  tmp=C;
+  Grid_m->GlobalSum(tmp);
+  std::cout << GridLogMessage<< " Split nodes "<< tmp <<std::endl;
+  GridN->Barrier();
+
+  auto local_latt = GridN->LocalDimensions();
+
+  Full_cpy = zero;
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          RNG(GridN);  RNG.SeedFixedIntegers(seeds);
+
+  random(RNG,Full);
+  for(int t=0;t<local_latt[nd-1];t++){
+    ExtractSliceLocal(Split,Full,0,t,Tp);
+    InsertSliceLocal (Split,Full_cpy,0,t,Tp);
+  }
+  Full_cpy = Full_cpy - Full;
+  std::cout << " NormFull " << norm2(Full)<<std::endl;
+  std::cout << " NormDiff " << norm2(Full_cpy)<<std::endl;
+  Grid_finalize();
+}
--- a/tests/hadrons/Test_QED.cc
+++ b/tests/hadrons/Test_QED.cc
@@ -72,6 +72,7 @@ int main(int argc, char *argv[])
    
    // set fermion boundary conditions to be periodic space, antiperiodic time.
    std::string boundary = "1 1 1 -1";
+    std::string twist    = "0. 0. 0. 0.";

    //stochastic photon field
    MGauge::StochEm::Par photonPar;
@@ -90,6 +91,7 @@ int main(int argc, char *argv[])
        actionPar.M5    = 1.8;
        actionPar.mass  = mass[i];
        actionPar.boundary = boundary;
+        actionPar.twist = "0. 0. 0. 0.";
        application.createModule<MAction::DWF>("DWF_" + flavour[i], actionPar);

        
--- a/tests/hadrons/Test_hadrons.hpp
+++ b/tests/hadrons/Test_hadrons.hpp
@@ -126,6 +126,7 @@ inline void makeWilsonAction(Application &application, std::string actionName,
        actionPar.gauge = gaugeField;
        actionPar.mass  = mass;
        actionPar.boundary = boundary;
+        actionPar.twist = "0. 0. 0. 0.";
        application.createModule<MAction::Wilson>(actionName, actionPar);
    }
 }
@@ -154,6 +155,7 @@ inline void makeDWFAction(Application &application, std::string actionName,
        actionPar.M5    = M5;
        actionPar.mass  = mass;
        actionPar.boundary = boundary;
+        actionPar.twist = "0. 0. 0. 0.";
        application.createModule<MAction::DWF>(actionName, actionPar);
    }
 }
--- a/tests/hadrons/Test_hadrons_meson_3pt.cc
+++ b/tests/hadrons/Test_hadrons_meson_3pt.cc
@@ -66,6 +66,7 @@ int main(int argc, char *argv[])
    
    // set fermion boundary conditions to be periodic space, antiperiodic time.
    std::string boundary = "1 1 1 -1";
+    std::string twist = "0. 0. 0. 0.";

    // sink
    MSink::Point::Par sinkPar;
@@ -80,6 +81,7 @@ int main(int argc, char *argv[])
        actionPar.M5    = 1.8;
        actionPar.mass  = mass[i];
        actionPar.boundary = boundary;
+        actionPar.twist = twist;
        application.createModule<MAction::DWF>("DWF_" + flavour[i], actionPar);
        
        // solvers
--- a/tests/hadrons/Test_hadrons_spectrum.cc
+++ b/tests/hadrons/Test_hadrons_spectrum.cc
@@ -72,6 +72,7 @@ int main(int argc, char *argv[])
    
    // set fermion boundary conditions to be periodic space, antiperiodic time.
    std::string boundary = "1 1 1 -1";
+    std::string twist = "0. 0. 0. 0.";

    for (unsigned int i = 0; i < flavour.size(); ++i)
    {
@@ -82,6 +83,7 @@ int main(int argc, char *argv[])
        actionPar.M5    = 1.8;
        actionPar.mass  = mass[i];
        actionPar.boundary = boundary;
+        actionPar.twist = twist;
        application.createModule<MAction::DWF>("DWF_" + flavour[i], actionPar);
        
        // solvers
--- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc
+++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc
@@ -38,6 +38,7 @@ int main (int argc, char ** argv)
  typedef typename DomainWallFermionR::ComplexField ComplexField; 
  typename DomainWallFermionR::ImplParams params; 

+  double stp=1.0e-5;
  const int Ls=4;

  Grid_init(&argc,&argv);
@@ -197,7 +198,7 @@ int main (int argc, char ** argv)

  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOpCk(Dchk);
-  ConjugateGradient<FermionField> CG((1.0e-2),10000);
+  ConjugateGradient<FermionField> CG((stp),10000);
  s_res = zero;
  CG(HermOp,s_src,s_res);

@@ -227,5 +228,11 @@ int main (int argc, char ** argv)
    std::cout << GridLogMessage<<" resid["<<n<<"]  "<< norm2(tmp)/norm2(src[n])<<std::endl;
  }

+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  int blockDim = 0;//not used for BlockCGVec
+  BlockConjugateGradient<FermionField>    BCGV  (BlockCGVec,blockDim,stp,10000);
+  BCGV.PrintInterval=10;
+  BCGV(HermOpCk,src,result);
+
  Grid_finalize();
 }
--- a/tests/solver/Test_mobius_bcg.cc
+++ b/tests/solver/Test_mobius_bcg.cc
@@ -0,0 +1,220 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  typedef typename MobiusFermionR::FermionField FermionField; 
+  typedef typename MobiusFermionR::ComplexField ComplexField; 
+  typename MobiusFermionR::ImplParams params; 
+
+  const int Ls=12;
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  std::vector<int> mpi_split (mpi_layout.size(),1);
+  std::vector<int> split_coor (mpi_layout.size(),1);
+  std::vector<int> split_dim (mpi_layout.size(),1);
+
+  std::vector<ComplexD> boundary_phases(Nd,1.);
+  boundary_phases[Nd-1]=-1.;
+  params.boundary_phases = boundary_phases;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  /////////////////////////////////////////////
+  // Split into 1^4 mpi communicators
+  /////////////////////////////////////////////
+
+  for(int i=0;i<argc;i++){
+    if(std::string(argv[i]) == "--split"){
+      for(int k=0;k<mpi_layout.size();k++){
+	std::stringstream ss; 
+	ss << argv[i+1+k]; 
+	ss >> mpi_split[k];
+      }
+      break;
+    }
+  }
+
+ 
+  double stp = 1.e-8;
+  int nrhs = 1;
+  int me;
+  for(int i=0;i<mpi_layout.size();i++){
+//	split_dim[i] = (mpi_layout[i]/mpi_split[i]);
+	nrhs *= (mpi_layout[i]/mpi_split[i]);
+//	split_coor[i] = FGrid._processor_coor[i]/mpi_split[i];
+  }
+  std::cout << GridLogMessage << "Creating split grids " <<std::endl;
+  GridCartesian         * SGrid = new GridCartesian(GridDefaultLatt(),
+						    GridDefaultSimd(Nd,vComplex::Nsimd()),
+						    mpi_split,
+						    *UGrid,me); 
+  std::cout << GridLogMessage <<"Creating split ferm grids " <<std::endl;
+
+  GridCartesian         * SFGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid);
+  std::cout << GridLogMessage <<"Creating split rb grids " <<std::endl;
+  GridRedBlackCartesian * SrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
+  std::cout << GridLogMessage <<"Creating split ferm rb grids " <<std::endl;
+  GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,SGrid);
+  std::cout << GridLogMessage << "Made the grids"<<std::endl;
+  ///////////////////////////////////////////////
+  // Set up the problem as a 4d spreadout job
+  ///////////////////////////////////////////////
+  std::vector<int> seeds({1,2,3,4});
+
+  std::vector<FermionField> src(nrhs,FGrid);
+  std::vector<FermionField> src_chk(nrhs,FGrid);
+  std::vector<FermionField> result(nrhs,FGrid);
+  FermionField tmp(FGrid);
+  std::cout << GridLogMessage << "Made the Fermion Fields"<<std::endl;
+
+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
+  for(int s=0;s<nrhs;s++) {
+    random(pRNG5,src[s]);
+    std::cout << GridLogMessage << " src ["<<s<<"] "<<norm2(src[s])<<std::endl;
+  }
+
+  std::cout << GridLogMessage << "Intialised the Fermion Fields"<<std::endl;
+
+  LatticeGaugeField Umu(UGrid); 
+
+  if(0) { 
+    FieldMetaData header;
+    std::string file("./lat.in");
+    NerscIO::readConfiguration(Umu,header,file);
+    std::cout << GridLogMessage << " "<<file<<" successfully read" <<std::endl;
+  } else {
+    GridParallelRNG pRNG(UGrid );  
+    std::cout << GridLogMessage << "Intialising 4D RNG "<<std::endl;
+    pRNG.SeedFixedIntegers(seeds);
+    std::cout << GridLogMessage << "Intialised 4D RNG "<<std::endl;
+    SU3::HotConfiguration(pRNG,Umu);
+    std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
+    std::cout << " Site zero "<< Umu._odata[0]   <<std::endl;
+  } 
+
+  /////////////////
+  // MPI only sends
+  /////////////////
+  LatticeGaugeField s_Umu(SGrid);
+  FermionField s_src(SFGrid);
+  FermionField s_tmp(SFGrid);
+  FermionField s_res(SFGrid);
+
+  std::cout << GridLogMessage << "Made the split grid fields"<<std::endl;
+  ///////////////////////////////////////////////////////////////
+  // split the source out using MPI instead of I/O
+  ///////////////////////////////////////////////////////////////
+  Grid_split  (Umu,s_Umu);
+  Grid_split  (src,s_src);
+  std::cout << GridLogMessage << " split rank  " <<me << " s_src "<<norm2(s_src)<<std::endl;
+
+  ///////////////////////////////////////////////////////////////
+  // Set up N-solvers as trivially parallel
+  ///////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage << " Building the solvers"<<std::endl;
+  //  RealD mass=0.00107;
+  RealD mass=0.1;
+  RealD M5=1.8;
+  RealD mobius_factor=32./12.;
+  RealD mobius_b=0.5*(mobius_factor+1.);
+  RealD mobius_c=0.5*(mobius_factor-1.);
+  MobiusFermionR Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5,mobius_b,mobius_c,params);
+  MobiusFermionR Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,mobius_b,mobius_c,params);
+
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+
+  MdagMLinearOperator<MobiusFermionR,FermionField> HermOp(Ddwf);
+  MdagMLinearOperator<MobiusFermionR,FermionField> HermOpCk(Dchk);
+  ConjugateGradient<FermionField> CG((stp),100000);
+  s_res = zero;
+
+  CG(HermOp,s_src,s_res);
+
+  std::cout << GridLogMessage << " split residual norm "<<norm2(s_res)<<std::endl;
+  /////////////////////////////////////////////////////////////
+  // Report how long they all took
+  /////////////////////////////////////////////////////////////
+  std::vector<uint32_t> iterations(nrhs,0);
+  iterations[me] = CG.IterationsToComplete;
+    
+  for(int n=0;n<nrhs;n++){
+    UGrid->GlobalSum(iterations[n]);
+    std::cout << GridLogMessage<<" Rank "<<n<<" "<< iterations[n]<<" CG iterations"<<std::endl;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Gather and residual check on the results
+  /////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage<< "Unsplitting the result"<<std::endl;
+  Grid_unsplit(result,s_res);
+
+
+  std::cout << GridLogMessage<< "Checking the residuals"<<std::endl;
+  for(int n=0;n<nrhs;n++){
+    std::cout << GridLogMessage<< " res["<<n<<"] norm "<<norm2(result[n])<<std::endl;
+    HermOpCk.HermOp(result[n],tmp); tmp = tmp - src[n];
+    std::cout << GridLogMessage<<" resid["<<n<<"]  "<< std::sqrt(norm2(tmp)/norm2(src[n]))<<std::endl;
+  }
+
+
+  for(int s=0;s<nrhs;s++){
+    result[s]=zero;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Try block CG
+  /////////////////////////////////////////////////////////////
+  int blockDim = 0;//not used for BlockCGVec
+  BlockConjugateGradient<FermionField>    BCGV  (BlockCGrQVec,blockDim,stp,100000);
+  {
+    BCGV(HermOpCk,src,result);
+  }
+
+  
+  
+  Grid_finalize();
+}
--- a/tests/solver/Test_mobius_bcg_nosplit.cc
+++ b/tests/solver/Test_mobius_bcg_nosplit.cc
@@ -0,0 +1,144 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  typedef typename DomainWallFermionR::FermionField FermionField; 
+  typedef typename DomainWallFermionR::ComplexField ComplexField; 
+  typename DomainWallFermionR::ImplParams params; 
+
+  const int Ls=16;
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  std::vector<ComplexD> boundary_phases(Nd,1.);
+  boundary_phases[Nd-1]=-1.;
+  params.boundary_phases = boundary_phases;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+ 
+  double stp = 1.e-8;
+  int nrhs = 2;
+
+  ///////////////////////////////////////////////
+  // Set up the problem as a 4d spreadout job
+  ///////////////////////////////////////////////
+  std::vector<int> seeds({1,2,3,4});
+
+  std::vector<FermionField> src(nrhs,FGrid);
+  std::vector<FermionField> src_chk(nrhs,FGrid);
+  std::vector<FermionField> result(nrhs,FGrid);
+  FermionField tmp(FGrid);
+  std::cout << GridLogMessage << "Made the Fermion Fields"<<std::endl;
+
+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
+  for(int s=0;s<nrhs;s++) {
+    random(pRNG5,src[s]);
+    std::cout << GridLogMessage << " src ["<<s<<"] "<<norm2(src[s])<<std::endl;
+  }
+
+  std::cout << GridLogMessage << "Intialised the Fermion Fields"<<std::endl;
+
+  LatticeGaugeField Umu(UGrid); 
+
+  int conf = 0;
+  if(conf==0) { 
+    FieldMetaData header;
+    std::string file("./lat.in");
+    NerscIO::readConfiguration(Umu,header,file);
+    std::cout << GridLogMessage << " Config "<<file<<" successfully read" <<std::endl;
+  } else if (conf==1){
+    GridParallelRNG pRNG(UGrid );  
+
+    pRNG.SeedFixedIntegers(seeds);
+    SU3::HotConfiguration(pRNG,Umu);
+    std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
+  } else {
+    SU3::ColdConfiguration(Umu);
+    std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<<std::endl;
+  }
+
+  ///////////////////////////////////////////////////////////////
+  // Set up N-solvers as trivially parallel
+  ///////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage << " Building the solvers"<<std::endl;
+  RealD mass=0.01;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5,params);
+
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+
+  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
+  ConjugateGradient<FermionField> CG((stp),100000);
+
+  for(int rhs=0;rhs<1;rhs++){
+    result[rhs] = zero;
+    CG(HermOp,src[rhs],result[rhs]);
+  }
+
+  for(int rhs=0;rhs<1;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Try block CG
+  /////////////////////////////////////////////////////////////
+  int blockDim = 0;//not used for BlockCGVec
+  for(int s=0;s<nrhs;s++){
+      result[s]=zero;
+   }
+  BlockConjugateGradient<FermionField>    BCGV  (BlockCGrQVec,blockDim,stp,100000);
+  {
+    BCGV(HermOp,src,result);
+  }
+  
+  for(int rhs=0;rhs<nrhs;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  Grid_finalize();
+}
--- a/tests/solver/Test_mobius_bcg_phys_nosplit.cc
+++ b/tests/solver/Test_mobius_bcg_phys_nosplit.cc
@@ -0,0 +1,148 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  typedef typename DomainWallFermionR::FermionField FermionField; 
+  typedef typename DomainWallFermionR::ComplexField ComplexField; 
+  typename DomainWallFermionR::ImplParams params; 
+
+  const int Ls=16;
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  std::vector<ComplexD> boundary_phases(Nd,1.);
+  boundary_phases[Nd-1]=-1.;
+  params.boundary_phases = boundary_phases;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+ 
+  double stp = 1.e-8;
+  int nrhs = 2;
+
+  ///////////////////////////////////////////////
+  // Set up the problem as a 4d spreadout job
+  ///////////////////////////////////////////////
+  std::vector<int> seeds({1,2,3,4});
+
+  std::vector<FermionField> src4(nrhs,UGrid);
+  std::vector<FermionField> src(nrhs,FGrid);
+  std::vector<FermionField> src_chk(nrhs,FGrid);
+  std::vector<FermionField> result(nrhs,FGrid);
+  FermionField tmp(FGrid);
+  std::cout << GridLogMessage << "Made the Fermion Fields"<<std::endl;
+
+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  GridParallelRNG pRNG4(UGrid);  pRNG4.SeedFixedIntegers(seeds);
+  for(int s=0;s<nrhs;s++) {
+    random(pRNG4,src4[s]);
+    std::cout << GridLogMessage << " src ["<<s<<"] "<<norm2(src[s])<<std::endl;
+  }
+
+  std::cout << GridLogMessage << "Intialised the Fermion Fields"<<std::endl;
+
+  LatticeGaugeField Umu(UGrid); 
+
+  int conf = 0;
+  if(conf==0) { 
+    FieldMetaData header;
+    std::string file("./lat.in");
+    NerscIO::readConfiguration(Umu,header,file);
+    std::cout << GridLogMessage << " Config "<<file<<" successfully read" <<std::endl;
+  } else if (conf==1){
+    GridParallelRNG pRNG(UGrid );  
+
+    pRNG.SeedFixedIntegers(seeds);
+    SU3::HotConfiguration(pRNG,Umu);
+    std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
+  } else {
+    SU3::ColdConfiguration(Umu);
+    std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<<std::endl;
+  }
+
+  ///////////////////////////////////////////////////////////////
+  // Set up N-solvers as trivially parallel
+  ///////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage << " Building the solvers"<<std::endl;
+  RealD mass=0.01;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5,params);
+  for(int s=0;s<nrhs;s++) {
+    Ddwf.ImportPhysicalFermionSource(src4[s],src[s]);
+  }
+
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+
+  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
+  ConjugateGradient<FermionField> CG((stp),100000);
+
+  for(int rhs=0;rhs<1;rhs++){
+    result[rhs] = zero;
+    //    CG(HermOp,src[rhs],result[rhs]);
+  }
+
+  for(int rhs=0;rhs<1;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Try block CG
+  /////////////////////////////////////////////////////////////
+  int blockDim = 0;//not used for BlockCGVec
+  for(int s=0;s<nrhs;s++){
+    result[s]=zero;
+  }
+  BlockConjugateGradient<FermionField>    BCGV  (BlockCGrQVec,blockDim,stp,100000);
+  {
+    BCGV(HermOp,src,result);
+  }
+  
+  for(int rhs=0;rhs<nrhs;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  Grid_finalize();
+}
--- a/tests/solver/Test_mobius_bcg_prec_nosplit.cc
+++ b/tests/solver/Test_mobius_bcg_prec_nosplit.cc
@@ -0,0 +1,147 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  typedef typename DomainWallFermionR::FermionField FermionField; 
+  typedef typename DomainWallFermionR::ComplexField ComplexField; 
+  typename DomainWallFermionR::ImplParams params; 
+
+  const int Ls=16;
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  std::vector<ComplexD> boundary_phases(Nd,1.);
+  boundary_phases[Nd-1]=-1.;
+  params.boundary_phases = boundary_phases;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+ 
+  double stp = 1.e-8;
+  int nrhs = 4;
+
+  ///////////////////////////////////////////////
+  // Set up the problem as a 4d spreadout job
+  ///////////////////////////////////////////////
+  std::vector<int> seeds({1,2,3,4});
+
+  std::vector<FermionField> src(nrhs,FGrid);
+  std::vector<FermionField> src_chk(nrhs,FGrid);
+  std::vector<FermionField> result(nrhs,FGrid);
+  FermionField tmp(FGrid);
+  std::cout << GridLogMessage << "Made the Fermion Fields"<<std::endl;
+
+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
+  for(int s=0;s<nrhs;s++) {
+    random(pRNG5,src[s]);
+    std::cout << GridLogMessage << " src ["<<s<<"] "<<norm2(src[s])<<std::endl;
+  }
+
+  std::cout << GridLogMessage << "Intialised the Fermion Fields"<<std::endl;
+
+  LatticeGaugeField Umu(UGrid); 
+
+  int conf = 2;
+  if(conf==0) { 
+    FieldMetaData header;
+    std::string file("./lat.in");
+    NerscIO::readConfiguration(Umu,header,file);
+    std::cout << GridLogMessage << " Config "<<file<<" successfully read" <<std::endl;
+  } else if (conf==1){
+    GridParallelRNG pRNG(UGrid );  
+
+    pRNG.SeedFixedIntegers(seeds);
+    SU3::HotConfiguration(pRNG,Umu);
+    std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
+  } else {
+    SU3::ColdConfiguration(Umu);
+    std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<<std::endl;
+  }
+
+  ///////////////////////////////////////////////////////////////
+  // Set up N-solvers as trivially parallel
+  ///////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage << " Building the solvers"<<std::endl;
+  RealD mass=0.01;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5,params);
+
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+
+  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
+  ConjugateGradient<FermionField> CG((stp),100000);
+
+  for(int rhs=0;rhs<1;rhs++){
+    result[rhs] = zero;
+    CG(HermOp,src[rhs],result[rhs]);
+  }
+
+  for(int rhs=0;rhs<1;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Try block CG
+  /////////////////////////////////////////////////////////////
+  int blockDim = 0;//not used for BlockCGVec
+  for(int s=0;s<nrhs;s++){
+    result[s]=zero;
+  }
+
+
+  {
+    BlockConjugateGradient<FermionField>    BCGV  (BlockCGrQVec,blockDim,stp,100000);
+    SchurRedBlackDiagTwoSolve<FermionField> SchurSolver(BCGV);
+    SchurSolver(Ddwf,src,result);
+  }
+  
+  for(int rhs=0;rhs<nrhs;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  Grid_finalize();
+}
--- a/tests/solver/Test_staggered_block_cg_prec.cc
+++ b/tests/solver/Test_staggered_block_cg_prec.cc
@@ -67,7 +67,22 @@ int main (int argc, char ** argv)
  GridParallelRNG pRNG(UGrid );  pRNG.SeedFixedIntegers(seeds);
  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);

-  FermionField src(FGrid); random(pRNG5,src);
+  FermionField src(FGrid);
+  FermionField tt(FGrid);
+#if 1
+  random(pRNG5,src);
+#else
+  src=zero;
+  ComplexField coor(FGrid);
+  LatticeCoordinate(coor,0);
+  for(int ss=0;ss<FGrid->oSites();ss++){
+    src._odata[ss]()()(0)=coor._odata[ss]()()();
+  }
+  LatticeCoordinate(coor,1);
+  for(int ss=0;ss<FGrid->oSites();ss++){
+    src._odata[ss]()()(0)+=coor._odata[ss]()()();
+  }
+#endif
  FermionField src_o(FrbGrid);   pickCheckerboard(Odd,src_o,src);
  FermionField result_o(FrbGrid); result_o=zero; 
  RealD nrm = norm2(src);
@@ -89,7 +104,8 @@ int main (int argc, char ** argv)
  ConjugateGradient<FermionField> CG(1.0e-8,10000);
  int blockDim = 0;
  BlockConjugateGradient<FermionField>    BCGrQ(BlockCGrQ,blockDim,1.0e-8,10000);
-  BlockConjugateGradient<FermionField>    BCG  (BlockCG,blockDim,1.0e-8,10000);
+  BlockConjugateGradient<FermionField>    BCG  (BlockCGrQ,blockDim,1.0e-8,10000);
+  BlockConjugateGradient<FermionField>    BCGv (BlockCGrQVec,blockDim,1.0e-8,10000);
  BlockConjugateGradient<FermionField>    mCG  (CGmultiRHS,blockDim,1.0e-8,10000);

  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
@@ -158,7 +174,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;

  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
-  std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl;
+  std::cout << GridLogMessage << " Calling Block CGrQ for "<<Ls <<" right hand sides" <<std::endl;
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  Ds.ZeroCounters();
  result_o=zero;
@@ -176,6 +192,49 @@ int main (int argc, char ** argv)
  Ds.Report();
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;

+  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
+  std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl;
+  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
+  Ds.ZeroCounters();
+  result_o=zero;
+  {
+    double t1=usecond();
+    BCG(HermOp,src_o,result_o);
+    double t2=usecond();
+    double ncall=BCGrQ.IterationsToComplete*Ls;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+    HermOp.Report();
+  }
+  Ds.Report();
+  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
+
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling BCGvec "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::vector<FermionField> src_v   (Ls,UrbGrid);
+  std::vector<FermionField> result_v(Ls,UrbGrid);
+  for(int s=0;s<Ls;s++) result_v[s] = zero;
+  for(int s=0;s<Ls;s++) {
+    FermionField src4(UGrid);
+    ExtractSlice(src4,src,s,0);
+    pickCheckerboard(Odd,src_v[s],src4);  
+  }
+
+  {
+    double t1=usecond();
+    BCGv(HermOp4d,src_v,result_v);
+    double t2=usecond();
+    double ncall=BCGv.IterationsToComplete*Ls;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+    //    HermOp4d.Report();
+  }
+

  Grid_finalize();
 }
Author	SHA1	Message	Date
Antonin Portelli	49b934310b	resilient I/O fix	2018-11-27 20:17:09 +00:00
Antonin Portelli	01e8cf5017	Merge branch 'develop' into feature/resilient-io	2018-11-27 19:09:59 +00:00
Antonin Portelli	12f4499502	HDF5 serialiser fix	2018-11-27 19:09:50 +00:00
Antonin Portelli	05aec72887	Hadrons: application parameter for resilient I/O	2018-11-27 18:46:43 +00:00
Antonin Portelli	136d3802cb	binary parallel IO can do read tests and eventually re-write in case of failure	2018-11-27 18:38:24 +00:00
Antonin Portelli	a4c55406ed	checksummed HDF5 IO	2018-11-27 17:43:19 +00:00
Antonin Portelli	c7f33ca2a8	Revert "Hadrons: A2A vector write can fail and retry" This reverts commit `10fc263675`.	2018-11-27 17:27:26 +00:00
Antonin Portelli	0e3035c51d	Revert "optional non-fatal checksum fail in Lime lattice read (with error codes)" This reverts commit `bccfd4cbb3`.	2018-11-27 17:27:20 +00:00
Antonin Portelli	10fc263675	Hadrons: A2A vector write can fail and retry	2018-11-26 19:47:03 +00:00
Antonin Portelli	bccfd4cbb3	optional non-fatal checksum fail in Lime lattice read (with error codes)	2018-11-26 19:45:51 +00:00
Antonin Portelli	0b50d4a328	log time fix	2018-11-23 15:51:27 +00:00
Antonin Portelli	e232257cb6	Hadrons: A2AAslashVector modul cleaning and renaming	2018-11-22 19:43:49 +00:00
Antonin Portelli	09451b5e48	Merge branch 'develop' of github.com:paboyle/Grid into develop	2018-11-22 15:45:24 +00:00
Antonin Portelli	6364aa8acf	Merge branch 'feature/contractor' into develop	2018-11-22 15:44:46 +00:00
Antonin Portelli	b9e84ecab7	Hadrons: minor code cleaning	2018-11-22 15:44:30 +00:00
Antonin Portelli	41032fef44	Optional RW mode for Hdf5Reader	2018-11-21 18:36:50 +00:00
Antonin Portelli	d77bc88170	Optional support for faster CRC32C checksum through Intel IPP	2018-11-19 17:21:53 +00:00
Antonin Portelli	494b3c9e57	Hadrons: contractor more IO fix	2018-11-19 16:26:53 +00:00
Antonin Portelli	2ba19a9e07	Hadrons: contractor IO fix	2018-11-19 16:17:51 +00:00
Antonin Portelli	5d7cc29eaf	Hadrons: contractor token @traj@ for trajectory number in input file	2018-11-19 16:04:01 +00:00
Antonin Portelli	f22a27d7f9	Hadrons: contractor trajectory loop and file output	2018-11-19 15:45:04 +00:00
Peter Boyle	33a0bbb17b	Const correctness	2018-11-19 11:27:57 +00:00
Antonin Portelli	f592ec8baa	Hadrons: contractor performance fix	2018-11-16 20:59:49 +00:00
Antonin Portelli	8b007b5c24	Hadrons: remove the use of OpenMP reductions	2018-11-16 20:00:29 +00:00
Antonin Portelli	9bb170576d	Merge pull request #177 from guelpers/develop Hadrons module to electrify a gauge	2018-11-14 16:04:09 +00:00
Vera Guelpers	a7e3977b75	Merge remote-tracking branch 'upstream/develop' into develop	2018-11-13 14:56:23 +00:00
Vera Guelpers	995f20e45d	Hadrons: some renamings	2018-11-13 14:54:48 +00:00
Vera Guelpers	d058b4e681	Merge branch 'feature/seqA2A' into develop	2018-11-13 13:27:24 +00:00
Antonin Portelli	8e0d2f3402	Hadrons: support for twisted boundary conditions	2018-11-12 17:16:18 +00:00
Antonin Portelli	2ac57370f1	Hadrons: contractor translation average normalisation	2018-11-12 16:04:35 +00:00
Antonin Portelli	344e832a4e	Hadrons: contractor faster transpose and finer timings	2018-11-12 15:59:54 +00:00
Antonin Portelli	cfe281f1a4	Hadrons: diskvectors measure hash performance in debug output	2018-11-12 15:59:11 +00:00
Antonin Portelli	f5422c7334	Hadrons: more contractor instrumentation	2018-11-09 16:23:53 +00:00
Antonin Portelli	68c76a410d	Hadrons: more contractor improvements	2018-11-08 19:24:29 +00:00
Antonin Portelli	69b6ba0a73	Hadrons: contractor fixes and improvements	2018-11-08 18:46:28 +00:00
Antonin Portelli	65349b07a7	Hadrons: simpler A2A perf functions	2018-11-08 18:44:44 +00:00
Antonin Portelli	7cd9914f0e	Hadrons: automatically resize output in MKL A2A matrix kernels	2018-11-08 17:40:57 +00:00
Peter Boyle	f3f24b3017	Optional Twisted BC's added, in "DoubleStore" for WilsonImpl. Untested but doesn't affect answers when twists are all zero. The zero is the default behaviour for ImplParams.	2018-11-08 12:55:25 +00:00
Vera Guelpers	8ef4657805	Merge remote-tracking branch 'upstream/develop' into feature/seqA2A	2018-11-08 09:00:06 +00:00
Vera Guelpers	78c1086f8b	Hadrons: sequential Aslash insertion and propagator on A2A vector	2018-11-08 08:58:09 +00:00
Peter Boyle	68c13045d6	Added a test for Felix and Michael to look at	2018-11-07 23:40:15 +00:00
Peter Boyle	e9b6f58fdc	Allow shrinking machine in orthog direction for extract slice local	2018-11-07 23:39:18 +00:00
Peter Boyle	839605c45c	Verbose reduce	2018-11-07 23:38:46 +00:00
Antonin Portelli	1ff1422e07	Hadrons: contractor lighter output	2018-11-07 20:02:53 +00:00
Antonin Portelli	32376f0437	Hadrons: contractor performances	2018-11-07 19:59:11 +00:00
Antonin Portelli	0c6e581336	Hadrons: first stab at general contraction code, needs serious testing	2018-11-07 19:16:55 +00:00
Vera Guelpers	e0a79a5bbf	Hadrons: PR#177: Electrify gauge: Single Precision fix	2018-11-07 15:01:22 +00:00
Vera Guelpers	4c016cc1a4	Merge remote-tracking branch 'upstream/develop' into develop	2018-11-07 14:03:12 +00:00
Peter Boyle	2205b1e63e	Add CXX to grid-config	2018-11-07 13:32:46 +00:00
Peter Boyle	6f421c7a6f	Block solver in the SchurRedBlack plus timing report cleaner	2018-11-07 12:26:56 +00:00
Peter Boyle	b62b9ac214	Patch to broken assertion	2018-11-06 22:18:17 +00:00
Antonin Portelli	88d9922e4f	Hadrons: fast A2A matrix contraction kernels	2018-11-06 19:49:09 +00:00
Antonin Portelli	9734e3ee58	Hadrons: (somewhat) faster build	2018-11-06 19:47:41 +00:00
Peter Boyle	8c3a599148	Block solver test	2018-11-06 16:44:58 +00:00
Azusa Yamaguchi	4a47b11876	Block CG improvements to develop	2018-11-06 12:49:05 +00:00
Vera Guelpers	f1382cf81d	Merge remote-tracking branch 'upstream/develop' into develop	2018-11-06 10:29:52 +00:00
Vera Guelpers	85699daef2	Hadrons: Module to electrify a gauge field	2018-11-06 10:27:18 +00:00
Antonin Portelli	1651111d18	Hadrons: final, portable form of the contractor benchmark	2018-11-05 21:29:13 +00:00
Antonin Portelli	1ed4ea344d	Merge branch 'develop' into feature/contractor	2018-11-05 11:42:02 +00:00
Antonin Portelli	8f514ae550	Hadrons: Lanczos 32bit IO	2018-11-05 11:41:10 +00:00
Antonin Portelli	4a7415e83c	Hadrons: contractor benchmark update	2018-10-23 21:00:54 +01:00
Antonin Portelli	0ffcfea724	Hadrons: contractor benchmark	2018-10-23 17:08:16 +01:00
Antonin Portelli	febe41cc1d	Hadrons: improvement on PR #176	2018-10-23 12:48:15 +01:00
Antonin Portelli	62173395b8	Merge pull request #176 from guelpers/develop Hadrons: full volume noise source for A2A	2018-10-23 12:29:35 +01:00
Antonin Portelli	b48611b80f	Merge branch 'develop' into feature/contractor	2018-10-22 18:27:18 +01:00
Antonin Portelli	6b559d68aa	Hadrons: eigenpack converter can do test reads	2018-10-22 11:10:18 +01:00
Antonin Portelli	1982cc58dd	Hadrons: A2A vectors I/O filename fix	2018-10-21 01:20:05 +01:00
Antonin Portelli	2e2e5ce596	SciDAC I/O print data checksums	2018-10-19 20:36:32 +01:00
Antonin Portelli	7d84dca8e9	Merge branch 'develop' into feature/contractor	2018-10-18 23:46:58 +01:00
Antonin Portelli	2d3916418e	Hadrons: more precision fix	2018-10-18 23:45:13 +01:00
Antonin Portelli	21304e2139	Hadrons: fix to allow single-prec build again	2018-10-18 19:58:50 +01:00
Antonin Portelli	7b850eb48b	Merge branch 'develop' of github.com:paboyle/Grid into develop	2018-10-18 19:46:25 +01:00
Antonin Portelli	a3ace57e01	Hadrons copyright update	2018-10-18 19:46:11 +01:00
Antonin Portelli	b1c3cbe35e	Hadrons: A2A vectors I/O	2018-10-18 19:44:58 +01:00
Antonin Portelli	f31d6bfec2	Hadrons: contractor cleaning and better error check	2018-10-18 17:50:35 +01:00
Antonin Portelli	a7cfa26901	Hadrons: reverse A2A matrix load for better DiskVector cache reuse	2018-10-18 17:50:16 +01:00
Antonin Portelli	f333f3e575	Hadrons: DiskVector save-on-eviction and faster CRC32 for Eigen matrices	2018-10-18 17:48:25 +01:00
Antonin Portelli	2b4e253473	Merge branch 'develop' of github.com:paboyle/Grid into develop	2018-10-17 20:28:20 +01:00
Antonin Portelli	0ba3d469c7	Benchmark IO in single and double precision	2018-10-17 20:27:34 +01:00
Antonin Portelli	f709329d96	Hadrons: first version of a contractor utility	2018-10-17 20:26:48 +01:00
Antonin Portelli	f05b25dae4	Hadrons: A2AMatrix load	2018-10-17 20:26:26 +01:00
Antonin Portelli	3e1d268fa3	Hadrons: DiskVector optimisation	2018-10-17 20:25:32 +01:00
Vera Guelpers	109c74bed8	Hadrons: full volume noise source for A2A	2018-10-16 14:56:12 +01:00