Reorganise a little to let the PV inverter be defined outside

the Reconstruct class. This lets the multiple choices for PV inversion be composed without changing the routine and no if/else case enumeration. Implemented SchurDiagMooee PV inversion (red black) and Unprec PV inversion. Red black cuts from 190 iterations to 90 iterations at 10^-12 on 8^4 test system Will revisit multiple Schur options and add a Fourier based multishift PV inverse, similar to the one Rudy Arthur did in BFM
4d 5d reconstruction code & test
2025-09-19 17:51:04 +01:00 · 2018-10-10 13:22:01 +01:00 · 2018-10-09 18:37:20 +01:00 · 2018-10-09 17:41:56 +01:00
157 changed files with 2251 additions and 12203 deletions
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -48,16 +48,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
 #include <Grid/algorithms/iterative/MinimalResidual.h>
 #include <Grid/algorithms/iterative/GeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/CoarsenedMatrix.h>
 #include <Grid/algorithms/FFT.h>
 // EigCg
 // Pcg
 // Hdcg
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -211,7 +211,6 @@ namespace Grid {
      for(int b=0;b<nn;b++){
 	subspace[b] = zero;
 	gaussian(RNG,noise);
 	scale = std::pow(norm2(noise),-0.5); 
 	noise=noise*scale;
@@ -297,57 +296,12 @@ namespace Grid {
    };
    RealD Mdag (const CoarseVector &in, CoarseVector &out){ 
-      // // corresponds to Petrov-Galerkin coarsening
+      return M(in,out);
      // return M(in,out);
      // corresponds to Galerkin coarsening
      CoarseVector tmp(Grid());
      G5C(tmp, in);
      M(tmp, out);
      G5C(out, out);
      return norm2(out);
    };
-    void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
+    // Defer support for further coarsening for now
-
+    void Mdiag    (const CoarseVector &in,  CoarseVector &out){};
-      conformable(_grid,in._grid);
+    void Mdir     (const CoarseVector &in,  CoarseVector &out,int dir, int disp){};
      conformable(in._grid,out._grid);
      SimpleCompressor<siteVector> compressor;
      Stencil.HaloExchange(in,compressor);
      auto point = [dir, disp](){
        if(dir == 0 and disp == 0)
          return 8;
        else
          return (4 * dir + 1 - disp) / 2;
      }();
      parallel_for(int ss=0;ss<Grid()->oSites();ss++){
        siteVector res = zero;
        siteVector nbr;
        int ptype;
        StencilEntry *SE;
        SE=Stencil.GetEntry(ptype,point,ss);
        if(SE->_is_local&&SE->_permute) {
          permute(nbr,in._odata[SE->_offset],ptype);
        } else if(SE->_is_local) {
          nbr = in._odata[SE->_offset];
        } else {
          nbr = Stencil.CommBuf()[SE->_offset];
        }
        res = res + A[point]._odata[ss]*nbr;
        vstream(out._odata[ss],res);
      }
    };
    void Mdiag(const CoarseVector &in, CoarseVector &out){
      Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil
    };
    CoarsenedMatrix(GridCartesian &CoarseGrid) 	: 
@@ -463,7 +417,7 @@ namespace Grid {
      std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
      //      ForceHermitian();
-      // AssertHermitian();
+      AssertHermitian();
      // ForceDiagonal();
    }
    void ForceDiagonal(void) {
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -380,12 +380,6 @@ namespace Grid {
    template<class Field> class OperatorFunction {
    public:
      virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
      virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
 	assert(in.size()==out.size());
 	for(int k=0;k<in.size();k++){
 	  (*this)(Linop,in[k],out[k]);
 	}
      };
    };
    template<class Field> class LinearFunction {
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -55,14 +55,6 @@ namespace Grid {
    template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
    public:
      virtual GridBase *RedBlackGrid(void)=0;
      //////////////////////////////////////////////////////////////////////
      // Query the even even properties to make algorithmic decisions
      //////////////////////////////////////////////////////////////////////
      virtual RealD  Mass(void)        { return 0.0; };
      virtual int    ConstEE(void)     { return 0; }; // Disable assumptions unless overridden
      virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
      // half checkerboard operaions
      virtual  void Meooe    (const Field &in, Field &out)=0;
      virtual  void Mooee    (const Field &in, Field &out)=0;
--- a/Grid/algorithms/iterative/BlockConjugateGradient.h
+++ b/Grid/algorithms/iterative/BlockConjugateGradient.h
@@ -33,7 +33,7 @@ directory
 namespace Grid {
-enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
+enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient. Dimension zero should be the block direction
@@ -42,6 +42,7 @@ template <class Field>
 class BlockConjugateGradient : public OperatorFunction<Field> {
 public:
  typedef typename Field::scalar_type scomplex;
  int blockDim ;
@@ -53,15 +54,21 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  Integer PrintInterval; //GridLogMessages or Iterative
  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100)
+    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
  {};
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Thin QR factorisation (google it)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 void ThinQRfact (Eigen::MatrixXcd &m_rr,
 		 Eigen::MatrixXcd &C,
 		 Eigen::MatrixXcd &Cinv,
 		 Field & Q,
 		 const Field & R)
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  //Dimensions
  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
@@ -78,20 +85,22 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
  // Cdag C = Rdag R ; passes.
  // QdagQ  = 1      ; passes
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 void ThinQRfact (Eigen::MatrixXcd &m_rr,
 		 Eigen::MatrixXcd &C,
 		 Eigen::MatrixXcd &Cinv,
 		 Field & Q,
 		 const Field & R)
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
  // Force manifest hermitian to avoid rounding related
  m_rr = 0.5*(m_rr+m_rr.adjoint());
-  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+#if 0
  std::cout << " Calling Cholesky  ldlt on m_rr "  << m_rr <<std::endl;
  Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL(); 
  std::cout << " Called Cholesky  ldlt on m_rr "  << L_ldlt <<std::endl;
  auto  D_ldlt = m_rr.ldlt().vectorD(); 
  std::cout << " Called Cholesky  ldlt on m_rr "  << D_ldlt <<std::endl;
 #endif
  //  std::cout << " Calling Cholesky  llt on m_rr "  <<std::endl;
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
  //  std::cout << " Called Cholesky  llt on m_rr "  << L <<std::endl;
  C    = L.adjoint();
  Cinv = C.inverse();
  ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -103,25 +112,6 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  sliceMulMatrix(Q,Cinv,R,Orthog);
 }
 // see comments above
 void ThinQRfact (Eigen::MatrixXcd &m_rr,
 		 Eigen::MatrixXcd &C,
 		 Eigen::MatrixXcd &Cinv,
 		 std::vector<Field> & Q,
 		 const std::vector<Field> & R)
 {
  InnerProductMatrix(m_rr,R,R);
  m_rr = 0.5*(m_rr+m_rr.adjoint());
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
  C    = L.adjoint();
  Cinv = C.inverse();
  MulMatrix(Q,Cinv,R);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Call one of several implementations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -129,20 +119,14 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 {
  if ( CGtype == BlockCGrQ ) {
    BlockCGrQsolve(Linop,Src,Psi);
  } else if (CGtype == BlockCG ) {
    BlockCGsolve(Linop,Src,Psi);
  } else if (CGtype == CGmultiRHS ) {
    CGmultiRHSsolve(Linop,Src,Psi);
  } else {
    assert(0);
  }
 }
 virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi) 
 {
  if ( CGtype == BlockCGrQVec ) {
    BlockCGrQsolveVec(Linop,Src,Psi);
  } else {
    assert(0);
  }
 }
 ////////////////////////////////////////////////////////////////////////////
 // BlockCGrQ implementation:
@@ -155,8 +139,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = B._grid->_fdimensions[Orthog];
-/* FAKE */
+
  Nblock=8;
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
  X.checkerboard = B.checkerboard;
@@ -219,10 +202,15 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
  Linop.HermOp(X, AD);
  tmp = B - AD;  
-
+  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
  D=Q;
  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
@@ -244,12 +232,14 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    MatrixTimer.Start();
    Linop.HermOp(D, Z);      
    MatrixTimer.Stop();
    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
    //4. M  = [D^dag Z]^{-1}
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
    sliceInnerTimer.Stop();
    m_M       = m_DZ.inverse();
    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
    //5. X  = X + D MC
    m_tmp     = m_M * m_C;
@@ -267,7 +257,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    //7. D  = Q + D S^dag
    m_tmp = m_S.adjoint();
    sliceMaddTimer.Start();
    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
    sliceMaddTimer.Stop();
@@ -328,6 +317,152 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  IterationsToComplete = k;
 }
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
 void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = Src._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
  Psi.checkerboard = Src.checkerboard;
  conformable(Psi, Src);
  Field P(Src);
  Field AP(Src);
  Field R(Src);
  Eigen::MatrixXcd m_pAp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_alpha      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_beta   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  // Initial residual computation & set up
  std::vector<RealD> residuals(Nblock);
  std::vector<RealD> ssq(Nblock);
  sliceNorm(ssq,Src,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  sliceNorm(residuals,Src,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  sliceNorm(residuals,Psi,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  // Initial search dir is guess
  Linop.HermOp(Psi, AP);
  /************************************************************************
   * Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
   ************************************************************************
   * O'Leary : R = B - A X
   * O'Leary : P = M R ; preconditioner M = 1
   * O'Leary : alpha = PAP^{-1} RMR
   * O'Leary : beta  = RMR^{-1}_old RMR_new
   * O'Leary : X=X+Palpha
   * O'Leary : R_new=R_old-AP alpha
   * O'Leary : P=MR_new+P beta
   */
  R = Src - AP;  
  P = R;
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
  GridStopWatch sliceInnerTimer;
  GridStopWatch sliceMaddTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  int k;
  for (k = 1; k <= MaxIterations; k++) {
    RealD rrsum=0;
    for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
    MatrixTimer.Start();
    Linop.HermOp(P, AP);
    MatrixTimer.Stop();
    // Alpha
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
    sliceInnerTimer.Stop();
    m_pAp_inv = m_pAp.inverse();
    m_alpha   = m_pAp_inv * m_rr ;
    // Psi, R update
    sliceMaddTimer.Start();
    sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog);     // add alpha *  P to psi
    sliceMaddMatrix(R  ,m_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
    sliceMaddTimer.Stop();
    // Beta
    m_rr_inv = m_rr.inverse();
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_rr,R,R,Orthog);
    sliceInnerTimer.Stop();
    m_beta = m_rr_inv *m_rr;
    // Search update
    sliceMaddTimer.Start();
    sliceMaddMatrix(AP,m_beta,P,R,Orthog);
    sliceMaddTimer.Stop();
    P= AP;
    /*********************
     * convergence monitor
     *********************
     */
    RealD max_resid=0;
    RealD rr;
    for(int b=0;b<Nblock;b++){
      rr = real(m_rr(b,b))/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    if ( max_resid < Tolerance*Tolerance ) { 
      SolverTimer.Stop();
      std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
 	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
 		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      Linop.HermOp(Psi, AP);
      AP = AP-Src;
      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
      IterationsToComplete = k;
      return;
    }
  }
  std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 //////////////////////////////////////////////////////////////////////////
 // multiRHS conjugate gradient. Dimension zero should be the block direction
 // Use this for spread out across nodes
 //////////////////////////////////////////////////////////////////////////
@@ -465,233 +600,6 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
  IterationsToComplete = k;
 }
 void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
  for(int b=0;b<Nblock;b++){
  for(int bp=0;bp<Nblock;bp++) {
    m(b,bp) = innerProduct(X[b],Y[bp]);  
  }}
 }
 void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
  // Should make this cache friendly with site outermost, parallel_for
  // Deal with case AP aliases with either Y or X
  std::vector<Field> tmp(Nblock,X[0]);
  for(int b=0;b<Nblock;b++){
    tmp[b]   = Y[b];
    for(int bp=0;bp<Nblock;bp++) {
      tmp[b] = tmp[b] + (scale*m(bp,b))*X[bp]; 
    }
  }
  for(int b=0;b<Nblock;b++){
    AP[b] = tmp[b];
  }
 }
 void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
  // Should make this cache friendly with site outermost, parallel_for
  for(int b=0;b<Nblock;b++){
    AP[b] = zero;
    for(int bp=0;bp<Nblock;bp++) {
      AP[b] += (m(bp,b))*X[bp]; 
    }
  }
 }
 double normv(const std::vector<Field> &P){
  double nn = 0.0;
  for(int b=0;b<Nblock;b++) {
    nn+=norm2(P[b]);
  }
  return nn;
 }
 ////////////////////////////////////////////////////////////////////////////
 // BlockCGrQvec implementation:
 //--------------------------
 // X is guess/Solution
 // B is RHS
 // Solve A X_i = B_i    ;        i refers to Nblock index
 ////////////////////////////////////////////////////////////////////////////
 void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X) 
 {
  Nblock = B.size();
  assert(Nblock == X.size());
  std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
  for(int b=0;b<Nblock;b++){ 
    X[b].checkerboard = B[b].checkerboard;
    conformable(X[b], B[b]);
    conformable(X[b], X[0]); 
  }
  Field Fake(B[0]);
  std::vector<Field> tmp(Nblock,Fake);
  std::vector<Field>   Q(Nblock,Fake);
  std::vector<Field>   D(Nblock,Fake);
  std::vector<Field>   Z(Nblock,Fake);
  std::vector<Field>  AD(Nblock,Fake);
  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  // Initial residual computation & set up
  std::vector<RealD> residuals(Nblock);
  std::vector<RealD> ssq(Nblock);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  /************************************************************************
   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
   ************************************************************************
   * Dimensions:
   *
   *   X,B==(Nferm x Nblock)
   *   A==(Nferm x Nferm)
   *  
   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
   * 
   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
   * for k: 
   *   Z  = AD
   *   M  = [D^dag Z]^{-1}
   *   X  = X + D MC
   *   QS = Q - ZM
   *   D  = Q + D S^dag
   *   C  = S C
   */
  ///////////////////////////////////////
  // Initial block: initial search dir is guess
  ///////////////////////////////////////
  std::cout << GridLogMessage<<"BlockCGrQvec algorithm initialisation " <<std::endl;
  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
  for(int b=0;b<Nblock;b++) {
    Linop.HermOp(X[b], AD[b]);
    tmp[b] = B[b] - AD[b];  
  }
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
  for(int b=0;b<Nblock;b++) D[b]=Q[b];
  std::cout << GridLogMessage<<"BlockCGrQ vec computed initial residual and QR fact " <<std::endl;
  ///////////////////////////////////////
  // Timers
  ///////////////////////////////////////
  GridStopWatch sliceInnerTimer;
  GridStopWatch sliceMaddTimer;
  GridStopWatch QRTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  int k;
  for (k = 1; k <= MaxIterations; k++) {
    //3. Z  = AD
    MatrixTimer.Start();
    for(int b=0;b<Nblock;b++) Linop.HermOp(D[b], Z[b]);      
    MatrixTimer.Stop();
    //4. M  = [D^dag Z]^{-1}
    sliceInnerTimer.Start();
    InnerProductMatrix(m_DZ,D,Z);
    sliceInnerTimer.Stop();
    m_M       = m_DZ.inverse();
    //5. X  = X + D MC
    m_tmp     = m_M * m_C;
    sliceMaddTimer.Start();
    MaddMatrix(X,m_tmp, D,X);     
    sliceMaddTimer.Stop();
    //6. QS = Q - ZM
    sliceMaddTimer.Start();
    MaddMatrix(tmp,m_M,Z,Q,-1.0);
    sliceMaddTimer.Stop();
    QRTimer.Start();
    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
    QRTimer.Stop();
    //7. D  = Q + D S^dag
    m_tmp = m_S.adjoint();
    sliceMaddTimer.Start();
    MaddMatrix(D,m_tmp,D,Q);
    sliceMaddTimer.Stop();
    //8. C  = S C
    m_C = m_S*m_C;
    /*********************
     * convergence monitor
     *********************
     */
    m_rr = m_C.adjoint() * m_C;
    RealD max_resid=0;
    RealD rrsum=0;
    RealD rr;
    for(int b=0;b<Nblock;b++) {
      rrsum+=real(m_rr(b,b));
      rr = real(m_rr(b,b))/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl;
    if ( max_resid < Tolerance*Tolerance ) { 
      SolverTimer.Stop();
      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
 	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
      for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
      IterationsToComplete = k;
      return;
    }
  }
  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 };
 }
--- a/Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
@@ -1,244 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field>
 class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the CAGMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  CommunicationAvoidingGeneralisedMinimalResidual(RealD   tol,
                                                  Integer maxit,
                                                  Integer restart_length,
                                                  bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.) {};
  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
    std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular GMRES" << std::endl;
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Field r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual:   src " << ssq   << std::endl;
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "CommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
    RealD cp = 0;
    Field w(src._grid);
    Field r(src._grid);
    // this should probably be made a class member so that it is only allocated once, not in every restart
    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(v, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
    MatrixTimer.Start();
    LinOp.Op(v[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + v[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -133,7 +133,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
      LinalgTimer.Stop();
      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
-                << " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
+                << " residual " << cp << " target " << rsq << std::endl;
      // Stopping condition
      if (cp <= rsq) {
--- a/Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
@@ -1,256 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field>
 class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the FCAGMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch PrecTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  LinearFunction<Field> &Preconditioner;
  FlexibleCommunicationAvoidingGeneralisedMinimalResidual(RealD   tol,
                                                          Integer maxit,
                                                          LinearFunction<Field> &Prec,
                                                          Integer restart_length,
                                                          bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.)
      , Preconditioner(Prec) {};
  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
    std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular FGMRES" << std::endl;
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Field r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual:   src " << ssq   << std::endl;
    PrecTimer.Reset();
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Precon  " <<         PrecTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
    RealD cp = 0;
    Field w(src._grid);
    Field r(src._grid);
    // these should probably be made class members so that they are only allocated once, not in every restart
    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, z, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(z, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
    PrecTimer.Start();
    Preconditioner(v[iter], z[iter]);
    PrecTimer.Stop();
    MatrixTimer.Start();
    LinOp.Op(z[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + z[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
--- a/Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
@@ -1,254 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field>
 class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the FGMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch PrecTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  LinearFunction<Field> &Preconditioner;
  FlexibleGeneralisedMinimalResidual(RealD   tol,
                                     Integer maxit,
                                     LinearFunction<Field> &Prec,
                                     Integer restart_length,
                                     bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.)
      , Preconditioner(Prec) {};
  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Field r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual:   src " << ssq   << std::endl;
    PrecTimer.Reset();
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "FlexibleGeneralisedMinimalResidual: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: Precon  " <<         PrecTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
    RealD cp = 0;
    Field w(src._grid);
    Field r(src._grid);
    // these should probably be made class members so that they are only allocated once, not in every restart
    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, z, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(z, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
    PrecTimer.Start();
    Preconditioner(v[iter], z[iter]);
    PrecTimer.Stop();
    MatrixTimer.Start();
    LinOp.Op(z[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + z[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
--- a/Grid/algorithms/iterative/GeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/GeneralisedMinimalResidual.h
@@ -1,242 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/GeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field>
 class GeneralisedMinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the GMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  GeneralisedMinimalResidual(RealD   tol,
                             Integer maxit,
                             Integer restart_length,
                             bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.) {};
  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Field r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "GeneralisedMinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "GeneralisedMinimalResidual:   src " << ssq   << std::endl;
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "GeneralisedMinimalResidual: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
    RealD cp = 0;
    Field w(src._grid);
    Field r(src._grid);
    // this should probably be made a class member so that it is only allocated once, not in every restart
    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "GeneralisedMinimalResidual: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(v, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
    MatrixTimer.Start();
    LinOp.Op(v[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + v[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
--- a/Grid/algorithms/iterative/MinimalResidual.h
+++ b/Grid/algorithms/iterative/MinimalResidual.h
@@ -1,156 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/MinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_MINIMAL_RESIDUAL_H
 #define GRID_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field> class MinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
                          // Defaults true.
  RealD   Tolerance;
  Integer MaxIterations;
  RealD   overRelaxParam;
  Integer IterationsToComplete; // Number of iterations the MR took to finish.
                                // Filled in upon completion
  MinimalResidual(RealD tol, Integer maxit, Real ovrelparam = 1.0, bool err_on_no_conv = true)
    : Tolerance(tol), MaxIterations(maxit), overRelaxParam(ovrelparam), ErrorOnNoConverge(err_on_no_conv){};
  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    Complex a, c;
    Real    d;
    Field Mr(src);
    Field r(src);
    // Initial residual computation & set up
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Linop.Op(psi, Mr);
    r = src - Mr;
    RealD cp = norm2(r);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "MinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "MinimalResidual:   src " << ssq << std::endl;
    std::cout << GridLogIterative << "MinimalResidual:    mp " << d << std::endl;
    std::cout << GridLogIterative << "MinimalResidual:  cp,r " << cp << std::endl;
    if (cp <= rsq) {
      return;
    }
    std::cout << GridLogIterative << "MinimalResidual: k=0 residual " << cp << " target " << rsq << std::endl;
    GridStopWatch LinalgTimer;
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
      MatrixTimer.Start();
      Linop.Op(r, Mr);
      MatrixTimer.Stop();
      LinalgTimer.Start();
      c = innerProduct(Mr, r);
      d = norm2(Mr);
      a = c / d;
      a = a * overRelaxParam;
      psi = psi + r * a;
      r = r - Mr * a;
      cp = norm2(r);
      LinalgTimer.Stop();
      std::cout << GridLogIterative << "MinimalResidual: Iteration " << k
                << " residual " << cp << " target " << rsq << std::endl;
      std::cout << GridLogDebug << "a = " << a << " c = " << c << " d = " << d << std::endl;
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        Linop.Op(psi, Mr);
        r = src - Mr;
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "MinimalResidual Converged on iteration " << k
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "MR Time elapsed: Total   " << SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MR Time elapsed: Matrix  " << MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MR Time elapsed: Linalg  " << LinalgTimer.Elapsed() << std::endl;
        if (ErrorOnNoConverge)
          assert(true_residual / Tolerance < 10000.0);
        IterationsToComplete = k;
        return;
      }
    }
    std::cout << GridLogMessage << "MinimalResidual did NOT converge"
              << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
    IterationsToComplete = k;
  }
 };
 } // namespace Grid
 #endif
--- a/Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
@@ -1,273 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class FieldD, class FieldF, typename std::enable_if<getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
 class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction<FieldD> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the MPFGMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch PrecTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  GridStopWatch ChangePrecTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  GridBase* SinglePrecGrid;
  LinearFunction<FieldF> &Preconditioner;
  MixedPrecisionFlexibleGeneralisedMinimalResidual(RealD   tol,
                                                   Integer maxit,
                                                   GridBase * sp_grid,
                                                   LinearFunction<FieldF> &Prec,
                                                   Integer restart_length,
                                                   bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.)
      , SinglePrecGrid(sp_grid)
      , Preconditioner(Prec) {};
  void operator()(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    FieldD r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "MPFGMRES: guess " << guess << std::endl;
    std::cout << GridLogIterative << "MPFGMRES:   src " << ssq   << std::endl;
    PrecTimer.Reset();
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    ChangePrecTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "MPFGMRES: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Total      " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Precon     " <<         PrecTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Matrix     " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Linalg     " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: QR         " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: CompSol    " << CompSolutionTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: PrecChange " <<   ChangePrecTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) {
    RealD cp = 0;
    FieldD w(src._grid);
    FieldD r(src._grid);
    // these should probably be made class members so that they are only allocated once, not in every restart
    std::vector<FieldD> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    std::vector<FieldD> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, z, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "MPFGMRES: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(z, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<FieldD> &LinOp, std::vector<FieldD> &v, std::vector<FieldD> &z, FieldD &w, int iter) {
    FieldF v_f(SinglePrecGrid);
    FieldF z_f(SinglePrecGrid);
    ChangePrecTimer.Start();
    precisionChange(v_f, v[iter]);
    precisionChange(z_f, z[iter]);
    ChangePrecTimer.Stop();
    PrecTimer.Start();
    Preconditioner(v_f, z_f);
    PrecTimer.Stop();
    ChangePrecTimer.Start();
    precisionChange(z[iter], z_f);
    ChangePrecTimer.Stop();
    MatrixTimer.Start();
    LinOp.Op(z[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<FieldD> const &z, FieldD &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + z[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -139,10 +139,7 @@ namespace Grid {
      MatTimer.Start();
      Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
      MatTimer.Stop();
      LinalgTimer.Start();
      r=src-Az;
      LinalgTimer.Stop();
      /////////////////////
      // p = Prec(r)
@@ -155,10 +152,8 @@ namespace Grid {
      Linop.HermOp(z,tmp); 
      MatTimer.Stop();
      LinalgTimer.Start();
      ttmp=tmp;
      tmp=tmp-r;
      LinalgTimer.Stop();
      /*
      std::cout<<GridLogMessage<<r<<std::endl;
@@ -171,14 +166,12 @@ namespace Grid {
      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
      MatTimer.Stop();
      LinalgTimer.Start();
      //p[0],q[0],qq[0] 
      p[0]= z;
      q[0]= Az;
      qq[0]= zAAz;
      cp =norm2(r);
      LinalgTimer.Stop();
      for(int k=0;k<nstep;k++){
@@ -188,14 +181,12 @@ namespace Grid {
 	int peri_k = k %mmax;
 	int peri_kp= kp%mmax;
        LinalgTimer.Start();
 	rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
 	a = rq/qq[peri_k];
 	axpy(psi,a,p[peri_k],psi);         
 	cp = axpy_norm(r,-a,q[peri_k],r);  
        LinalgTimer.Stop();
 	if((k==nstep-1)||(cp<rsq)){
 	  return cp;
@@ -211,8 +202,6 @@ namespace Grid {
 	Linop.HermOpAndNorm(z,Az,zAz,zAAz);
 	Linop.HermOp(z,tmp);
 	MatTimer.Stop();
        LinalgTimer.Start();
        tmp=tmp-r;
 	std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
@@ -230,9 +219,9 @@ namespace Grid {
 	}
 	qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
        LinalgTimer.Stop();
      }
      }
      assert(0); // never reached
      return cp;
    }
--- a/Grid/algorithms/iterative/Reconstruct5Dprop.h
+++ b/Grid/algorithms/iterative/Reconstruct5Dprop.h
@@ -30,6 +30,49 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 namespace QCD {
 template<class Field>
 class PauliVillarsSolverUnprec
 {
 public:
  ConjugateGradient<Field> & CG;
  PauliVillarsSolverUnprec(  ConjugateGradient<Field> &_CG) : CG(_CG){};
  template<class Matrix>
  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
  {
    RealD m = _Matrix.Mass();
    Field A  (_Matrix.FermionGrid());
    MdagMLinearOperator<Matrix,Field> HermOp(_Matrix);
    _Matrix.SetMass(1.0);
    _Matrix.Mdag(src,A);
    CG(HermOp,A,sol);
    _Matrix.SetMass(m);
  };
 };
 template<class Field>
 class PauliVillarsSolverRBprec
 {
 public:
  ConjugateGradient<Field> & CG;
  PauliVillarsSolverRBprec(  ConjugateGradient<Field> &_CG) : CG(_CG){};
  template<class Matrix>
  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
  {
    RealD m = _Matrix.Mass();
    Field A  (_Matrix.FermionGrid());
    _Matrix.SetMass(1.0);
    SchurRedBlackDiagMooeeSolve<Field> SchurSolver(CG);
    SchurSolver(_Matrix,src,sol);
    _Matrix.SetMass(m);
  };
 };
 template<class Field,class PVinverter> class Reconstruct5DfromPhysical {
 private:
  PVinverter & PauliVillarsSolver;
@@ -42,12 +85,20 @@ template<class Field,class PVinverter> class Reconstruct5DfromPhysical {
 // of the Mobius exact AMA corrections.
 //
 // TODO : understand absence of contact term in eqns in Hantao's thesis
- //        sol4 is contact term subtracted, but thesis & Brower's paper suggests not.
+ //        sol4 is contact term subtracted.
 //
- // Step 1: Localise PV inverse in a routine. [DONE]
+ // Options
 // a) Defect correction approach:
 //    1) Compute defect from current soln (initially guess).
 //       This is ...... outerToInner check !!!!
 //    2) Deflated Zmobius solve to get 4d soln
 //       Ensure deflation is working
 //    3) Refine 5d Outer using the inner 4d delta soln
 // 
 // Step 1: localise PV inverse in a routine. [DONE]
 // Step 2: Schur based PV inverse            [DONE]
- // Step 3: Fourier accelerated PV inverse    [DONE]
+ // Step 3: Fourier accelerated PV inverse
- //
+ // Step 4: 
 /////////////////////////////////////////////////////
  Reconstruct5DfromPhysical(PVinverter &_PauliVillarsSolver) 
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -86,22 +86,228 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   */
 namespace Grid {
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Use base class to share code
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackBase {
+  // Now make the norm reflect extra factor of Mee
-  protected:
+  template<class Field> class SchurRedBlackStaggeredSolve {
-    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+  private:
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
  public:
-    SchurRedBlackBase(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
+    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
     _HermitianRBSolver(HermitianRBSolver) 
    { 
      CBfactorise=0;
      subtractGuess(initSubGuess);
    };
    void subtractGuess(const bool initSubGuess)
    {
      subGuess = initSubGuess;
    }
    bool isSubtractGuess(void)
    {
      return subGuess;
    }
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve " <<std::endl;
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
      /////////////////////////////////////////////////////
      // src_o = (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      //src_o = tmp;     assert(src_o.checkerboard ==Odd);
      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
      guess(src_o, sol_o);
      Mtmp = sol_o;
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;
      // Fionn A2A boolean behavioural control
      if (subGuess)        sol_o = sol_o-Mtmp;
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver reconstructed other CB" <<std::endl;
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver inserted solution" <<std::endl;
      // Verify the unprec residual
      if ( ! subGuess ) {
        _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
        std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
      } else {
        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
      }
    }     
  };
  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagMooeeSolve {
  private:
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
  public:
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0, const bool initSubGuess = false)  :  _HermitianRBSolver(HermitianRBSolver) 
  { 
    CBfactorise=cb;
    subtractGuess(initSubGuess);
  };
    void subtractGuess(const bool initSubGuess)
    {
      subGuess = initSubGuess;
    }
    bool isSubtractGuess(void)
    {
      return subGuess;
    }
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
      guess(src_o,sol_o);
      Mtmp = sol_o;
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      // Fionn A2A boolean behavioural control
      if (subGuess)        sol_o = sol_o-Mtmp;
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
      // Verify the unprec residual
      if ( ! subGuess ) {
        _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
        std::cout<<GridLogMessage << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
      } else {
        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
      }
    }     
  };
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagTwoSolve {
  private:
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
  public:
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
     _HermitianRBSolver(HermitianRBSolver) 
    { 
      CBfactorise = 0;
@@ -116,86 +322,12 @@ namespace Grid {
      return subGuess;
    }
-    /////////////////////////////////////////////////////////////
+    template<class Matrix>
    // Shared code
    /////////////////////////////////////////////////////////////
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
-    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out) 
+    template<class Matrix,class Guesser>
    {
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Guesser>
    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess) 
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      int nblock = in.size();
      std::vector<Field> src_o(nblock,grid);
      std::vector<Field> sol_o(nblock,grid);
      std::vector<Field> guess_save;
      Field resid(fgrid);
      Field tmp(grid);
      ////////////////////////////////////////////////
      // Prepare RedBlack source
      ////////////////////////////////////////////////
      for(int b=0;b<nblock;b++){
 	RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
      }
      ////////////////////////////////////////////////
      // Make the guesses
      ////////////////////////////////////////////////
      if ( subGuess ) guess_save.resize(nblock,grid);
      for(int b=0;b<nblock;b++){
 	guess(src_o[b],sol_o[b]); 
 	if ( subGuess ) { 
 	  guess_save[b] = sol_o[b];
 	}
      }
      //////////////////////////////////////////////////////////////
      // Call the block solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlackBase calling the solver for "<<nblock<<" RHS" <<std::endl;
      RedBlackSolve(_Matrix,src_o,sol_o);
      ////////////////////////////////////////////////
      // A2A boolean behavioural control & reconstruct other checkerboard
      ////////////////////////////////////////////////
      for(int b=0;b<nblock;b++) {
 	if (subGuess)   sol_o[b] = sol_o[b] - guess_save[b];
 	///////// Needs even source //////////////
 	pickCheckerboard(Even,tmp,in[b]);
 	RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
 	/////////////////////////////////////////////////
 	// Check unprec residual if possible
 	/////////////////////////////////////////////////
 	if ( ! subGuess ) {
 	  _Matrix.M(out[b],resid); 
 	  resid = resid-in[b];
 	  RealD ns = norm2(in[b]);
 	  RealD nr = norm2(resid);
 	  std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
 	} else {
 	  std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
 	}
      }
    }
    template<class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
@@ -203,105 +335,42 @@ namespace Grid {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
-      Field resid(fgrid);
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
-      Field src_o(grid);
+ 
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      ////////////////////////////////////////////////
      // RedBlack source
      ////////////////////////////////////////////////
      RedBlackSource(_Matrix,in,src_e,src_o);
      ////////////////////////////////
      // Construct the guess
      ////////////////////////////////
      Field   tmp(grid);
      guess(src_o,sol_o);
      Field  guess_save(grid);
      guess_save = sol_o;
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      RedBlackSolve(_Matrix,src_o,sol_o);
      ////////////////////////////////////////////////
      // Fionn A2A boolean behavioural control
      ////////////////////////////////////////////////
      if (subGuess)      sol_o= sol_o-guess_save;
      ///////////////////////////////////////////////////
      // RedBlack solution needs the even source
      ///////////////////////////////////////////////////
      RedBlackSolution(_Matrix,sol_o,src_e,out);
      // Verify the unprec residual
      if ( ! subGuess ) {
        _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
        std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl;
      } else {
        std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl;
      }
    }     
    /////////////////////////////////////////////////////////////
    // Override in derived. Not virtual as template methods
    /////////////////////////////////////////////////////////////
    virtual void RedBlackSource  (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)                =0;
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)          =0;
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)                           =0;
    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)=0;
  };
  template<class Field> class SchurRedBlackStaggeredSolve : public SchurRedBlackBase<Field> {
  public:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) 
      :    SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess) 
    {
    }
    //////////////////////////////////////////////////////
    // Override RedBlack specialisation
    //////////////////////////////////////////////////////
    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
-      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,src);
+      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      /////////////////////////////////////////////////////
-      // src_o = (source_o - Moe MeeInv source_e)
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
-      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
+      // get the right MpcDag
-    }
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e_c,Field &sol)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
-      Field   tmp(grid);
+      //////////////////////////////////////////////////////////////
-      Field   sol_e(grid);
+      // Call the red-black solver
-      Field   src_e(grid);
+      //////////////////////////////////////////////////////////////
-
+      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-      src_e = src_e_c; // Const correctness
+//      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      guess(src_o,tmp);
      Mtmp = tmp;
      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      // Fionn A2A boolean behavioural control
      if (subGuess)      tmp = tmp-Mtmp;
      _Matrix.MooeeInv(tmp,sol_o);       assert(  sol_o.checkerboard   ==Odd);
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
@@ -310,116 +379,78 @@ namespace Grid {
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(sol,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(sol,sol_o); assert(  sol_o.checkerboard ==Odd );
+      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
    }
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
    {
      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
    };
    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
    {
      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
    }
  };
  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
      // Verify the unprec residual
      if ( ! subGuess ) {
        _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
        std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
      } else {
        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
      }
    }     
  };
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Site diagonal has Mooee on it.
+  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field> {
+  template<class Field> class SchurRedBlackDiagTwoMixed {
  private:
    LinearFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
  public:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  
      : SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess) {};
    //////////////////////////////////////////////////////
    // Override RedBlack specialisation
    //////////////////////////////////////////////////////
    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field   tmp(grid);
      Field  Mtmp(grid);
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd ,src_o,src);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      // get the right MpcDag
      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
    }
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field   tmp(grid);
      Field  sol_e(grid);
      Field  src_e_i(grid);
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);          assert(  tmp.checkerboard   ==Even);
      src_e_i = src_e-tmp;               assert(  src_e_i.checkerboard ==Even);
      _Matrix.MooeeInv(src_e_i,sol_e);   assert(  sol_e.checkerboard ==Even);
      setCheckerboard(sol,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(sol,sol_o); assert(  sol_o.checkerboard ==Odd );
    }
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
    {
      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
    };
    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
    {
      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
    }
  };
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal is identity, right preconditioned by Mee^inv
  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
  //=> psi = MeeInv phi
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field> {
  public:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
-  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  
+  SchurRedBlackDiagTwoMixed(LinearFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
-    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess) {};
+     _HermitianRBSolver(HermitianRBSolver) 
    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
    { 
      CBfactorise=0;
      subtractGuess(initSubGuess);
    };
    void subtractGuess(const bool initSubGuess)
    {
      subGuess = initSubGuess;
    }
    bool isSubtractGuess(void)
    {
      return subGuess;
    }
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
-      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,src);
+      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
@@ -430,44 +461,43 @@ namespace Grid {
      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
    }
-    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+      //////////////////////////////////////////////////////////////
-    {
+      // Call the red-black solver
-      GridBase *grid = _Matrix.RedBlackGrid();
+      //////////////////////////////////////////////////////////////
-      GridBase *fgrid= _Matrix.Grid();
+      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-
+//      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-      Field   sol_o_i(grid);
+//      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
-      Field   tmp(grid);
+      guess(src_o,tmp);
-      Field   sol_e(grid);
+      Mtmp = tmp;
-
+      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
-      ////////////////////////////////////////////////
+      // Fionn A2A boolean behavioural control
-      // MooeeInv due to pecond
+      if (subGuess)      tmp = tmp-Mtmp;
-      ////////////////////////////////////////////////
+      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
      _Matrix.MooeeInv(sol_o,tmp);
      sol_o_i = tmp;
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o_i,tmp);    assert(  tmp.checkerboard   ==Even);
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      tmp = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(tmp,sol_e);   assert(  sol_e.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(sol,sol_e);    assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(sol,sol_o_i);  assert(  sol_o_i.checkerboard ==Odd );
+      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
    };
-    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+      // Verify the unprec residual
-    {
+      if ( ! subGuess ) {
-      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+        _Matrix.M(out,resid); 
-      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
+        resid = resid-in;
-    };
+        RealD ns = norm2(in);
-    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+        RealD nr = norm2(resid);
-    {
+
-      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+        std::cout << GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid " << std::sqrt(nr / ns) << " nr " << nr << " ns " << ns << std::endl;
-      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+      } else {
        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
      }
    }     
  };
 }
 #endif
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -50,6 +50,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
      assert(0);
  }
  Grid_quiesce_nodes();
  // Never clean up as done once.
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
@@ -122,8 +124,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  // split the communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  //  int Nparent = parent._processors ; 
  //  std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
  int Nparent;
  MPI_Comm_size(parent.communicator,&Nparent);
  //  std::cout << " Parent size  "<<Nparent <<std::endl;
  int childsize=1;
  for(int d=0;d<processors.size();d++) {
@@ -132,6 +136,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  int Nchild = Nparent/childsize;
  assert (childsize * Nchild == Nparent);
  //  std::cout << " child size  "<<childsize <<std::endl;
  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
  std::vector<int> scoor(_ndimension); // coor of split within parent
  std::vector<int> ssize(_ndimension); // coor of split within parent
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -413,7 +413,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@@ -455,7 +455,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@@ -499,7 +499,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
-      //      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
+      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
      if ( ptr == (void * )MAP_FAILED ) {       
 	perror("failed mmap");     
 	assert(0);    
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -392,10 +392,14 @@ namespace Grid {
    void SeedUniqueString(const std::string &s){
      std::vector<int> seeds;
      std::stringstream sha;
      seeds = GridChecksum::sha256_seeds(s);
      for(int i=0;i<seeds.size();i++) { 
        sha << std::hex << seeds[i];
      }
      std::cout << GridLogMessage << "Intialising parallel RNG with unique string '" 
                << s << "'" << std::endl;
-      std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
+      std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
      SeedFixedIntegers(seeds);
    }
    void SeedFixedIntegers(const std::vector<int> &seeds){
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -464,11 +464,9 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
  assert(orthog>=0);
  for(int d=0;d<nh;d++){
    if ( d!=orthog ) {
    assert(lg->_processors[d]  == hg->_processors[d]);
    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
  }
  }
  // the above should guarantee that the operations are local
  parallel_for(int idx=0;idx<lg->lSites();idx++){
@@ -487,7 +485,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
 template<class vobj>
-void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
  typedef typename vobj::scalar_object sobj;
@@ -501,11 +499,9 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
  assert(orthog>=0);
  for(int d=0;d<nh;d++){
    if ( d!=orthog ) {
    assert(lg->_processors[d]  == hg->_processors[d]);
    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
  }
  }
  // the above should guarantee that the operations are local
  parallel_for(int idx=0;idx<lg->lSites();idx++){
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@@ -59,7 +59,6 @@ void GridLogTimestamp(int on){
 }
 Colours GridLogColours(0);
 GridLogger GridLogMG     (1, "MG"    , GridLogColours, "NORMAL");
 GridLogger GridLogIRL    (1, "IRL"   , GridLogColours, "NORMAL");
 GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
 GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -146,11 +146,9 @@ public:
      if ( log.timestamp ) {
 	log.StopWatch->Stop();
 	GridTime now = log.StopWatch->Elapsed();
 	if ( log.timing_mode==1 ) log.StopWatch->Reset();
 	log.StopWatch->Start();
-	stream << log.evidence()
+	stream << log.evidence()<< std::setw(6)<<now << log.background() << " : " ;
 	       << now	       << log.background() << " : " ;
      }
      stream << log.colour();
      return stream;
@@ -169,7 +167,6 @@ public:
 void GridLogConfigure(std::vector<std::string> &logstreams);
 extern GridLogger GridLogMG;
 extern GridLogger GridLogIRL;
 extern GridLogger GridLogSolver;
 extern GridLogger GridLogError;
--- a/Grid/parallelIO/BinaryIO.cc
+++ b/Grid/parallelIO/BinaryIO.cc
@@ -1,3 +0,0 @@
 #include <Grid/GridCore.h>
 int Grid::BinaryIO::latticeWriteMaxRetry = -1;
--- a/Grid/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@@ -81,7 +81,6 @@ inline void removeWhitespace(std::string &key)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 class BinaryIO {
 public:
  static int latticeWriteMaxRetry;
  /////////////////////////////////////////////////////////////////////////////
  // more byte manipulation helpers
@@ -371,7 +370,7 @@ PARALLEL_CRITICAL
 #endif
      } else {
 	std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
-                  << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
+                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
        std::ifstream fin;
 	fin.open(file, std::ios::binary | std::ios::in);
        if (control & BINARYIO_MASTER_APPEND)
@@ -583,9 +582,7 @@ PARALLEL_CRITICAL
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
    GridBase *grid = Umu._grid;
-    uint64_t lsites = grid->lSites(), offsetCopy = offset;
+    uint64_t lsites = grid->lSites();
    int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry);
    bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0);
    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
@@ -600,35 +597,9 @@ PARALLEL_CRITICAL
    grid->Barrier();
    timer.Stop();
-    while (attemptsLeft >= 0)
+
    {
      grid->Barrier();
    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
 	     nersc_csum,scidac_csuma,scidac_csumb);
      if (checkWrite)
      {
        std::vector<fobj> ckiodata(lsites);
        uint32_t          cknersc_csum, ckscidac_csuma, ckscidac_csumb;
        uint64_t          ckoffset = offsetCopy;
        std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
        grid->Barrier();
        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
 	               cknersc_csum,ckscidac_csuma,ckscidac_csumb);
        if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
        {
          std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl;
          offset = offsetCopy;
        }
        else
        {
          std::cout << GridLogMessage << "writeLatticeObject: read test checksum correct" << std::endl;
          break;
        }
      }
      attemptsLeft--;
    }
    std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed()  <<std::endl;
  }
@@ -754,6 +725,5 @@ PARALLEL_CRITICAL
    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
  }
 };
 }
 #endif
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -233,8 +233,7 @@ class GridLimeReader : public BinaryIO {
 	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
 	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
-  std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
+
  std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
 	/////////////////////////////////////////////
 	// Insist checksum is next record
 	/////////////////////////////////////////////
--- a/Grid/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@@ -49,38 +49,20 @@ inline double usecond(void) {
 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
 typedef  std::chrono::seconds               GridSecs;
 typedef  std::chrono::milliseconds          GridMillisecs;
 typedef  std::chrono::microseconds          GridUsecs;
 typedef  std::chrono::microseconds          GridTime;
 typedef  std::chrono::microseconds          GridUsecs;
-inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
+inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
 {
-  stream << time.count()<<" s";
+  stream << time.count()<<" ms";
  return stream;
 }
-inline std::ostream& operator<< (std::ostream & stream, const GridMillisecs & now)
+inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
 {
-  GridSecs second(1);
+  stream << time.count()<<" usec";
  auto     secs       = now/second ; 
  auto     subseconds = now%second ;
  auto     fill       = stream.fill();
  stream << secs<<"."<<std::setw(3)<<std::setfill('0')<<subseconds.count()<<" s";
  stream.fill(fill);
  return stream;
 }
 inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now)
 {
  GridSecs second(1);
  auto     seconds    = now/second ; 
  auto     subseconds = now%second ;
  auto     fill       = stream.fill();
  stream << seconds<<"."<<std::setw(6)<<std::setfill('0')<<subseconds.count()<<" s";
  stream.fill(fill);
  return stream;
 }
 class GridStopWatch {
 private:
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -90,7 +90,6 @@ namespace QCD {
    // That probably makes for GridRedBlack4dCartesian grid.
    // s,sp,c,spc,lc
    template<typename vtype> using iSinglet                   = iScalar<iScalar<iScalar<vtype> > >;
    template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iScalar<vtype>, Ns> >;
    template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
@@ -102,8 +101,6 @@ namespace QCD {
    template<typename vtype> using iSpinColourVector          = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
@@ -135,24 +132,6 @@ namespace QCD {
    typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
    typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;
    // SpinColourSpinColour matrix
    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
    typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
    typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
    // SpinColourSpinColour matrix
    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
    typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
    typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
    // LorentzColour
    typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
    typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
@@ -250,9 +229,6 @@ namespace QCD {
    typedef Lattice<vSpinColourMatrixF>     LatticeSpinColourMatrixF;
    typedef Lattice<vSpinColourMatrixD>     LatticeSpinColourMatrixD;
    typedef Lattice<vSpinColourSpinColourMatrix>      LatticeSpinColourSpinColourMatrix;
    typedef Lattice<vSpinColourSpinColourMatrixF>     LatticeSpinColourSpinColourMatrixF;
    typedef Lattice<vSpinColourSpinColourMatrixD>     LatticeSpinColourSpinColourMatrixD;
    typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
    typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -44,15 +44,12 @@ namespace QCD {
  struct WilsonImplParams {
    bool overlapCommsCompute;
    std::vector<Real> twist_n_2pi_L;
    std::vector<Complex> boundary_phases;
    WilsonImplParams() : overlapCommsCompute(false) {
      boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
    };
-    WilsonImplParams(const std::vector<Complex> phi) : boundary_phases(phi), overlapCommsCompute(false) {
+    WilsonImplParams(const std::vector<Complex> phi)
-      twist_n_2pi_L.resize(Nd, 0.0);
+      : boundary_phases(phi), overlapCommsCompute(false) {}
    }
  };
  struct StaggeredImplParams {
--- a/Grid/qcd/action/fermion/CayleyFermion5D.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.cc
@@ -485,13 +485,9 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  double bpc = b+c;
  double bmc = b-c;
  _b = b;
  _c = c;
  _gamma  = gamma; // Save the parameters so we can change mass later.
  _zolo_hi= zolo_hi;
  for(int i=0; i < Ls; i++){
    as[i] = 1.0;
-    omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code
+    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
    assert(omega[i]!=Coeff_t(0.0));
    bs[i] = 0.5*(bpc/omega[i] + bmc);
    cs[i] = 0.5*(bpc/omega[i] - bmc);
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -97,10 +97,7 @@ namespace Grid {
      // Support for MADWF tricks
      ///////////////////////////////////////////////////////////////
      RealD Mass(void) { return mass; };
-      void  SetMass(RealD _mass) { 
+      void  SetMass(RealD _mass) { mass=_mass; } ;
 	mass=_mass; 
 	SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
      } ;
      void  P(const FermionField &psi, FermionField &chi);
      void  Pdag(const FermionField &psi, FermionField &chi);
@@ -150,12 +147,6 @@ namespace Grid {
      //    protected:
      RealD mass;
      // Save arguments to SetCoefficientsInternal
      std::vector<Coeff_t> _gamma;
      RealD                _zolo_hi;
      RealD                _b;
      RealD                _c;
      // Cayley form Moebius (tanh and zolotarev)
      std::vector<Coeff_t> omega;
      std::vector<Coeff_t> bs;    // S dependent coeffs
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -80,24 +80,12 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ///////////////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/g5HermitianLinop.h>
 ///////////////////////////////////////////////////////////////////////////////
 // Fourier accelerated Pauli Villars inverse support
 ///////////////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/WilsonTMFermion5D.h>   
 ////////////////////////////////////////////////////////////////////////////////
 // Move this group to a DWF specific tools/algorithms subdir? 
 ////////////////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/FourierAcceleratedPV.h>
 #include <Grid/qcd/action/fermion/PauliVillarsInverters.h>
 #include <Grid/qcd/action/fermion/Reconstruct5Dprop.h>
 #include <Grid/qcd/action/fermion/MADWF.h>
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // More maintainable to maintain the following typedef list centrally, as more "impl" targets
 // are added, (e.g. extension for gparity, half precision project in comms etc..)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Cayley 5d
 namespace Grid {
  namespace QCD {
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -64,6 +64,12 @@ namespace Grid {
      virtual RealD  M    (const FermionField &in, FermionField &out)=0;
      virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;
      // Query the even even properties to make algorithmic decisions
      virtual int    ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
      virtual int    isTrivialEE(void) { return 0; };
      virtual RealD  Mass(void) {return 0.0;};
      virtual void SetMass(RealD _mass) { return; };
      // half checkerboard operaions
      virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
      virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0;
--- a/Grid/qcd/action/fermion/FermionOperatorImpl.h
+++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h
@@ -141,7 +141,6 @@ namespace QCD {
  ////////////////////////////////////////////////////////////////////////
 #define INHERIT_FIMPL_TYPES(Impl)\
  typedef Impl Impl_t;							\
  typedef typename Impl::FermionField           FermionField;		\
  typedef typename Impl::PropagatorField     PropagatorField;		\
  typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
@@ -240,30 +239,16 @@ namespace QCD {
      GaugeLinkField tmp(GaugeGrid);
      Lattice<iScalar<vInteger> > coor(GaugeGrid);
      ////////////////////////////////////////////////////
      // apply any boundary phase or twists
      ////////////////////////////////////////////////////
      for (int mu = 0; mu < Nd; mu++) {
 	////////// boundary phase /////////////
 	      auto pha = Params.boundary_phases[mu];
 	      scalar_type phase( real(pha),imag(pha) );
-	int L   = GaugeGrid->GlobalDimensions()[mu];
+        int Lmu = GaugeGrid->GlobalDimensions()[mu] - 1;
        int Lmu = L - 1;
        LatticeCoordinate(coor, mu);
        U = PeekIndex<LorentzIndex>(Umu, mu);
 	// apply any twists
 	RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
 	if ( theta != 0.0) { 
 	  scalar_type twphase(::cos(theta),::sin(theta));
 	  U = twphase*U;
 	  std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
 	}
        tmp = where(coor == Lmu, phase * U, U);
        PokeIndex<LorentzIndex>(Uds, tmp, mu);
--- a/Grid/qcd/action/fermion/FourierAcceleratedPV.h
+++ b/Grid/qcd/action/fermion/FourierAcceleratedPV.h
@@ -1,237 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/FourierAcceleratedPV.h
    Copyright (C) 2015
 Author: Christoph Lehner (lifted with permission by Peter Boyle, brought back to Grid)
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once
 namespace Grid {
 namespace QCD {
  template<typename M>
    void get_real_const_bc(M& m, RealD& _b, RealD& _c) {
    ComplexD b,c;
    b=m.bs[0];
    c=m.cs[0];
    std::cout << GridLogMessage << "b=" << b << ", c=" << c << std::endl;
    for (size_t i=1;i<m.bs.size();i++) {
      assert(m.bs[i] == b);
      assert(m.cs[i] == c);
    }
    assert(b.imag() == 0.0);
    assert(c.imag() == 0.0);
    _b = b.real();
    _c = c.real();
  }
 template<typename Vi, typename M, typename G>
 class FourierAcceleratedPV {
 public:
  ConjugateGradient<Vi> &cg;
  M& dwfPV;
  G& Umu;
  GridCartesian* grid5D;
  GridRedBlackCartesian* gridRB5D;
  int group_in_s;
  FourierAcceleratedPV(M& _dwfPV, G& _Umu, ConjugateGradient<Vi> &_cg, int _group_in_s = 2) 
   : dwfPV(_dwfPV), Umu(_Umu), cg(_cg), group_in_s(_group_in_s) 
  {
    assert( dwfPV.FermionGrid()->_fdimensions[0] % (2*group_in_s) == 0);
    grid5D = QCD::SpaceTimeGrid::makeFiveDimGrid(2*group_in_s, (GridCartesian*)Umu._grid);
    gridRB5D = QCD::SpaceTimeGrid::makeFiveDimRedBlackGrid(2*group_in_s, (GridCartesian*)Umu._grid);
  }
  void rotatePV(const Vi& _src, Vi& dst, bool forward) const {
    GridStopWatch gsw1, gsw2;
    typedef typename Vi::scalar_type Coeff_t;
    int Ls = dst._grid->_fdimensions[0];
    Vi _tmp(dst._grid);
    double phase = M_PI / (double)Ls;
    Coeff_t bzero(0.0,0.0);
    FFT theFFT((GridCartesian*)dst._grid);
    if (!forward) {
      gsw1.Start();
      for (int s=0;s<Ls;s++) {
 	Coeff_t a(::cos(phase*s),-::sin(phase*s));
 	axpby_ssp(_tmp,a,_src,bzero,_src,s,s);
      }
      gsw1.Stop();
      gsw2.Start();
      theFFT.FFT_dim(dst,_tmp,0,FFT::forward);
      gsw2.Stop();
    } else {
      gsw2.Start();
      theFFT.FFT_dim(_tmp,_src,0,FFT::backward);
      gsw2.Stop();
      gsw1.Start();
      for (int s=0;s<Ls;s++) {
 	Coeff_t a(::cos(phase*s),::sin(phase*s));
 	axpby_ssp(dst,a,_tmp,bzero,_tmp,s,s);
      }
      gsw1.Stop();
    }
    std::cout << GridLogMessage << "Timing rotatePV: " << gsw1.Elapsed() << ", " << gsw2.Elapsed() << std::endl;
  }
  void pvInv(const Vi& _src, Vi& _dst) const {
    std::cout << GridLogMessage << "Fourier-Accelerated Outer Pauli Villars"<<std::endl;
    typedef typename Vi::scalar_type Coeff_t;
    int Ls = _dst._grid->_fdimensions[0];
    GridStopWatch gswT;
    gswT.Start();
    RealD b,c;
    get_real_const_bc(dwfPV,b,c);
    RealD M5 = dwfPV.M5;
    // U(true) Rightinv TMinv U(false) = Minv
    Vi _src_diag(_dst._grid);
    Vi _src_diag_slice(dwfPV.GaugeGrid());
    Vi _dst_diag_slice(dwfPV.GaugeGrid());
    Vi _src_diag_slices(grid5D);
    Vi _dst_diag_slices(grid5D);
    Vi _dst_diag(_dst._grid);
    rotatePV(_src,_src_diag,false);
    // now do TM solves
    Gamma G5(Gamma::Algebra::Gamma5);
    GridStopWatch gswA, gswB;
    gswA.Start();
    typedef typename M::Impl_t Impl;
    //WilsonTMFermion<Impl> tm(x.Umu,*x.UGridF,*x.UrbGridF,0.0,0.0,solver_outer.parent.par.wparams_f);
    std::vector<RealD> vmass(grid5D->_fdimensions[0],0.0);
    std::vector<RealD> vmu(grid5D->_fdimensions[0],0.0);
    WilsonTMFermion5D<Impl> tm(Umu,*grid5D,*gridRB5D,
 			   *(GridCartesian*)dwfPV.GaugeGrid(),
 			   *(GridRedBlackCartesian*)dwfPV.GaugeRedBlackGrid(),
 			   vmass,vmu);
    //SchurRedBlackDiagTwoSolve<Vi> sol(cg);
    SchurRedBlackDiagMooeeSolve<Vi> sol(cg); // same performance as DiagTwo
    gswA.Stop();
    gswB.Start();
    for (int sgroup=0;sgroup<Ls/2/group_in_s;sgroup++) {
      for (int sidx=0;sidx<group_in_s;sidx++) {
 	int s = sgroup*group_in_s + sidx;
 	int sprime = Ls-s-1;
 	RealD phase = M_PI / (RealD)Ls * (2.0 * s + 1.0);
 	RealD cosp = ::cos(phase);
 	RealD sinp = ::sin(phase);
 	RealD denom = b*b + c*c + 2.0*b*c*cosp;
 	RealD mass = -(b*b*M5 + c*(1.0 - cosp + c*M5) + b*(-1.0 + cosp + 2.0*c*cosp*M5))/denom;
 	RealD mu = (b+c)*sinp/denom;
 	vmass[2*sidx + 0] = mass;
 	vmass[2*sidx + 1] = mass;
 	vmu[2*sidx + 0] = mu;
 	vmu[2*sidx + 1] = -mu;
      }
      tm.update(vmass,vmu);
      for (int sidx=0;sidx<group_in_s;sidx++) {
 	int s = sgroup*group_in_s + sidx;
 	int sprime = Ls-s-1;
 	ExtractSlice(_src_diag_slice,_src_diag,s,0);
 	InsertSlice(_src_diag_slice,_src_diag_slices,2*sidx + 0,0);
 	ExtractSlice(_src_diag_slice,_src_diag,sprime,0);
 	InsertSlice(_src_diag_slice,_src_diag_slices,2*sidx + 1,0);
      }
      GridStopWatch gsw;
      gsw.Start();
      _dst_diag_slices = zero; // zero guess
      sol(tm,_src_diag_slices,_dst_diag_slices);
      gsw.Stop();
      std::cout << GridLogMessage << "Solve[sgroup=" << sgroup << "] completed in " << gsw.Elapsed() << ", " << gswA.Elapsed() << std::endl;
      for (int sidx=0;sidx<group_in_s;sidx++) {
 	int s = sgroup*group_in_s + sidx;
 	int sprime = Ls-s-1;
 	RealD phase = M_PI / (RealD)Ls * (2.0 * s + 1.0);
 	RealD cosp = ::cos(phase);
 	RealD sinp = ::sin(phase);
 	// now rotate with inverse of
 	Coeff_t pA = b + c*cosp;
 	Coeff_t pB = - Coeff_t(0.0,1.0)*c*sinp;
 	Coeff_t pABden = pA*pA - pB*pB;
 	// (pA + pB * G5) * (pA - pB*G5) = (pA^2 - pB^2)
 	ExtractSlice(_dst_diag_slice,_dst_diag_slices,2*sidx + 0,0);
 	_dst_diag_slice = (pA/pABden) * _dst_diag_slice - (pB/pABden) * (G5 * _dst_diag_slice);
 	InsertSlice(_dst_diag_slice,_dst_diag,s,0);
 	ExtractSlice(_dst_diag_slice,_dst_diag_slices,2*sidx + 1,0);
 	_dst_diag_slice = (pA/pABden) * _dst_diag_slice + (pB/pABden) * (G5 * _dst_diag_slice);
 	InsertSlice(_dst_diag_slice,_dst_diag,sprime,0);
      }
    }
    gswB.Stop();
    rotatePV(_dst_diag,_dst,true);
    gswT.Stop();
    std::cout << GridLogMessage << "PV completed in " << gswT.Elapsed() << " (Setup: " << gswA.Elapsed() << ", s-loop: " << gswB.Elapsed() << ")" << std::endl;
  }
 };
 }}
--- a/Grid/qcd/action/fermion/MADWF.h
+++ b/Grid/qcd/action/fermion/MADWF.h
@@ -1,193 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/MADWF.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once
 namespace Grid {
 namespace QCD {
 template <class Fieldi, class Fieldo,IfNotSame<Fieldi,Fieldo> X=0>
 inline void convert(const Fieldi &from,Fieldo &to) 
 {
  precisionChange(to,from);
 }
 template <class Fieldi, class Fieldo,IfSame<Fieldi,Fieldo> X=0>
 inline void convert(const Fieldi &from,Fieldo &to) 
 {
  to=from;
 }
 template<class Matrixo,class Matrixi,class PVinverter,class SchurSolver, class Guesser> 
 class MADWF 
 {
 private:
  typedef typename Matrixo::FermionField FermionFieldo;
  typedef typename Matrixi::FermionField FermionFieldi;
  PVinverter  & PauliVillarsSolvero;// For the outer field
  SchurSolver & SchurSolveri;       // For the inner approx field
  Guesser     & Guesseri;           // To deflate the inner approx solves
  Matrixo & Mato;                   // Action object for outer
  Matrixi & Mati;                   // Action object for inner
  RealD target_resid;
  int   maxiter;
 public:
  MADWF(Matrixo &_Mato,
 	Matrixi &_Mati, 
 	PVinverter &_PauliVillarsSolvero, 
 	SchurSolver &_SchurSolveri,
 	Guesser & _Guesseri,
 	RealD resid,
 	int _maxiter) :
  Mato(_Mato),Mati(_Mati),
    SchurSolveri(_SchurSolveri),
    PauliVillarsSolvero(_PauliVillarsSolvero),Guesseri(_Guesseri)
  {   
    target_resid=resid;
    maxiter     =_maxiter; 
  };
  void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
  {
    std::cout << GridLogMessage<< " ************************************************" << std::endl;
    std::cout << GridLogMessage<< "  MADWF-like algorithm                           " << std::endl;
    std::cout << GridLogMessage<< " ************************************************" << std::endl;
    FermionFieldi    c0i(Mati.GaugeGrid()); // 4d 
    FermionFieldi    y0i(Mati.GaugeGrid()); // 4d
    FermionFieldo    c0 (Mato.GaugeGrid()); // 4d 
    FermionFieldo    y0 (Mato.GaugeGrid()); // 4d
    FermionFieldo    A(Mato.FermionGrid()); // Temporary outer
    FermionFieldo    B(Mato.FermionGrid()); // Temporary outer
    FermionFieldo    b(Mato.FermionGrid()); // 5d source
    FermionFieldo    c(Mato.FermionGrid()); // PVinv source; reused so store
    FermionFieldo    defect(Mato.FermionGrid()); // 5d source
    FermionFieldi   ci(Mati.FermionGrid()); 
    FermionFieldi   yi(Mati.FermionGrid()); 
    FermionFieldi   xi(Mati.FermionGrid()); 
    FermionFieldi srci(Mati.FermionGrid()); 
    FermionFieldi   Ai(Mati.FermionGrid()); 
    RealD m=Mati.Mass();
    ///////////////////////////////////////
    //Import source, include Dminus factors
    ///////////////////////////////////////
    Mato.ImportPhysicalFermionSource(src4,b); 
    std::cout << GridLogMessage << " src4 " <<norm2(src4)<<std::endl;
    std::cout << GridLogMessage << " b    " <<norm2(b)<<std::endl;
    defect = b;
    sol5=zero;
    for (int i=0;i<maxiter;i++) {
      ///////////////////////////////////////
      // Set up c0 from current defect
      ///////////////////////////////////////
      PauliVillarsSolvero(Mato,defect,A);
      Mato.Pdag(A,c);
      ExtractSlice(c0, c, 0 , 0);
      ////////////////////////////////////////////////
      // Solve the inner system with surface term c0
      ////////////////////////////////////////////////
      ci = zero;  
      convert(c0,c0i); // Possible precison change
      InsertSlice(c0i,ci,0, 0);
      // Dwm P y = Dwm x = D(1) P (c0,0,0,0)^T
      Mati.P(ci,Ai);
      Mati.SetMass(1.0);      Mati.M(Ai,srci);      Mati.SetMass(m);
      SchurSolveri(Mati,srci,xi,Guesseri); 
      Mati.Pdag(xi,yi);
      ExtractSlice(y0i, yi, 0 , 0);
      convert(y0i,y0); // Possible precision change
      //////////////////////////////////////
      // Propagate solution back to outer system
      // Build Pdag PV^-1 Dm P [-sol4,c2,c3... cL]
      //////////////////////////////////////
      c0 = - y0;
      InsertSlice(c0, c, 0   , 0);
      /////////////////////////////
      // Reconstruct the bulk solution Pdag PV^-1 Dm P 
      /////////////////////////////
      Mato.P(c,B);
      Mato.M(B,A);
      PauliVillarsSolvero(Mato,A,B);
      Mato.Pdag(B,A);
      //////////////////////////////
      // Reinsert surface prop
      //////////////////////////////
      InsertSlice(y0,A,0,0);
      //////////////////////////////
      // Convert from y back to x 
      //////////////////////////////
      Mato.P(A,B);
      //         sol5' = sol5 + M^-1 defect
      //               = sol5 + M^-1 src - M^-1 M sol5  ...
      sol5 = sol5 + B;
      std::cout << GridLogMessage << "***************************************" <<std::endl;
      std::cout << GridLogMessage << " Sol5 update "<<std::endl;
      std::cout << GridLogMessage << "***************************************" <<std::endl;
      std::cout << GridLogMessage << " Sol5 now "<<norm2(sol5)<<std::endl;
      std::cout << GridLogMessage << " delta    "<<norm2(B)<<std::endl;
       // New defect  = b - M sol5
       Mato.M(sol5,A);
       defect = b - A;
       std::cout << GridLogMessage << " defect   "<<norm2(defect)<<std::endl;
       double resid = ::sqrt(norm2(defect) / norm2(b));
       std::cout << GridLogMessage << "Residual " << i << ": " << resid  << std::endl;
       std::cout << GridLogMessage << "***************************************" <<std::endl;
       if (resid < target_resid) {
 	 return;
       }
    }
    std::cout << GridLogMessage << "MADWF : Exceeded maxiter "<<std::endl;
    assert(0);
  }
 };
 }}
--- a/Grid/qcd/action/fermion/PauliVillarsInverters.h
+++ b/Grid/qcd/action/fermion/PauliVillarsInverters.h
@@ -1,95 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once
 namespace Grid {
 namespace QCD {
 template<class Field>
 class PauliVillarsSolverUnprec
 {
 public:
  ConjugateGradient<Field> & CG;
  PauliVillarsSolverUnprec(  ConjugateGradient<Field> &_CG) : CG(_CG){};
  template<class Matrix>
  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
  {
    RealD m = _Matrix.Mass();
    Field A  (_Matrix.FermionGrid());
    MdagMLinearOperator<Matrix,Field> HermOp(_Matrix);
    _Matrix.SetMass(1.0);
    _Matrix.Mdag(src,A);
    CG(HermOp,A,sol);
    _Matrix.SetMass(m);
  };
 };
 template<class Field,class SchurSolverType>
 class PauliVillarsSolverRBprec
 {
 public:
  SchurSolverType & SchurSolver;
  PauliVillarsSolverRBprec( SchurSolverType &_SchurSolver) : SchurSolver(_SchurSolver){};
  template<class Matrix>
  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
  {
    RealD m = _Matrix.Mass();
    Field A  (_Matrix.FermionGrid());
    _Matrix.SetMass(1.0);
    SchurSolver(_Matrix,src,sol);
    _Matrix.SetMass(m);
  };
 };
 template<class Field,class GaugeField>
 class PauliVillarsSolverFourierAccel
 {
 public:
  GaugeField      & Umu;
  ConjugateGradient<Field> & CG;
  PauliVillarsSolverFourierAccel(GaugeField &_Umu,ConjugateGradient<Field> &_CG) :  Umu(_Umu), CG(_CG)
  {
  };
  template<class Matrix>
  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
  {
    FourierAcceleratedPV<Field, Matrix, typename Matrix::GaugeField > faPV(_Matrix,Umu,CG) ;
    faPV.pvInv(src,sol);
  };
 };
 }
 }
--- a/Grid/qcd/action/fermion/WilsonTMFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
@@ -1,155 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/WilsonTMFermion5D.h
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk> ; NB Christoph did similar in GPT
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once 
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/WilsonFermion.h>
 namespace Grid {
  namespace QCD {
    template<class Impl>
      class WilsonTMFermion5D : public WilsonFermion5D<Impl>
      {
      public:
 	INHERIT_IMPL_TYPES(Impl);
      public:
 	virtual void   Instantiatable(void) {};
 	// Constructors
        WilsonTMFermion5D(GaugeField &_Umu,
 			  GridCartesian         &Fgrid,
 			  GridRedBlackCartesian &Frbgrid, 
 			  GridCartesian         &Ugrid,
 			  GridRedBlackCartesian &Urbgrid, 
 			  const std::vector<RealD> _mass,
 			  const std::vector<RealD> _mu,
 			  const ImplParams &p= ImplParams()
 			  ) :
 	WilsonFermion5D<Impl>(_Umu,
 			      Fgrid,
 			      Frbgrid,
 			      Ugrid,
 			      Urbgrid,
 			      4.0,p)
 	  {
 	    update(_mass,_mu);
 	  }
 	virtual void Meooe(const FermionField &in, FermionField &out) {
 	  if (in.checkerboard == Odd) {
 	    this->DhopEO(in, out, DaggerNo);
 	  } else {
 	    this->DhopOE(in, out, DaggerNo);
 	  }
 	}
 	virtual void MeooeDag(const FermionField &in, FermionField &out) {
 	  if (in.checkerboard == Odd) {
 	    this->DhopEO(in, out, DaggerYes);
 	  } else {
 	    this->DhopOE(in, out, DaggerYes);
 	  }
 	}	
 	// allow override for twisted mass and clover
 	virtual void Mooee(const FermionField &in, FermionField &out) {
 	  out.checkerboard = in.checkerboard;
 	  //axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
 	  for (int s=0;s<(int)this->mass.size();s++) {
 	    ComplexD a = 4.0+this->mass[s];
 	    ComplexD b(0.0,this->mu[s]);
 	    axpbg5y_ssp(out,a,in,b,in,s,s);
 	  }
 	}
 	virtual void MooeeDag(const FermionField &in, FermionField &out) {
 	  out.checkerboard = in.checkerboard;
 	  for (int s=0;s<(int)this->mass.size();s++) {
 	    ComplexD a = 4.0+this->mass[s];
 	    ComplexD b(0.0,-this->mu[s]);
 	    axpbg5y_ssp(out,a,in,b,in,s,s);
 	  }
 	}
 	virtual void MooeeInv(const FermionField &in, FermionField &out) {
 	  for (int s=0;s<(int)this->mass.size();s++) {
 	    RealD m    = this->mass[s];
 	    RealD tm   = this->mu[s];
 	    RealD mtil = 4.0+this->mass[s];
 	    RealD sq   = mtil*mtil+tm*tm;
 	    ComplexD a    = mtil/sq;
 	    ComplexD b(0.0, -tm /sq);
 	    axpbg5y_ssp(out,a,in,b,in,s,s);
 	  }
 	}
 	virtual void MooeeInvDag(const FermionField &in, FermionField &out) {
 	  for (int s=0;s<(int)this->mass.size();s++) {
 	    RealD m    = this->mass[s];
 	    RealD tm   = this->mu[s];
 	    RealD mtil = 4.0+this->mass[s];
 	    RealD sq   = mtil*mtil+tm*tm;
 	    ComplexD a    = mtil/sq;
 	    ComplexD b(0.0,tm /sq);
 	    axpbg5y_ssp(out,a,in,b,in,s,s);
 	  }
 	}
 	virtual RealD M(const FermionField &in, FermionField &out) {
 	  out.checkerboard = in.checkerboard;
 	  this->Dhop(in, out, DaggerNo);
 	  FermionField tmp(out._grid);
 	  for (int s=0;s<(int)this->mass.size();s++) {
 	    ComplexD a = 4.0+this->mass[s];
 	    ComplexD b(0.0,this->mu[s]);
 	    axpbg5y_ssp(tmp,a,in,b,in,s,s);
 	  }
 	  return axpy_norm(out, 1.0, tmp, out);
 	}
 	// needed for fast PV
 	void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) {
 	  assert(_mass.size() == _mu.size());
 	  assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
 	  this->mass = _mass;
 	  this->mu = _mu;
 	}
      private:
 	std::vector<RealD> mu;
 	std::vector<RealD> mass;
      };
    typedef WilsonTMFermion5D<WilsonImplF> WilsonTMFermion5DF; 
    typedef WilsonTMFermion5D<WilsonImplD> WilsonTMFermion5DD; 
 }}
--- a/Grid/qcd/action/gauge/Photon.h
+++ b/Grid/qcd/action/gauge/Photon.h
@@ -4,11 +4,9 @@
 Source file: ./lib/qcd/action/gauge/Photon.h
-Copyright (C) 2015-2018
+ Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: James Harrison <J.Harrison@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -32,9 +30,8 @@ Copyright (C) 2015-2018
 namespace Grid{
 namespace QCD{
  template <class S>
-  class QedGImpl
+  class QedGimpl
  {
  public:
    typedef S Simd;
@@ -46,27 +43,27 @@ namespace QCD{
    typedef iImplGaugeLink<Simd>  SiteLink;
    typedef iImplGaugeField<Simd> SiteField;
-    typedef SiteLink              SiteComplex;
+    typedef SiteField             SiteComplex;
    typedef Lattice<SiteLink>  LinkField;
    typedef Lattice<SiteField> Field;
    typedef Field              ComplexField;
  };
-  typedef QedGImpl<vComplex> QedGImplR;
+  typedef QedGimpl<vComplex> QedGimplR;
-  template <class GImpl>
+  template<class Gimpl>
  class Photon
  {
  public:
-    INHERIT_GIMPL_TYPES(GImpl);
+    INHERIT_GIMPL_TYPES(Gimpl);
    typedef typename SiteGaugeLink::scalar_object ScalarSite;
    typedef typename ScalarSite::scalar_type      ScalarComplex;
    GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3);
-    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2);
+    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2, qedInf, 3);
  public:
-    Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvement);
+    Photon(Gauge gauge, ZmScheme zmScheme);
-    Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme);
+    Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements);
    Photon(Gauge gauge, ZmScheme zmScheme, Real G0);
    Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements, Real G0);
    virtual ~Photon(void) = default;
    void FreePropagator(const GaugeField &in, GaugeField &out);
    void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
@@ -76,255 +73,345 @@ namespace QCD{
                         const GaugeLinkField &weight);
    void UnitField(GaugeField &out);
  private:
-    void makeSpatialNorm(LatticeInteger &spNrm);
+    void infVolPropagator(GaugeLinkField &out);
-    void makeKHat(std::vector<GaugeLinkField> &khat);
+    void invKHatSquared(GaugeLinkField &out);
    void makeInvKHatSquared(GaugeLinkField &out);
    void zmSub(GaugeLinkField &out);
    void transverseProjectSpatial(GaugeField &out);
    void gaugeTransform(GaugeField &out);
  private:
    GridBase          *grid_;
    Gauge    gauge_;
    ZmScheme zmScheme_;
    std::vector<Real>  improvement_;
    Real     G0_;
  };
-  typedef Photon<QedGImplR>  PhotonR;
+  typedef Photon<QedGimplR>  PhotonR;
-  template<class GImpl>
+  template<class Gimpl>
-  Photon<GImpl>::Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme,
+  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
  : gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()),
    G0_(0.15493339023106021408483720810737508876916113364521)
  {}
  template<class Gimpl>
  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
                        std::vector<Real> improvements)
-  : grid_(grid), gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements)
+  : gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements),
    G0_(0.15493339023106021408483720810737508876916113364521)
  {}
-  template<class GImpl>
+  template<class Gimpl>
-  Photon<GImpl>::Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme)
+  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme, Real G0)
-  : Photon(grid, gauge, zmScheme, std::vector<Real>())
+  : gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()), G0_(G0)
  {}
-  template<class GImpl>
+  template<class Gimpl>
-  void Photon<GImpl>::FreePropagator(const GaugeField &in, GaugeField &out)
+  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
-  {
+                        std::vector<Real> improvements, Real G0)
-    FFT        theFFT(dynamic_cast<GridCartesian *>(grid_));
+  : gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements), G0_(G0)
-    GaugeField in_k(grid_);
+  {}
    GaugeField prop_k(grid_);
-    theFFT.FFT_all_dim(in_k, in, FFT::forward);
+  template<class Gimpl>
-    MomentumSpacePropagator(prop_k, in_k);
+  void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out)
-    theFFT.FFT_all_dim(out, prop_k, FFT::backward);
+  {
    FFT theFFT(in._grid);
    GaugeField in_k(in._grid);
    GaugeField prop_k(in._grid);
    theFFT.FFT_all_dim(in_k,in,FFT::forward);
    MomentumSpacePropagator(prop_k,in_k);
    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
  }
-  template<class GImpl>
+  template<class Gimpl>
-  void Photon<GImpl>::makeSpatialNorm(LatticeInteger &spNrm)
+  void Photon<Gimpl>::infVolPropagator(GaugeLinkField &out)
  {
-    LatticeInteger   coor(grid_);
+    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
-    std::vector<int> l = grid_->FullDimensions();
+    LatticeReal        xmu(grid);
-
+    GaugeLinkField     one(grid);
-    spNrm = zero;
+    const unsigned int nd    = grid->_ndimension;
-    for(int mu = 0; mu < grid_->Nd() - 1; mu++)
+    std::vector<int>   &l    = grid->_fdimensions;
-    {
+    std::vector<int>   x0(nd,0);
-      LatticeCoordinate(coor, mu);
+    TComplex           Tone  = Complex(1.0,0.0);
-      coor  = where(coor < Integer(l[mu]/2), coor, coor - Integer(l[mu]));
+    TComplex           Tzero = Complex(G0_,0.0);
-      spNrm = spNrm + coor*coor;
+    FFT                fft(grid);
    }
  }
  template<class GImpl>
  void Photon<GImpl>::makeKHat(std::vector<GaugeLinkField> &khat)
  {
    const unsigned int nd = grid_->Nd();
    std::vector<int>   l  = grid_->FullDimensions();
    Complex            ci(0., 1.);
    khat.resize(nd, grid_);
    for (unsigned int mu = 0; mu < nd; ++mu)
    {
      Real piL = M_PI/l[mu];
      LatticeCoordinate(khat[mu], mu);
      khat[mu] = exp(piL*ci*khat[mu])*2.*sin(piL*khat[mu]);
    }
  }
  template<class GImpl>
  void Photon<GImpl>::makeInvKHatSquared(GaugeLinkField &out)
  {
    std::vector<GaugeLinkField> khat;
    GaugeLinkField              lone(grid_);
    const unsigned int          nd = grid_->Nd();
    std::vector<int>            zm(nd, 0);
    ScalarSite                  one = ScalarComplex(1., 0.), z = ScalarComplex(0., 0.);
    one = Complex(1.0,0.0);
    out = zero;
    makeKHat(khat);
    for(int mu = 0; mu < nd; mu++)
    {
-      out = out + khat[mu]*conjugate(khat[mu]);
+      LatticeCoordinate(xmu,mu);
      Real lo2 = l[mu]/2.0;
      xmu = where(xmu < lo2, xmu, xmu-double(l[mu]));
      out = out + toComplex(4*M_PI*M_PI*xmu*xmu);
    }
-    lone = ScalarComplex(1., 0.);
+    pokeSite(Tone, out, x0);
-    pokeSite(one, out, zm);
+    out = one/out;
-    out = lone/out;
+    pokeSite(Tzero, out, x0);
-    pokeSite(z, out, zm);
+    fft.FFT_all_dim(out, out, FFT::forward);
  }
-  template<class GImpl>
+  template<class Gimpl>
-  void Photon<GImpl>::zmSub(GaugeLinkField &out)
+  void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out)
  {
    GridBase           *grid = out._grid;
    GaugeLinkField     kmu(grid), one(grid);
    const unsigned int nd    = grid->_ndimension;
    std::vector<int>   &l    = grid->_fdimensions;
    std::vector<int>   zm(nd,0);
    TComplex           Tone = Complex(1.0,0.0);
    TComplex           Tzero= Complex(0.0,0.0);
    one = Complex(1.0,0.0);
    out = zero;
    for(int mu = 0; mu < nd; mu++)
    {
      Real twoPiL = M_PI*2./l[mu];
      LatticeCoordinate(kmu,mu);
      kmu = 2.*sin(.5*twoPiL*kmu);
      out = out + kmu*kmu;
    }
    pokeSite(Tone, out, zm);
    out = one/out;
    pokeSite(Tzero, out, zm);
  }
  template<class Gimpl>
  void Photon<Gimpl>::zmSub(GaugeLinkField &out)
  {
    GridBase           *grid = out._grid;
    const unsigned int nd    = grid->_ndimension;
    std::vector<int>   &l    = grid->_fdimensions;
    switch (zmScheme_)
    {
      case ZmScheme::qedTL:
      {
-        std::vector<int> zm(grid_->Nd(), 0);
+        std::vector<int> zm(nd,0);
-        ScalarSite       z = ScalarComplex(0., 0.);
+        TComplex         Tzero = Complex(0.0,0.0);
        pokeSite(Tzero, out, zm);
        pokeSite(z, out, zm);
        break;
      }
      case ZmScheme::qedL:
      {
-        LatticeInteger spNrm(grid_);
+        LatticeInteger spNrm(grid), coor(grid);
        GaugeLinkField z(grid);
-        makeSpatialNorm(spNrm);
+        spNrm = zero;
        for(int d = 0; d < grid->_ndimension - 1; d++)
        {
          LatticeCoordinate(coor,d);
          coor = where(coor < Integer(l[d]/2), coor, coor-Integer(l[d]));
          spNrm = spNrm + coor*coor;
        }
        out = where(spNrm == Integer(0), 0.*out, out);
        // IR improvement
        for(int i = 0; i < improvement_.size(); i++)
        {
-          Real f = sqrt(improvement_[i] + 1);
+          Real f = sqrt(improvement_[i]+1);
-          out = where(spNrm == Integer(i + 1), f*out, out);
+          out = where(spNrm == Integer(i+1), f*out, out);
        }
        break;
      }
      default:
        assert(0);
        break;
    }
  }
-  template<class GImpl>
+  template<class Gimpl>
-  void Photon<GImpl>::transverseProjectSpatial(GaugeField &out)
+  void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in,
  {
    const unsigned int          nd = grid_->Nd();
    GaugeLinkField              invKHat(grid_), cst(grid_), spdiv(grid_);
    LatticeInteger              spNrm(grid_);
    std::vector<GaugeLinkField> khat, a(nd, grid_), aProj(nd, grid_);
    invKHat = zero;
    makeSpatialNorm(spNrm);
    makeKHat(khat);
    for (unsigned int mu = 0; mu < nd; ++mu)
    {
      a[mu] = peekLorentz(out, mu);
      if (mu < nd - 1)
      {
        invKHat += khat[mu]*conjugate(khat[mu]);
      }
    }
    cst     = ScalarComplex(1., 0.);
    invKHat = where(spNrm == Integer(0), cst, invKHat);
    invKHat = cst/invKHat;
    cst     = zero;
    invKHat = where(spNrm == Integer(0), cst, invKHat);
    spdiv   = zero;
    for (unsigned int nu = 0; nu < nd - 1; ++nu)
    {
      spdiv += conjugate(khat[nu])*a[nu];
    }
    spdiv *= invKHat;
    for (unsigned int mu = 0; mu < nd; ++mu)
    {
      aProj[mu] = a[mu] - khat[mu]*spdiv;
      pokeLorentz(out, aProj[mu], mu);
    }
  }
  template<class GImpl>
  void Photon<GImpl>::gaugeTransform(GaugeField &out)
  {
    switch (gauge_)
    {
      case Gauge::feynman:
        break;
      case Gauge::coulomb:
        transverseProjectSpatial(out);
        break;
      case Gauge::landau:
        assert(0);
        break;
      default:
        assert(0);
        break;
    }
  }
  template<class GImpl>
  void Photon<GImpl>::MomentumSpacePropagator(const GaugeField &in,
                                               GaugeField &out)
  {
-    LatticeComplex momProp(grid_);
+  GridBase           *grid = out._grid;
    LatticeComplex     momProp(grid);
-    makeInvKHatSquared(momProp);
+    switch (zmScheme_)
    {
      case ZmScheme::qedTL:
      case ZmScheme::qedL:
      {
        invKHatSquared(momProp);
        zmSub(momProp);
        break;
      }
      case ZmScheme::qedInf:
      {
        infVolPropagator(momProp);
        break;
      }
      default:
        break;
    }
    out = in*momProp;
  }
-  template<class GImpl>
+  template<class Gimpl>
-  void Photon<GImpl>::StochasticWeight(GaugeLinkField &weight)
+  void Photon<Gimpl>::StochasticWeight(GaugeLinkField &weight)
  {
-    const unsigned int nd  = grid_->Nd();
+    auto               *grid     = dynamic_cast<GridCartesian *>(weight._grid);
-    std::vector<int>   l   = grid_->FullDimensions();
+    const unsigned int nd        = grid->_ndimension;
-    Integer            vol = 1;
+    std::vector<int>   latt_size = grid->_fdimensions;
-    for(unsigned int mu = 0; mu < nd; mu++)
+    switch (zmScheme_)
    {
-      vol = vol*l[mu];
+      case ZmScheme::qedTL:
      case ZmScheme::qedL:
      {
        Integer vol = 1;
        for(int d = 0; d < nd; d++)
        {
          vol = vol * latt_size[d];
        }
-    makeInvKHatSquared(weight);
+        invKHatSquared(weight);
        weight = sqrt(vol)*sqrt(weight);
        zmSub(weight);
        break;
      }
      case ZmScheme::qedInf:
      {
        infVolPropagator(weight);
        weight = sqrt(real(weight));
        break;
      }
      default:
        break;
    }
  }
-  template<class GImpl>
+  template<class Gimpl>
-  void Photon<GImpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
+  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
  {
-    GaugeLinkField weight(grid_);
+    auto           *grid = dynamic_cast<GridCartesian *>(out._grid);
    GaugeLinkField weight(grid);
    StochasticWeight(weight);
    StochasticField(out, rng, weight);
  }
-  template<class GImpl>
+  template<class Gimpl>
-  void Photon<GImpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
+  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
                                      const GaugeLinkField &weight)
  {
-    const unsigned int nd = grid_->Nd();
+    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
-    GaugeLinkField     r(grid_);
+    const unsigned int nd = grid->_ndimension;
-    GaugeField         aTilde(grid_);
+    GaugeLinkField     r(grid);
-    FFT                fft(dynamic_cast<GridCartesian *>(grid_));
+    GaugeField         aTilde(grid);
    FFT                fft(grid);
-    for(unsigned int mu = 0; mu < nd; mu++)
+    switch (zmScheme_)
    {
      case ZmScheme::qedTL:
      case ZmScheme::qedL:
      {
        for(int mu = 0; mu < nd; mu++)
        {
          gaussian(rng, r);
          r = weight*r;
          pokeLorentz(aTilde, r, mu);
        }
-    gaugeTransform(aTilde);
+        break;
      }
      case ZmScheme::qedInf:
      {
        Complex                    shift(1., 1.); // This needs to be a GaugeLink element?
        for(int mu = 0; mu < nd; mu++)
        {
          bernoulli(rng, r);
          r = weight*(2.*r - shift);
          pokeLorentz(aTilde, r, mu);
        }
        break;
      }
      default:
        break;
    }
    fft.FFT_all_dim(out, aTilde, FFT::backward);
    out = real(out);
  }
-  template<class GImpl>
+  template<class Gimpl>
-  void Photon<GImpl>::UnitField(GaugeField &out)
+  void Photon<Gimpl>::UnitField(GaugeField &out)
  {
-    const unsigned int nd = grid_->Nd();
+    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
-    GaugeLinkField     r(grid_);
+    const unsigned int nd = grid->_ndimension;
    GaugeLinkField     r(grid);
-    r = ScalarComplex(1., 0.);
+    r = Complex(1.0,0.0);
-    for(unsigned int mu = 0; mu < nd; mu++)
+
    for(int mu = 0; mu < nd; mu++)
    {
      pokeLorentz(out, r, mu);
    }
    out = real(out);
  }
 //  template<class Gimpl>
 //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
 //                                                            const GaugeField &in)
 //  {
 //    
 //    FeynmanGaugeMomentumSpacePropagator_TL(out,in);
 //    
 //    GridBase *grid = out._grid;
 //    LatticeInteger     coor(grid);
 //    GaugeField zz(grid); zz=zero;
 //    
 //    // xyzt
 //    for(int d = 0; d < grid->_ndimension-1;d++){
 //      LatticeCoordinate(coor,d);
 //      out = where(coor==Integer(0),zz,out);
 //    }
 //  }
 //  
 //  template<class Gimpl>
 //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out,
 //                                                             const GaugeField &in)
 //  {
 //    
 //    // what type LatticeComplex
 //    GridBase *grid = out._grid;
 //    int nd = grid->_ndimension;
 //    
 //    typedef typename GaugeField::vector_type vector_type;
 //    typedef typename GaugeField::scalar_type ScalComplex;
 //    typedef Lattice<iSinglet<vector_type> > LatComplex;
 //    
 //    std::vector<int> latt_size   = grid->_fdimensions;
 //    
 //    LatComplex denom(grid); denom= zero;
 //    LatComplex   one(grid); one = ScalComplex(1.0,0.0);
 //    LatComplex   kmu(grid);
 //    
 //    ScalComplex ci(0.0,1.0);
 //    // momphase = n * 2pi / L
 //    for(int mu=0;mu<Nd;mu++) {
 //      
 //      LatticeCoordinate(kmu,mu);
 //      
 //      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
 //      
 //      kmu = TwoPiL * kmu ;
 //      
 //      denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
 //    }
 //    std::vector<int> zero_mode(nd,0);
 //    TComplexD Tone = ComplexD(1.0,0.0);
 //    TComplexD Tzero= ComplexD(0.0,0.0);
 //    
 //    pokeSite(Tone,denom,zero_mode);
 //    
 //    denom= one/denom;
 //    
 //    pokeSite(Tzero,denom,zero_mode);
 //    
 //    out = zero;
 //    out = in*denom;
 //  };
 }}
 #endif
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@@ -27,13 +27,12 @@ public:
  typedef iSpinColourMatrix<vector_type> SpinColourMatrix_v;
-  template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
+  static void MesonField(Eigen::Tensor<ComplexD,5> &mat, 
  static void MesonField(TensorType &mat, 
 			 const FermionField *lhs_wi,
 			 const FermionField *rhs_vj,
 			 std::vector<Gamma::Algebra> gammas,
 			 const std::vector<ComplexField > &mom,
-			 int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
+			 int orthogdim);
  static void PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat, 
 			     const FermionField *wi,
@@ -60,14 +59,6 @@ public:
 			  const FermionField *vj,
 			  int orthogdim);
  template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
  static void AslashField(TensorType &mat, 
        const FermionField *lhs_wi,
        const FermionField *rhs_vj,
        const std::vector<ComplexField> &emB0,
        const std::vector<ComplexField> &emB1,
        int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
  static void ContractWWVV(std::vector<PropagatorField> &WWVV,
 			   const Eigen::Tensor<ComplexD,3> &WW_sd,
 			   const FermionField *vs,
@@ -101,14 +92,13 @@ public:
 #endif
 };
-template <class FImpl>
+template<class FImpl>
-template <typename TensorType>
+void A2Autils<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat, 
 void A2Autils<FImpl>::MesonField(TensorType &mat, 
 				 const FermionField *lhs_wi,
 				 const FermionField *rhs_vj,
 				 std::vector<Gamma::Algebra> gammas,
 				 const std::vector<ComplexField > &mom,
-				 int orthogdim, double *t_kernel, double *t_gsum) 
+				 int orthogdim) 
 {
  typedef typename FImpl::SiteSpinor vobj;
@@ -156,7 +146,6 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
  int stride=grid->_slice_stride[orthogdim];
  // potentially wasting cores here if local time extent too small
  if (t_kernel) *t_kernel = -usecond();
  parallel_for(int r=0;r<rd;r++){
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
@@ -223,7 +212,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
      }
    }}}
  }
-  if (t_kernel) *t_kernel += usecond();
+
  assert(mat.dimension(0) == Nmom);
  assert(mat.dimension(1) == Ngamma);
  assert(mat.dimension(2) == Nt);
@@ -267,9 +256,9 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
  // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
  // Healthy size that should suffice
  ////////////////////////////////////////////////////////////////////
-  if (t_gsum) *t_gsum = -usecond();
+
  grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
-  if (t_gsum) *t_gsum += usecond();
+
 }
@@ -625,189 +614,6 @@ void A2Autils<FImpl>::PionFieldVV(Eigen::Tensor<ComplexD,3> &mat,
  PionFieldXX(mat,vi,vj,orthogdim,nog5);
 }
 // "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x)
 //
 // With:
 //
 // B_0 = A_0 + i A_1
 // B_1 = A_2 + i A_3
 // 
 // then in spin space
 // 
 //                 ( 0          0          -conj(B_1) -B_0 )
 // i * A_mu g_mu = ( 0          0          -conj(B_0)  B_1 )
 //                 ( B_1        B_0        0          0    )
 //                 ( conj(B_0)  -conj(B_1) 0          0    )
 template <class FImpl>
 template <typename TensorType>
 void A2Autils<FImpl>::AslashField(TensorType &mat, 
          const FermionField *lhs_wi,
          const FermionField *rhs_vj,
          const std::vector<ComplexField> &emB0,
          const std::vector<ComplexField> &emB1,
          int orthogdim, double *t_kernel, double *t_gsum) 
 {
    typedef typename FermionField::vector_object vobj;
    typedef typename vobj::scalar_object         sobj;
    typedef typename vobj::scalar_type           scalar_type;
    typedef typename vobj::vector_type           vector_type;
    typedef iSpinMatrix<vector_type> SpinMatrix_v;
    typedef iSpinMatrix<scalar_type> SpinMatrix_s;
    typedef iSinglet<vector_type>    Singlet_v;
    typedef iSinglet<scalar_type>    Singlet_s;
    int Lblock = mat.dimension(3); 
    int Rblock = mat.dimension(4);
    GridBase *grid = lhs_wi[0]._grid;
    const int    Nd = grid->_ndimension;
    const int Nsimd = grid->Nsimd();
    int Nt  = grid->GlobalDimensions()[orthogdim];
    int Nem = emB0.size();
    assert(emB1.size() == Nem);
    int fd=grid->_fdimensions[orthogdim];
    int ld=grid->_ldimensions[orthogdim];
    int rd=grid->_rdimensions[orthogdim];
    // will locally sum vectors first
    // sum across these down to scalars
    // splitting the SIMD
    int MFrvol = rd*Lblock*Rblock*Nem;
    int MFlvol = ld*Lblock*Rblock*Nem;
    Vector<vector_type> lvSum(MFrvol);
    parallel_for (int r = 0; r < MFrvol; r++)
    {
        lvSum[r] = zero;
    }
    Vector<scalar_type> lsSum(MFlvol);             
    parallel_for (int r = 0; r < MFlvol; r++)
    {
        lsSum[r] = scalar_type(0.0);
    }
    int e1=    grid->_slice_nblock[orthogdim];
    int e2=    grid->_slice_block [orthogdim];
    int stride=grid->_slice_stride[orthogdim];
    // Nested parallelism would be ok
    // Wasting cores here. Test case r
    if (t_kernel) *t_kernel = -usecond();
    parallel_for(int r=0;r<rd;r++)
    {
        int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
        for(int n=0;n<e1;n++)
        for(int b=0;b<e2;b++)
        {
            int ss= so+n*stride+b;
            for(int i=0;i<Lblock;i++)
            {
                auto left = conjugate(lhs_wi[i]._odata[ss]);
                for(int j=0;j<Rblock;j++)
                {
                    SpinMatrix_v vv;
                    auto right = rhs_vj[j]._odata[ss];
                    for(int s1=0;s1<Ns;s1++)
                    for(int s2=0;s2<Ns;s2++)
                    {
                        vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
                                        + left()(s2)(1) * right()(s1)(1)
                                        + left()(s2)(2) * right()(s1)(2);
                    }
                    // After getting the sitewise product do the mom phase loop
                    int base = Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*r;
                    for ( int m=0;m<Nem;m++)
                    {
                        int idx  = m+base;
                        auto b0  = emB0[m]._odata[ss];
                        auto b1  = emB1[m]._odata[ss];
                        auto cb0 = conjugate(b0);
                        auto cb1 = conjugate(b1);
                        lvSum[idx] += - vv()(3,0)()*b0()()()  - vv()(2,0)()*cb1()()()
                                      + vv()(3,1)()*b1()()()  - vv()(2,1)()*cb0()()()
                                      + vv()(0,2)()*b1()()()  + vv()(1,2)()*b0()()()
                                      + vv()(0,3)()*cb0()()() - vv()(1,3)()*cb1()()();
                    }
                }
            }
        }
    }
    // Sum across simd lanes in the plane, breaking out orthog dir.
    parallel_for(int rt=0;rt<rd;rt++)
    {
        std::vector<int> icoor(Nd);
        std::vector<scalar_type> extracted(Nsimd);               
        for(int i=0;i<Lblock;i++)
        for(int j=0;j<Rblock;j++)
        for(int m=0;m<Nem;m++)
        {
            int ij_rdx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*rt;
            extract<vector_type,scalar_type>(lvSum[ij_rdx],extracted);
            for(int idx=0;idx<Nsimd;idx++)
            {
                grid->iCoorFromIindex(icoor,idx);
                int ldx    = rt+icoor[orthogdim]*rd;
                int ij_ldx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*ldx;
                lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
            }
        }
    }
    if (t_kernel) *t_kernel += usecond();
    // ld loop and local only??
    int pd = grid->_processors[orthogdim];
    int pc = grid->_processor_coor[orthogdim];
    parallel_for_nest2(int lt=0;lt<ld;lt++)
    {
        for(int pt=0;pt<pd;pt++)
        {
            int t = lt + pt*ld;
            if (pt == pc)
            {
                for(int i=0;i<Lblock;i++)
                for(int j=0;j<Rblock;j++)
                for(int m=0;m<Nem;m++)
                {
                    int ij_dx = m+Nem*i + Nem*Lblock * j + Nem*Lblock * Rblock * lt;
                    mat(m,0,t,i,j) = lsSum[ij_dx];
                }
            } 
            else 
            { 
                const scalar_type zz(0.0);
                for(int i=0;i<Lblock;i++)
                for(int j=0;j<Rblock;j++)
                for(int m=0;m<Nem;m++)
                {
                    mat(m,0,t,i,j) = zz;
                }
            }
        }
    }
    if (t_gsum) *t_gsum = -usecond();
    grid->GlobalSumVector(&mat(0,0,0,0,0),Nem*Nt*Lblock*Rblock);
    if (t_gsum) *t_gsum += usecond();
 }
 ////////////////////////////////////////////
 // Schematic thoughts about more generalised four quark insertion
--- a/Grid/qcd/utils/LinalgUtils.h
+++ b/Grid/qcd/utils/LinalgUtils.h
@@ -173,39 +173,6 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
    }
  }
 }
 }
-// I explicitly need these outside the QCD namespace
+}}
 template<typename vobj>
 void G5C(Lattice<vobj> &z, const Lattice<vobj> &x)
 {
  GridBase *grid = x._grid;
  z.checkerboard = x.checkerboard;
  conformable(x, z);
  QCD::Gamma G5(QCD::Gamma::Algebra::Gamma5);
  z = G5 * x;
 }
 template<class CComplex, int nbasis>
 void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, nbasis>> &x)
 {
  GridBase *grid = x._grid;
  z.checkerboard = x.checkerboard;
  conformable(x, z);
  static_assert(nbasis % 2 == 0, "");
  int nb = nbasis / 2;
  parallel_for(int ss = 0; ss < grid->oSites(); ss++) {
    for(int n = 0; n < nb; ++n) {
      z._odata[ss](n) = x._odata[ss](n);
    }
    for(int n = nb; n < nbasis; ++n) {
      z._odata[ss](n) = -x._odata[ss](n);
    }
  }
 }
 }
 #endif 
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -6,12 +6,10 @@
    Copyright (C) 2015
-    Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    Author: neo <cossu@post.kek.jp>
+Author: neo <cossu@post.kek.jp>
-    Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: James Harrison <J.Harrison@soton.ac.uk>
    Author: Antonin Portelli <antonin.portelli@me.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -647,184 +645,6 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      }
    }
  }
  //////////////////////////////////////////////////
  // Wilson loop of size (R1, R2), oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void wilsonLoop(GaugeMat &wl, const std::vector<GaugeMat> &U,
                           const int Rmu, const int Rnu,
                           const int mu, const int nu) {
    wl = U[nu];
    for(int i = 0; i < Rnu-1; i++){
      wl = Gimpl::CovShiftForward(U[nu], nu, wl);
    }
    for(int i = 0; i < Rmu; i++){
      wl = Gimpl::CovShiftForward(U[mu], mu, wl);
    }
    for(int i = 0; i < Rnu; i++){
      wl = Gimpl::CovShiftBackward(U[nu], nu, wl);
    }
    for(int i = 0; i < Rmu; i++){
      wl = Gimpl::CovShiftBackward(U[mu], mu, wl);
    }
  }
  //////////////////////////////////////////////////
  // trace of Wilson Loop oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void traceWilsonLoop(LatticeComplex &wl,
                                const std::vector<GaugeMat> &U,
                                const int Rmu, const int Rnu,
                                const int mu, const int nu) {
    GaugeMat sp(U[0]._grid);
    wilsonLoop(sp, U, Rmu, Rnu, mu, nu);
    wl = trace(sp);
  }
  //////////////////////////////////////////////////
  // sum over all planes of Wilson loop
  //////////////////////////////////////////////////
  static void siteWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    Wl = zero;
    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
        Wl = Wl + siteWl;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over planes of Wilson loop with length R1
  // in the time direction
  //////////////////////////////////////////////////
  static void siteTimelikeWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    int ndim = U[0]._grid->_ndimension;
    Wl = zero;
    for (int nu = 0; nu < ndim - 1; nu++) {
      traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu);
      Wl = Wl + siteWl;
    }
  }
  //////////////////////////////////////////////////
  // sum Wilson loop over all planes orthogonal to the time direction
  //////////////////////////////////////////////////
  static void siteSpatialWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    Wl = zero;
    for (int mu = 1; mu < U[0]._grid->_ndimension - 1; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
        Wl = Wl + siteWl;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of Wilson loop
  //////////////////////////////////////////////////
  static Real sumWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of timelike Wilson loop
  //////////////////////////////////////////////////
  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteTimelikeWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteSpatialWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of Wilson loop
  //////////////////////////////////////////////////
  static Real avgWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * ndim * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of timelike Wilson loop
  //////////////////////////////////////////////////
  static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * (ndim - 1) * (ndim - 2);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
 };
 typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
--- a/Grid/serialisation/Hdf5IO.cc
+++ b/Grid/serialisation/Hdf5IO.cc
@@ -61,9 +61,9 @@ Group & Hdf5Writer::getGroup(void)
 }
 // Reader implementation ///////////////////////////////////////////////////////
-Hdf5Reader::Hdf5Reader(const std::string &fileName, const bool readOnly)
+Hdf5Reader::Hdf5Reader(const std::string &fileName)
 : fileName_(fileName)
-, file_(fileName.c_str(), readOnly ? H5F_ACC_RDONLY : H5F_ACC_RDWR)
+, file_(fileName.c_str(), H5F_ACC_RDWR)
 {
  group_ = file_.openGroup("/");
  readSingleAttribute(dataSetThres_, HDF5_GRID_GUARD "dataset_threshold",
--- a/Grid/serialisation/Hdf5IO.h
+++ b/Grid/serialisation/Hdf5IO.h
@@ -54,7 +54,7 @@ namespace Grid
  class Hdf5Reader: public Reader<Hdf5Reader>
  {
  public:
-    Hdf5Reader(const std::string &fileName, const bool readOnly = true);
+    Hdf5Reader(const std::string &fileName);
    virtual ~Hdf5Reader(void) = default;
    bool push(const std::string &s);
    void pop(void);
@@ -124,11 +124,8 @@ namespace Grid
    if (flatx.size() > dataSetThres_)
    {
      H5NS::DataSet dataSet;
      H5NS::DSetCreatPropList plist;
-      plist.setChunk(dim.size(), dim.data());
+      dataSet = group_.createDataSet(s, Hdf5Type<Element>::type(), dataSpace);
      plist.setFletcher32();
      dataSet = group_.createDataSet(s, Hdf5Type<Element>::type(), dataSpace, plist);
      dataSet.write(flatx.data(), Hdf5Type<Element>::type());
    }
    else
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -47,7 +47,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #define PARALLEL_FOR_LOOP
 #define PARALLEL_FOR_LOOP_INTERN
 #define PARALLEL_FOR_LOOP_REDUCE(op, var)
 #define PARALLEL_NESTED_LOOP2
 #define PARALLEL_NESTED_LOOP5
 #define PARALLEL_REGION
@@ -59,7 +58,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
 #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 #define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for
 #define parallel_critical PARALLEL_CRITICAL
 namespace Grid {
--- a/Grid/util/Sha.h
+++ b/Grid/util/Sha.h
@@ -28,46 +28,17 @@
 extern "C" {
 #include <openssl/sha.h>
 }
 #ifdef USE_IPP
 #include "ipp.h"
 #endif
 #pragma once
 class GridChecksum
 {
 public:
-  static inline uint32_t crc32(const void *data, size_t bytes)
+  static inline uint32_t crc32(void *data,size_t bytes)
  {
    return ::crc32(0L,(unsigned char *)data,bytes);
  }
-
+  static inline std::vector<unsigned char> sha256(void *data,size_t bytes)
 #ifdef USE_IPP
  static inline uint32_t crc32c(const void* data, size_t bytes)
  {
      uint32_t crc32c = ~(uint32_t)0;
      ippsCRC32C_8u(reinterpret_cast<const unsigned char *>(data), bytes, &crc32c);
      ippsSwapBytes_32u_I(&crc32c, 1);
      return ~crc32c;
  }
 #endif
  template <typename T>
  static inline std::string sha256_string(const std::vector<T> &hash)
  {
    std::stringstream sha;
    std::string       s;
    for(unsigned int i = 0; i < hash.size(); i++) 
    { 
        sha << std::hex << static_cast<unsigned int>(hash[i]);
    }
    s = sha.str();
    return s;
  }
  static inline std::vector<unsigned char> sha256(const void *data,size_t bytes)
  {
    std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
    SHA256_CTX sha256;
--- a/Hadrons/A2AMatrix.hpp
+++ b/Hadrons/A2AMatrix.hpp
@@ -7,7 +7,6 @@ Source file: Hadrons/A2AMatrix.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -30,397 +29,38 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define A2A_Matrix_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/TimerArray.hpp>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
 #ifdef USE_MKL
 #include "mkl.h"
 #include "mkl_cblas.h"
 #endif
 #ifndef HADRONS_A2AM_NAME 
 #define HADRONS_A2AM_NAME "a2aMatrix"
 #endif
 #ifndef HADRONS_A2AM_IO_TYPE
 #define HADRONS_A2AM_IO_TYPE ComplexF
 #endif
 #define HADRONS_A2AM_PARALLEL_IO
 BEGIN_HADRONS_NAMESPACE
-// general A2A matrix set based on Eigen tensors and Grid-allocated memory
+template <typename T, typename MetadataType>
 // Dimensions:
 //   0 - ext - external field (momentum, EM field, ...)
 //   1 - str - spin-color structure
 //   2 - t   - timeslice
 //   3 - i   - left  A2A mode index
 //   4 - j   - right A2A mode index
 template <typename T>
 using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;
 template <typename T>
 using A2AMatrix = Eigen::Matrix<T, -1, -1, Eigen::RowMajor>;
 template <typename T>
 using A2AMatrixMap = Eigen::Map<A2AMatrix<T>>;
 template <typename T>
 using A2AMatrixTr = Eigen::Matrix<T, -1, -1, Eigen::ColMajor>;
 /******************************************************************************
 *                      Abstract class for A2A kernels                        *
 ******************************************************************************/
 template <typename T, typename Field>
 class A2AKernel
 {
 public:
    A2AKernel(void) = default;
    virtual ~A2AKernel(void) = default;
    virtual void operator()(A2AMatrixSet<T> &m, const Field *left, const Field *right,
                          const unsigned int orthogDim, double &time) = 0;
    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
 };
 /******************************************************************************
 *                  Class to handle A2A matrix block HDF5 I/O                 *
 ******************************************************************************/
 template <typename T>
 class A2AMatrixIo
 {
 public:
    // constructors
    A2AMatrixIo(void) = default;
    A2AMatrixIo(std::string filename, std::string dataname, 
-                const unsigned int nt, const unsigned int ni = 0,
+                const unsigned int nt, const unsigned int ni,
-                const unsigned int nj = 0);
+                const unsigned int nj);
    // destructor
    ~A2AMatrixIo(void) = default;
    // access
    unsigned int getNi(void) const;
    unsigned int getNj(void) const;
    unsigned int getNt(void) const;
    size_t       getSize(void) const;
    // file allocation
    template <typename MetadataType>
    void initFile(const MetadataType &d, const unsigned int chunkSize);
    // block I/O
    void saveBlock(const T *data, const unsigned int i, const unsigned int j,
                   const unsigned int blockSizei, const unsigned int blockSizej);
    void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
                   const unsigned int i, const unsigned int j);
    template <template <class> class Vec, typename VecT>
    void load(Vec<VecT> &v, double *tRead = nullptr, const bool useCache = true);
 private:
-    std::string  filename_{""}, dataname_{""};
+    std::string  filename_, dataname_;
-    unsigned int nt_{0}, ni_{0}, nj_{0};
+    unsigned int nt_, ni_, nj_;
 };
-/******************************************************************************
+template <typename T, typename MetadataType>
- *                  Wrapper for A2A matrix block computation                  *
+A2AMatrixIo<T, MetadataType>::A2AMatrixIo(std::string filename, 
- ******************************************************************************/
+                                          std::string dataname, 
-template <typename T, typename Field, typename MetadataType, typename TIo = T>
+                                          const unsigned int nt, 
-class A2AMatrixBlockComputation
+                                          const unsigned int ni,
 {
 private:
    struct IoHelper
    {
        A2AMatrixIo<TIo> io;
        MetadataType     md;
        unsigned int     e, s, i, j;
    };
    typedef std::function<std::string(const unsigned int, const unsigned int)>  FilenameFn;
    typedef std::function<MetadataType(const unsigned int, const unsigned int)> MetadataFn;
 public:
    // constructor
    A2AMatrixBlockComputation(GridBase *grid,
                              const unsigned int orthogDim,
                              const unsigned int next,
                              const unsigned int nstr,
                              const unsigned int blockSize,
                              const unsigned int cacheBlockSize,
                              TimerArray *tArray = nullptr);
    // execution
    void execute(const std::vector<Field> &left, 
                 const std::vector<Field> &right,
                 A2AKernel<T, Field> &kernel,
                 const FilenameFn &ionameFn,
                 const FilenameFn &filenameFn,
                 const MetadataFn &metadataFn);
 private:
    // I/O handler
    void saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h);
 private:
    TimerArray            *tArray_;
    GridBase              *grid_;
    unsigned int          orthogDim_, nt_, next_, nstr_, blockSize_, cacheBlockSize_;
    Vector<T>             mCache_;
    Vector<TIo>           mBuf_;
    std::vector<IoHelper> nodeIo_;
 };
 /******************************************************************************
 *                       A2A matrix contraction kernels                       *
 ******************************************************************************/
 class A2AContraction
 {
 public:
    // accTrMul(acc, a, b): acc += tr(a*b)
    template <typename C, typename MatLeft, typename MatRight>
    static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
    {
        if ((MatLeft::Options == Eigen::RowMajor) and
            (MatRight::Options == Eigen::ColMajor))
        {
            parallel_for (unsigned int r = 0; r < a.rows(); ++r)
            {
                C tmp;
 #ifdef USE_MKL
                dotuRow(tmp, r, a, b);
 #else
                tmp = a.row(r).conjugate().dot(b.col(r));
 #endif
                parallel_critical
                {
                    acc += tmp;
                }
            }
        }
        else
        {
            parallel_for (unsigned int c = 0; c < a.cols(); ++c)
            {
                C tmp;
 #ifdef USE_MKL 
                dotuCol(tmp, c, a, b);
 #else
                tmp = a.col(c).conjugate().dot(b.row(c));
 #endif
                parallel_critical
                {
                    acc += tmp;
                }
            }
        }
    }
    template <typename MatLeft, typename MatRight>
    static inline double accTrMulFlops(const MatLeft &a, const MatRight &b)
    {
        double n = a.rows()*a.cols();
        return 8.*n;
    }
    // mul(res, a, b): res = a*b
 #ifdef USE_MKL
    template <template <class, int...> class Mat, int... Opts>
    static inline void mul(Mat<ComplexD, Opts...> &res, 
                           const Mat<ComplexD, Opts...> &a, 
                           const Mat<ComplexD, Opts...> &b)
    {
        static const ComplexD one(1., 0.), zero(0., 0.);
        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
        {
            res.resize(a.rows(), b.cols());
        }
        if (Mat<ComplexD, Opts...>::Options == Eigen::RowMajor)
        {
            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
                        res.data(), res.cols());
        }
        else if (Mat<ComplexD, Opts...>::Options == Eigen::ColMajor)
        {
            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
                        res.data(), res.rows());
        }
    }
    template <template <class, int...> class Mat, int... Opts>
    static inline void mul(Mat<ComplexF, Opts...> &res, 
                           const Mat<ComplexF, Opts...> &a, 
                           const Mat<ComplexF, Opts...> &b)
    {
        static const ComplexF one(1., 0.), zero(0., 0.);
        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
        {
            res.resize(a.rows(), b.cols());
        }
        if (Mat<ComplexF, Opts...>::Options == Eigen::RowMajor)
        {
            cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
                        res.data(), res.cols());
        }
        else if (Mat<ComplexF, Opts...>::Options == Eigen::ColMajor)
        {
            cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
                        res.data(), res.rows());
        }
    }
 #else
    template <typename Mat>
    static inline void mul(Mat &res, const Mat &a, const Mat &b)
    {
        res = a*b;
    }
 #endif
    template <typename Mat>
    static inline double mulFlops(const Mat &a, const Mat &b)
    {
        double nr = a.rows(), nc = a.cols();
        return nr*nr*(6.*nc + 2.*(nc - 1.));
    }
 private:
    template <typename C, typename MatLeft, typename MatRight>
    static inline void makeDotRowPt(C * &aPt, unsigned int &aInc, C * &bPt, 
                                    unsigned int &bInc, const unsigned int aRow, 
                                    const MatLeft &a, const MatRight &b)
    {
        if (MatLeft::Options == Eigen::RowMajor)
        {
            aPt  = a.data() + aRow*a.cols();
            aInc = 1;
        }
        else if (MatLeft::Options == Eigen::ColMajor)
        {
            aPt  = a.data() + aRow;
            aInc = a.rows();
        }
        if (MatRight::Options == Eigen::RowMajor)
        {
            bPt  = b.data() + aRow;
            bInc = b.cols();
        }
        else if (MatRight::Options == Eigen::ColMajor)
        {
            bPt  = b.data() + aRow*b.rows();
            bInc = 1;
        }
    }
 #ifdef USE_MKL
    template <typename C, typename MatLeft, typename MatRight>
    static inline void makeDotColPt(C * &aPt, unsigned int &aInc, C * &bPt, 
                                    unsigned int &bInc, const unsigned int aCol, 
                                    const MatLeft &a, const MatRight &b)
    {
        if (MatLeft::Options == Eigen::RowMajor)
        {
            aPt  = a.data() + aCol;
            aInc = a.cols();
        }
        else if (MatLeft::Options == Eigen::ColMajor)
        {
            aPt  = a.data() + aCol*a.rows();
            aInc = 1;
        }
        if (MatRight::Options == Eigen::RowMajor)
        {
            bPt  = b.data() + aCol*b.cols();
            bInc = 1;
        }
        else if (MatRight::Options == Eigen::ColMajor)
        {
            bPt  = b.data() + aCol;
            bInc = b.rows();
        }
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuRow(ComplexF &res, const unsigned int aRow,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexF *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
        cblas_cdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuCol(ComplexF &res, const unsigned int aCol,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexF *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
        cblas_cdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuRow(ComplexD &res, const unsigned int aRow,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexD *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
        cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuCol(ComplexD &res, const unsigned int aCol,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexD *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
        cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
    }
 #endif
 };
 /******************************************************************************
 *                     A2AMatrixIo template implementation                    *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename T>
 A2AMatrixIo<T>::A2AMatrixIo(std::string filename, std::string dataname, 
                            const unsigned int nt, const unsigned int ni,
                                          const unsigned int nj)
 : filename_(filename), dataname_(dataname)
 , nt_(nt), ni_(ni), nj_(nj)
 {}
-// access //////////////////////////////////////////////////////////////////////
+template <typename T, typename MetadataType>
-template <typename T>
+void A2AMatrixIo<T, MetadataType>::initFile(const MetadataType &d, const unsigned int chunkSize)
 unsigned int A2AMatrixIo<T>::getNt(void) const
 {
    return nt_;
 }
 template <typename T>
 unsigned int A2AMatrixIo<T>::getNi(void) const
 {
    return ni_;
 }
 template <typename T>
 unsigned int A2AMatrixIo<T>::getNj(void) const
 {
    return nj_;
 }
 template <typename T>
 size_t A2AMatrixIo<T>::getSize(void) const
 {
    return nt_*ni_*nj_*sizeof(T);
 }
 // file allocation /////////////////////////////////////////////////////////////
 template <typename T>
 template <typename MetadataType>
 void A2AMatrixIo<T>::initFile(const MetadataType &d, const unsigned int chunkSize)
 {
 #ifdef HAVE_HDF5
    std::vector<hsize_t>    dim = {static_cast<hsize_t>(nt_), 
@@ -440,28 +80,26 @@ void A2AMatrixIo<T>::initFile(const MetadataType &d, const unsigned int chunkSiz
    }
    // create the dataset
-    Hdf5Reader reader(filename_, false);
+    Hdf5Reader reader(filename_);
    push(reader, dataname_);
    auto &group = reader.getGroup();
    plist.setChunk(chunk.size(), chunk.data());
-    plist.setFletcher32();
+    dataset = group.createDataSet("data", Hdf5Type<T>::type(), dataspace, plist);
    dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type<T>::type(), dataspace, plist);
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
 #endif
 }
-// block I/O ///////////////////////////////////////////////////////////////////
+template <typename T, typename MetadataType>
-template <typename T>
+void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data, 
 void A2AMatrixIo<T>::saveBlock(const T *data, 
                                             const unsigned int i, 
                                             const unsigned int j,
                                             const unsigned int blockSizei,
                                             const unsigned int blockSizej)
 {
 #ifdef HAVE_HDF5
-    Hdf5Reader           reader(filename_, false);
+    Hdf5Reader           reader(filename_);
    std::vector<hsize_t> count = {nt_, blockSizei, blockSizej},
                         offset = {0, static_cast<hsize_t>(i),
                                   static_cast<hsize_t>(j)},
@@ -473,7 +111,7 @@ void A2AMatrixIo<T>::saveBlock(const T *data,
    push(reader, dataname_);
    auto &group = reader.getGroup();
-    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
+    dataset     = group.openDataSet("data");
    dataspace   = dataset.getSpace();
    dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
                              stride.data(), block.data());
@@ -483,286 +121,6 @@ void A2AMatrixIo<T>::saveBlock(const T *data,
 #endif
 }
 template <typename T>
 void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,
                               const unsigned int ext, const unsigned int str,
                               const unsigned int i, const unsigned int j)
 {
    unsigned int blockSizei = m.dimension(3);
    unsigned int blockSizej = m.dimension(4);
    unsigned int nstr       = m.dimension(1);
    size_t       offset     = (ext*nstr + str)*nt_*blockSizei*blockSizej;
    saveBlock(m.data() + offset, i, j, blockSizei, blockSizej);
 }
 template <typename T>
 template <template <class> class Vec, typename VecT>
 void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead, const bool useCache)
 {
 #ifdef HAVE_HDF5
    Hdf5Reader           reader(filename_);
    std::vector<hsize_t> hdim;
    H5NS::DataSet        dataset;
    H5NS::DataSpace      dataspace;
    H5NS::CompType       datatype;
    push(reader, dataname_);
    auto &group = reader.getGroup();
    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
    datatype    = dataset.getCompType();
    dataspace   = dataset.getSpace();
    hdim.resize(dataspace.getSimpleExtentNdims());
    dataspace.getSimpleExtentDims(hdim.data());
    if ((nt_*ni_*nj_ != 0) and
        ((hdim[0] != nt_) or (hdim[1] != ni_) or (hdim[2] != nj_)))
    {
        HADRONS_ERROR(Size, "all-to-all matrix size mismatch (got "
            + std::to_string(hdim[0]) + "x" + std::to_string(hdim[1]) + "x"
            + std::to_string(hdim[2]) + ", expected "
            + std::to_string(nt_) + "x" + std::to_string(ni_) + "x"
            + std::to_string(nj_));
    }
    else if (ni_*nj_ == 0)
    {
        if (hdim[0] != nt_)
        {
            HADRONS_ERROR(Size, "all-to-all time size mismatch (got "
                + std::to_string(hdim[0]) + ", expected "
                + std::to_string(nt_) + ")");
        }
        ni_ = hdim[1];
        nj_ = hdim[2];
    }
    if (useCache)
    {
        std::vector<T> buf(nt_*ni_*nj_);
        T              *pt;
        dataset.read(buf.data(), datatype);
        pt = buf.data();
        for (unsigned int t = 0; t < nt_; ++t)
        {
            A2AMatrixMap<T> bufMap(pt, ni_, nj_);
            v[t]  = bufMap.template cast<VecT>();
            pt   += ni_*nj_;
        }
    }
    // if useCache = false, do I/O timeslice per timeslice (much slower)
    else
    {
        A2AMatrix<T>         buf(ni_, nj_);
        std::vector<hsize_t> count    = {1, static_cast<hsize_t>(ni_),
                                        static_cast<hsize_t>(nj_)},
                             stride   = {1, 1, 1},
                             block    = {1, 1, 1},
                             memCount = {static_cast<hsize_t>(ni_),
                                         static_cast<hsize_t>(nj_)};
        H5NS::DataSpace      memspace(memCount.size(), memCount.data());
        std::cout << "Loading timeslice";
        std::cout.flush();
        *tRead = 0.;
        for (unsigned int tp1 = nt_; tp1 > 0; --tp1)
        {
            unsigned int         t      = tp1 - 1;
            std::vector<hsize_t> offset = {static_cast<hsize_t>(t), 0, 0};
            if (t % 10 == 0)
            {
                std::cout << " " << t;
                std::cout.flush();
            }
            dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
                                      stride.data(), block.data());
            if (tRead) *tRead -= usecond();    
            dataset.read(buf.data(), datatype, memspace, dataspace);
            if (tRead) *tRead += usecond();
            v[t] = buf.template cast<VecT>();
        }
        std::cout << std::endl;
    }
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
 #endif
 }
 /******************************************************************************
 *               A2AMatrixBlockComputation template implementation            *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename T, typename Field, typename MetadataType, typename TIo>
 A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
 ::A2AMatrixBlockComputation(GridBase *grid,
                            const unsigned int orthogDim,
                            const unsigned int next, 
                            const unsigned int nstr,
                            const unsigned int blockSize, 
                            const unsigned int cacheBlockSize,
                            TimerArray *tArray)
 : grid_(grid), nt_(grid->GlobalDimensions()[orthogDim]), orthogDim_(orthogDim)
 , next_(next), nstr_(nstr), blockSize_(blockSize), cacheBlockSize_(cacheBlockSize)
 , tArray_(tArray)
 {
    mCache_.resize(nt_*next_*nstr_*cacheBlockSize_*cacheBlockSize_);
    mBuf_.resize(nt_*next_*nstr_*blockSize_*blockSize_);
 }
 #define START_TIMER(name) if (tArray_) tArray_->startTimer(name)
 #define STOP_TIMER(name)  if (tArray_) tArray_->stopTimer(name)
 #define GET_TIMER(name)   ((tArray_ != nullptr) ? tArray_->getDTimer(name) : 0.)
 // execution ///////////////////////////////////////////////////////////////////
 template <typename T, typename Field, typename MetadataType, typename TIo>
 void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
 ::execute(const std::vector<Field> &left, const std::vector<Field> &right,
          A2AKernel<T, Field> &kernel, const FilenameFn &ionameFn,
          const FilenameFn &filenameFn, const MetadataFn &metadataFn)
 {
    //////////////////////////////////////////////////////////////////////////
    // i,j   is first  loop over blockSize_ factors
    // ii,jj is second loop over cacheBlockSize_ factors for high perf contractions
    // iii,jjj are loops within cacheBlock
    // Total index is sum of these  i+ii+iii etc...
    //////////////////////////////////////////////////////////////////////////
    int    N_i = left.size();
    int    N_j = right.size();
    double flops, bytes, t_kernel;
    double nodes = grid_->NodeCount();
    int NBlock_i = N_i/blockSize_ + (((N_i % blockSize_) != 0) ? 1 : 0);
    int NBlock_j = N_j/blockSize_ + (((N_j % blockSize_) != 0) ? 1 : 0);
    for(int i=0;i<N_i;i+=blockSize_)
    for(int j=0;j<N_j;j+=blockSize_)
    {
        // Get the W and V vectors for this block^2 set of terms
        int N_ii = MIN(N_i-i,blockSize_);
        int N_jj = MIN(N_j-j,blockSize_);
        A2AMatrixSet<TIo> mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj);
        LOG(Message) << "All-to-all matrix block " 
                     << j/blockSize_ + NBlock_j*i/blockSize_ + 1 
                     << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
                     << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
                     << std::endl;
        // Series of cache blocked chunks of the contractions within this block
        flops    = 0.0;
        bytes    = 0.0;
        t_kernel = 0.0;
        for(int ii=0;ii<N_ii;ii+=cacheBlockSize_)
        for(int jj=0;jj<N_jj;jj+=cacheBlockSize_)
        {
            double t;
            int N_iii = MIN(N_ii-ii,cacheBlockSize_);
            int N_jjj = MIN(N_jj-jj,cacheBlockSize_);
            A2AMatrixSet<T> mCacheBlock(mCache_.data(), next_, nstr_, nt_, N_iii, N_jjj);
            START_TIMER("kernel");
            kernel(mCacheBlock, &left[i+ii], &right[j+jj], orthogDim_, t);
            STOP_TIMER("kernel");
            t_kernel += t;
            flops    += kernel.flops(N_iii, N_jjj);
            bytes    += kernel.bytes(N_iii, N_jjj);
            START_TIMER("cache copy");
            parallel_for_nest5(int e =0;e<next_;e++)
            for(int s =0;s< nstr_;s++)
            for(int t =0;t< nt_;t++)
            for(int iii=0;iii< N_iii;iii++)
            for(int jjj=0;jjj< N_jjj;jjj++)
            {
                mBlock(e,s,t,ii+iii,jj+jjj) = mCacheBlock(e,s,t,iii,jjj);
            }
            STOP_TIMER("cache copy");
        }
        // perf
        LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes 
                     << " Gflop/s/node " << std::endl;
        LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes 
                     << " GB/s/node "  << std::endl;
        // IO
        double       blockSize, ioTime;
        unsigned int myRank = grid_->ThisRank(), nRank  = grid_->RankCount();
        LOG(Message) << "Writing block to disk" << std::endl;
        ioTime = -GET_TIMER("IO: write block");
        START_TIMER("IO: total");
        makeFileDir(filenameFn(0, 0), grid_);
 #ifdef HADRONS_A2AM_PARALLEL_IO
        grid_->Barrier();
        // make task list for current node
        nodeIo_.clear();
        for(int f = myRank; f < next_*nstr_; f += nRank)
        {
            IoHelper h;
            h.i  = i;
            h.j  = j;
            h.e  = f/nstr_;
            h.s  = f % nstr_;
            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
            h.md = metadataFn(h.e, h.s);
            nodeIo_.push_back(h);
        }
        // parallel IO
        for (auto &h: nodeIo_)
        {
            saveBlock(mBlock, h);
        }
        grid_->Barrier();
 #else
        // serial IO, for testing purposes only
        for(int e = 0; e < next_; e++)
        for(int s = 0; s < nstr_; s++)
        {
            IoHelper h;
            h.i  = i;
            h.j  = j;
            h.e  = e;
            h.s  = s;
            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
            h.md = metadataFn(h.e, h.s);
            saveBlock(mfBlock, h);
        }
 #endif
        STOP_TIMER("IO: total");
        blockSize  = static_cast<double>(next_*nstr_*nt_*N_ii*N_jj*sizeof(TIo));
        ioTime    += GET_TIMER("IO: write block");
        LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
                     << ioTime  << " us (" 
                     << blockSize/ioTime*1.0e6/1024/1024
                     << " MB/s)" << std::endl;
    }
 }
 // I/O handler /////////////////////////////////////////////////////////////////
 template <typename T, typename Field, typename MetadataType, typename TIo>
 void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
 ::saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h)
 {
    if ((h.i == 0) and (h.j == 0))
    {
        START_TIMER("IO: file creation");
        h.io.initFile(h.md, blockSize_);
        STOP_TIMER("IO: file creation");
    }
    START_TIMER("IO: write block");
    h.io.saveBlock(m, h.e, h.s, h.i, h.j);
    STOP_TIMER("IO: write block");
 }
 #undef START_TIMER
 #undef STOP_TIMER
 #undef GET_TIMER
 END_HADRONS_NAMESPACE
 #endif // A2A_Matrix_hpp_
--- a/Hadrons/A2AVectors.hpp
+++ b/Hadrons/A2AVectors.hpp
@@ -36,7 +36,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
- *                 Class to generate V & W all-to-all vectors                 *
+ *               Classes to generate V & W all-to-all vectors                 *
 ******************************************************************************/
 template <typename FImpl>
 class A2AVectorsSchurDiagTwo
@@ -70,42 +70,6 @@ private:
    SchurDiagTwoOperator<FMat, FermionField> op_;
 };
 /******************************************************************************
 *                  Methods for V & W all-to-all vectors I/O                  *
 ******************************************************************************/
 class A2AVectorsIo
 {
 public:
    struct Record: Serializable
    {
        GRID_SERIALIZABLE_CLASS_MEMBERS(Record,
                                        unsigned int, index);
        Record(void): index(0) {}
    };
 public:
    template <typename Field>
    static void write(const std::string fileStem, std::vector<Field> &vec, 
                      const bool multiFile, const int trajectory = -1);
    template <typename Field>
    static void read(std::vector<Field> &vec, const std::string fileStem,
                     const bool multiFile, const int trajectory = -1);
 private:
    static inline std::string vecFilename(const std::string stem, const int traj, 
                                          const bool multiFile)
    {
        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
        if (multiFile)
        {
            return stem + t;
        }
        else
        {
            return stem + t + ".bin";
        }
    }
 };
 /******************************************************************************
 *               A2AVectorsSchurDiagTwo template implementation               *
 ******************************************************************************/
@@ -253,90 +217,6 @@ void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeW5D(FermionField &wout_4d,
    }
 }
 /******************************************************************************
 *               all-to-all vectors I/O template implementation               *
 ******************************************************************************/
 template <typename Field>
 void A2AVectorsIo::write(const std::string fileStem, std::vector<Field> &vec, 
                         const bool multiFile, const int trajectory)
 {
    Record       record;
    GridBase     *grid = vec[0]._grid;
    ScidacWriter binWriter(grid->IsBoss());
    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
    if (multiFile)
    {
        std::string fullFilename;
        for (unsigned int i = 0; i < vec.size(); ++i)
        {
            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
            LOG(Message) << "Writing vector " << i << std::endl;
            makeFileDir(fullFilename, grid);
            binWriter.open(fullFilename);
            record.index = i;
            binWriter.writeScidacFieldRecord(vec[i], record);
            binWriter.close();
        }
    }
    else
    {
        makeFileDir(filename, grid);
        binWriter.open(filename);
        for (unsigned int i = 0; i < vec.size(); ++i)
        {
            LOG(Message) << "Writing vector " << i << std::endl;
            record.index = i;
            binWriter.writeScidacFieldRecord(vec[i], record);
        }
        binWriter.close();
    }
 }
 template <typename Field>
 void A2AVectorsIo::read(std::vector<Field> &vec, const std::string fileStem, 
                        const bool multiFile, const int trajectory)
 {
    Record       record;
    ScidacReader binReader;
    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
    if (multiFile)
    {
        std::string fullFilename;
        for (unsigned int i = 0; i < vec.size(); ++i)
        {
            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
            LOG(Message) << "Reading vector " << i << std::endl;
            binReader.open(fullFilename);
            binReader.readScidacFieldRecord(vec[i], record);
            binReader.close();
            if (record.index != i)
            {
                HADRONS_ERROR(Io, "vector index mismatch");
            }
        }
    }
    else
    {
        binReader.open(filename);
        for (unsigned int i = 0; i < vec.size(); ++i)
        {
            LOG(Message) << "Reading vector " << i << std::endl;
            binReader.readScidacFieldRecord(vec[i], record);
            if (record.index != i)
            {
                HADRONS_ERROR(Io, "vector index mismatch");
            }
        }
        binReader.close();
    }
 }
 END_HADRONS_NAMESPACE
 #endif // A2A_Vectors_hpp_
--- a/Hadrons/Application.cc
+++ b/Hadrons/Application.cc
@@ -108,9 +108,6 @@ void Application::run(void)
        HADRONS_ERROR(Definition, "run id is empty");
    }
    LOG(Message) << "RUN ID '" << getPar().runId << "'" << std::endl;
    BinaryIO::latticeWriteMaxRetry = getPar().parallelWriteMaxRetry;
    LOG(Message) << "Attempt(s) for resilient parallel I/O: " 
                 << BinaryIO::latticeWriteMaxRetry << std::endl;
    vm().setRunId(getPar().runId);
    vm().printContent();
    env().printContent();
--- a/Hadrons/Application.hpp
+++ b/Hadrons/Application.hpp
@@ -41,6 +41,14 @@ BEGIN_HADRONS_NAMESPACE
 class Application
 {
 public:
    class TrajRange: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
                                        unsigned int, start,
                                        unsigned int, end,
                                        unsigned int, step);
    };
    class GlobalPar: Serializable
    {
    public:
@@ -48,9 +56,7 @@ public:
                                        TrajRange,                  trajCounter,
                                        VirtualMachine::GeneticPar, genetic,
                                        std::string,                runId,
-                                        std::string,                graphFile,
+                                        std::string,                graphFile);
                                        int,                        parallelWriteMaxRetry);
        GlobalPar(void): parallelWriteMaxRetry{-1} {}
    };
 public:
    // constructors
--- a/Hadrons/DilutedNoise.hpp
+++ b/Hadrons/DilutedNoise.hpp
@@ -7,7 +7,6 @@ Source file: Hadrons/DilutedNoise.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -77,22 +76,6 @@ private:
    unsigned int nt_;
 };
 template <typename FImpl>
 class FullVolumeSpinColorDiagonalNoise: public DilutedNoise<FImpl>
 {
 public:
    typedef typename FImpl::FermionField FermionField;
 public:
    // constructor/destructor
    FullVolumeSpinColorDiagonalNoise(GridCartesian *g, unsigned int n_src);
    virtual ~FullVolumeSpinColorDiagonalNoise(void) = default;
    // generate noise
    virtual void generateNoise(GridParallelRNG &rng);
 private:
    unsigned int nSrc_;
 };
 /******************************************************************************
 *                    DilutedNoise template implementation                    *
 ******************************************************************************/
@@ -203,47 +186,6 @@ void TimeDilutedSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rn
    }
 }
 /******************************************************************************
 *        FullVolumeSpinColorDiagonalNoise template implementation           *
 ******************************************************************************/
 template <typename FImpl>
 FullVolumeSpinColorDiagonalNoise<FImpl>::
 FullVolumeSpinColorDiagonalNoise(GridCartesian *g, unsigned int nSrc)
 : DilutedNoise<FImpl>(g, nSrc*Ns*FImpl::Dimension), nSrc_(nSrc)
 {}
 template <typename FImpl>
 void FullVolumeSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng)
 {
    typedef decltype(peekColour((*this)[0], 0)) SpinField;
    auto                       &noise = *this;
    auto                       g      = this->getGrid();
    auto                       nd     = g->GlobalDimensions().size();
    auto                       nc     = FImpl::Dimension;
    Complex                    shift(1., 1.);
    LatticeComplex             eta(g);
    SpinField                  etas(g);
    unsigned int               i = 0;
    bernoulli(rng, eta);
    eta = (2.*eta - shift)*(1./::sqrt(2.));
    for (unsigned int n = 0; n < nSrc_; ++n)
    {
        for (unsigned int s = 0; s < Ns; ++s)
        {
            etas = zero;
            pokeSpin(etas, eta, s);
            for (unsigned int c = 0; c < nc; ++c)
            {
                noise[i] = zero;
                pokeColour(noise[i], etas, c);
                i++;
            }
        }
    }
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_DilutedNoise_hpp_
--- a/Hadrons/DiskVector.hpp
+++ b/Hadrons/DiskVector.hpp
@@ -29,18 +29,11 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define Hadrons_DiskVector_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 #include <deque>
 #include <sys/stat.h>
 #include <ftw.h>
 #include <unistd.h>
 #ifdef DV_DEBUG
 #define DV_DEBUG_MSG(dv, stream) LOG(Debug) << "diskvector " << (dv) << ": " << stream << std::endl
 #else
 #define DV_DEBUG_MSG(dv, stream)
 #endif
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
@@ -60,18 +53,16 @@ public:
        : master_(master), cmaster_(master), i_(i) {}
        // operator=: somebody is trying to store a vector element
-        // write to cache and tag as modified
+        // write to disk and cache
        T &operator=(const T &obj) const
        {
-            auto &cache    = *master_.cachePtr_;
+#ifdef DV_DEBUG
-            auto &modified = *master_.modifiedPtr_;
+            LOG(Debug) << "diskvector " << &master_ << ": writing to " << i_ << std::endl;
-            auto &index    = *master_.indexPtr_;
+#endif
            DV_DEBUG_MSG(&master_, "writing to " << i_);
            master_.cacheInsert(i_, obj);
-            modified[index.at(i_)] = true;
+            master_.save(master_.filename(i_), obj);
-            return cache[index.at(i_)];
+            return master_.cachePtr_->at(i_);
        }
        // implicit cast to const object reference and redirection
@@ -88,12 +79,9 @@ public:
 public:
    DiskVectorBase(const std::string dirname, const unsigned int size = 0,
                   const unsigned int cacheSize = 1, const bool clean = true);
    DiskVectorBase(DiskVectorBase<T> &&v) = default;
    virtual ~DiskVectorBase(void);
    const T & operator[](const unsigned int i) const;
    RwAccessHelper operator[](const unsigned int i);
    double hitRatio(void) const;
    void resetStat(void);
 private:
    virtual void load(T &obj, const std::string filename) const = 0;
    virtual void save(const std::string filename, const T &obj) const = 0;
@@ -105,14 +93,10 @@ private:
 private:
    std::string                                dirname_;
    unsigned int                               size_, cacheSize_;
    double                                                access_{0.}, hit_{0.};
    bool                                       clean_;
    // using pointers to allow modifications when class is const
    // semantic: const means data unmodified, but cache modification allowed
-    std::unique_ptr<std::vector<T>>                       cachePtr_;
+    std::unique_ptr<std::map<unsigned int, T>> cachePtr_;
    std::unique_ptr<std::vector<bool>>                    modifiedPtr_;
    std::unique_ptr<std::map<unsigned int, unsigned int>> indexPtr_;
    std::unique_ptr<std::stack<unsigned int>>             freePtr_;
    std::unique_ptr<std::deque<unsigned int>>  loadsPtr_;                
 };
@@ -131,7 +115,6 @@ private:
        read(reader, basename(filename), obj);
    }
    virtual void save(const std::string filename, const T &obj) const
    {
        Writer writer(filename);
@@ -140,100 +123,20 @@ private:
    }
 };
 /******************************************************************************
 *                      Specialisation for Eigen matrices                     *
 ******************************************************************************/
 template <typename T>
 using EigenDiskVectorMat = A2AMatrix<T>;
 template <typename T>
 class EigenDiskVector: public DiskVectorBase<EigenDiskVectorMat<T>>
 {
 public:
    using DiskVectorBase<EigenDiskVectorMat<T>>::DiskVectorBase;
    typedef EigenDiskVectorMat<T> Matrix;
 public:
    T operator()(const unsigned int i, const Eigen::Index j,
                 const Eigen::Index k) const
    {
        return (*this)[i](j, k);
    }
 private:
    virtual void load(EigenDiskVectorMat<T> &obj, const std::string filename) const
    {
        std::ifstream f(filename, std::ios::binary);
        uint32_t      crc, check;
        Eigen::Index  nRow, nCol;
        size_t        matSize;
        double        tRead, tHash;
        f.read(reinterpret_cast<char *>(&crc), sizeof(crc));
        f.read(reinterpret_cast<char *>(&nRow), sizeof(nRow));
        f.read(reinterpret_cast<char *>(&nCol), sizeof(nCol));
        obj.resize(nRow, nCol);
        matSize = nRow*nCol*sizeof(T);
        tRead  = -usecond();
        f.read(reinterpret_cast<char *>(obj.data()), matSize);
        tRead += usecond();
        tHash  = -usecond();
 #ifdef USE_IPP
        check  = GridChecksum::crc32c(obj.data(), matSize);
 #else
        check  = GridChecksum::crc32(obj.data(), matSize);
 #endif
        tHash += usecond();
        DV_DEBUG_MSG(this, "Eigen read " << tRead/1.0e6 << " sec " << matSize/tRead*1.0e6/1024/1024 << " MB/s");
        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << check << std::dec 
                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
        if (crc != check)
        {
            HADRONS_ERROR(Io, "checksum failed")
        }
    }
    virtual void save(const std::string filename, const EigenDiskVectorMat<T> &obj) const
    {
        std::ofstream f(filename, std::ios::binary);
        uint32_t      crc;
        Eigen::Index  nRow, nCol;
        size_t        matSize;
        double        tWrite, tHash;
        nRow    = obj.rows();
        nCol    = obj.cols();
        matSize = nRow*nCol*sizeof(T);
        tHash   = -usecond();
 #ifdef USE_IPP
        crc     = GridChecksum::crc32c(obj.data(), matSize);
 #else
        crc     = GridChecksum::crc32(obj.data(), matSize);
 #endif
        tHash  += usecond();
        f.write(reinterpret_cast<char *>(&crc), sizeof(crc));
        f.write(reinterpret_cast<char *>(&nRow), sizeof(nRow));
        f.write(reinterpret_cast<char *>(&nCol), sizeof(nCol));
        tWrite = -usecond();
        f.write(reinterpret_cast<const char *>(obj.data()), matSize);
        tWrite += usecond();
        DV_DEBUG_MSG(this, "Eigen write " << tWrite/1.0e6 << " sec " << matSize/tWrite*1.0e6/1024/1024 << " MB/s");
        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << crc << std::dec
                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
    }
 };
 /******************************************************************************
 *                       DiskVectorBase implementation                         *
 ******************************************************************************/
 #ifdef DV_DEBUG
 #define DV_DEBUG_MSG(stream) LOG(Debug) << "diskvector " << this << ": " << stream << std::endl
 #endif
 template <typename T>
 DiskVectorBase<T>::DiskVectorBase(const std::string dirname, 
                                  const unsigned int size,
                                  const unsigned int cacheSize,
                                  const bool clean)
 : dirname_(dirname), size_(size), cacheSize_(cacheSize), clean_(clean)
-, cachePtr_(new std::vector<T>(size))
+, cachePtr_(new std::map<unsigned int, T>())
 , modifiedPtr_(new std::vector<bool>(size, false))
 , indexPtr_(new std::map<unsigned int, unsigned int>())
 , freePtr_(new std::stack<unsigned int>)
 , loadsPtr_(new std::deque<unsigned int>())
 {
    struct stat s;
@@ -243,10 +146,6 @@ DiskVectorBase<T>::DiskVectorBase(const std::string dirname,
        HADRONS_ERROR(Io, "directory '" + dirname + "' already exists")
    }
    mkdir(dirname);
    for (unsigned int i = 0; i < cacheSize_; ++i)
    {
        freePtr_->push(i);
    }
 }
 template <typename T>
@@ -262,30 +161,27 @@ template <typename T>
 const T & DiskVectorBase<T>::operator[](const unsigned int i) const
 {
    auto &cache = *cachePtr_;
    auto &index   = *indexPtr_;
    auto &freeInd = *freePtr_;
    auto &loads = *loadsPtr_;
-    DV_DEBUG_MSG(this, "accessing " << i << " (RO)");
+    DV_DEBUG_MSG("accessing " << i << " (RO)");
    if (i >= size_)
    {
        HADRONS_ERROR(Size, "index out of range");
    }
-    const_cast<double &>(access_)++;
+
-    if (index.find(i) == index.end())
+    if (cache.find(i) == cache.end())
    {
        // cache miss
-        DV_DEBUG_MSG(this, "cache miss");
+        DV_DEBUG_MSG("cache miss");
        fetch(i);
    }
    else
    {
-        DV_DEBUG_MSG(this, "cache hit");
+        DV_DEBUG_MSG("cache hit");
        auto pos = std::find(loads.begin(), loads.end(), i);
        const_cast<double &>(hit_)++;
        loads.erase(pos);
        loads.push_back(i);
    }
@@ -297,16 +193,16 @@ const T & DiskVectorBase<T>::operator[](const unsigned int i) const
    {
        msg += std::to_string(p) + " ";
    }
-    DV_DEBUG_MSG(this, "in cache: " << msg);
+    DV_DEBUG_MSG("in cache: " << msg);
 #endif
-    return cache[index.at(i)];
+    return cache.at(i);
 }
 template <typename T>
 typename DiskVectorBase<T>::RwAccessHelper DiskVectorBase<T>::operator[](const unsigned int i)
 {
-    DV_DEBUG_MSG(this, "accessing " << i << " (RW)");
+    DV_DEBUG_MSG("accessing " << i << " (RW)");
    if (i >= size_)
    {
@@ -316,19 +212,6 @@ typename DiskVectorBase<T>::RwAccessHelper DiskVectorBase<T>::operator[](const u
    return RwAccessHelper(*this, i);
 }
 template <typename T>
 double DiskVectorBase<T>::hitRatio(void) const
 {
    return hit_/access_;
 }
 template <typename T>
 void DiskVectorBase<T>::resetStat(void)
 {
    access_ = 0.;
    hit_    = 0.;
 }
 template <typename T>
 std::string DiskVectorBase<T>::filename(const unsigned int i) const
 {
@@ -339,23 +222,12 @@ template <typename T>
 void DiskVectorBase<T>::evict(void) const
 {
    auto &cache = *cachePtr_;
    auto &modified = *modifiedPtr_;
    auto &index    = *indexPtr_;
    auto &freeInd  = *freePtr_;
    auto &loads = *loadsPtr_;
-    if (index.size() >= cacheSize_)
+    if (cache.size() >= cacheSize_)
    {
-        unsigned int i = loads.front();
+        DV_DEBUG_MSG("evicting " << loads.front());
-        
+        cache.erase(loads.front());
        DV_DEBUG_MSG(this, "evicting " << i);
        if (modified[index.at(i)])
        {
            DV_DEBUG_MSG(this, "element " << i << " modified, saving to disk");
            save(filename(i), cache[index.at(i)]);
        }
        freeInd.push(index.at(i));
        index.erase(i);
        loads.pop_front();
    }
 }
@@ -364,43 +236,29 @@ template <typename T>
 void DiskVectorBase<T>::fetch(const unsigned int i) const
 {
    auto &cache = *cachePtr_;
    auto &modified = *modifiedPtr_;
    auto &index    = *indexPtr_;
    auto &freeInd  = *freePtr_;
    auto &loads = *loadsPtr_;
    struct stat s;
-    DV_DEBUG_MSG(this, "loading " << i << " from disk");
+    DV_DEBUG_MSG("loading " << i << " from disk");
    evict();
    if(stat(filename(i).c_str(), &s) != 0)
    {
        HADRONS_ERROR(Io, "disk vector element " + std::to_string(i) + " uninitialised");
    }
-    index[i] = freeInd.top();
+    load(cache[i], filename(i));
    freeInd.pop();
    load(cache[index.at(i)], filename(i));
    loads.push_back(i);
    modified[index.at(i)] = false;
 }
 template <typename T>
 void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
 {
    auto &cache = *cachePtr_;
    auto &modified = *modifiedPtr_;
    auto &index    = *indexPtr_;
    auto &freeInd  = *freePtr_;
    auto &loads = *loadsPtr_;
    evict();
-    index[i] = freeInd.top();
+    cache[i] = obj;
    freeInd.pop();
    cache[index.at(i)] = obj;
    loads.push_back(i);
    modified[index.at(i)] = false;
 #ifdef DV_DEBUG
    std::string msg;
@@ -409,7 +267,7 @@ void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
    {
        msg += std::to_string(p) + " ";
    }
-    DV_DEBUG_MSG(this, "in cache: " << msg);
+    DV_DEBUG_MSG("in cache: " << msg);
 #endif
 }
--- a/Hadrons/EigenPack.hpp
+++ b/Hadrons/EigenPack.hpp
@@ -39,12 +39,12 @@ BEGIN_HADRONS_NAMESPACE
 #define HADRONS_DEFAULT_LANCZOS_NBASIS 60
 #endif
-#define HADRONS_DUMP_EP_METADATA(record) \
+#define HADRONS_DUMP_EP_METADATA \
 LOG(Message) << "Eigenpack metadata:" << std::endl;\
 LOG(Message) << "* operator" << std::endl;\
-LOG(Message) << (record).operatorXml << std::endl;\
+LOG(Message) << record.operatorXml << std::endl;\
 LOG(Message) << "* solver" << std::endl;\
-LOG(Message) << (record).solverXml << std::endl;
+LOG(Message) << record.solverXml << std::endl;
 struct PackRecord
 {
@@ -59,9 +59,66 @@ struct VecRecord: Serializable
    VecRecord(void): index(0), eval(0.) {}
 };
-namespace EigenPackIo
+template <typename F>
 class EigenPack
 {
-    inline void readHeader(PackRecord &record, ScidacReader &binReader)
+public:
    typedef F Field;
 public:
    std::vector<RealD> eval;
    std::vector<F>     evec;
    PackRecord         record;
 public:
    EigenPack(void)          = default;
    virtual ~EigenPack(void) = default;
    EigenPack(const size_t size, GridBase *grid)
    {
        resize(size, grid);
    }
    void resize(const size_t size, GridBase *grid)
    {
        eval.resize(size);
        evec.resize(size, grid);
    }
    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        if (multiFile)
        {
            for(int k = 0; k < evec.size(); ++k)
            {
                basicReadSingle(evec[k], eval[k], evecFilename(fileStem, k, traj), k);
                if (k == 0)
                {
                    HADRONS_DUMP_EP_METADATA;
                }
            }
        }
        else
        {
            basicRead(evec, eval, evecFilename(fileStem, -1, traj), evec.size());
            HADRONS_DUMP_EP_METADATA;
        }
    }
    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        if (multiFile)
        {
            for(int k = 0; k < evec.size(); ++k)
            {
                basicWriteSingle(evecFilename(fileStem, k, traj), evec[k], eval[k], k);
            }
        }
        else
        {
            basicWrite(evecFilename(fileStem, -1, traj), evec, eval, evec.size());
        }
    }
    static void readHeader(PackRecord &record, ScidacReader &binReader)
    {
        std::string recordXml;
@@ -73,75 +130,13 @@ namespace EigenPackIo
        xmlReader.readCurrentSubtree(record.solverXml);
    }
-    template <typename T, typename TIo = T>
+    template <typename T>
-    void readElement(T &evec, RealD &eval, const unsigned int index,
+    static void readElement(T &evec, VecRecord &vecRecord, ScidacReader &binReader)
                     ScidacReader &binReader, TIo *ioBuf = nullptr)
    {
        VecRecord vecRecord;
        LOG(Message) << "Reading eigenvector " << index << std::endl;
        if (ioBuf == nullptr)
    {
        binReader.readScidacFieldRecord(evec, vecRecord);
    }
        else
        {
            binReader.readScidacFieldRecord(*ioBuf, vecRecord);
            precisionChange(evec, *ioBuf);
        }
        if (vecRecord.index != index)
        {
            HADRONS_ERROR(Io, "Eigenvector " + std::to_string(index) + " has a"
                            + " wrong index (expected " + std::to_string(vecRecord.index) 
                            + ")");
        }
        eval = vecRecord.eval;
    }
-    template <typename T, typename TIo = T>
+    static void writeHeader(ScidacWriter &binWriter, PackRecord &record)
    static void readPack(std::vector<T> &evec, std::vector<RealD> &eval,
                         PackRecord &record, const std::string filename, 
                         const unsigned int size, bool multiFile, 
                         GridBase *gridIo = nullptr)
    {
        std::unique_ptr<TIo> ioBuf{nullptr};
        ScidacReader         binReader;
        if (typeHash<T>() != typeHash<TIo>())
        {
            if (gridIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "I/O type different from vector type but null I/O grid passed");
            }
            ioBuf.reset(new TIo(gridIo));
        }
        if (multiFile)
        {
            std::string fullFilename;
            for(int k = 0; k < size; ++k) 
            {
                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
                binReader.open(fullFilename);
                readHeader(record, binReader);
                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
                binReader.close();
            }
        }
        else
        {
            binReader.open(filename);
            readHeader(record, binReader);
            for(int k = 0; k < size; ++k) 
            {
                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
            }
            binReader.close();
        }
    }
    inline void writeHeader(ScidacWriter &binWriter, PackRecord &record)
    {
        XmlWriter xmlWriter("", "eigenPackPar");
@@ -150,217 +145,165 @@ namespace EigenPackIo
        binWriter.writeLimeObject(1, 1, xmlWriter, "parameters", SCIDAC_FILE_XML);
    }
-    template <typename T, typename TIo = T>
+    template <typename T>
-    void writeElement(ScidacWriter &binWriter, T &evec, RealD &eval, 
+    static void writeElement(ScidacWriter &binWriter, T &evec, VecRecord &vecRecord)
                      const unsigned int index, TIo *ioBuf, 
                      T *testBuf = nullptr)
    {
        VecRecord vecRecord;
        LOG(Message) << "Writing eigenvector " << index << std::endl;
        vecRecord.eval  = eval;
        vecRecord.index = index;
        if ((ioBuf == nullptr) || (testBuf == nullptr))
    {
        binWriter.writeScidacFieldRecord(evec, vecRecord, DEFAULT_ASCII_PREC);
    }
 protected:
    std::string evecFilename(const std::string stem, const int vec, const int traj)
    {
        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
        if (vec == -1)
        {
            return stem + t + ".bin";
        }
        else
        {
-            precisionChange(*ioBuf, evec);
+            return stem + t + "/v" + std::to_string(vec) + ".bin";
            precisionChange(*testBuf, *ioBuf);
            *testBuf -= evec;
            LOG(Message) << "Precision diff norm^2 " << norm2(*testBuf) << std::endl;
            binWriter.writeScidacFieldRecord(*ioBuf, vecRecord, DEFAULT_ASCII_PREC);
        }
    }
-    template <typename T, typename TIo = T>
+    template <typename T>
-    static void writePack(const std::string filename, std::vector<T> &evec, 
+    void basicRead(std::vector<T> &evec, std::vector<RealD> &eval,
-                          std::vector<RealD> &eval, PackRecord &record, 
+                   const std::string filename, const unsigned int size)
                          const unsigned int size, bool multiFile, 
                          GridBase *gridIo = nullptr)
    {
-        GridBase             *grid = evec[0]._grid;
+        ScidacReader binReader;
        std::unique_ptr<TIo> ioBuf{nullptr}; 
        std::unique_ptr<T>   testBuf{nullptr};
        ScidacWriter         binWriter(grid->IsBoss());
        if (typeHash<T>() != typeHash<TIo>())
        {
            if (gridIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "I/O type different from vector type but null I/O grid passed");
            }
            ioBuf.reset(new TIo(gridIo));
            testBuf.reset(new T(grid));
        }
        if (multiFile)
        {
            std::string fullFilename;
        binReader.open(filename);
        readHeader(record, binReader);
        for(int k = 0; k < size; ++k) 
        {
-                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
+            VecRecord vecRecord;
-                makeFileDir(fullFilename, grid);
+            LOG(Message) << "Reading eigenvector " << k << std::endl;
-                binWriter.open(fullFilename);
+            readElement(evec[k], vecRecord, binReader);
-                writeHeader(binWriter, record);
+            if (vecRecord.index != k)
                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
                binWriter.close();
            }
        }
        else
            {
-            makeFileDir(filename, grid);
+                HADRONS_ERROR(Io, "Eigenvector " + std::to_string(k) + " has a"
                              + " wrong index (expected " + std::to_string(vecRecord.index) 
                              + ") in file '" + filename + "'");
            }
            eval[k] = vecRecord.eval;
        }
        binReader.close();
    }
    template <typename T>
    void basicReadSingle(T &evec, RealD &eval, const std::string filename, 
                         const unsigned int index)
    {
        ScidacReader binReader;
        VecRecord    vecRecord;
        binReader.open(filename);
        readHeader(record, binReader);
        LOG(Message) << "Reading eigenvector " << index << std::endl;
        readElement(evec, vecRecord, binReader);
        if (vecRecord.index != index)
        {
            HADRONS_ERROR(Io, "Eigenvector " + std::to_string(index) + " has a"
                          + " wrong index (expected " + std::to_string(vecRecord.index) 
                          + ") in file '" + filename + "'");
        }
        eval = vecRecord.eval;
        binReader.close();
    }
    template <typename T>
    void basicWrite(const std::string filename, std::vector<T> &evec, 
                    const std::vector<RealD> &eval, const unsigned int size)
    {
        ScidacWriter binWriter(evec[0]._grid->IsBoss());
        makeFileDir(filename, evec[0]._grid);
        binWriter.open(filename);
        writeHeader(binWriter, record);
        for(int k = 0; k < size; ++k) 
        {
-                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
+            VecRecord vecRecord;
            vecRecord.index = k;
            vecRecord.eval  = eval[k];
            LOG(Message) << "Writing eigenvector " << k << std::endl;
            writeElement(binWriter, evec[k], vecRecord);
        }
        binWriter.close();
    }
    }
 }
-template <typename F>
+    template <typename T>
-class BaseEigenPack
+    void basicWriteSingle(const std::string filename, T &evec, 
-{
+                          const RealD eval, const unsigned int index)
 public:
    typedef F Field;
 public:
    std::vector<RealD> eval;
    std::vector<F>     evec;
    PackRecord         record;
 public:
    BaseEigenPack(void)          = default;
    BaseEigenPack(const size_t size, GridBase *grid)
    {
-        resize(size, grid);
+        ScidacWriter binWriter(evec._grid->IsBoss());
-    }
+        VecRecord    vecRecord;
-    virtual ~BaseEigenPack(void) = default;
+
-    void resize(const size_t size, GridBase *grid)
+        makeFileDir(filename, evec._grid);
-    {
+        binWriter.open(filename);
-        eval.resize(size);
+        writeHeader(binWriter, record);
-        evec.resize(size, grid);
+        vecRecord.index = index;
        vecRecord.eval  = eval;
        LOG(Message) << "Writing eigenvector " << index << std::endl;
        writeElement(binWriter, evec, vecRecord);
        binWriter.close();
    }
 };
-template <typename F, typename FIo = F>
+template <typename FineF, typename CoarseF>
-class EigenPack: public BaseEigenPack<F>
+class CoarseEigenPack: public EigenPack<FineF>
 {
 public:
    typedef F   Field;
    typedef FIo FieldIo;
 public:
    EigenPack(void)          = default;
    virtual ~EigenPack(void) = default;
    EigenPack(const size_t size, GridBase *grid, GridBase *gridIo = nullptr)
    : BaseEigenPack<F>(size, grid)
    {
        if (typeHash<F>() != typeHash<FIo>())
        {
            if (gridIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "I/O type different from vector type but null I/O grid passed");
            }
        }
        gridIo_ = gridIo;
    }
    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        EigenPackIo::readPack<F, FIo>(this->evec, this->eval, this->record, 
                                      evecFilename(fileStem, traj, multiFile), 
                                      this->evec.size(), multiFile, gridIo_);
        HADRONS_DUMP_EP_METADATA(this->record);
    }
    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        EigenPackIo::writePack<F, FIo>(evecFilename(fileStem, traj, multiFile), 
                                       this->evec, this->eval, this->record, 
                                       this->evec.size(), multiFile, gridIo_);
    }
 protected:
    std::string evecFilename(const std::string stem, const int traj, const bool multiFile)
    {
        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
        if (multiFile)
        {
            return stem + t;
        }
        else
        {
            return stem + t + ".bin";
        }
    }
 protected:
    GridBase *gridIo_;
 };
 template <typename FineF, typename CoarseF, 
          typename FineFIo = FineF, typename CoarseFIo = CoarseF>
 class CoarseEigenPack: public EigenPack<FineF, FineFIo>
 {
 public:
    typedef CoarseF CoarseField;
-    std::vector<CoarseF> evecCoarse;
+public:
    std::vector<RealD>   evalCoarse;
    std::vector<CoarseF> evecCoarse;
 public:
    CoarseEigenPack(void)          = default;
    virtual ~CoarseEigenPack(void) = default;
    CoarseEigenPack(const size_t sizeFine, const size_t sizeCoarse, 
-                    GridBase *gridFine, GridBase *gridCoarse,
+                    GridBase *gridFine, GridBase *gridCoarse)
                    GridBase *gridFineIo = nullptr, 
                    GridBase *gridCoarseIo = nullptr)
    {
        if (typeHash<FineF>() != typeHash<FineFIo>())
        {
            if (gridFineIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "Fine I/O type different from vector type but null fine I/O grid passed");
            }
        }
        if (typeHash<CoarseF>() != typeHash<CoarseFIo>())
        {
            if (gridCoarseIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "Coarse I/O type different from vector type but null coarse I/O grid passed");
            }
        }
        this->gridIo_ = gridFineIo;
        gridCoarseIo_ = gridCoarseIo;
        resize(sizeFine, sizeCoarse, gridFine, gridCoarse);
    }
    void resize(const size_t sizeFine, const size_t sizeCoarse, 
                GridBase *gridFine, GridBase *gridCoarse)
    {
-        EigenPack<FineF, FineFIo>::resize(sizeFine, gridFine);
+        EigenPack<FineF>::resize(sizeFine, gridFine);
        evalCoarse.resize(sizeCoarse);
        evecCoarse.resize(sizeCoarse, gridCoarse);
    }
    void readFine(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        EigenPack<FineF, FineFIo>::read(fileStem + "_fine", multiFile, traj);
+        if (multiFile)
        {
            for(int k = 0; k < this->evec.size(); ++k)
            {
                this->basicReadSingle(this->evec[k], this->eval[k], this->evecFilename(fileStem + "_fine", k, traj), k);
            }
        }
        else
        {
            this->basicRead(this->evec, this->eval, this->evecFilename(fileStem + "_fine", -1, traj), this->evec.size());
        }
    }
    void readCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        PackRecord dummy;
+        if (multiFile)
-
+        {
-        EigenPackIo::readPack<CoarseF, CoarseFIo>(evecCoarse, evalCoarse, dummy, 
+            for(int k = 0; k < evecCoarse.size(); ++k)
-                              this->evecFilename(fileStem + "_coarse", traj, multiFile), 
+            {
-                              evecCoarse.size(), multiFile, gridCoarseIo_);
+                this->basicReadSingle(evecCoarse[k], evalCoarse[k], this->evecFilename(fileStem + "_coarse", k, traj), k);
            }
        }
        else
        {
            this->basicRead(evecCoarse, evalCoarse, this->evecFilename(fileStem + "_coarse", -1, traj), evecCoarse.size());
        }
    }
    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
@@ -371,14 +314,32 @@ public:
    void writeFine(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        EigenPack<FineF, FineFIo>::write(fileStem + "_fine", multiFile, traj);
+        if (multiFile)
        {
            for(int k = 0; k < this->evec.size(); ++k)
            {
                this->basicWriteSingle(this->evecFilename(fileStem + "_fine", k, traj), this->evec[k], this->eval[k], k);
            }
        }
        else
        {
            this->basicWrite(this->evecFilename(fileStem + "_fine", -1, traj), this->evec, this->eval, this->evec.size());
        }
    }
    void writeCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        EigenPackIo::writePack<CoarseF, CoarseFIo>(this->evecFilename(fileStem + "_coarse", traj, multiFile), 
+        if (multiFile)
-                                                   evecCoarse, evalCoarse, this->record, 
+        {
-                                                   evecCoarse.size(), multiFile, gridCoarseIo_);
+            for(int k = 0; k < evecCoarse.size(); ++k)
            {
                this->basicWriteSingle(this->evecFilename(fileStem + "_coarse", k, traj), evecCoarse[k], evalCoarse[k], k);
            }
        }
        else
        {
            this->basicWrite(this->evecFilename(fileStem + "_coarse", -1, traj), evecCoarse, evalCoarse, evecCoarse.size());
        }
    }
    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
@@ -386,25 +347,16 @@ public:
        writeFine(fileStem, multiFile, traj);
        writeCoarse(fileStem, multiFile, traj);
    }
 private:
    GridBase *gridCoarseIo_;
 };
 template <typename FImpl>
-using BaseFermionEigenPack = BaseEigenPack<typename FImpl::FermionField>;
+using FermionEigenPack = EigenPack<typename FImpl::FermionField>;
-template <typename FImpl, typename FImplIo = FImpl>
+template <typename FImpl, int nBasis>
 using FermionEigenPack = EigenPack<typename FImpl::FermionField, typename FImplIo::FermionField>;
 template <typename FImpl, int nBasis, typename FImplIo = FImpl>
 using CoarseFermionEigenPack = CoarseEigenPack<
    typename FImpl::FermionField,
    typename LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
                                   typename FImpl::SiteComplex, 
                                   nBasis>::CoarseField,
    typename FImplIo::FermionField,
    typename LocalCoherenceLanczos<typename FImplIo::SiteSpinor, 
                                   typename FImplIo::SiteComplex, 
                                   nBasis>::CoarseField>;
 #undef HADRONS_DUMP_EP_METADATA
--- a/Hadrons/Global.cc
+++ b/Hadrons/Global.cc
@@ -166,13 +166,7 @@ std::string Hadrons::dirname(const std::string &s)
 void Hadrons::makeFileDir(const std::string filename, GridBase *g)
 {
-    bool doIt = true;
+    if (g->IsBoss())
    if (g)
    {
        doIt = g->IsBoss();
    }
    if (doIt)
    {
        std::string dir    = dirname(filename);
        int         status = mkdir(dir);
--- a/Hadrons/Global.hpp
+++ b/Hadrons/Global.hpp
@@ -32,7 +32,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <set>
 #include <stack>
 #include <regex>
 #include <Grid/Grid.h>
 #include <cxxabi.h>
@@ -218,15 +217,15 @@ typedef XmlReader ResultReader;
 typedef XmlWriter ResultWriter;
 #endif
-#define RESULT_FILE_NAME(name, traj) \
+#define RESULT_FILE_NAME(name) \
-name + "." + std::to_string(traj) + "." + resultFileExt
+name + "." + std::to_string(vm().getTrajectory()) + "." + resultFileExt
 // recursive mkdir
 #define MAX_PATH_LENGTH 512u
 int         mkdir(const std::string dirName);
 std::string basename(const std::string &s);
 std::string dirname(const std::string &s);
-void        makeFileDir(const std::string filename, GridBase *g = nullptr);
+void        makeFileDir(const std::string filename, GridBase *g);
 // default Schur convention
 #ifndef HADRONS_DEFAULT_SCHUR 
@@ -249,47 +248,6 @@ void        makeFileDir(const std::string filename, GridBase *g = nullptr);
 // pretty print time profile
 void printTimeProfile(const std::map<std::string, GridTime> &timing, GridTime total);
 // token replacement utility
 template <typename T>
 void tokenReplace(std::string &str, const std::string token,
                  const T &x, const std::string mark = "@")
 {
    std::string fullToken = mark + token + mark;
    auto pos = str.find(fullToken);
    if (pos != std::string::npos)
    {
        str.replace(pos, fullToken.size(), std::to_string(x));
    }
 }
 // trajectory range
 class TrajRange: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
                                    unsigned int, start,
                                    unsigned int, end,
                                    unsigned int, step,
                                    std::string,  exclude);
    inline std::vector<unsigned int> getTrajectoryList(void)
    {
        std::vector<unsigned int> excVec = strToVec<unsigned int>(exclude);
        std::vector<unsigned int> list;
        for (unsigned int t = start; t < end; t += step)
        {
            if (std::find(excVec.begin(), excVec.end(), t) == excVec.end())
            {
                list.push_back(t);
            }
        }
        return list;
    }
 };
 END_HADRONS_NAMESPACE
 #include <Hadrons/Exceptions.hpp>
--- a/Hadrons/Makefile.am
+++ b/Hadrons/Makefile.am
@@ -5,17 +5,16 @@ lib_LIBRARIES = libHadrons.a
 include modules.inc
 libHadrons_a_SOURCES = \
    $(modules_cc)      \
    Application.cc     \
    Environment.cc     \
 	Exceptions.cc      \
    Global.cc          \
    Module.cc		   \
-	TimerArray.cc      \
+	VirtualMachine.cc
 	VirtualMachine.cc  \
 	$(modules_cc)
 libHadrons_adir = $(includedir)/Hadrons
 nobase_libHadrons_a_HEADERS = \
 	$(modules_hpp)            \
 	A2AVectors.hpp            \
 	A2AMatrix.hpp             \
 	Application.hpp           \
@@ -32,7 +31,4 @@ nobase_libHadrons_a_HEADERS = \
 	Modules.hpp               \
 	ModuleFactory.hpp         \
 	Solver.hpp                \
-	TimerArray.hpp            \
+	VirtualMachine.hpp
 	VirtualMachine.hpp        \
 	Utilities/Contractor.hpp  \
 	$(modules_hpp)
--- a/Hadrons/Module.cc
+++ b/Hadrons/Module.cc
@@ -66,6 +66,101 @@ void ModuleBase::operator()(void)
    stopAllTimers();
 }
 // timers //////////////////////////////////////////////////////////////////////
 void ModuleBase::startTimer(const std::string &name)
 {
    if (!name.empty())
    {
        timer_[name].Start();
    }
 }
 GridTime ModuleBase::getTimer(const std::string &name)
 {
    GridTime t;
    if (!name.empty())
    {
        try
        {
            bool running = timer_.at(name).isRunning();
            if (running) stopTimer(name);
            t = timer_.at(name).Elapsed();
            if (running) startTimer(name);
        }
        catch (std::out_of_range &)
        {
            t = GridTime::zero();
        }
    }
    else
    {
        t = GridTime::zero();
    }
    return t;
 }
 double ModuleBase::getDTimer(const std::string &name)
 {
    return static_cast<double>(getTimer(name).count());
 }
 void ModuleBase::startCurrentTimer(const std::string &name)
 {
    if (!name.empty())
    {
        stopCurrentTimer();
        startTimer(name);
        currentTimer_ = name;
    }
 }
 void ModuleBase::stopTimer(const std::string &name)
 {
    if (timer_.at(name).isRunning())
    {
        timer_.at(name).Stop();
    }
 }
 void ModuleBase::stopCurrentTimer(void)
 {
    if (!currentTimer_.empty())
    {
        stopTimer(currentTimer_);
        currentTimer_ = "";
    }
 }
 void ModuleBase::stopAllTimers(void)
 {
    for (auto &t: timer_)
    {
        stopTimer(t.first);
    }
    currentTimer_ = "";
 }
 void ModuleBase::resetTimers(void)
 {
    timer_.clear();
    currentTimer_ = "";
 }
 std::map<std::string, GridTime> ModuleBase::getTimings(void)
 {
    std::map<std::string, GridTime> timing;
    for (auto &t: timer_)
    {
        timing[t.first] = t.second.Elapsed();
    }
    return timing;
 }
 std::string ModuleBase::makeSeedString(void)
 {
    std::string seed;
--- a/Hadrons/Module.hpp
+++ b/Hadrons/Module.hpp
@@ -30,7 +30,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define Hadrons_Module_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/TimerArray.hpp>
 #include <Hadrons/VirtualMachine.hpp>
 BEGIN_HADRONS_NAMESPACE
@@ -144,7 +143,7 @@ if (env().getGrid()->IsBoss() and !ioStem.empty())\
 {\
    makeFileDir(ioStem, env().getGrid());\
    {\
-        ResultWriter _writer(RESULT_FILE_NAME(ioStem, vm().getTrajectory()));\
+        ResultWriter _writer(RESULT_FILE_NAME(ioStem));\
        write(_writer, name, result);\
    }\
 }
@@ -153,7 +152,7 @@ if (env().getGrid()->IsBoss() and !ioStem.empty())\
 *                            Module class                                    *
 ******************************************************************************/
 // base class
-class ModuleBase: public TimerArray
+class ModuleBase
 {
 public:
    // constructor
@@ -181,6 +180,16 @@ public:
    virtual void execute(void) = 0;
    // execution
    void operator()(void);
    // timers
    void                            startTimer(const std::string &name);
    GridTime                        getTimer(const std::string &name);
    double                          getDTimer(const std::string &name);
    void                            startCurrentTimer(const std::string &name);
    void                            stopTimer(const std::string &name);
    void                            stopCurrentTimer(void);
    void                            stopAllTimers(void);
    void                            resetTimers(void);
    std::map<std::string, GridTime> getTimings(void);
 protected:
    // environment shortcut
    DEFINE_ENV_ALIAS;
--- a/Hadrons/Modules.hpp
+++ b/Hadrons/Modules.hpp
@@ -1,6 +1,6 @@
 #include <Hadrons/Modules/MContraction/Baryon.hpp>
 #include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
 #include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
 #include <Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp>
 #include <Hadrons/Modules/MContraction/Meson.hpp>
 #include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 #include <Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
@@ -16,7 +16,6 @@
 #include <Hadrons/Modules/MSource/Wall.hpp>
 #include <Hadrons/Modules/MSource/Z2.hpp>
 #include <Hadrons/Modules/MSource/SeqConserved.hpp>
 #include <Hadrons/Modules/MSource/Momentum.hpp>
 #include <Hadrons/Modules/MSink/Smear.hpp>
 #include <Hadrons/Modules/MSink/Point.hpp>
 #include <Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp>
@@ -24,17 +23,13 @@
 #include <Hadrons/Modules/MSolver/Guesser.hpp>
 #include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Hadrons/Modules/MSolver/A2AVectors.hpp>
 #include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
 #include <Hadrons/Modules/MGauge/UnitEm.hpp>
 #include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
 #include <Hadrons/Modules/MGauge/Unit.hpp>
 #include <Hadrons/Modules/MGauge/Electrify.hpp>
 #include <Hadrons/Modules/MGauge/Random.hpp>
 #include <Hadrons/Modules/MGauge/GaugeFix.hpp>
 #include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
 #include <Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
 #include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
 #include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
 #include <Hadrons/Modules/MUtilities/RandomVectors.hpp>
 #include <Hadrons/Modules/MUtilities/TestSeqGamma.hpp>
@@ -45,9 +40,6 @@
 #include <Hadrons/Modules/MScalar/ScalarVP.hpp>
 #include <Hadrons/Modules/MScalar/Scalar.hpp>
 #include <Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Hadrons/Modules/MNPR/Bilinear.hpp>
 #include <Hadrons/Modules/MNPR/Amputate.hpp>
 #include <Hadrons/Modules/MNPR/FourQuark.hpp>
 #include <Hadrons/Modules/MAction/DWF.hpp>
 #include <Hadrons/Modules/MAction/MobiusDWF.hpp>
 #include <Hadrons/Modules/MAction/Wilson.hpp>
@@ -58,6 +50,7 @@
 #include <Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp>
 #include <Hadrons/Modules/MScalarSUN/ShiftProbe.hpp>
 #include <Hadrons/Modules/MScalarSUN/Div.hpp>
 #include <Hadrons/Modules/MScalarSUN/TimeMomProbe.hpp>
 #include <Hadrons/Modules/MScalarSUN/TrMag.hpp>
 #include <Hadrons/Modules/MScalarSUN/EMT.hpp>
 #include <Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
@@ -68,7 +61,6 @@
 #include <Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
 #include <Hadrons/Modules/MIO/LoadEigenPack.hpp>
 #include <Hadrons/Modules/MIO/LoadNersc.hpp>
 #include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
 #include <Hadrons/Modules/MIO/LoadCosmHol.hpp>
 #include <Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp>
 #include <Hadrons/Modules/MIO/LoadBinary.hpp>
--- a/Hadrons/Modules/MAction/DWF.cc
+++ b/Hadrons/Modules/MAction/DWF.cc
@@ -32,6 +32,4 @@ using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TDWF<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TDWF<FIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/DWF.hpp
+++ b/Hadrons/Modules/MAction/DWF.hpp
@@ -49,8 +49,7 @@ public:
                                    unsigned int, Ls,
                                    double      , mass,
                                    double      , M5,
-                                    std::string , boundary,
+                                    std::string , boundary);
                                    std::string , twist);
 };
 template <typename FImpl>
@@ -74,9 +73,7 @@ protected:
 };
 MODULE_REGISTER_TMP(DWF, TDWF<FIMPL>, MAction);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(DWFF, TDWF<FIMPLF>, MAction);
 #endif
 /******************************************************************************
 *                        DWF template implementation                         *
@@ -120,9 +117,8 @@ void TDWF<FImpl>::setup(void)
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    typename DomainWallFermion<FImpl>::ImplParams implParams;
+    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    typename DomainWallFermion<FImpl>::ImplParams implParams(boundary);
    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, DomainWallFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, implParams);
 }
--- a/Hadrons/Modules/MAction/MobiusDWF.cc
+++ b/Hadrons/Modules/MAction/MobiusDWF.cc
@@ -32,6 +32,4 @@ using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TMobiusDWF<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TMobiusDWF<FIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/MobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/MobiusDWF.hpp
@@ -49,8 +49,7 @@ public:
                                    double      , M5,
                                    double      , b,
                                    double      , c,
-                                    std::string , boundary,
+                                    std::string , boundary);
                                    std::string , twist);
 };
 template <typename FImpl>
@@ -73,9 +72,7 @@ public:
 };
 MODULE_REGISTER_TMP(MobiusDWF, TMobiusDWF<FIMPL>, MAction);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(MobiusDWFF, TMobiusDWF<FIMPLF>, MAction);
 #endif
 /******************************************************************************
 *                      TMobiusDWF implementation                             *
@@ -120,9 +117,8 @@ void TMobiusDWF<FImpl>::setup(void)
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    typename MobiusFermion<FImpl>::ImplParams implParams;
+    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    typename MobiusFermion<FImpl>::ImplParams implParams(boundary);
    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, MobiusFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, par().b, par().c,
                     implParams);
--- a/Hadrons/Modules/MAction/ScaledDWF.cc
+++ b/Hadrons/Modules/MAction/ScaledDWF.cc
@@ -32,6 +32,4 @@ using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TScaledDWF<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TScaledDWF<FIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/ScaledDWF.hpp
+++ b/Hadrons/Modules/MAction/ScaledDWF.hpp
@@ -48,8 +48,7 @@ public:
                                    double      , mass,
                                    double      , M5,
                                    double      , scale,
-                                    std::string , boundary,
+                                    std::string , boundary);
                                    std::string , twist);
 };
 template <typename FImpl>
@@ -72,9 +71,7 @@ public:
 };
 MODULE_REGISTER_TMP(ScaledDWF, TScaledDWF<FIMPL>, MAction);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(ScaledDWFF, TScaledDWF<FIMPLF>, MAction);
 #endif
 /******************************************************************************
 *                      TScaledDWF implementation                             *
@@ -119,9 +116,8 @@ void TScaledDWF<FImpl>::setup(void)
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    typename ScaledShamirFermion<FImpl>::ImplParams implParams;
+    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    typename MobiusFermion<FImpl>::ImplParams implParams(boundary);
    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, ScaledShamirFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, par().scale,
                     implParams);
--- a/Hadrons/Modules/MAction/Wilson.cc
+++ b/Hadrons/Modules/MAction/Wilson.cc
@@ -32,6 +32,4 @@ using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TWilson<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TWilson<FIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/Wilson.hpp
+++ b/Hadrons/Modules/MAction/Wilson.hpp
@@ -47,9 +47,7 @@ public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonPar,
                                    std::string, gauge,
                                    double     , mass,
-                                    std::string, boundary,
+                                    std::string, boundary);
                                    std::string, string,
                                    std::string, twist);
 };
 template <typename FImpl>
@@ -73,9 +71,7 @@ protected:
 };
 MODULE_REGISTER_TMP(Wilson, TWilson<FIMPL>, MAction);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(WilsonF, TWilson<FIMPLF>, MAction);
 #endif
 /******************************************************************************
 *                     TWilson template implementation                        *
@@ -115,9 +111,8 @@ void TWilson<FImpl>::setup(void)
    auto &U      = envGet(GaugeField, par().gauge);
    auto &grid   = *envGetGrid(FermionField);
    auto &gridRb = *envGetRbGrid(FermionField);
-    typename WilsonFermion<FImpl>::ImplParams implParams;
+    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    typename WilsonFermion<FImpl>::ImplParams implParams(boundary);
    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, WilsonFermion<FImpl>, getName(), 1, U, grid, gridRb,
                     par().mass, implParams);
 }
--- a/Hadrons/Modules/MAction/WilsonClover.cc
+++ b/Hadrons/Modules/MAction/WilsonClover.cc
@@ -32,6 +32,4 @@ using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TWilsonClover<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TWilsonClover<FIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/WilsonClover.hpp
+++ b/Hadrons/Modules/MAction/WilsonClover.hpp
@@ -51,8 +51,7 @@ public:
 				                    double     , csw_r,
 				                    double     , csw_t,
 				                    WilsonAnisotropyCoefficients ,clover_anisotropy,
-                                    std::string, boundary,
+                                    std::string, boundary
                                    std::string, twist
 				    );
 };
@@ -76,9 +75,7 @@ public:
 };
 MODULE_REGISTER_TMP(WilsonClover, TWilsonClover<FIMPL>, MAction);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(WilsonCloverF, TWilsonClover<FIMPLF>, MAction);
 #endif
 /******************************************************************************
 *                    TWilsonClover template implementation                   *
@@ -120,9 +117,8 @@ void TWilsonClover<FImpl>::setup(void)
    auto &U      = envGet(GaugeField, par().gauge);
    auto &grid   = *envGetGrid(FermionField);
    auto &gridRb = *envGetRbGrid(FermionField);
-    typename WilsonCloverFermion<FImpl>::ImplParams implParams;
+    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    typename WilsonCloverFermion<FImpl>::ImplParams implParams(boundary);
    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, WilsonCloverFermion<FImpl>, getName(), 1, U, grid,
                     gridRb, par().mass, par().csw_r, par().csw_t, 
                     par().clover_anisotropy, implParams); 
--- a/Hadrons/Modules/MAction/ZMobiusDWF.cc
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.cc
@@ -32,6 +32,4 @@ using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TZMobiusDWF<ZFIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TZMobiusDWF<ZFIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/ZMobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.hpp
@@ -50,8 +50,7 @@ public:
                                    double                           , b,
                                    double                           , c,
                                    std::vector<std::complex<double>>, omega,
-                                    std::string                      , boundary,
+                                    std::string                      , boundary);
                                    std::string                      , twist);
 };
 template <typename FImpl>
@@ -74,9 +73,7 @@ public:
 };
 MODULE_REGISTER_TMP(ZMobiusDWF, TZMobiusDWF<ZFIMPL>, MAction);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(ZMobiusDWFF, TZMobiusDWF<ZFIMPLF>, MAction);
 #endif
 /******************************************************************************
 *                     TZMobiusDWF implementation                             *
@@ -128,9 +125,8 @@ void TZMobiusDWF<FImpl>::setup(void)
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    auto omega = par().omega;
-    typename ZMobiusFermion<FImpl>::ImplParams implParams;
+    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    typename ZMobiusFermion<FImpl>::ImplParams implParams(boundary);
    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, ZMobiusFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, omega,
                     par().b, par().c, implParams);
--- a/Hadrons/Modules/MContraction/A2AAslashField.cc
+++ b/Hadrons/Modules/MContraction/A2AAslashField.cc
@@ -1,34 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/A2AAslashField.cc
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TA2AAslashField<FIMPL, PhotonR>;
--- a/Hadrons/Modules/MContraction/A2AAslashField.hpp
+++ b/Hadrons/Modules/MContraction/A2AAslashField.hpp
@@ -1,246 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/A2AAslashField.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_A2AAslashField_hpp_
 #define Hadrons_MContraction_A2AAslashField_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         A2AAslashField                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 class A2AAslashFieldPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashFieldPar,
                                    int, cacheBlock,
                                    int, block,
                                    std::string, left,
                                    std::string, right,
                                    std::string, output,
                                    std::vector<std::string>, emField);
 };
 class A2AAslashFieldMetadata: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashFieldMetadata,
                                    std::string, emFieldName);
 };
 template <typename T, typename FImpl>
 class AslashFieldKernel: public A2AKernel<T, typename FImpl::FermionField>
 {
 public:
    typedef typename FImpl::FermionField FermionField;
 public:
    AslashFieldKernel(const std::vector<LatticeComplex> &emB0,
                      const std::vector<LatticeComplex> &emB1,
                      GridBase *grid)
    : emB0_(emB0), emB1_(emB1), grid_(grid)
    {
        vol_ = 1.;
        for (auto &d: grid_->GlobalDimensions())
        {
            vol_ *= d;
        }
    }
    virtual ~AslashFieldKernel(void) = default;
    virtual void operator()(A2AMatrixSet<T> &m, const FermionField *left, 
                            const FermionField *right,
                            const unsigned int orthogDim, double &t)
    {
        A2Autils<FImpl>::AslashField(m, left, right, emB0_, emB1_, orthogDim, &t);
    }
    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej)
    {
        return 0.;
    }
    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej)
    {
        return 0.;
    }
 private:
    const std::vector<LatticeComplex> &emB0_, &emB1_;
    GridBase                          *grid_;
    double                            vol_;
 };
 template <typename FImpl, typename PhotonImpl>
 class TA2AAslashField: public Module<A2AAslashFieldPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    typedef typename PhotonImpl::GaugeField EmField;
    typedef A2AMatrixBlockComputation<Complex, 
                                      FermionField, 
                                      A2AAslashFieldMetadata, 
                                      HADRONS_A2AM_IO_TYPE> Computation;
    typedef AslashFieldKernel<Complex, FImpl> Kernel;
 public:
    // constructor
    TA2AAslashField(const std::string name);
    // destructor
    virtual ~TA2AAslashField(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(A2AAslashField, ARG(TA2AAslashField<FIMPL, PhotonR>), MContraction);
 /******************************************************************************
 *                 TA2AAslashField implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl, typename PhotonImpl>
 TA2AAslashField<FImpl, PhotonImpl>::TA2AAslashField(const std::string name)
 : Module<A2AAslashFieldPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl, typename PhotonImpl>
 std::vector<std::string> TA2AAslashField<FImpl, PhotonImpl>::getInput(void)
 {
    std::vector<std::string> in = par().emField;
    in.push_back(par().left);
    in.push_back(par().right);
    return in;
 }
 template <typename FImpl, typename PhotonImpl>
 std::vector<std::string> TA2AAslashField<FImpl, PhotonImpl>::getOutput(void)
 {
    std::vector<std::string> out = {};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl, typename PhotonImpl>
 void TA2AAslashField<FImpl, PhotonImpl>::setup(void)
 {
    envTmp(Computation, "computation", 1, envGetGrid(FermionField), 
           env().getNd() - 1, par().emField.size(), 1, par().block, 
           par().cacheBlock, this);
    envTmp(std::vector<ComplexField>, "B0", 1, 
           par().emField.size(), envGetGrid(ComplexField));
    envTmp(std::vector<ComplexField>, "B1", 1, 
           par().emField.size(), envGetGrid(ComplexField));
    envTmpLat(ComplexField, "Amu");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl, typename PhotonImpl>
 void TA2AAslashField<FImpl, PhotonImpl>::execute(void)
 {
    auto &left  = envGet(std::vector<FermionField>, par().left);
    auto &right = envGet(std::vector<FermionField>, par().right);
    int nt         = env().getDim().back();
    int N_i        = left.size();
    int N_j        = right.size();
    int nem        = par().emField.size();
    int block      = par().block;
    int cacheBlock = par().cacheBlock;
    LOG(Message) << "Computing all-to-all A-slash fields" << std::endl;
    LOG(Message) << "Left: '" << par().left << "' Right: '" << par().right << "'" << std::endl;
    LOG(Message) << "EM fields:" << std::endl;
    for (auto &name: par().emField)
    {
        LOG(Message) << "  " << name << std::endl;
    }
    LOG(Message) << "A-slash field size: " << nt << "*" << N_i << "*" << N_j 
                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(HADRONS_A2AM_IO_TYPE)) 
                 << "/EM field)" << std::endl;
    // preparing "B" complexified fields
    startTimer("Complexify EM fields");
    envGetTmp(std::vector<ComplexField>, B0);
    envGetTmp(std::vector<ComplexField>, B1);
    for (unsigned int i = 0; i < par().emField.size(); ++i)
    {
        auto &A = envGet(EmField, par().emField[i]);
        envGetTmp(ComplexField, Amu);
        B0[i]  = peekLorentz(A, 0);
        B0[i] += timesI(peekLorentz(A, 1));
        B1[i]  = peekLorentz(A, 2);
        B1[i] += timesI(peekLorentz(A, 3));
    }
    stopTimer("Complexify EM fields");
    // I/O name & metadata lambdas
    auto ionameFn = [this](const unsigned int em, const unsigned int dummy)
    {
        return par().emField[em];
    };
    auto filenameFn = [this, &ionameFn](const unsigned int em, const unsigned int dummy)
    {
        return par().output + "." + std::to_string(vm().getTrajectory()) 
               + "/" + ionameFn(em, dummy) + ".h5";
    };
    auto metadataFn = [this](const unsigned int em, const unsigned int dummy)
    {
        A2AAslashFieldMetadata md;
        md.emFieldName = par().emField[em];
        return md;
    };
    // executing computation
    Kernel kernel(B0, B1, envGetGrid(FermionField));
    envGetTmp(Computation, computation);
    computation.execute(left, right, kernel, ionameFn, filenameFn, metadataFn);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_A2AAslashField_hpp_
--- a/Hadrons/Modules/MContraction/A2AMesonField.cc
+++ b/Hadrons/Modules/MContraction/A2AMesonField.cc
@@ -33,3 +33,4 @@ using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TA2AMesonField<FIMPL>;
 template class Grid::Hadrons::MContraction::TA2AMesonField<ZFIMPL>;
--- a/Hadrons/Modules/MContraction/A2AMesonField.hpp
+++ b/Hadrons/Modules/MContraction/A2AMesonField.hpp
@@ -33,7 +33,15 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/A2AVectors.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 #include <Hadrons/Modules/MSolver/A2AVectors.hpp>
 #include <Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp>
 #define MF_PARALLEL_IO
 #ifndef MF_IO_TYPE
 #define MF_IO_TYPE ComplexF
 #endif
 BEGIN_HADRONS_NAMESPACE
@@ -48,8 +56,8 @@ public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMesonFieldPar,
                                    int, cacheBlock,
                                    int, block,
-                                    std::string, left,
+                                    std::string, v,
-                                    std::string, right,
+                                    std::string, w,
                                    std::string, output,
                                    std::string, gammas,
                                    std::vector<std::string>, mom);
@@ -63,59 +71,22 @@ public:
                                    Gamma::Algebra, gamma);
 };
 template <typename T, typename FImpl>
 class MesonFieldKernel: public A2AKernel<T, typename FImpl::FermionField>
 {
 public:
    typedef typename FImpl::FermionField FermionField;
 public:
    MesonFieldKernel(const std::vector<Gamma::Algebra> &gamma,
                     const std::vector<LatticeComplex> &mom,
                     GridBase *grid)
    : gamma_(gamma), mom_(mom), grid_(grid)
    {
        vol_ = 1.;
        for (auto &d: grid_->GlobalDimensions())
        {
            vol_ *= d;
        }
    }
    virtual ~MesonFieldKernel(void) = default;
    virtual void operator()(A2AMatrixSet<T> &m, const FermionField *left, 
                            const FermionField *right,
                            const unsigned int orthogDim, double &t)
    {
        A2Autils<FImpl>::MesonField(m, left, right, gamma_, mom_, orthogDim, &t);
    }
    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej)
    {
        return vol_*(2*8.0+6.0+8.0*mom_.size())*blockSizei*blockSizej*gamma_.size();
    }
    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej)
    {
        return vol_*(12.0*sizeof(T))*blockSizei*blockSizej
               +  vol_*(2.0*sizeof(T)*mom_.size())*blockSizei*blockSizej*gamma_.size();
    }
 private:
    const std::vector<Gamma::Algebra> &gamma_;
    const std::vector<LatticeComplex> &mom_;
    GridBase                          *grid_;
    double                            vol_;
 };
 template <typename FImpl>
 class TA2AMesonField : public Module<A2AMesonFieldPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
-    typedef A2AMatrixBlockComputation<Complex, 
+    SOLVER_TYPE_ALIASES(FImpl,);
-                                      FermionField, 
+    typedef Eigen::TensorMap<Eigen::Tensor<Complex, 5, Eigen::RowMajor>>    MesonField;
-                                      A2AMesonFieldMetadata, 
+    typedef Eigen::TensorMap<Eigen::Tensor<MF_IO_TYPE, 5, Eigen::RowMajor>> MesonFieldIo;
-                                      HADRONS_A2AM_IO_TYPE> Computation;
+    typedef A2AMatrixIo<MF_IO_TYPE, A2AMesonFieldMetadata>                  MatrixIo;
-    typedef MesonFieldKernel<Complex, FImpl> Kernel;
+    struct IoHelper
    {
        MatrixIo              io;
        A2AMesonFieldMetadata metadata;
        size_t                offset;
        unsigned int          i, j, blockSizei, blockSizej;
    };
 public:
    // constructor
    TA2AMesonField(const std::string name);
@@ -128,14 +99,21 @@ public:
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    // IO
    std::string ioname(const unsigned int m, const unsigned int g) const;
    std::string filename(const unsigned int m, const unsigned int g) const;
    void saveBlock(const MF_IO_TYPE *data, IoHelper &h);
 private:
    bool                                               hasPhase_{false};
    std::string                                        momphName_;
    std::vector<Gamma::Algebra>                        gamma_;
    std::vector<std::vector<Real>>                     mom_;
    std::vector<IoHelper>                              nodeIo_;
 };
 MODULE_REGISTER(A2AMesonField, ARG(TA2AMesonField<FIMPL>), MContraction);
 MODULE_REGISTER(ZA2AMesonField, ARG(TA2AMesonField<ZFIMPL>), MContraction);
 /******************************************************************************
 *                  TA2AMesonField implementation                             *
@@ -152,7 +130,7 @@ TA2AMesonField<FImpl>::TA2AMesonField(const std::string name)
 template <typename FImpl>
 std::vector<std::string> TA2AMesonField<FImpl>::getInput(void)
 {
-    std::vector<std::string> in = {par().left, par().right};
+    std::vector<std::string> in = {par().v, par().w};
    return in;
 }
@@ -208,31 +186,34 @@ void TA2AMesonField<FImpl>::setup(void)
        }
        mom_.push_back(p);
    }
    envCache(std::vector<ComplexField>, momphName_, 1, 
             par().mom.size(), envGetGrid(ComplexField));
    envTmpLat(ComplexField, "coor");
-    envTmp(Computation, "computation", 1, envGetGrid(FermionField), 
+    // preallocate memory for meson field block
-           env().getNd() - 1, mom_.size(), gamma_.size(), par().block, 
+    auto tgp = env().getDim().back()*gamma_.size()*mom_.size();
-           par().cacheBlock, this);
+
    envTmp(Vector<MF_IO_TYPE>, "mfBuf", 1, tgp*par().block*par().block);
    envTmp(Vector<Complex>, "mfCache", 1, tgp*par().cacheBlock*par().cacheBlock);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TA2AMesonField<FImpl>::execute(void)
 {
-    auto &left  = envGet(std::vector<FermionField>, par().left);
+    auto &v = envGet(std::vector<FermionField>, par().v);
-    auto &right = envGet(std::vector<FermionField>, par().right);
+    auto &w = envGet(std::vector<FermionField>, par().w);
    int nt         = env().getDim().back();
-    int N_i        = left.size();
+    int N_i        = w.size();
-    int N_j        = right.size();
+    int N_j        = v.size();
    int ngamma     = gamma_.size();
    int nmom       = mom_.size();
    int block      = par().block;
    int cacheBlock = par().cacheBlock;
    LOG(Message) << "Computing all-to-all meson fields" << std::endl;
-    LOG(Message) << "Left: '" << par().left << "' Right: '" << par().right << "'" << std::endl;
+    LOG(Message) << "W: '" << par().w << "' V: '" << par().v << "'" << std::endl;
    LOG(Message) << "Momenta:" << std::endl;
    for (auto &p: mom_)
    {
@@ -244,9 +225,12 @@ void TA2AMesonField<FImpl>::execute(void)
        LOG(Message) << "  " << g << std::endl;
    }
    LOG(Message) << "Meson field size: " << nt << "*" << N_i << "*" << N_j 
-                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(HADRONS_A2AM_IO_TYPE)) 
+                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(MF_IO_TYPE)) 
                 << "/momentum/bilinear)" << std::endl;
    ///////////////////////////////////////////////
    // Momentum setup
    ///////////////////////////////////////////////
    auto &ph = envGet(std::vector<ComplexField>, momphName_);
    if (!hasPhase_)
@@ -270,8 +254,158 @@ void TA2AMesonField<FImpl>::execute(void)
        stopTimer("Momentum phases");
    }
-    auto ionameFn = [this](const unsigned int m, const unsigned int g)
+    //////////////////////////////////////////////////////////////////////////
    // i,j   is first  loop over SchurBlock factors reusing 5D matrices
    // ii,jj is second loop over cacheBlock factors for high perf contractoin
    // iii,jjj are loops within cacheBlock
    // Total index is sum of these  i+ii+iii etc...
    //////////////////////////////////////////////////////////////////////////
    double flops;
    double bytes;
    double vol      = env().getVolume();
    double t_kernel = 0.0;
    double nodes    = env().getGrid()->NodeCount();
    double tot_kernel;
    envGetTmp(Vector<MF_IO_TYPE>, mfBuf);
    envGetTmp(Vector<Complex>, mfCache);
    double t0    = usecond();
    int NBlock_i = N_i/block + (((N_i % block) != 0) ? 1 : 0);
    int NBlock_j = N_j/block + (((N_j % block) != 0) ? 1 : 0);
    for(int i=0;i<N_i;i+=block)
    for(int j=0;j<N_j;j+=block)
    {
        // Get the W and V vectors for this block^2 set of terms
        int N_ii = MIN(N_i-i,block);
        int N_jj = MIN(N_j-j,block);
        LOG(Message) << "Meson field block " 
                    << j/block + NBlock_j*i/block + 1 
                    << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
                    << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
                    << std::endl;
        MesonFieldIo mfBlock(mfBuf.data(),nmom,ngamma,nt,N_ii,N_jj);
        // Series of cache blocked chunks of the contractions within this block
        flops = 0.0;
        bytes = 0.0;
        for(int ii=0;ii<N_ii;ii+=cacheBlock)
        for(int jj=0;jj<N_jj;jj+=cacheBlock)
        {
            int N_iii = MIN(N_ii-ii,cacheBlock);
            int N_jjj = MIN(N_jj-jj,cacheBlock);
            MesonField mfCacheBlock(mfCache.data(),nmom,ngamma,nt,N_iii,N_jjj);    
            startTimer("contraction: total");
            makeMesonFieldBlock(mfCacheBlock, &w[i+ii], &v[j+jj], gamma_, ph, 
                                env().getNd() - 1, this);
            stopTimer("contraction: total");
            // flops for general N_c & N_s
            flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma;
            bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj
                     +  vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma;
            startTimer("cache copy");
            parallel_for_nest5(int m =0;m< nmom;m++)
            for(int g =0;g< ngamma;g++)
            for(int t =0;t< nt;t++)
            for(int iii=0;iii< N_iii;iii++)
            for(int jjj=0;jjj< N_jjj;jjj++)
            {
                mfBlock(m,g,t,ii+iii,jj+jjj) = mfCacheBlock(m,g,t,iii,jjj);
            }
            stopTimer("cache copy");
        }
        // perf
        tot_kernel = getDTimer("contraction: colour trace & mom.")
                     + getDTimer("contraction: local space sum");
        t_kernel   = tot_kernel - t_kernel;
        LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes 
                     << " Gflop/s/node " << std::endl;
        LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes 
                     << " GB/s/node "  << std::endl;
        t_kernel = tot_kernel;
        // IO
        if (!par().output.empty())
        {
            double       blockSize, ioTime;
            unsigned int myRank = env().getGrid()->ThisRank(),
                         nRank  = env().getGrid()->RankCount();
            LOG(Message) << "Writing block to disk" << std::endl;
            ioTime = -getDTimer("IO: write block");
            startTimer("IO: total");
            makeFileDir(filename(0, 0), env().getGrid());
 #ifdef MF_PARALLEL_IO
            env().getGrid()->Barrier();
            nodeIo_.clear();
            for(int f = myRank; f < nmom*ngamma; f += nRank)
            {
                const unsigned int    m = f/ngamma, g = f % ngamma;
                IoHelper              h;
                h.io = MatrixIo(filename(m, g), ioname(m, g), nt, N_i, N_j);
                for (auto pmu: mom_[m])
                {
                    h.metadata.momentum.push_back(pmu);
                }
                h.metadata.gamma = gamma_[g];
                h.i              = i;
                h.j              = j;
                h.blockSizei     = mfBlock.dimension(3);
                h.blockSizej     = mfBlock.dimension(4);
                h.offset         = (m*ngamma + g)*nt*h.blockSizei*h.blockSizej;
                nodeIo_.push_back(h);
            }
            // parallel IO
            for (auto &h: nodeIo_)
            {
                saveBlock(mfBlock.data(), h);
            }
            env().getGrid()->Barrier();
 #else
            // serial IO
            for(int m = 0; m < nmom; m++)
            for(int g = 0; g < ngamma; g++)
            {
                IoHelper h;
                h.io = MatrixIo(filename(m, g), ioname(m, g), nt, N_i, N_j);
                for (auto pmu: mom_[m])
                {
                    h.metadata.momentum.push_back(pmu);
                }
                h.metadata.gamma = gamma_[g];
                h.i              = i;
                h.j              = j;
                h.blockSizei     = mfBlock.dimension(3);
                h.blockSizej     = mfBlock.dimension(4);
                h.offset         = (m*ngamma + g)*nt*h.blockSizei*h.blockSizej;
                saveBlock(mfBlock.data(), h);
            }
 #endif
            stopTimer("IO: total");
            blockSize  = static_cast<double>(nmom*ngamma*nt*N_ii*N_jj*sizeof(MF_IO_TYPE));
            ioTime    += getDTimer("IO: write block");
            LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
                         << ioTime  << " us (" 
                         << blockSize/ioTime*1.0e6/1024/1024
                         << " MB/s)" << std::endl;
        }
    }
 }
 // IO
 template <typename FImpl>
 std::string TA2AMesonField<FImpl>::ioname(unsigned int m, unsigned int g) const
 {
    std::stringstream ss;
    ss << gamma_[g] << "_";
@@ -281,31 +415,27 @@ void TA2AMesonField<FImpl>::execute(void)
    }
    return ss.str();
-    };
+}
-    auto filenameFn = [this, &ionameFn](const unsigned int m, const unsigned int g)
+template <typename FImpl>
-    {
+std::string TA2AMesonField<FImpl>::filename(unsigned int m, unsigned int g) const
 {
    return par().output + "." + std::to_string(vm().getTrajectory()) 
-               + "/" + ionameFn(m, g) + ".h5";
+           + "/" + ioname(m, g) + ".h5";
-    };
+}
-    auto metadataFn = [this](const unsigned int m, const unsigned int g)
+template <typename FImpl>
 void TA2AMesonField<FImpl>::saveBlock(const MF_IO_TYPE *data, IoHelper &h)
 {
    if ((h.i == 0) and (h.j == 0))
    {
-        A2AMesonFieldMetadata md;
+        startTimer("IO: file creation");
-
+        h.io.initFile(h.metadata, par().block);
-        for (auto pmu: mom_[m])
+        stopTimer("IO: file creation");
        {
            md.momentum.push_back(pmu);
    }
-        md.gamma = gamma_[g];
+    startTimer("IO: write block");
-        
+    h.io.saveBlock(data + h.offset, h.i, h.j, h.blockSizei, h.blockSizej);
-        return md;
+    stopTimer("IO: write block");
    };
    Kernel      kernel(gamma_, ph, envGetGrid(FermionField));
    envGetTmp(Computation, computation);
    computation.execute(left, right, kernel, ionameFn, filenameFn, metadataFn);
 }
 END_MODULE_NAMESPACE
--- a/Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
+++ b/Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
@@ -0,0 +1,224 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_A2AMesonFieldKernels_hpp_
 #define Hadrons_MContraction_A2AMesonFieldKernels_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
 BEGIN_HADRONS_NAMESPACE
 BEGIN_MODULE_NAMESPACE(MContraction)
 ////////////////////////////////////////////////////////////////////////////////
 // Cache blocked arithmetic routine
 // Could move to Grid ???
 ////////////////////////////////////////////////////////////////////////////////
 template <typename Field, typename MesonField>
 void makeMesonFieldBlock(MesonField &mat, 
                         const Field *lhs_wi,
                         const Field *rhs_vj,
                         std::vector<Gamma::Algebra> gamma,
                         const std::vector<LatticeComplex> &mom,
                         int orthogdim,
                         ModuleBase *caller = nullptr) 
 {
    typedef typename Field::vector_object vobj;
    typedef typename vobj::scalar_object  sobj;
    typedef typename vobj::scalar_type    scalar_type;
    typedef typename vobj::vector_type    vector_type;
    typedef iSpinMatrix<vector_type> SpinMatrix_v;
    typedef iSpinMatrix<scalar_type> SpinMatrix_s;
    int Lblock = mat.dimension(3); 
    int Rblock = mat.dimension(4);
    GridBase *grid = lhs_wi[0]._grid;
    const int    Nd = grid->_ndimension;
    const int Nsimd = grid->Nsimd();
    int Nt     = grid->GlobalDimensions()[orthogdim];
    int Ngamma = gamma.size();
    int Nmom   = mom.size();
    int fd=grid->_fdimensions[orthogdim];
    int ld=grid->_ldimensions[orthogdim];
    int rd=grid->_rdimensions[orthogdim];
    // will locally sum vectors first
    // sum across these down to scalars
    // splitting the SIMD
    int MFrvol = rd*Lblock*Rblock*Nmom;
    int MFlvol = ld*Lblock*Rblock*Nmom;
    Vector<SpinMatrix_v > lvSum(MFrvol);
    parallel_for (int r = 0; r < MFrvol; r++)
    {
        lvSum[r] = zero;
    }
    Vector<SpinMatrix_s > lsSum(MFlvol);             
    parallel_for (int r = 0; r < MFlvol; r++)
    {
        lsSum[r]=scalar_type(0.0);
    }
    int e1=    grid->_slice_nblock[orthogdim];
    int e2=    grid->_slice_block [orthogdim];
    int stride=grid->_slice_stride[orthogdim];
    if (caller) caller->startTimer("contraction: colour trace & mom.");
    // Nested parallelism would be ok
    // Wasting cores here. Test case r
    parallel_for(int r=0;r<rd;r++)
    {
        int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
        for(int n=0;n<e1;n++)
        for(int b=0;b<e2;b++)
        {
            int ss= so+n*stride+b;
            for(int i=0;i<Lblock;i++)
            {
                auto left = conjugate(lhs_wi[i]._odata[ss]);
                for(int j=0;j<Rblock;j++)
                {
                    SpinMatrix_v vv;
                    auto right = rhs_vj[j]._odata[ss];
                    for(int s1=0;s1<Ns;s1++)
                    for(int s2=0;s2<Ns;s2++)
                    {
                        vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
                                        + left()(s2)(1) * right()(s1)(1)
                                        + left()(s2)(2) * right()(s1)(2);
                    }
                    // After getting the sitewise product do the mom phase loop
                    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
                    for ( int m=0;m<Nmom;m++)
                    {
                        int idx = m+base;
                        auto phase = mom[m]._odata[ss];
                        mac(&lvSum[idx],&vv,&phase);
                    }
                }
            }
        }
    }
    if (caller) caller->stopTimer("contraction: colour trace & mom.");
    // Sum across simd lanes in the plane, breaking out orthog dir.
    if (caller) caller->startTimer("contraction: local space sum");
    parallel_for(int rt=0;rt<rd;rt++)
    {
        std::vector<int> icoor(Nd);
        std::vector<SpinMatrix_s> extracted(Nsimd);               
        for(int i=0;i<Lblock;i++)
        for(int j=0;j<Rblock;j++)
        for(int m=0;m<Nmom;m++)
        {
            int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
            extract(lvSum[ij_rdx],extracted);
            for(int idx=0;idx<Nsimd;idx++)
            {
                grid->iCoorFromIindex(icoor,idx);
                int ldx    = rt+icoor[orthogdim]*rd;
                int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
                lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
            }
        }
    }
    if (caller) caller->stopTimer("contraction: local space sum");
    // ld loop and local only??
    if (caller) caller->startTimer("contraction: spin trace");
    int pd = grid->_processors[orthogdim];
    int pc = grid->_processor_coor[orthogdim];
    parallel_for_nest2(int lt=0;lt<ld;lt++)
    {
        for(int pt=0;pt<pd;pt++)
        {
            int t = lt + pt*ld;
            if (pt == pc)
            {
                for(int i=0;i<Lblock;i++)
                for(int j=0;j<Rblock;j++)
                for(int m=0;m<Nmom;m++)
                {
                    int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
                    for(int mu=0;mu<Ngamma;mu++)
                    {
                        // this is a bit slow
                        mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gamma[mu]));
                    }
                }
            } 
            else 
            { 
                const scalar_type zz(0.0);
                for(int i=0;i<Lblock;i++)
                for(int j=0;j<Rblock;j++)
                for(int mu=0;mu<Ngamma;mu++)
                for(int m=0;m<Nmom;m++)
                {
                    mat(m,mu,t,i,j) =zz;
                }
            }
        }
    }
    if (caller) caller->stopTimer("contraction: spin trace");
    ////////////////////////////////////////////////////////////////////
    // This global sum is taking as much as 50% of time on 16 nodes
    // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
    // Healthy size that should suffice
    ////////////////////////////////////////////////////////////////////
    if (caller) caller->startTimer("contraction: global sum");
    grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
    if (caller) caller->stopTimer("contraction: global sum");
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif //Hadrons_MContraction_A2AMesonField_hpp_
--- a/Hadrons/Modules/MGauge/Electrify.cc
+++ b/Hadrons/Modules/MGauge/Electrify.cc
@@ -1,34 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MGauge/Electrify.cc
 Copyright (C) 2015-2018
 Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MGauge/Electrify.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MGauge;
 template class Grid::Hadrons::MGauge::TElectrify<GIMPL>;
--- a/Hadrons/Modules/MGauge/Electrify.hpp
+++ b/Hadrons/Modules/MGauge/Electrify.hpp
@@ -1,151 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MGauge/Electrify.hpp
 Copyright (C) 2015-2018
 Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MGauge_Electrify_hpp_
 #define Hadrons_MGauge_Electrify_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                              Electrify gauge                               *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)
 /****************************************************************************
 *  Electrify a gauge field:
 *
 *  Ue_mu(x) = U_mu(x)*exp(ieqA_mu(x))
 *
 *  with
 *
 *  - gauge: U_mu(x): gauge field
 *  - emField: A_mu(x): electromagnetic photon field
 *  - e: value for the elementary charge
 *  - q: charge in units of e
 *
 *****************************************************************************/
 class ElectrifyPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ElectrifyPar,
                                    std::string, gauge,
 				    std::string, emField,
 				    double, e,
 				    double, charge);
 };
 template <typename GImpl>
 class TElectrify: public Module<ElectrifyPar>
 {
 public:
    GAUGE_TYPE_ALIASES(GImpl,);
 public:
    typedef PhotonR::GaugeField     EmField;
 public:
    // constructor
    TElectrify(const std::string name);
    // destructor
    virtual ~TElectrify(void) {};
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(Electrify, TElectrify<GIMPL>, MGauge);
 /******************************************************************************
 *                            TElectrify implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename GImpl>
 TElectrify<GImpl>::TElectrify(const std::string name)
 : Module<ElectrifyPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename GImpl>
 std::vector<std::string> TElectrify<GImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().gauge, par().emField};
    return in;
 }
 template <typename GImpl>
 std::vector<std::string> TElectrify<GImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename GImpl>
 void TElectrify<GImpl>::setup(void)
 {
    envCreateLat(GaugeField, getName());
    envTmpLat(LatticeComplex, "eiAmu");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename GImpl>
 void TElectrify<GImpl>::execute(void)
 {
    LOG(Message) << "Electrify the gauge field " << par().gauge << " using the photon field " 
                  << par().emField << " with charge e*q= " << par().e << "*" << par().charge << std::endl;
    auto &Ue = envGet(GaugeField, getName());
    auto &U = envGet(GaugeField, par().gauge);
    auto &A = envGet(EmField,  par().emField);
    envGetTmp(LatticeComplex, eiAmu);
    Complex i(0.0,1.0);
    for(unsigned int mu = 0; mu < env().getNd(); mu++)
    {
 	eiAmu = exp(i * (Real)(par().e * par().charge) * PeekIndex<LorentzIndex>(A, mu));
 	PokeIndex<LorentzIndex>(Ue, PeekIndex<LorentzIndex>(U, mu) * eiAmu, mu);
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MGauge_Electrify_hpp_
--- a/Hadrons/Modules/MGauge/GaugeFix.cc
+++ b/Hadrons/Modules/MGauge/GaugeFix.cc
@@ -1,36 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MGauge/GaugeFix.cc
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MGauge/GaugeFix.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MGauge;
 template class Grid::Hadrons::MGauge::TGaugeFix<GIMPL>;
--- a/Hadrons/Modules/MGauge/GaugeFix.hpp
+++ b/Hadrons/Modules/MGauge/GaugeFix.hpp
@@ -1,135 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MGauge/GaugeFix.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MGaugeFix_hpp_
 #define Hadrons_MGaugeFix_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Grid/qcd/utils/GaugeFix.h>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                              Fix gauge                                    *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)
 class GaugeFixPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(GaugeFixPar,
                                    std::string, gauge,
                                    Real,  alpha,
                                    int, maxiter, 
                                    Real, Omega_tol, 
                                    Real, Phi_tol,
                                    bool, Fourier);
 };
 template <typename GImpl>
 class TGaugeFix: public Module<GaugeFixPar>
 {
 public:
    GAUGE_TYPE_ALIASES(GImpl,);
 public:
    // constructor
    TGaugeFix(const std::string name);
    // destructor
    virtual ~TGaugeFix(void) {};
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(GaugeFix, TGaugeFix<GIMPL>, MGauge);
 /******************************************************************************
 *                            TGaugeFix implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename GImpl>
 TGaugeFix<GImpl>::TGaugeFix(const std::string name)
 : Module<GaugeFixPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename GImpl>
 std::vector<std::string> TGaugeFix<GImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().gauge};
    return in;
 }
 template <typename GImpl>
 std::vector<std::string> TGaugeFix<GImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename GImpl>
 void TGaugeFix<GImpl>::setup(void)
 {
    envCreateLat(GaugeField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename GImpl>
 void TGaugeFix<GImpl>::execute(void)
 //Loads the gauge and fixes it
 {
    std::cout << "executing" << std::endl;
    LOG(Message) << "Fixing the Gauge" << std::endl;
    LOG(Message) << par().gauge << std::endl;
    auto &U     = envGet(GaugeField, par().gauge);
    auto &Umu   = envGet(GaugeField, getName());
    LOG(Message) << "Gauge Field fetched" << std::endl;
    //do we allow maxiter etc to be user set?
    Real alpha     = par().alpha;
    int  maxiter   = par().maxiter;
    Real Omega_tol = par().Omega_tol;
    Real Phi_tol   = par().Phi_tol;
    bool Fourier   = par().Fourier;
    FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(U,alpha,maxiter,Omega_tol,Phi_tol,Fourier);
    Umu = U;
    LOG(Message) << "Gauge Fixed" << std::endl;
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MGaugeFix_hpp_
--- a/Hadrons/Modules/MGauge/StochEm.cc
+++ b/Hadrons/Modules/MGauge/StochEm.cc
@@ -70,7 +70,7 @@ void TStochEm::execute(void)
    LOG(Message) << "Generating stochastic EM potential..." << std::endl;
    std::vector<Real> improvements = strToVec<Real>(par().improvement);
-    PhotonR photon(envGetGrid(EmField), par().gauge, par().zmScheme, improvements);
+    PhotonR photon(par().gauge, par().zmScheme, improvements, par().G0_qedInf);
    auto    &a = envGet(EmField, getName());
    auto    &w = envGet(EmComp, "_" + getName() + "_weight");
--- a/Hadrons/Modules/MGauge/StochEm.hpp
+++ b/Hadrons/Modules/MGauge/StochEm.hpp
@@ -47,7 +47,8 @@ public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(StochEmPar,
                                    PhotonR::Gauge,    gauge,
                                    PhotonR::ZmScheme, zmScheme,
-                                    std::string,       improvement);
+                                    std::string,       improvement,
                                    Real,              G0_qedInf);
 };
 class TStochEm: public Module<StochEmPar>
--- a/Hadrons/Modules/MGauge/UnitEm.cc
+++ b/Hadrons/Modules/MGauge/UnitEm.cc
@@ -62,7 +62,7 @@ void TUnitEm::setup(void)
 // execution ///////////////////////////////////////////////////////////////////
 void TUnitEm::execute(void)
 {
-    PhotonR photon(envGetGrid(EmField), 0, 0); // Just chose arbitrary input values here
+    PhotonR photon(0, 0); // Just chose arbitrary input values here
    auto    &a = envGet(EmField, getName());
    LOG(Message) << "Generating unit EM potential..." << std::endl;
    photon.UnitField(a);
--- a/Hadrons/Modules/MIO/LoadA2AVectors.hpp
+++ b/Hadrons/Modules/MIO/LoadA2AVectors.hpp
@@ -1,120 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MIO/LoadA2AVectors.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MIO_LoadA2AVectors_hpp_
 #define Hadrons_MIO_LoadA2AVectors_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/A2AVectors.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                    Module to load all-to-all vectors                       *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MIO)
 class LoadA2AVectorsPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadA2AVectorsPar,
                                    std::string,  filestem,
                                    bool,         multiFile,
                                    unsigned int, size);
 };
 template <typename FImpl>
 class TLoadA2AVectors: public Module<LoadA2AVectorsPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TLoadA2AVectors(const std::string name);
    // destructor
    virtual ~TLoadA2AVectors(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(LoadA2AVectors, TLoadA2AVectors<FIMPL>, MIO);
 /******************************************************************************
 *                      TLoadA2AVectors implementation                        *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TLoadA2AVectors<FImpl>::TLoadA2AVectors(const std::string name)
 : Module<LoadA2AVectorsPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TLoadA2AVectors<FImpl>::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TLoadA2AVectors<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TLoadA2AVectors<FImpl>::setup(void)
 {
    envCreate(std::vector<FermionField>, getName(), 1, par().size, 
              envGetGrid(FermionField));
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TLoadA2AVectors<FImpl>::execute(void)
 {
    auto &vec = envGet(std::vector<FermionField>, getName());
    A2AVectorsIo::read(vec, par().filestem, par().multiFile, vm().getTrajectory());
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MIO_LoadA2AVectors_hpp_
--- a/Hadrons/Modules/MIO/LoadEigenPack.cc
+++ b/Hadrons/Modules/MIO/LoadEigenPack.cc
@@ -32,6 +32,4 @@ using namespace Hadrons;
 using namespace MIO;
 template class Grid::Hadrons::MIO::TLoadEigenPack<FermionEigenPack<FIMPL>>;
-#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+
 template class Grid::Hadrons::MIO::TLoadEigenPack<FermionEigenPack<FIMPL, FIMPLF>>;
 #endif
--- a/Hadrons/Modules/MIO/LoadEigenPack.hpp
+++ b/Hadrons/Modules/MIO/LoadEigenPack.hpp
@@ -54,9 +54,7 @@ template <typename Pack>
 class TLoadEigenPack: public Module<LoadEigenPackPar>
 {
 public:
-    typedef typename Pack::Field   Field;
+    typedef EigenPack<typename Pack::Field> BasePack;
    typedef typename Pack::FieldIo FieldIo;
    typedef BaseEigenPack<Field>   BasePack;
 public:
    // constructor
    TLoadEigenPack(const std::string name);
@@ -72,9 +70,6 @@ public:
 };
 MODULE_REGISTER_TMP(LoadFermionEigenPack, TLoadEigenPack<FermionEigenPack<FIMPL>>, MIO);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(LoadFermionEigenPackIo32, ARG(TLoadEigenPack<FermionEigenPack<FIMPL, FIMPLF>>), MIO);
 #endif
 /******************************************************************************
 *                    TLoadEigenPack implementation                           *
@@ -106,14 +101,9 @@ std::vector<std::string> TLoadEigenPack<Pack>::getOutput(void)
 template <typename Pack>
 void TLoadEigenPack<Pack>::setup(void)
 {
-    GridBase *gridIo = nullptr;
+    env().createGrid(par().Ls);
    if (typeHash<Field>() != typeHash<FieldIo>())
    {
        gridIo = envGetRbGrid(FieldIo, par().Ls);
    }
    envCreateDerived(BasePack, Pack, getName(), par().Ls, par().size, 
-                     envGetRbGrid(Field, par().Ls), gridIo);
+                     env().getRbGrid(par().Ls));
 }
 // execution ///////////////////////////////////////////////////////////////////
--- a/Hadrons/Modules/MNPR/Amputate.cc
+++ b/Hadrons/Modules/MNPR/Amputate.cc
@@ -1,36 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MNPR/Amputate.cc
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MNPR/Amputate.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MNPR;
 template class Grid::Hadrons::MNPR::TAmputate<FIMPL,FIMPL>;
--- a/Hadrons/Modules/MNPR/Amputate.hpp
+++ b/Hadrons/Modules/MNPR/Amputate.hpp
@@ -1,200 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MNPR/Amputate.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Julia Kettle J.R.Kettle-2@sms.ed.ac.uk
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Amputate_hpp_
 #define Hadrons_Amputate_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Grid/Eigen/LU>
 //#include <Grid/qcd/utils/PropagatorUtils.h>
 //#include <Grid/serialisation/Serialisation.h>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                                TAmputate                                       *
        Performs bilinear contractions of the type tr[g5*adj(Sout)*g5*G*Sin]
        Suitable for non exceptional momenta
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MNPR)
 class AmputatePar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(AmputatePar,
                                    std::string,    Sin, //need to make this a propogator type?
                                    std::string,    Sout, //same
                                    std::string,    vertex,
                                    std::string,    pin,
                                    std::string,    pout,
                                    std::string,    output,
                                    std::string,    input);
 };
 template <typename FImpl1, typename FImpl2>
 class TAmputate: public Module<AmputatePar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl1, 1);
    FERM_TYPE_ALIASES(FImpl2, 2);
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        std::vector<Complex>, Vamp,
                                        ); 
    };
 public:
    // constructor
    TAmputate(const std::string name);
    // destructor
    virtual ~TAmputate(void) {};
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    virtual SpinColourMatrix invertspincolmat(SpinColourMatrix &scmat);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(Amputate, ARG(TAmputate<FIMPL, FIMPL>), MNPR);
 /******************************************************************************
 *                           TAmputate implementation                            *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2>
 TAmputate<FImpl1, FImpl2>::TAmputate(const std::string name)
 : Module<AmputatePar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2>
 std::vector<std::string> TAmputate<FImpl1, FImpl2>::getInput(void)
 {
    std::vector<std::string> input = {par().Sin, par().Sout, par().vertex};
    return input;
 }
 template <typename FImpl1, typename FImpl2>
 std::vector<std::string> TAmputate<FImpl1, FImpl2>::getOutput(void)
 {
    std::vector<std::string> output = {getName()};
    return output;
 }
 // Invert spin colour matrix using Eigen
 template <typename Fimpl1, typename Fimpl2>
 SpinColourMatrix TAmputate<Fimpl1, Fimpl2>::invertspincolmat(SpinColourMatrix &scmat)
 {
    Eigen::MatrixXcf scmat_2d(Ns*Nc,Ns*Nc);
    for(int ic=0; ic<Nc; ic++){
    for(int jc=0; jc<Nc; jc++){
        for(int is=0; is<Ns; is++){
        for(int js=0; js<Ns; js++){
            scmat_2d(Ns*ic+is,Ns*jc+js) = scmat()(is,js)(ic,jc);
        }}
    }}      
    Eigen::MatrixXcf scmat_2d_inv = scmat_2d.inverse();
    SpinColourMatrix scmat_inv;
    for(int ic=0; ic<Nc; ic++){
    for(int jc=0; jc<Nc; jc++){
        for(int is=0; is<Ns; is++){
        for(int js=0; js<Ns; js++){
            scmat_inv()(is,js)(ic,jc) = scmat_2d_inv(Ns*ic+is,Ns*jc+js);
        }}
    }}      
    return scmat_inv;
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2>
 void TAmputate<FImpl1, FImpl2>::execute(void)
 {
    LOG(Message) << "Computing bilinear amputations '" << getName() << "' using"
                 << " momentum '" << par().Sin << "' and '" << par().Sout << "'"
                 << std::endl;
    BinaryWriter                    writer(par().output);
    PropagatorField1                &Sin = *env().template getObject<PropagatorField1>(par().Sin); //Do these have the phases taken into account?? Don't think so. FIX
    PropagatorField2                &Sout = *env().template getObject<PropagatorField2>(par().Sout);
    std::vector<int>                pin  = strToVec<int>(par().pin), pout = strToVec<int>(par().pout);
    std::vector<Real>               latt_size(pin.begin(), pin.end()); 
    LatticeComplex                  pdotxin(env().getGrid()), pdotxout(env().getGrid()), coor(env().getGrid());
    LOG(Message) << "Propagators set up " << std::endl;
    std::vector<SpinColourMatrix>   vertex; // Let's read from file here
    Gamma                           g5(Gamma::Algebra::Gamma5);
    Result                          result;
    LOG(Message) << "reading file - "  << par().input << std::endl;
    BinaryReader                    reader(par().input); 
    Complex                         Ci(0.0,1.0);
    std::string svertex;
    read(reader,"vertex", vertex);
    LOG(Message) << "vertex read" << std::endl;
    pdotxin=zero;
    pdotxout=zero;
    for (unsigned int mu = 0; mu < 4; ++mu)
    {
        Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
        LatticeCoordinate(coor,mu);
        pdotxin = pdotxin +(TwoPiL * pin[mu]) * coor;
        pdotxout= pdotxout +(TwoPiL * pout[mu]) * coor;
    }
    Sin = Sin*exp(-Ci*pdotxin); //phase corrections
    Sout = Sout*exp(-Ci*pdotxout);
    SpinColourMatrix Sin_mom = sum(Sin);
    SpinColourMatrix Sout_mom = sum(Sout);
    LOG(Message) << "summed over lattice" << std::endl;
    LOG(Message) << "Lattice -> spincolourmatrix conversion" << std::endl;
    SpinColourMatrix Sin_inv = invertspincolmat(Sin_mom);
    SpinColourMatrix Sout_inv = invertspincolmat(Sout_mom);
    LOG(Message) << "Inversions done" << std::endl;
    result.Vamp.resize(Gamma::nGamma/2);
    for( int mu=0; mu < Gamma::nGamma/2; mu++){
        Gamma::Algebra gam = mu;
        result.Vamp[mu] = 1/12.0*trace(adj(Gamma(mu*2+1))*g5*Sout_inv*g5*vertex[mu]*Sin_inv);
        LOG(Message) << "Vamp[" << mu << "] - " << result.Vamp[mu] << std::endl;
        }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Amputate_hpp_
--- a/Hadrons/Modules/MNPR/Bilinear.cc
+++ b/Hadrons/Modules/MNPR/Bilinear.cc
@@ -1,36 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MNPR/Bilinear.cc
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MNPR/Bilinear.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MNPR;
 template class Grid::Hadrons::MNPR::TBilinear<FIMPL,FIMPL>;
--- a/Hadrons/Modules/MNPR/Bilinear.hpp
+++ b/Hadrons/Modules/MNPR/Bilinear.hpp
@@ -1,225 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MNPR/Bilinear.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Julia Kettle J.R.Kettle-2@sms.ed.ac.uk
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Bilinear_hpp_
 #define Hadrons_Bilinear_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 //#include <Grid/qcd/utils/PropagatorUtils.h>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                                TBilinear                                       *
        Performs bilinear contractions of the type tr[g5*adj(Sout)*g5*G*Sin]
        Suitable for non exceptional momenta in Rome-Southampton NPR
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MNPR)
 class BilinearPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(BilinearPar,
                                    std::string,    Sin,
                                    std::string,    Sout,
                                    std::string,    pin,
                                    std::string,    pout,
                                    std::string,    output);
 };
 template <typename FImpl1, typename FImpl2>
 class TBilinear: public Module<BilinearPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl1, 1);
    FERM_TYPE_ALIASES(FImpl2, 2);
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result, 
                                        std::vector<SpinColourMatrix>, bilinear);
    };
 public:
    // constructor
    TBilinear(const std::string name);
    // destructor
    virtual ~TBilinear(void) {};
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    //LatticeSpinColourMatrix PhaseProps(LatticeSpinColourMatrix S, std::vector<Real> p);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(Bilinear, ARG(TBilinear<FIMPL, FIMPL>), MNPR);
 /******************************************************************************
 *                           TBilinear implementation                            *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2>
 TBilinear<FImpl1, FImpl2>::TBilinear(const std::string name)
 : Module<BilinearPar>(name)
 {}
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2>
 void TBilinear<FImpl1, FImpl2>::setup(void)
 {
    //env().template registerLattice<LatticeSpinColourMatrix>(getName());
    //env().template registerObject<SpinColourMatrix>(getName());
 }
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2>
 std::vector<std::string> TBilinear<FImpl1, FImpl2>::getInput(void)
 {
    std::vector<std::string> input = {par().Sin, par().Sout};
    return input;
 }
 template <typename FImpl1, typename FImpl2>
 std::vector<std::string> TBilinear<FImpl1, FImpl2>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 /*
 /////Phase propagators//////////////////////////
 template <typename FImpl1, typename FImpl2>
 LatticeSpinColourMatrix TBilinear<FImpl1, FImpl2>::PhaseProps(LatticeSpinColourMatrix S, std::vector<Real> p)
 {
    GridBase *grid = S._grid;
    LatticeComplex      pdotx(grid),  coor(grid);
    std::vector<int>   latt_size = grid->_fdimensions; 
    Complex             Ci(0.0,1.0);
    pdotx=zero;
    for (unsigned int mu = 0; mu < 4; ++mu)
    {
        Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
        LatticeCoordinate(coor,mu);
        pdotx = pdotx +(TwoPiL * p[mu]) * coor;
    }
    S = S*exp(-Ci*pdotx);
    return S;
 }
 */
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2>
 void TBilinear<FImpl1, FImpl2>::execute(void)
 {
 /**************************************************************************
 Compute the bilinear vertex needed for the NPR.
 V(G) = sum_x  [ g5 * adj(S'(x,p2)) * g5 * G * S'(x,p1) ]_{si,sj,ci,cj}
 G is one of the 16 gamma vertices [I,gmu,g5,g5gmu,sig(mu,nu)]
        * G
       / \
    p1/   \p2
     /     \
    /       \
 Returns a spin-colour matrix, with indices si,sj, ci,cj
 Conventions:
 p1 - incoming momenta
 p2 - outgoing momenta
 q = (p1-p2)
 **************************************************************************/
    LOG(Message) << "Computing bilinear contractions '" << getName() << "' using"
                 << " momentum '" << par().Sin << "' and '" << par().Sout << "'"
                 << std::endl;
    BinaryWriter             writer(par().output);
    // Propogators
    LatticeSpinColourMatrix     &Sin = *env().template getObject<LatticeSpinColourMatrix>(par().Sin);
    LatticeSpinColourMatrix     &Sout = *env().template getObject<LatticeSpinColourMatrix>(par().Sout);
    LatticeComplex              pdotxin(env().getGrid()), pdotxout(env().getGrid()), coor(env().getGrid());
    // momentum on legs
    std::vector<Real>           pin  = strToVec<Real>(par().pin), pout = strToVec<Real>(par().pout);
    std::vector<Real>           latt_size(pin.begin(), pin.end()); 
    //bilinears
    LatticeSpinColourMatrix     bilinear_x(env().getGrid());
    SpinColourMatrix            bilinear;
    Gamma                       g5(Gamma::Algebra::Gamma5);
    Result                      result;
    Complex                     Ci(0.0,1.0);
    //
    pdotxin=zero;
    pdotxout=zero;
    for (unsigned int mu = 0; mu < 4; ++mu)
    {
        Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
        LatticeCoordinate(coor,mu);
        pdotxin = pdotxin +(TwoPiL * pin[mu]) * coor;
        pdotxout= pdotxout +(TwoPiL * pout[mu]) * coor;
    }
    Sin = Sin*exp(-Ci*pdotxin); //phase corrections
    Sout = Sout*exp(-Ci*pdotxout);
    ////Set up gamma vector//////////////////////////
    std::vector<Gamma> gammavector;
    for( int i=0; i<Gamma::nGamma; i++){
        Gamma::Algebra gam = i;
        gammavector.push_back(Gamma(gam));
    }
    result.bilinear.resize(Gamma::nGamma);
    /////////////////////////////////////////////////
    //LatticeSpinMatrix temp = g5*Sout;
    ////////Form Vertex//////////////////////////////
    for (int i=0; i < Gamma::nGamma; i++){
        bilinear_x = g5*adj(Sout)*g5*gammavector[i]*Sin; 
        result.bilinear[i] = sum(bilinear_x); //sum over lattice sites
    }
    //////////////////////////////////////////////////
    write(writer, par().output, result.bilinear);
    LOG(Message) << "Complete. Writing results to " << par().output << std:: endl;
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Bilinear_hpp_
--- a/Hadrons/Modules/MNPR/FourQuark.cc
+++ b/Hadrons/Modules/MNPR/FourQuark.cc
@@ -1,36 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MNPR/FourQuark.cc
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MNPR/FourQuark.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MNPR;
 template class Grid::Hadrons::MNPR::TFourQuark<FIMPL,FIMPL>;
--- a/Hadrons/Modules/MNPR/FourQuark.hpp
+++ b/Hadrons/Modules/MNPR/FourQuark.hpp
@@ -1,274 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MNPR/FourQuark.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Julia Kettle J.R.Kettle-2@sms.ed.ac.uk
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_FourQuark_hpp_
 #define Hadrons_FourQuark_hpp_
 #include <typeinfo>
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Grid/serialisation/Serialisation.h>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                                TFourQuark                                       *
        Performs fourquark contractions of the type tr[g5*adj(Sout)*g5*G*Sin]
        Suitable for non exceptional momenta
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MNPR)
 class FourQuarkPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(FourQuarkPar,
                                    std::string,    Sin, //need to make this a propogator type?
                                    std::string,    Sout, //same
                                    std::string,    pin,
                                    std::string,    pout,
                                    bool,           fullbasis,
                                    std::string,    output);
 };
 template <typename FImpl1, typename FImpl2>
 class TFourQuark: public Module<FourQuarkPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl1, 1);
    FERM_TYPE_ALIASES(FImpl2, 2);
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        std::vector<SpinColourSpinColourMatrix>, fourquark);
    };
 public:
    // constructor
    TFourQuark(const std::string name);
    // destructor
    virtual ~TFourQuark(void) {};
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void tensorprod(LatticeSpinColourSpinColourMatrix &lret, LatticeSpinColourMatrix a, LatticeSpinColourMatrix b);
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(FourQuark, ARG(TFourQuark<FIMPL, FIMPL>), MNPR);
 /******************************************************************************
 *                           TFourQuark implementation                            *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2>
 TFourQuark<FImpl1, FImpl2>::TFourQuark(const std::string name)
 : Module<FourQuarkPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2>
 std::vector<std::string> TFourQuark<FImpl1, FImpl2>::getInput(void)
 {
    std::vector<std::string> input = {par().Sin, par().Sout};
    return input;
 }
 template <typename FImpl1, typename FImpl2>
 std::vector<std::string> TFourQuark<FImpl1, FImpl2>::getOutput(void)
 {
    std::vector<std::string> output = {getName()};
    return output;
 }
 template <typename FImpl1, typename FImpl2>
 void TFourQuark<FImpl1, FImpl2>::tensorprod(LatticeSpinColourSpinColourMatrix &lret, LatticeSpinColourMatrix a, LatticeSpinColourMatrix b)
 {
 #if 0
            parallel_for(auto site=lret.begin();site<lret.end();site++) {
                for (int si; si < 4; ++si){
                for(int sj; sj <4; ++sj){
                    for (int ci; ci < 3; ++ci){
                    for (int cj; cj < 3; ++cj){
                        for (int sk; sk < 4; ++sk){
                        for(int sl; sl <4; ++sl){
                            for (int ck; ck < 3; ++ck){
                            for (int cl; cl < 3; ++cl){
                        lret[site]()(si,sj)(ci,cj)(sk,sl)(ck,cl)=a[site]()(si,sj)(ci,cj)*b[site]()(sk,sl)(ck,cl);
                            }}
                        }}
                    }}
                }}
        }
 #else 
            // FIXME ; is there a general need for this construct ? In which case we should encapsulate the
            //         below loops in a helper function.
            //LOG(Message) << "sp co mat a is - " << a << std::endl;
            //LOG(Message) << "sp co mat b is - " << b << std::endl;
            parallel_for(auto site=lret.begin();site<lret.end();site++) {
            vTComplex left;
                for(int si=0; si < Ns; ++si){
                for(int sj=0; sj < Ns; ++sj){
                    for (int ci=0; ci < Nc; ++ci){
                    for (int cj=0; cj < Nc; ++cj){
                      //LOG(Message) << "si, sj, ci, cj -  " << si << ", " << sj  << ", "<< ci  << ", "<< cj << std::endl;
                      left()()() = a[site]()(si,sj)(ci,cj);
                      //LOG(Message) << left << std::endl;
                      lret[site]()(si,sj)(ci,cj)=left()*b[site]();
                    }}
                }}
            }
 #endif      
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2>
 void TFourQuark<FImpl1, FImpl2>::setup(void)
 {
    envCreateLat(LatticeSpinColourMatrix, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2>
 void TFourQuark<FImpl1, FImpl2>::execute(void)
 {
 /*********************************************************************************
 TFourQuark : Creates the four quark vertex required for the NPR of four-quark ops
 V_{Gamma_1,Gamma_2} = sum_x [ ( g5 * adj(S'(x,p2)) * g5 * G1 * S'(x,p1) )_ci,cj;si,sj x ( g5 * adj(S'(x,p2)) * g5 * G2 S'(x,p1) )_ck,cl;sk,cl ]
 Create a bilinear vertex for G1 and G2  the spin and colour indices are kept free. Where there are 16 potential Gs.
 We then find the outer product of V1 and V2, keeping the spin and colour indices uncontracted
 Then this is summed over the lattice coordinate
 Result is a SpinColourSpinColourMatrix - with 4 colour and 4 spin indices. 
 We have up to 256 of these including the offdiag (G1 != G2).
        \         /
         \p1   p1/
          \     /
           \   /
         G1 * * G2
           /   \
          /     \
         /p2   p2\
        /         \
 *********************************************************************************/
    LOG(Message) << "Computing fourquark contractions '" << getName() << "' using"
                 << " momentum '" << par().Sin << "' and '" << par().Sout << "'"
                 << std::endl;
    BinaryWriter             writer(par().output);
    PropagatorField1                            &Sin = *env().template getObject<PropagatorField1>(par().Sin);
    PropagatorField2                            &Sout = *env().template getObject<PropagatorField2>(par().Sout);
    std::vector<Real>                           pin  = strToVec<Real>(par().pin), pout = strToVec<Real>(par().pout);
    bool                                        fullbasis = par().fullbasis;
    Gamma                                       g5(Gamma::Algebra::Gamma5);
    Result                                      result;
    std::vector<Real>                           latt_size(pin.begin(), pin.end());
    LatticeComplex                              pdotxin(env().getGrid()), pdotxout(env().getGrid()), coor(env().getGrid());
    LatticeSpinColourMatrix                     bilinear_mu(env().getGrid()), bilinear_nu(env().getGrid());
    LatticeSpinColourSpinColourMatrix           lret(env().getGrid()); 
    Complex                         Ci(0.0,1.0);
    //Phase propagators
    //Sin = Grid::QCD::PropUtils::PhaseProps(Sin,pin);
    //Sout = Grid::QCD::PropUtils::PhaseProps(Sout,pout);
    //find p.x for in and out so phase can be accounted for in propagators
    pdotxin=zero;
    pdotxout=zero;
    for (unsigned int mu = 0; mu < 4; ++mu)
    {
        Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
        LatticeCoordinate(coor,mu);
        pdotxin = pdotxin +(TwoPiL * pin[mu]) * coor;
        pdotxout= pdotxout +(TwoPiL * pout[mu]) * coor;
    }
    Sin = Sin*exp(-Ci*pdotxin); //phase corrections
    Sout = Sout*exp(-Ci*pdotxout);
    //Set up Gammas 
    std::vector<Gamma> gammavector;
     for( int i=1; i<Gamma::nGamma; i+=2){
         Gamma::Algebra gam = i;
         gammavector.push_back(Gamma(gam));
       }
    lret = zero;
    if (fullbasis == true){ // all combinations of mu and nu
        result.fourquark.resize(Gamma::nGamma/2*Gamma::nGamma/2);
        for( int mu=0; mu<Gamma::nGamma/2; mu++){ 
            bilinear_mu = g5*adj(Sout)*g5*gammavector[mu]*Sin;
            for ( int nu=0; nu<Gamma::nGamma; nu++){
                LatticeSpinColourMatrix     bilinear_nu(env().getGrid());
                bilinear_nu = g5*adj(Sout)*g5*gammavector[nu]*Sin;
                LOG(Message) << "bilinear_nu for nu = " << nu << " is - " << bilinear_mu << std::endl;
                result.fourquark[mu*Gamma::nGamma/2 + nu] = zero;
                tensorprod(lret,bilinear_mu,bilinear_nu);
                result.fourquark[mu*Gamma::nGamma/2 + nu] = sum(lret);
            }
        }
    } else {
        result.fourquark.resize(Gamma::nGamma/2);
        for ( int mu=0; mu<1; mu++){
        //for( int mu=0; mu<Gamma::nGamma/2; mu++ ){
            bilinear_mu = g5*adj(Sout)*g5*gammavector[mu]*Sin;
            //LOG(Message) << "bilinear_mu for mu = " << mu << " is - " << bilinear_mu << std::endl;
            result.fourquark[mu] = zero;
            tensorprod(lret,bilinear_mu,bilinear_mu); //tensor outer product
            result.fourquark[mu] = sum(lret);
        }
    }
    write(writer, "fourquark", result.fourquark);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_FourQuark_hpp_
--- a/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.cc
+++ b/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.cc
@@ -1,36 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.cc
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MNoise;
 template class Grid::Hadrons::MNoise::TFullVolumeSpinColorDiagonal<FIMPL>;
 template class Grid::Hadrons::MNoise::TFullVolumeSpinColorDiagonal<ZFIMPL>;
--- a/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp
+++ b/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp
@@ -1,121 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MNoise_FullVolumeSpinColorDiagonal_hpp_
 #define Hadrons_MNoise_FullVolumeSpinColorDiagonal_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/DilutedNoise.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *             Generate full volume spin-color diagonal noise                *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MNoise)
 class FullVolumeSpinColorDiagonalPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(FullVolumeSpinColorDiagonalPar,
                                    unsigned int, nsrc);
 };
 template <typename FImpl>
 class TFullVolumeSpinColorDiagonal: public Module<FullVolumeSpinColorDiagonalPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TFullVolumeSpinColorDiagonal(const std::string name);
    // destructor
    virtual ~TFullVolumeSpinColorDiagonal(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(FullVolumeSpinColorDiagonal, TFullVolumeSpinColorDiagonal<FIMPL>, MNoise);
 MODULE_REGISTER_TMP(ZFullVolumeSpinColorDiagonal, TFullVolumeSpinColorDiagonal<ZFIMPL>, MNoise);
 /******************************************************************************
 *              TFullVolumeSpinColorDiagonal implementation                  *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TFullVolumeSpinColorDiagonal<FImpl>::TFullVolumeSpinColorDiagonal(const std::string name)
 : Module<FullVolumeSpinColorDiagonalPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TFullVolumeSpinColorDiagonal<FImpl>::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TFullVolumeSpinColorDiagonal<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TFullVolumeSpinColorDiagonal<FImpl>::setup(void)
 {
    envCreateDerived(DilutedNoise<FImpl>, 
                     FullVolumeSpinColorDiagonalNoise<FImpl>,
                     getName(), 1, envGetGrid(FermionField), par().nsrc);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TFullVolumeSpinColorDiagonal<FImpl>::execute(void)
 {
    auto &noise = envGet(DilutedNoise<FImpl>, getName());
    LOG(Message) << "Generating full volume, spin-color diagonal noise" << std::endl;
    noise.generateNoise(rng4d());
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MNoise_FullVolumeSpinColorDiagonal_hpp_
--- a/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -146,7 +146,7 @@ void TChargedProp::execute(void)
        std::vector<int>    siteCoor;
        LOG(Message) << "Saving momentum-projected propagator to '"
-                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
+                     << RESULT_FILE_NAME(par().output) << "'..."
                     << std::endl;
        result.projection.resize(par().outputMom.size());
        result.lattice_size = env().getGrid()->_fdimensions;
--- a/Hadrons/Modules/MScalar/ScalarVP.cc
+++ b/Hadrons/Modules/MScalar/ScalarVP.cc
@@ -462,7 +462,7 @@ void TScalarVP::execute(void)
    if (!par().output.empty())
    {
        LOG(Message) << "Saving momentum-projected HVP to '"
-                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
+                     << RESULT_FILE_NAME(par().output) << "'..."
                     << std::endl;
        saveResult(par().output, "HVP", outputData);
    }
--- a/Hadrons/Modules/MScalar/VPCounterTerms.cc
+++ b/Hadrons/Modules/MScalar/VPCounterTerms.cc
@@ -239,7 +239,7 @@ void TVPCounterTerms::execute(void)
    if (!par().output.empty())
    {
        LOG(Message) << "Saving momentum-projected correlators to '"
-                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
+                     << RESULT_FILE_NAME(par().output) << "'..."
                     << std::endl;
        saveResult(par().output, "scalar_loops", outputData);
    }
--- a/Hadrons/Modules/MScalarSUN/TimeMomProbe.cc
+++ b/Hadrons/Modules/MScalarSUN/TimeMomProbe.cc
@@ -2,7 +2,7 @@
 Grid physics library, www.github.com/paboyle/Grid 
-Source file: Hadrons/Modules/MIO/LoadA2AVectors.cc
+Source file: Hadrons/Modules/MScalarSUN/TimeMomProbe.cc
 Copyright (C) 2015-2018
@@ -25,10 +25,14 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
+#include <Hadrons/Modules/MScalarSUN/TimeMomProbe.hpp>
 using namespace Grid;
 using namespace Hadrons;
-using namespace MIO;
+using namespace MScalarSUN;
-template class Grid::Hadrons::MIO::TLoadA2AVectors<FIMPL>;
+template class Grid::Hadrons::MScalarSUN::TTimeMomProbe<ScalarNxNAdjImplR<2>>;
 template class Grid::Hadrons::MScalarSUN::TTimeMomProbe<ScalarNxNAdjImplR<3>>;
 template class Grid::Hadrons::MScalarSUN::TTimeMomProbe<ScalarNxNAdjImplR<4>>;
 template class Grid::Hadrons::MScalarSUN::TTimeMomProbe<ScalarNxNAdjImplR<5>>;
 template class Grid::Hadrons::MScalarSUN::TTimeMomProbe<ScalarNxNAdjImplR<6>>;
--- a/Hadrons/Modules/MScalarSUN/TimeMomProbe.hpp
+++ b/Hadrons/Modules/MScalarSUN/TimeMomProbe.hpp
@@ -0,0 +1,268 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MScalarSUN/TimeMomProbe.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalarSUN_TimeMomProbe_hpp_
 #define Hadrons_MScalarSUN_TimeMomProbe_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/Modules/MScalarSUN/Utils.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *          n-point functions O(t,p)*tr(phi(t_1,p_1)*...*phi(t_n,p_n))        *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)
 class TimeMomProbePar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TimeMomProbePar,
                                    std::string,              field,
                                    std::vector<std::string>, op,
                                    std::vector<std::vector<std::string>>, timeMom,
                                    std::string,              output);
 };
 class TimeMomProbeResult: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TimeMomProbeResult,
                                    std::string,                   op,
                                    std::vector<std::vector<int>>, timeMom,
                                    std::vector<Complex>,          data);
 };
 template <typename SImpl>
 class TTimeMomProbe: public Module<TimeMomProbePar>
 {
 public:
    typedef typename SImpl::Field                    Field;
    typedef typename SImpl::SiteField::scalar_object Site;
    typedef typename SImpl::ComplexField             ComplexField;
    typedef          std::vector<Complex>            SlicedOp;
 public:
    // constructor
    TTimeMomProbe(const std::string name);
    // destructor
    virtual ~TTimeMomProbe(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    void vectorModulo(std::vector<int> &v);
 };
 MODULE_REGISTER_TMP(TimeMomProbeSU2, TTimeMomProbe<ScalarNxNAdjImplR<2>>, MScalarSUN);
 MODULE_REGISTER_TMP(TimeMomProbeSU3, TTimeMomProbe<ScalarNxNAdjImplR<3>>, MScalarSUN);
 MODULE_REGISTER_TMP(TimeMomProbeSU4, TTimeMomProbe<ScalarNxNAdjImplR<4>>, MScalarSUN);
 MODULE_REGISTER_TMP(TimeMomProbeSU5, TTimeMomProbe<ScalarNxNAdjImplR<5>>, MScalarSUN);
 MODULE_REGISTER_TMP(TimeMomProbeSU6, TTimeMomProbe<ScalarNxNAdjImplR<6>>, MScalarSUN);
 /******************************************************************************
 *                        TTimeMomProbe implementation                        *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename SImpl>
 TTimeMomProbe<SImpl>::TTimeMomProbe(const std::string name)
 : Module<TimeMomProbePar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename SImpl>
 std::vector<std::string> TTimeMomProbe<SImpl>::getInput(void)
 {
    std::vector<std::string> in = par().op;
    in.push_back(par().field);
    return in;
 }
 template <typename SImpl>
 std::vector<std::string> TTimeMomProbe<SImpl>::getOutput(void)
 {
    std::vector<std::string> out;
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TTimeMomProbe<SImpl>::setup(void)
 {
    envTmpLat(ComplexField, "ftBuf");
    envTmpLat(Field, "ftMatBuf");
 }
 // execution ///////////////////////////////////////////////////////////////////
 // NB: time is direction 0
 template <typename SImpl>
 void TTimeMomProbe<SImpl>::vectorModulo(std::vector<int> &v)
 {
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        auto d = env().getDim(mu);
        v[mu] = ((v[mu] % d) + d) % d;
    }
 }
 template <typename SImpl>
 void TTimeMomProbe<SImpl>::execute(void)
 {
    const unsigned int                           nd = env().getNd();
    const unsigned int                           nt = env().getDim(0);
    double                                       partVol = 1.;
    std::set<std::vector<int>>                   timeMomSet;
    std::vector<std::vector<std::vector<int>>>   timeMom;
    std::vector<std::vector<int>>                transferMom;
    FFT                                          fft(envGetGrid(Field));
    std::vector<int>                             dMask(nd, 1);
    std::vector<TimeMomProbeResult>              result;
    std::map<std::string, std::vector<SlicedOp>> slicedOp;
    std::vector<SlicedOp>                        slicedProbe;
    auto                                         &phi = envGet(Field, par().field);
    envGetTmp(ComplexField, ftBuf);
    envGetTmp(Field, ftMatBuf);
    dMask[0] = 0;
    for (unsigned int mu = 1; mu < nd; ++mu)
    {
        partVol *= env().getDim(mu);
    }
    timeMom.resize(par().timeMom.size());
    for (unsigned int p = 0; p < timeMom.size(); ++p)
    {
        for (auto &tms: par().timeMom[p])
        {
            std::vector<int> tm = strToVec<int>(tms);
            timeMom[p].push_back(tm);
            timeMomSet.insert(tm);
        }
        transferMom.push_back(std::vector<int>(nd - 1, 0));
        for (auto &tm: timeMom[p])
        {
            for (unsigned int j = 1; j < nd; ++j)
            {
                transferMom[p][j - 1] -= tm[j];
            }
        }
        LOG(Message) << "Probe " << p << " (" << timeMom[p].size() << " points) : " << std::endl;
        LOG(Message) << "  phi(t_i, p_i) for (t_i, p_i) in " << timeMom[p] << std::endl;
        LOG(Message) << "  operator with momentum " << transferMom[p] << std::endl;
    }
    LOG(Message) << "FFT: field '" << par().field << "'" << std::endl;
    fft.FFT_dim_mask(ftMatBuf, phi, dMask, FFT::forward);
    slicedProbe.resize(timeMom.size());
    for (unsigned int p = 0; p < timeMom.size(); ++p)
    {
        std::vector<int> qt;
        LOG(Message) << "Making probe " << p << std::endl;
        slicedProbe[p].resize(nt);
        for (unsigned int t = 0; t < nt; ++t)
        {
            Site acc;
            for (unsigned int i = 0; i < timeMom[p].size(); ++i)
            {
                Site buf;
                qt     = timeMom[p][i];
                qt[0] += t;
                vectorModulo(qt);
                peekSite(buf, ftMatBuf, qt);
                if (i == 0)
                {
                    acc = buf;
                }
                else
                {
                    acc *= buf;
                }
            }
            slicedProbe[p][t] = TensorRemove(trace(acc));
        }
        //std::cout << slicedProbe[p]<< std::endl;
    }
    for (auto &o: par().op)
    {
        auto &op = envGet(ComplexField, o);
        slicedOp[o].resize(transferMom.size());
        LOG(Message) << "FFT: operator '" << o << "'" << std::endl;
        fft.FFT_dim_mask(ftBuf, op, dMask, FFT::forward);
        //std::cout << ftBuf << std::endl;
        for (unsigned int p = 0; p < transferMom.size(); ++p)
        {
            std::vector<int> qt(nd, 0);
            for (unsigned int j = 1; j < nd; ++j)
            {
                qt[j] = transferMom[p][j - 1];
            }
            slicedOp[o][p].resize(nt);
            for (unsigned int t = 0; t < nt; ++t)
            {
                TComplex buf;
                qt[0] = t;
                vectorModulo(qt);
                peekSite(buf, ftBuf, qt);
                slicedOp[o][p][t] = TensorRemove(buf);
            }
            //std::cout << ftBuf << std::endl;
            //std::cout << slicedOp[o][p] << std::endl;
        }
    }
    LOG(Message) << "Making correlators" << std::endl;
    for (auto &o: par().op)
    for (unsigned int p = 0; p < timeMom.size(); ++p)
    {
        TimeMomProbeResult r;
        LOG(Message) << "  <" << o << " probe_" << p << ">" << std::endl;
        r.op      = o;
        r.timeMom = timeMom[p];
        r.data    = makeTwoPoint(slicedOp[o][p], slicedProbe[p], 1./partVol);
        result.push_back(r);
    }
    saveResult(par().output, "timemomprobe", result);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalarSUN_TimeMomProbe_hpp_
--- a/Hadrons/Modules/MScalarSUN/TrMag.hpp
+++ b/Hadrons/Modules/MScalarSUN/TrMag.hpp
@@ -124,8 +124,7 @@ void TTrMag<SImpl>::execute(void)
    std::vector<TrMagResult> result;
    auto                     &phi = envGet(Field, par().field);
-    auto m2 = sum(phi);
+    auto m2 = sum(phi), mn = m2;
    auto mn = m2;
    m2 = -m2*m2;
    mn = 1.;
--- a/Hadrons/Modules/MScalarSUN/Utils.hpp
+++ b/Hadrons/Modules/MScalarSUN/Utils.hpp
@@ -103,7 +103,7 @@ std::vector<Complex> makeTwoPoint(const std::vector<SinkSite>   &sink,
    {
        for (unsigned int t  = 0; t < nt; ++t)
        {
-            res[dt] += trace(sink[(t+dt)%nt]*adj(source[t]));
+            res[dt] += trace(sink[(t+dt)%nt]*source[t]);
        }
        res[dt] *= factor/static_cast<double>(nt);
    }
--- a/Hadrons/Modules/MSolver/A2AAslashVectors.cc
+++ b/Hadrons/Modules/MSolver/A2AAslashVectors.cc
@@ -1,35 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MSolver/A2AAslashVectors.cc
 Copyright (C) 2015-2018
 Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MSolver;
 template class Grid::Hadrons::MSolver::TA2AAslashVectors<FIMPL>;
 template class Grid::Hadrons::MSolver::TA2AAslashVectors<ZFIMPL>;
--- a/Show More
+++ b/Show More
		`@@ -1,3 +0,0 @@`
			`#include <Grid/GridCore.h>`

			`int Grid::BinaryIO::latticeWriteMaxRetry = -1;`