Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2025-10-01 23:24:43 +01:00 · 2020-04-23 04:35:42 -04:00
parent edec9ee2e2 0782b76ed4
commit c2c3cad20d
307 changed files with 4394 additions and 31968 deletions
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -35,17 +35,22 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/Zolotarev.h>
 #include <Grid/algorithms/approx/Chebyshev.h>
 #include <Grid/algorithms/approx/JacobiPolynomial.h>
 #include <Grid/algorithms/approx/Remez.h>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>
 #include <Grid/algorithms/approx/RemezGeneral.h>
 #include <Grid/algorithms/approx/ZMobius.h>
 #include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
 #include <Grid/algorithms/iterative/MinimalResidual.h>
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -1,3 +1,14 @@
    // blockZaxpy in bockPromote - 3s, 5%
    // noncoalesced linalg in Preconditionoer ~ 3s 5%
    // Lancos tuning or replace 10-20s ~ 25%, open ended
    // setup tuning   5s  ~  8%
    //    -- e.g. ordermin, orderstep tunables.
    // MdagM path without norm in LinOp code.     few seconds
    // Mdir calc blocking kernels
    // Fuse kernels in blockMaskedInnerProduct
    // preallocate Vectors in Cayley 5D ~ few percent few seconds
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@@ -34,15 +45,36 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 template<class vobj,class CComplex>
 inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
 				    const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask,
 				    const Lattice<vobj> &fineX,
 				    const Lattice<vobj> &fineY)
 {
  typedef decltype(innerProduct(vobj(),vobj())) dotp;
  GridBase *coarse(CoarseInner.Grid());
  GridBase *fine  (fineX.Grid());
  Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
  Lattice<dotp> fine_inner_msk(fine);
  // Multiply could be fused with innerProduct
  // Single block sum kernel could do both masks.
  fine_inner = localInnerProduct(fineX,fineY);
  mult(fine_inner_msk, fine_inner,FineMask);
  blockSum(CoarseInner,fine_inner_msk);
 }
 class Geometry {
  //    int dimension;
 public:
  int npoint;
  std::vector<int> directions   ;
  std::vector<int> displacements;
  Geometry(int _d)  {
-  
+    
    int base = (_d==5) ? 1:0;
    // make coarse grid stencil for 4d , not 5d
@@ -52,10 +84,10 @@ public:
    directions.resize(npoint);
    displacements.resize(npoint);
    for(int d=0;d<_d;d++){
-      directions[2*d  ] = d+base;
+      directions[d   ] = d+base;
-      directions[2*d+1] = d+base;
+      directions[d+_d] = d+base;
-      displacements[2*d  ] = +1;
+      displacements[d  ] = +1;
-      displacements[2*d+1] = -1;
+      displacements[d+_d]= -1;
    }
    directions   [2*_d]=0;
    displacements[2*_d]=0;
@@ -63,7 +95,7 @@ public:
    //// report back
    std::cout<<GridLogMessage<<"directions    :";
    for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
-    std::cout <<std::endl;
+    std::cout<<std::endl;
    std::cout<<GridLogMessage<<"displacements :";
    for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
    std::cout<<std::endl;
@@ -115,10 +147,10 @@ public:
  void Orthogonalise(void){
    CoarseScalar InnerProd(CoarseGrid); 
-    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
+    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
    //    blockOrthogonalise(InnerProd,subspace);
    //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
    //      CheckOrthogonal();
  } 
@@ -128,7 +160,7 @@ public:
    for(int i=0;i<nbasis;i++){
      blockProject(iProj,subspace[i],subspace);
      eProj=Zero(); 
-      thread_for(ss, CoarseGrid->oSites(),{
+      accelerator_for(ss, CoarseGrid->oSites(),1,{
 	eProj[ss](i)=CComplex(1.0);
      });
      eProj=eProj - iProj;
@@ -146,61 +178,9 @@ public:
  void CreateSubspaceRandom(GridParallelRNG &RNG){
    for(int i=0;i<nbasis;i++){
      random(RNG,subspace[i]);
      std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
    }
    Orthogonalise();
  }
  /*
    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
    {
    // Run a Lanczos with sloppy convergence
    const int Nstop = nn;
    const int Nk = nn+20;
    const int Np = nn+20;
    const int Nm = Nk+Np;
    const int MaxIt= 10000;
    RealD resid = 1.0e-3;
    Chebyshev<FineField> Cheb(0.5,64.0,21);
    ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
    //	IRL.lock = 1;
    FineField noise(FineGrid); gaussian(RNG,noise);
    FineField tmp(FineGrid); 
    std::vector<RealD>     eval(Nm);
    std::vector<FineField> evec(Nm,FineGrid);
    int Nconv;
    IRL.calc(eval,evec,
    noise,
    Nconv);
    // pull back nn vectors
    for(int b=0;b<nn;b++){
    subspace[b]   = evec[b];
    std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
    hermop.Op(subspace[b],tmp); 
    std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
    noise = tmp -  sqrt(eval[b])*subspace[b] ;
    std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
    noise = tmp +  eval[b]*subspace[b] ;
    std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
    }
    Orthogonalise();
    for(int b=0;b<nn;b++){
    std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
    }
    }
  */
  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
    RealD scale;
@@ -232,54 +212,316 @@ public:
      subspace[b]   = noise;
    }
    Orthogonalise();
  }
-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
+  ////////////////////////////////////////////////////////////////////////////////////////////////
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
  // and this is the best I found
  ////////////////////////////////////////////////////////////////////////////////////////////////
 #if 1
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
 				       double lo,
 				       int orderfilter,
 				       int ordermin,
 				       int orderstep,
 				       double filterlo
 				       ) {
    RealD scale;
    FineField noise(FineGrid);
    FineField Mn(FineGrid);
    FineField tmp(FineGrid);
-    Chebyshev<FineField> Cheb(0.1,64.0,900);
+    // New normalised noise
    gaussian(RNG,noise);
    scale = std::pow(norm2(noise),-0.5); 
    noise=noise*scale;
    // Initial matrix element
    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
    int b =0;
    {
      // Filter
      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
      subspace[b]   = Mn;
      hermop.Op(Mn,tmp); 
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
      b++;
    }
    // Generate a full sequence of Chebyshevs
    {
      lo=filterlo;
      noise=Mn;
      FineField T0(FineGrid); T0 = noise;  
      FineField T1(FineGrid); 
      FineField T2(FineGrid);
      FineField y(FineGrid);
      FineField *Tnm = &T0;
      FineField *Tn  = &T1;
      FineField *Tnp = &T2;
      // Tn=T1 = (xscale M + mscale)in
      RealD xscale = 2.0/(hi-lo);
      RealD mscale = -(hi+lo)/(hi-lo);
      hermop.HermOp(T0,y);
      T1=y*xscale+noise*mscale;
      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
 	hermop.HermOp(*Tn,y);
 	auto y_v = y.View();
 	auto Tn_v = Tn->View();
 	auto Tnp_v = Tnp->View();
 	auto Tnm_v = Tnm->View();
 	const int Nsimd = CComplex::Nsimd();
 	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
        });
 	// Possible more fine grained control is needed than a linear sweep,
 	// but huge productivity gain if this is simple algorithm and not a tunable
 	int m =1;
 	if ( n>=ordermin ) m=n-ordermin;
 	if ( (m%orderstep)==0 ) { 
 	  Mn=*Tnp;
 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
 	  subspace[b] = Mn;
 	  hermop.Op(Mn,tmp); 
 	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
 	  b++;
 	}
 	// Cycle pointers to avoid copies
 	FineField *swizzle = Tnm;
 	Tnm    =Tn;
 	Tn     =Tnp;
 	Tnp    =swizzle;
      }
    }
    assert(b==nn);
  }
 #endif
 #if 0
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
 				       double lo,
 				       int orderfilter,
 				       int ordermin,
 				       int orderstep,
 				       double filterlo
 				       ) {
    RealD scale;
    FineField noise(FineGrid);
    FineField Mn(FineGrid);
    FineField tmp(FineGrid);
    FineField combined(FineGrid);
-    for(int b=0;b<nn;b++){
+    // New normalised noise
-	
+    gaussian(RNG,noise);
-      gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
-      scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
      noise=noise*scale;
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+    // Initial matrix element
    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-      Cheb(hermop,noise,Mn);
+    int b =0;
 #define FILTERb(llo,hhi,oorder)						\
    {									\
      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
      Cheb(hermop,noise,Mn);						\
      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
      subspace[b]   = Mn;						\
      hermop.Op(Mn,tmp);						\
      std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
      b++;								\
    }									
-      scale = std::pow(norm2(Mn),-0.5); 
+    //      JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5);	\
      Mn=Mn*scale;
      subspace[b]   = Mn;
-      hermop.Op(Mn,noise); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(noise)<<std::endl;
+    RealD alpha=-0.8;
-
+    RealD beta =-0.8;
-    }
+#define FILTER(llo,hhi,oorder)						\
-
+    {									\
-    Orthogonalise();
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
      /* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
      Cheb(hermop,noise,Mn);						\
      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
      subspace[b]   = Mn;						\
      hermop.Op(Mn,tmp);						\
      std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
      b++;								\
    }									
 #define FILTERc(llo,hhi,oorder)				\
    {							\
      Chebyshev<FineField> Cheb(llo,hhi,oorder);	\
      Cheb(hermop,noise,combined);			\
    }									
    double node = 0.000;
    FILTERb(lo,hi,orderfilter);// 0
    //    FILTERc(node,hi,51);// 0
    noise = Mn;
    int base = 0;
    int mult = 100;
    FILTER(node,hi,base+1*mult);
    FILTER(node,hi,base+2*mult);
    FILTER(node,hi,base+3*mult);
    FILTER(node,hi,base+4*mult);
    FILTER(node,hi,base+5*mult);
    FILTER(node,hi,base+6*mult);
    FILTER(node,hi,base+7*mult);
    FILTER(node,hi,base+8*mult);
    FILTER(node,hi,base+9*mult);
    FILTER(node,hi,base+10*mult);
    FILTER(node,hi,base+11*mult);
    FILTER(node,hi,base+12*mult);
    FILTER(node,hi,base+13*mult);
    FILTER(node,hi,base+14*mult);
    FILTER(node,hi,base+15*mult);
    assert(b==nn);
  }
 #endif
 #if 0
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
 				       double lo,
 				       int orderfilter,
 				       int ordermin,
 				       int orderstep,
 				       double filterlo
 				       ) {
    RealD scale;
    FineField noise(FineGrid);
    FineField Mn(FineGrid);
    FineField tmp(FineGrid);
    FineField combined(FineGrid);
    // New normalised noise
    gaussian(RNG,noise);
    scale = std::pow(norm2(noise),-0.5); 
    noise=noise*scale;
    // Initial matrix element
    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
    int b =0;
    {						
      Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
      //      JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
      //JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
      //      JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
      JacobiPoly(hermop,noise,Mn);
      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
      subspace[b]   = Mn;
      hermop.Op(Mn,tmp);
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; 
      b++;
      //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
      //      subspace[b]   = tmp;      b++;
      //    }									
    }									
 #define FILTER(lambda)						\
    {								\
      hermop.HermOp(subspace[0],tmp);				\
      tmp = tmp - lambda *subspace[0];				\
      scale = std::pow(norm2(tmp),-0.5);			\
      tmp=tmp*scale;							\
      subspace[b]   = tmp;						\
      hermop.Op(subspace[b],tmp);					\
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
      b++;								\
    }									
    //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
    //      subspace[b]   = tmp;      b++;
    //    }									
    FILTER(2.0e-5);
    FILTER(2.0e-4);
    FILTER(4.0e-4);
    FILTER(8.0e-4);
    FILTER(8.0e-4);
    FILTER(2.0e-3);
    FILTER(3.0e-3);
    FILTER(4.0e-3);
    FILTER(5.0e-3);
    FILTER(6.0e-3);
    FILTER(2.5e-3);
    FILTER(3.5e-3);
    FILTER(4.5e-3);
    FILTER(5.5e-3);
    FILTER(6.5e-3);
    //    FILTER(6.0e-5);//6
    //    FILTER(7.0e-5);//8
    //    FILTER(8.0e-5);//9
    //    FILTER(9.0e-5);//3
    /*
    //    FILTER(1.0e-4);//10
    FILTER(2.0e-4);//11
    //   FILTER(3.0e-4);//12
    //    FILTER(4.0e-4);//13
    FILTER(5.0e-4);//14
    FILTER(6.0e-3);//4
    FILTER(7.0e-4);//1
    FILTER(8.0e-4);//7
    FILTER(9.0e-4);//15
    FILTER(1.0e-3);//2
    FILTER(2.0e-3);//2
    FILTER(3.0e-3);//2
    FILTER(4.0e-3);//2
    FILTER(5.0e-3);//2
    FILTER(6.0e-3);//2
    FILTER(7.0e-3);//2
    FILTER(8.0e-3);//2
    FILTER(1.0e-2);//2
    */
    std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
    assert(b==nn);
  }
 #endif
 };
 // Fine Object == (per site) type of fine field
 // nbasis      == number of deflation vectors
 template<class Fobj,class CComplex,int nbasis>
 class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
 public:
-  typedef iVector<CComplex,nbasis >             siteVector;
+  typedef iVector<CComplex,nbasis >           siteVector;
  typedef Lattice<CComplex >                  CoarseComplexField;
  typedef Lattice<siteVector>                 CoarseVector;
  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-
+  typedef iMatrix<CComplex,nbasis >  Cobj;
  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj >        FineField;
@@ -293,7 +535,6 @@ public:
  CartesianStencil<siteVector,siteVector,int> Stencil; 
  std::vector<CoarseMatrix> A;
  ///////////////////////
  // Interface
@@ -305,33 +546,71 @@ public:
    conformable(_grid,in.Grid());
    conformable(in.Grid(),out.Grid());
-    RealD Nin = norm2(in);
+    //    RealD Nin = norm2(in);
    SimpleCompressor<siteVector> compressor;
    double comms_usec = -usecond();
    Stencil.HaloExchange(in,compressor);
    comms_usec += usecond();
    auto in_v = in.View();
    auto out_v = out.View();
-    thread_for(ss,Grid()->oSites(),{
+    typedef LatticeView<Cobj> Aview;
-      siteVector res = Zero();
+
-      siteVector nbr;
+    Vector<Aview> AcceleratorViewContainer;
    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
    Aview *Aview_p = & AcceleratorViewContainer[0];
    const int Nsimd = CComplex::Nsimd();
    typedef decltype(coalescedRead(in_v[0])) calcVector;
    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
    GridStopWatch ArithmeticTimer;
    int osites=Grid()->oSites();
    //    double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
    //    double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
    double usecs =-usecond();
    // assert(geom.npoint==9);
    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
      int ss = sss/nbasis;
      int b  = sss%nbasis;
      calcComplex res = Zero();
      calcVector nbr;
      int ptype;
      StencilEntry *SE;
      int lane=SIMTlane(Nsimd);
      for(int point=0;point<geom.npoint;point++){
 	SE=Stencil.GetEntry(ptype,point,ss);
-	if(SE->_is_local&&SE->_permute) { 
+	if(SE->_is_local) { 
-	  permute(nbr,in_v[SE->_offset],ptype);
+	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
 	} else if(SE->_is_local) { 
 	  nbr = in_v[SE->_offset];
 	} else {
-	  nbr = Stencil.CommBuf()[SE->_offset];
+	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
 	}
 	synchronise();
 	for(int bb=0;bb<nbasis;bb++) {
 	  res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
 	}
 	auto A_point = A[point].View();
 	res = res + A_point[ss]*nbr;
      }
-      vstream(out_v[ss],res);
+      coalescedWrite(out_v[ss](b),res,lane);
    });
    usecs +=usecond();
    double nrm_usec=-usecond();
    RealD Nout= norm2(out);
    nrm_usec+=usecond();
    /*
        std::cout << GridLogMessage << "\tNorm        " << nrm_usec << " us" <<std::endl;
        std::cout << GridLogMessage << "\tHalo        " << comms_usec << " us" <<std::endl;
        std::cout << GridLogMessage << "\tMatrix      " << usecs << " us" <<std::endl;
        std::cout << GridLogMessage << "\t  mflop/s   " << flops/usecs<<std::endl;
        std::cout << GridLogMessage << "\t  MB/s      " << bytes/usecs<<std::endl;
    */
    return Nout;
  };
@@ -349,25 +628,54 @@ public:
      return norm2(out);
    }
  };
-
+  void MdirComms(const CoarseVector &in)
-  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
+  {
    conformable(_grid,in.Grid());
    conformable(in.Grid(),out.Grid());
    SimpleCompressor<siteVector> compressor;
    Stencil.HaloExchange(in,compressor);
-    
+  }
-    auto point = [dir, disp](){
+  void MdirCalc(const CoarseVector &in, CoarseVector &out, int point)
-      if(dir == 0 and disp == 0)
+  {
-	return 8;
+    conformable(_grid,in.Grid());
-      else
+    conformable(_grid,out.Grid());
-	return (4 * dir + 1 - disp) / 2;
+
-    }();
+    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
    Aview *Aview_p = & AcceleratorViewContainer[0];
    auto out_v = out.View();
    auto in_v  = in.View();
-    thread_for(ss,Grid()->oSites(),{
+
    const int Nsimd = CComplex::Nsimd();
    typedef decltype(coalescedRead(in_v[0])) calcVector;
    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
      int ss = sss/nbasis;
      int b  = sss%nbasis;
      calcComplex res = Zero();
      calcVector nbr;
      int ptype;
      StencilEntry *SE;
      int lane=SIMTlane(Nsimd);
      SE=Stencil.GetEntry(ptype,point,ss);
      if(SE->_is_local) { 
 	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
      } else {
 	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
      }
      synchronise();
      for(int bb=0;bb<nbasis;bb++) {
 	res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
      }
      coalescedWrite(out_v[ss](b),res,lane);
    });
 #if 0
    accelerator_for(ss,Grid()->oSites(),1,{
      siteVector res = Zero();
      siteVector nbr;
      int ptype;
@@ -382,16 +690,65 @@ public:
      } else {
 	nbr = Stencil.CommBuf()[SE->_offset];
      }
      synchronise();
-      auto A_point = A[point].View();
+      res = res + Aview_p[point][ss]*nbr;
      res = res + A_point[ss]*nbr;
-      vstream(out_v[ss],res);
+      out_v[ss]=res;
    });
 #endif
  }
  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
  {
    this->MdirComms(in);
    int ndir=geom.npoint-1;
    if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { 
      std::cout <<"MdirAll out size "<< out.size()<<std::endl;
      std::cout <<"MdirAll ndir "<< ndir<<std::endl;
      assert(0);
    }
    for(int p=0;p<ndir;p++){
      MdirCalc(in,out[p],p);
    }
  };
  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
    this->MdirComms(in);
    int ndim = in.Grid()->Nd();
    //////////////
    // 4D action like wilson
    // 0+ => 0 
    // 0- => 1
    // 1+ => 2 
    // 1- => 3
    // etc..
    //////////////
    // 5D action like DWF
    // 1+ => 0 
    // 1- => 1
    // 2+ => 2 
    // 2- => 3
    // etc..
    auto point = [dir, disp, ndim](){
      if(dir == 0 and disp == 0)
 	return 8;
      else if ( ndim==4 ) { 
 	return (4 * dir + 1 - disp) / 2;
      } else { 
 	return (4 * (dir-1) + 1 - disp) / 2;
      }
    }();
    MdirCalc(in,out,point);
  };
-  void Mdiag(const CoarseVector &in, CoarseVector &out){
+  void Mdiag(const CoarseVector &in, CoarseVector &out)
-    Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil
+  {
    int point=geom.npoint-1;
    MdirCalc(in, out, point); // No comms
  };
@@ -401,25 +758,44 @@ public:
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
-    A(geom.npoint,&CoarseGrid)
+      A(geom.npoint,&CoarseGrid)
  {
  };
  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & Subspace){
+		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
  {
    typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
    typedef typename Fobj::scalar_type scalar_type;
-    FineField iblock(FineGrid); // contributions from within this block
+    FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
-    FineField oblock(FineGrid); // contributions from outwith this block
+    FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
    std::vector<FineComplexField> masks(geom.npoint,FineGrid);
    FineComplexField imask(FineGrid); // contributions from within this block
    FineComplexField omask(FineGrid); // contributions from outwith this block
    FineComplexField evenmask(FineGrid);
    FineComplexField oddmask(FineGrid); 
    FineField     phi(FineGrid);
    FineField     tmp(FineGrid);
    FineField     zz(FineGrid); zz=Zero();
    FineField    Mphi(FineGrid);
    FineField    Mphie(FineGrid);
    FineField    Mphio(FineGrid);
    std::vector<FineField>     Mphi_p(geom.npoint,FineGrid);
-    Lattice<iScalar<vInteger> > coor(FineGrid);
+    Lattice<iScalar<vInteger> > coor (FineGrid);
    Lattice<iScalar<vInteger> > bcoor(FineGrid);
    Lattice<iScalar<vInteger> > bcb  (FineGrid); bcb = Zero();
    CoarseVector iProj(Grid()); 
    CoarseVector oProj(Grid()); 
    CoarseVector SelfProj(Grid()); 
    CoarseComplexField iZProj(Grid()); 
    CoarseComplexField oZProj(Grid()); 
    CoarseScalar InnerProd(Grid()); 
    // Orthogonalise the subblocks over the basis
@@ -428,69 +804,117 @@ public:
    // Compute the matrix elements of linop between this orthonormal
    // set of vectors.
    int self_stencil=-1;
-    for(int p=0;p<geom.npoint;p++){ 
+    for(int p=0;p<geom.npoint;p++)
    { 
      int dir   = geom.directions[p];
      int disp  = geom.displacements[p];
      A[p]=Zero();
      if( geom.displacements[p]==0){
 	self_stencil=p;
      }
      Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
      LatticeCoordinate(coor,dir);
      ///////////////////////////////////////////////////////
      // Work out even and odd block checkerboarding for fast diagonal term
      ///////////////////////////////////////////////////////
      if ( disp==1 ) {
 	bcb   = bcb + div(coor,block);
      }
      if ( disp==0 ) {
 	  masks[p]= Zero();
      } else if ( disp==1 ) {
 	masks[p] = where(mod(coor,block)==(block-1),one,zero);
      } else if ( disp==-1 ) {
 	masks[p] = where(mod(coor,block)==(Integer)0,one,zero);
      }
    }
    evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
    oddmask  = one-evenmask;
    assert(self_stencil!=-1);
    for(int i=0;i<nbasis;i++){
      phi=Subspace.subspace[i];
-	
+
-      std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
+      //      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
      linop.OpDirAll(phi,Mphi_p);
      linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);
      for(int p=0;p<geom.npoint;p++){ 
 	Mphi = Mphi_p[p];
 	int dir   = geom.directions[p];
 	int disp  = geom.displacements[p];
-	Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
+	if ( (disp==-1) || (!hermitian ) ) {
-	LatticeCoordinate(coor,dir);
+	  ////////////////////////////////////////////////////////////////////////
-
+	  // Pick out contributions coming from this cell and neighbour cell
-	if ( disp==0 ){
+	  ////////////////////////////////////////////////////////////////////////
-	  linop.OpDiag(phi,Mphi);
+	  omask = masks[p];
-	}
+	  imask = one-omask;
-	else  {
+	
 	  linop.OpDir(phi,Mphi,dir,disp); 
 	}
 	////////////////////////////////////////////////////////////////////////
 	// Pick out contributions coming from this cell and neighbour cell
 	////////////////////////////////////////////////////////////////////////
 	if ( disp==0 ) {
 	  iblock = Mphi;
 	  oblock = Zero();
 	} else if ( disp==1 ) {
 	  oblock = where(mod(coor,block)==(block-1),Mphi,zz);
 	  iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
 	} else if ( disp==-1 ) {
 	  oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
 	  iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
 	} else {
 	  assert(0);
 	}
 	Subspace.ProjectToSubspace(iProj,iblock);
 	Subspace.ProjectToSubspace(oProj,oblock);
 	//	  blockProject(iProj,iblock,Subspace.subspace);
 	//	  blockProject(oProj,oblock,Subspace.subspace);
 	auto iProj_v = iProj.View() ;
 	auto oProj_v = oProj.View() ;
 	auto A_p     =  A[p].View();
 	auto A_self  = A[self_stencil].View();
 	thread_for(ss, Grid()->oSites(),{
 	  for(int j=0;j<nbasis;j++){
-	    if( disp!= 0 ) {
+	    
-	      A_p[ss](j,i) = oProj_v[ss](j);
+	    blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
-	    }
+	    
-	    A_self[ss](j,i) =	A_self[ss](j,i) + iProj_v[ss](j);
+	    auto iZProj_v = iZProj.View() ;
 	    auto oZProj_v = oZProj.View() ;
 	    auto A_p     =  A[p].View();
 	    auto A_self  = A[self_stencil].View();
 	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
 	    //      if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });}
 	    //	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_self[ss](j,i),A_self(ss)(j,i)+iZProj_v(ss)); });
 	  }
 	}
      }
      ///////////////////////////////////////////
      // Faster alternate self coupling.. use hermiticity to save 2x
      ///////////////////////////////////////////
      {
 	mult(tmp,phi,evenmask);  linop.Op(tmp,Mphie);
 	mult(tmp,phi,oddmask );  linop.Op(tmp,Mphio);
 	{
 	  auto tmp_      = tmp.View();
 	  auto evenmask_ = evenmask.View();
 	  auto oddmask_  =  oddmask.View();
 	  auto Mphie_    =  Mphie.View();
 	  auto Mphio_    =  Mphio.View();
 	  accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ 
 	      coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
 	    });
 	}
 	blockProject(SelfProj,tmp,Subspace.subspace);
 	auto SelfProj_ = SelfProj.View();
 	auto A_self  = A[self_stencil].View();
 	accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
 	  for(int j=0;j<nbasis;j++){
 	    coalescedWrite(A_self[ss](j,i), SelfProj_(ss)(j));
 	  }
 	});
      }
    }
    if(hermitian) {
      std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
      ForceHermitian();
    }
      // AssertHermitian();
      // ForceDiagonal();
  }
 #if 0
    ///////////////////////////
@@ -513,17 +937,26 @@ public:
    std::cout<<GridLogMessage<< iProj <<std::endl;
    std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
-      //      ForceHermitian();
+
      // AssertHermitian();
      // ForceDiagonal();
  }
  void ForceHermitian(void) {
-    for(int d=0;d<4;d++){
+    CoarseMatrix Diff  (Grid());
-      int dd=d+1;
+    for(int p=0;p<geom.npoint;p++){
-      A[2*d] = adj(Cshift(A[2*d+1],dd,1));
+      int dir   = geom.directions[p];
      int disp  = geom.displacements[p];
      if(disp==-1) {
 	// Find the opposite link
 	for(int pp=0;pp<geom.npoint;pp++){
 	  int dirp   = geom.directions[pp];
 	  int dispp  = geom.displacements[pp];
 	  if ( (dirp==dir) && (dispp==1) ){
 	    //	    Diff = adj(Cshift(A[p],dir,1)) - A[pp]; 
 	    //	    std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
 	    A[pp] = adj(Cshift(A[p],dir,1));
 	  }
 	}
      }
    }
    //      A[8] = 0.5*(A[8] + adj(A[8]));
  }
  void AssertHermitian(void) {
    CoarseMatrix AA    (Grid());
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -47,6 +47,7 @@ public:
  // Support for coarsening to a multigrid
  virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
  virtual void OpDir  (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
  virtual void OpDirAll  (const Field &in, std::vector<Field> &out) = 0; // Abstract base
  virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
@@ -83,6 +84,9 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){
    _Mat.MdirAll(in,out);
  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
@@ -93,8 +97,7 @@ public:
    _Mat.MdagM(in,out,n1,n2);
  }
  void HermOp(const Field &in, Field &out){
-    RealD n1,n2;
+    _Mat.MdagM(in,out);
    HermOpAndNorm(in,out,n1,n2);
  }
 };
@@ -116,6 +119,9 @@ public:
    _Mat.Mdir(in,out,dir,disp);
    assert(0);
  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){
    assert(0);
  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
    assert(0);
@@ -154,6 +160,9 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){
    _Mat.MdirAll(in,out);
  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
@@ -162,7 +171,6 @@ public:
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    _Mat.M(in,out);
    ComplexD dot= innerProduct(in,out); n1=real(dot);
    n2=norm2(out);
  }
@@ -183,6 +191,9 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){
    _Mat.MdirAll(in,out);
  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
@@ -234,6 +245,9 @@ public:
      void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	assert(0);
      }
      void OpDirAll  (const Field &in, std::vector<Field> &out){
 	assert(0);
      };
    };
    template<class Matrix,class Field>
    class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
@@ -320,9 +334,135 @@ public:
 	return axpy_norm(out,-1.0,tmp,in);
      }
    };
    template<class Field>
    class NonHermitianSchurOperatorBase :  public LinearOperatorBase<Field> 
    {
      public:
        virtual RealD Mpc      (const Field& in, Field& out) = 0;
        virtual RealD MpcDag   (const Field& in, Field& out) = 0;
        virtual void  MpcDagMpc(const Field& in, Field& out, RealD& ni, RealD& no) {
          Field tmp(in.Grid());
          tmp.Checkerboard() = in.Checkerboard();
 	        ni = Mpc(in,tmp);
 	        no = MpcDag(tmp,out);
        }
        virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
          assert(0);
        }
        virtual void HermOp(const Field& in, Field& out) {
          assert(0);
        }
        void Op(const Field& in, Field& out) {
          Mpc(in, out);
        }
        void AdjOp(const Field& in, Field& out) { 
          MpcDag(in, out);
        }
        // Support for coarsening to a multigrid
        void OpDiag(const Field& in, Field& out) {
          assert(0); // must coarsen the unpreconditioned system
        }
        void OpDir(const Field& in, Field& out, int dir, int disp) {
          assert(0);
        }
    };
    template<class Matrix, class Field>
    class NonHermitianSchurDiagMooeeOperator :  public NonHermitianSchurOperatorBase<Field> 
    {
      public:
        Matrix& _Mat;
        NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
        virtual RealD Mpc(const Field& in, Field& out) {
          Field tmp(in.Grid());
          tmp.Checkerboard() = !in.Checkerboard();
  	      _Mat.Meooe(in, tmp);
 	        _Mat.MooeeInv(tmp, out);
 	        _Mat.Meooe(out, tmp);
 	        _Mat.Mooee(in, out);
          return axpy_norm(out, -1.0, tmp, out);
        }
        virtual RealD MpcDag(const Field& in, Field& out) {
 	        Field tmp(in.Grid());
 	        _Mat.MeooeDag(in, tmp);
          _Mat.MooeeInvDag(tmp, out);
 	        _Mat.MeooeDag(out, tmp);
 	        _Mat.MooeeDag(in, out);
          return axpy_norm(out, -1.0, tmp, out);
      }
    };
    template<class Matrix,class Field>
    class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Field> 
    {
      protected:
        Matrix &_Mat;
      public:
        NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
        virtual RealD Mpc(const Field& in, Field& out) {
 	        Field tmp(in.Grid());
 	        _Mat.Meooe(in, out);
 	        _Mat.MooeeInv(out, tmp);
 	        _Mat.Meooe(tmp, out);
 	        _Mat.MooeeInv(out, tmp);
 	        return axpy_norm(out, -1.0, tmp, in);
        }
        virtual RealD MpcDag(const Field& in, Field& out) {
 	        Field tmp(in.Grid());
 	        _Mat.MooeeInvDag(in, out);
 	        _Mat.MeooeDag(out, tmp);
 	        _Mat.MooeeInvDag(tmp, out);
 	        _Mat.MeooeDag(out, tmp);
 	        return axpy_norm(out, -1.0, tmp, in);
        }
    };
    template<class Matrix, class Field>
    class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Field> 
    {
      protected:
        Matrix& _Mat;
      public:
        NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
        virtual RealD Mpc(const Field& in, Field& out) {
          Field tmp(in.Grid());
 	        _Mat.MooeeInv(in, out);
 	        _Mat.Meooe(out, tmp);
 	        _Mat.MooeeInv(tmp, out);
 	        _Mat.Meooe(out, tmp);
 	        return axpy_norm(out, -1.0, tmp, in);
        }
        virtual RealD MpcDag(const Field& in, Field& out) {
 	        Field tmp(in.Grid());
          _Mat.MeooeDag(in, out);
          _Mat.MooeeInvDag(out, tmp);
          _Mat.MeooeDag(tmp, out);
          _Mat.MooeeInvDag(out, tmp);
          return axpy_norm(out, -1.0, tmp, in);
        }
    };
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
-    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
+    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
    template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -45,8 +45,13 @@ public:
    ni=M(in,tmp);
    no=Mdag(tmp,out);
  }
  virtual void  MdagM(const Field &in, Field &out) {
    RealD ni, no;
    MdagM(in,out,ni,no);
  }
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
 };
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -56,12 +61,12 @@ template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrix
 public:
  virtual GridBase *RedBlackGrid(void)=0;
-      //////////////////////////////////////////////////////////////////////
+  //////////////////////////////////////////////////////////////////////
-      // Query the even even properties to make algorithmic decisions
+  // Query the even even properties to make algorithmic decisions
-      //////////////////////////////////////////////////////////////////////
+  //////////////////////////////////////////////////////////////////////
-      virtual RealD  Mass(void)        { return 0.0; };
+  virtual RealD  Mass(void)        { return 0.0; };
-      virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden
+  virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden
-      virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
+  virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
  // half checkerboard operaions
  virtual  void Meooe    (const Field &in, Field &out)=0;
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -94,6 +94,24 @@ public:
    Coeffs.assign(0.,order);
    Coeffs[order-1] = 1.;
  };
  // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
  // Similar kick effect below the threshold as Lanczos filter approach
  void InitLowPass(RealD _lo,RealD _hi,int _order)
  {
    lo=_lo;
    hi=_hi;
    order=_order;
    if(order < 2) exit(-1);
    Coeffs.resize(order);
    for(int j=0;j<order;j++){
      RealD k=(order-1.0);
      RealD s=std::cos( j*M_PI*(k+0.5)/order );
      Coeffs[j] = s * 2.0/order;
    }
  };
  void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
  {
@@ -234,20 +252,20 @@ public:
    RealD xscale = 2.0/(hi-lo);
    RealD mscale = -(hi+lo)/(hi-lo);
    Linop.HermOp(T0,y);
-    T1=y*xscale+in*mscale;
+    axpby(T1,xscale,mscale,y,in);
    // sum = .5 c[0] T0 + c[1] T1
-    out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
+    //    out = ()*T0 + Coeffs[1]*T1;
    axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
    for(int n=2;n<order;n++){
      Linop.HermOp(*Tn,y);
-
+      //     y=xscale*y+mscale*(*Tn);
-      y=xscale*y+mscale*(*Tn);
+      //      *Tnp=2.0*y-(*Tnm);
-
+      //      out=out+Coeffs[n]* (*Tnp);
-      *Tnp=2.0*y-(*Tnm);
+      axpby(y,xscale,mscale,y,(*Tn));
-
+      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
-      out=out+Coeffs[n]* (*Tnp);
+      axpy(out,Coeffs[n],*Tnp,out);
      // Cycle pointers to avoid copies
      Field *swizzle = Tnm;
      Tnm    =Tn;
--- a/Grid/algorithms/approx/JacobiPolynomial.h
+++ b/Grid/algorithms/approx/JacobiPolynomial.h
@@ -0,0 +1,129 @@
 #ifndef GRID_JACOBIPOLYNOMIAL_H
 #define GRID_JACOBIPOLYNOMIAL_H
 #include <Grid/algorithms/LinearOperator.h>
 NAMESPACE_BEGIN(Grid);
 template<class Field>
 class JacobiPolynomial : public OperatorFunction<Field> {
 private:
  using OperatorFunction<Field>::operator();
  int order;
  RealD hi;
  RealD lo;
  RealD alpha;
  RealD beta;
 public:
  void csv(std::ostream &out){
    csv(out,lo,hi);
  }
  void csv(std::ostream &out,RealD llo,RealD hhi){
    RealD diff = hhi-llo;
    RealD delta = diff*1.0e-5;
    for (RealD x=llo-delta; x<=hhi; x+=delta) {
      RealD f = approx(x);
      out<< x<<" "<<f <<std::endl;
    }
    return;
  }
  JacobiPolynomial(){};
  JacobiPolynomial(RealD _lo,RealD _hi,int _order,RealD _alpha, RealD _beta)
  {
      lo=_lo;
      hi=_hi;
      alpha=_alpha;
      beta=_beta;
      order=_order;
  };
  RealD approx(RealD x) // Convenience for plotting the approximation                                                       
  {
    RealD Tn;
    RealD Tnm;
    RealD Tnp;
    RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
    RealD T0=1.0;
    RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
    Tn =T1;
    Tnm=T0;
    for(int n=2;n<=order;n++){
      RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
      RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
      RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
      RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
      Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;
      Tnm=Tn;
      Tn =Tnp;
    }
    return Tnp;
  };
  // Implement the required interface                                                                                       
  void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
    GridBase *grid=in.Grid();
    int vol=grid->gSites();
    Field T0(grid);
    Field T1(grid);
    Field T2(grid);
    Field y(grid);
    Field *Tnm = &T0;
    Field *Tn  = &T1;
    Field *Tnp = &T2;
    //    RealD T0=1.0;                                                                                                     
    T0=in;
    //    RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));                                                                           
    //           = x * 2/(hi-lo) - (hi+lo)/(hi-lo)                                                                          
    Linop.HermOp(T0,y);
    RealD xscale = 2.0/(hi-lo);
    RealD mscale = -(hi+lo)/(hi-lo);
    Linop.HermOp(T0,y);
    y=y*xscale+in*mscale;
    // RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
    RealD halfAmB  = (alpha-beta)*0.5;
    RealD halfApBp2= (alpha+beta+2.0)*0.5;
    T1 = halfAmB * in + halfApBp2*y;
    for(int n=2;n<=order;n++){
      Linop.HermOp(*Tn,y);
      y=xscale*y+mscale*(*Tn);
      RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
      RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
      RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
      RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
      //      Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;                                                             
      cny=cny/cnp;
      cn1=cn1/cnp;
      cn1=cn1/cnp;
      cnm=cnm/cnp;
      *Tnp=cny*y + cn1 *(*Tn) + cnm * (*Tnm);
      // Cycle pointers to avoid copies                                                                                     
      Field *swizzle = Tnm;
      Tnm    =Tn;
      Tn     =Tnp;
      Tnp    =swizzle;
    }
    out=*Tnp;
  }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/approx/RemezGeneral.cc
+++ b/Grid/algorithms/approx/RemezGeneral.cc
@@ -0,0 +1,473 @@
 #include<math.h>
 #include<stdio.h>
 #include<stdlib.h>
 #include<string>
 #include<iostream>
 #include<iomanip>
 #include<cassert>
 #include<Grid/algorithms/approx/RemezGeneral.h>
 // Constructor
 AlgRemezGeneral::AlgRemezGeneral(double lower, double upper, long precision,
 				 bigfloat (*f)(bigfloat x, void *data), void *data): f(f), 
 										     data(data), 
 										     prec(precision),
 										     apstrt(lower), apend(upper), apwidt(upper - lower),
 										     n(0), d(0), pow_n(0), pow_d(0)
 {
  bigfloat::setDefaultPrecision(prec);
  std::cout<<"Approximation bounds are ["<<apstrt<<","<<apend<<"]\n";
  std::cout<<"Precision of arithmetic is "<<precision<<std::endl;
 }
 //Determine the properties of the numerator and denominator polynomials
 void AlgRemezGeneral::setupPolyProperties(int num_degree, int den_degree, PolyType num_type_in, PolyType den_type_in){
  pow_n = num_degree;
  pow_d = den_degree;
  if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) assert(0);
  if(pow_n % 2 == 1 && num_type_in == PolyType::Even) assert(0);
  if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) assert(0);
  if(pow_d % 2 == 1 && den_type_in == PolyType::Even) assert(0);
  num_type = num_type_in;
  den_type = den_type_in;
  num_pows.resize(pow_n+1);
  den_pows.resize(pow_d+1);
  int n_in = 0;
  bool odd = num_type == PolyType::Full || num_type == PolyType::Odd;
  bool even = num_type == PolyType::Full || num_type == PolyType::Even;
  for(int i=0;i<=pow_n;i++){
    num_pows[i] = -1;
    if(i % 2 == 0 && even) num_pows[i] = n_in++;
    if(i % 2 == 1 && odd) num_pows[i] = n_in++;
  }
  std::cout << n_in << " terms in numerator" << std::endl;
  --n_in; //power is 1 less than the number of terms, eg  pow=1   a x^1  + b x^0
  int d_in = 0;
  odd = den_type == PolyType::Full || den_type == PolyType::Odd;
  even = den_type == PolyType::Full || den_type == PolyType::Even;
  for(int i=0;i<=pow_d;i++){
    den_pows[i] = -1;
    if(i % 2 == 0 && even) den_pows[i] = d_in++;
    if(i % 2 == 1 && odd) den_pows[i] = d_in++;
  }
  std::cout << d_in << " terms in denominator" << std::endl;
  --d_in;
  n = n_in;
  d = d_in;
 }
 //Setup algorithm
 void AlgRemezGeneral::reinitializeAlgorithm(){
  spread = 1.0e37;
  iter = 0;
  neq = n + d + 1; //not +2 because highest-power term in denominator is fixed to 1
  param.resize(neq);
  yy.resize(neq+1);
  //Initialize linear equation temporaries
  A.resize(neq*neq);
  B.resize(neq);
  IPS.resize(neq);
  //Initialize maximum and minimum errors
  xx.resize(neq+2);
  mm.resize(neq+1);
  initialGuess();
  //Initialize search steps
  step.resize(neq+1);
  stpini();
 }
 double AlgRemezGeneral::generateApprox(const int num_degree, const int den_degree, 
 				       const PolyType num_type_in, const PolyType den_type_in, 
 				       const double _tolerance, const int report_freq){
  //Setup the properties of the polynomial
  setupPolyProperties(num_degree, den_degree, num_type_in, den_type_in);
  //Setup the algorithm
  reinitializeAlgorithm();
  bigfloat tolerance = _tolerance;
  //Iterate until convergance
  while (spread > tolerance) { 
    if (iter++ % report_freq==0)
      std::cout<<"Iteration " <<iter-1<<" spread "<<(double)spread<<" delta "<<(double)delta << std::endl; 
    equations();
    if (delta < tolerance) {
      std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n";
      assert(0);
    };    
    assert( delta>= tolerance );
    search();
  }
  int sign;
  double error = (double)getErr(mm[0],&sign);
  std::cout<<"Converged at "<<iter<<" iterations; error = "<<error<<std::endl;
  // Return the maximum error in the approximation
  return error;
 }
 // Initial values of maximal and minimal errors
 void AlgRemezGeneral::initialGuess(){
  // Supply initial guesses for solution points
  long ncheb = neq;			// Degree of Chebyshev error estimate
  // Find ncheb+1 extrema of Chebyshev polynomial
  bigfloat a = ncheb;
  bigfloat r;
  mm[0] = apstrt;
  for (long i = 1; i < ncheb; i++) {
    r = 0.5 * (1 - cos((M_PI * i)/(double) a));
    //r *= sqrt_bf(r);
    r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
    mm[i] = apstrt + r * apwidt;
  }
  mm[ncheb] = apend;
  a = 2.0 * ncheb;
  for (long i = 0; i <= ncheb; i++) {
    r = 0.5 * (1 - cos(M_PI * (2*i+1)/(double) a));
    //r *= sqrt_bf(r); // Squeeze to low end of interval
    r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
    xx[i] = apstrt + r * apwidt;
  }
 }
 // Initialise step sizes
 void AlgRemezGeneral::stpini(){
  xx[neq+1] = apend;
  delta = 0.25;
  step[0] = xx[0] - apstrt;
  for (int i = 1; i < neq; i++) step[i] = xx[i] - xx[i-1];
  step[neq] = step[neq-1];
 }
 // Search for error maxima and minima
 void AlgRemezGeneral::search(){
  bigfloat a, q, xm, ym, xn, yn, xx1;
  int emsign, ensign, steps;
  int meq = neq + 1;
  bigfloat eclose = 1.0e30;
  bigfloat farther = 0l;
  bigfloat xx0 = apstrt;
  for (int i = 0; i < meq; i++) {
    steps = 0;
    xx1 = xx[i]; // Next zero
    if (i == meq-1) xx1 = apend;
    xm = mm[i];
    ym = getErr(xm,&emsign);
    q = step[i];
    xn = xm + q;
    if (xn < xx0 || xn >= xx1) {	// Cannot skip over adjacent boundaries
      q = -q;
      xn = xm;
      yn = ym;
      ensign = emsign;
    } else {
      yn = getErr(xn,&ensign);
      if (yn < ym) {
 	q = -q;
 	xn = xm;
 	yn = ym;
 	ensign = emsign;
      }
    }
    while(yn >= ym) {		// March until error becomes smaller.
      if (++steps > 10)
      	break;
      ym = yn;
      xm = xn;
      emsign = ensign;
      a = xm + q;
      if (a == xm || a <= xx0 || a >= xx1)
 	break;// Must not skip over the zeros either side.      
      xn = a;
      yn = getErr(xn,&ensign);
    }
    mm[i] = xm;			// Position of maximum
    yy[i] = ym;			// Value of maximum
    if (eclose > ym) eclose = ym;
    if (farther < ym) farther = ym;
    xx0 = xx1; // Walk to next zero.
  } // end of search loop
  q = (farther - eclose);	// Decrease step size if error spread increased
  if (eclose != 0.0) q /= eclose; // Relative error spread
  if (q >= spread)
    delta *= 0.5; // Spread is increasing; decrease step size
  spread = q;
  for (int i = 0; i < neq; i++) {
    q = yy[i+1];
    if (q != 0.0) q = yy[i] / q  - (bigfloat)1l;
    else q = 0.0625;
    if (q > (bigfloat)0.25) q = 0.25;
    q *= mm[i+1] - mm[i];
    step[i] = q * delta;
  }
  step[neq] = step[neq-1];
  for (int i = 0; i < neq; i++) {	// Insert new locations for the zeros.
    xm = xx[i] - step[i];
    if (xm <= apstrt)
      continue;
    if (xm >= apend)
      continue;
    if (xm <= mm[i])
      xm = (bigfloat)0.5 * (mm[i] + xx[i]);    
    if (xm >= mm[i+1])
      xm = (bigfloat)0.5 * (mm[i+1] + xx[i]);
    xx[i] = xm;
  }
 }
 // Solve the equations
 void AlgRemezGeneral::equations(){
  bigfloat x, y, z;
  bigfloat *aa;
  for (int i = 0; i < neq; i++) {	// set up the equations for solution by simq()
    int ip = neq * i;		// offset to 1st element of this row of matrix
    x = xx[i];			// the guess for this row
    y = func(x);		// right-hand-side vector
    z = (bigfloat)1l;
    aa = A.data()+ip;
    int t = 0;
    for (int j = 0; j <= pow_n; j++) {
      if(num_pows[j] != -1){ *aa++ = z; t++; }
      z *= x;
    }
    assert(t == n+1);
    z = (bigfloat)1l;
    t = 0;
    for (int j = 0; j < pow_d; j++) {
      if(den_pows[j] != -1){ *aa++ = -y * z; t++; }
      z *= x;
    }
    assert(t == d);
    B[i] = y * z;		// Right hand side vector
  }
  // Solve the simultaneous linear equations.
  if (simq()){
    std::cout<<"simq failed\n";
    exit(0);
  }
 }
 // Evaluate the rational form P(x)/Q(x) using coefficients
 // from the solution vector param
 bigfloat AlgRemezGeneral::approx(const bigfloat x) const{
  // Work backwards toward the constant term.
  int c = n;
  bigfloat yn = param[c--];		// Highest order numerator coefficient
  for (int i = pow_n-1; i >= 0; i--) yn = x * yn  +  (num_pows[i] != -1 ? param[c--] : bigfloat(0l));  
  c = n+d;
  bigfloat yd = 1l; //Highest degree coefficient is 1.0
  for (int i = pow_d-1; i >= 0; i--) yd = x * yd  +  (den_pows[i] != -1 ? param[c--] : bigfloat(0l)); 
  return(yn/yd);
 }
 // Compute size and sign of the approximation error at x
 bigfloat AlgRemezGeneral::getErr(bigfloat x, int *sign) const{
  bigfloat f = func(x);
  bigfloat e = approx(x) - f;
  if (f != 0) e /= f;
  if (e < (bigfloat)0.0) {
    *sign = -1;
    e = -e;
  }
  else *sign = 1;
  return(e);
 }
 // Solve the system AX=B
 int AlgRemezGeneral::simq(){
  int ip, ipj, ipk, ipn;
  int idxpiv;
  int kp, kp1, kpk, kpn;
  int nip, nkp;
  bigfloat em, q, rownrm, big, size, pivot, sum;
  bigfloat *aa;
  bigfloat *X = param.data();
  int n = neq;
  int nm1 = n - 1;
  // Initialize IPS and X
  int ij = 0;
  for (int i = 0; i < n; i++) {
    IPS[i] = i;
    rownrm = 0.0;
    for(int j = 0; j < n; j++) {
      q = abs_bf(A[ij]);
      if(rownrm < q) rownrm = q;
      ++ij;
    }
    if (rownrm == (bigfloat)0l) {
      std::cout<<"simq rownrm=0\n";
      return(1);
    }
    X[i] = (bigfloat)1.0 / rownrm;
  }
  for (int k = 0; k < nm1; k++) {
    big = 0.0;
    idxpiv = 0;
    for (int i = k; i < n; i++) {
      ip = IPS[i];
      ipk = n*ip + k;
      size = abs_bf(A[ipk]) * X[ip];
      if (size > big) {
 	big = size;
 	idxpiv = i;
      }
    }
    if (big == (bigfloat)0l) {
      std::cout<<"simq big=0\n";
      return(2);
    }
    if (idxpiv != k) {
      int j = IPS[k];
      IPS[k] = IPS[idxpiv];
      IPS[idxpiv] = j;
    }
    kp = IPS[k];
    kpk = n*kp + k;
    pivot = A[kpk];
    kp1 = k+1;
    for (int i = kp1; i < n; i++) {
      ip = IPS[i];
      ipk = n*ip + k;
      em = -A[ipk] / pivot;
      A[ipk] = -em;
      nip = n*ip;
      nkp = n*kp;
      aa = A.data()+nkp+kp1;
      for (int j = kp1; j < n; j++) {
 	ipj = nip + j;
 	A[ipj] = A[ipj] + em * *aa++;
      }
    }
  }
  kpn = n * IPS[n-1] + n - 1;	// last element of IPS[n] th row
  if (A[kpn] == (bigfloat)0l) {
    std::cout<<"simq A[kpn]=0\n";
    return(3);
  }
  ip = IPS[0];
  X[0] = B[ip];
  for (int i = 1; i < n; i++) {
    ip = IPS[i];
    ipj = n * ip;
    sum = 0.0;
    for (int j = 0; j < i; j++) {
      sum += A[ipj] * X[j];
      ++ipj;
    }
    X[i] = B[ip] - sum;
  }
  ipn = n * IPS[n-1] + n - 1;
  X[n-1] = X[n-1] / A[ipn];
  for (int iback = 1; iback < n; iback++) {
    //i goes (n-1),...,1
    int i = nm1 - iback;
    ip = IPS[i];
    nip = n*ip;
    sum = 0.0;
    aa = A.data()+nip+i+1;
    for (int j= i + 1; j < n; j++) 
      sum += *aa++ * X[j];
    X[i] = (X[i] - sum) / A[nip+i];
  }
  return(0);
 }
 void AlgRemezGeneral::csv(std::ostream & os) const{
  os << "Numerator" << std::endl;
  for(int i=0;i<=pow_n;i++){
    os << getCoeffNum(i) << "*x^" << i;
    if(i!=pow_n) os << " + ";
  }
  os << std::endl;
  os << "Denominator" << std::endl;
  for(int i=0;i<=pow_d;i++){
    os << getCoeffDen(i) << "*x^" << i;
    if(i!=pow_d) os << " + ";
  }
  os << std::endl;
  //For a true minimax solution the errors should all be equal and the signs should oscillate +-+-+- etc
  int sign;
  os << "Errors at maxima: coordinate, error, (sign)" << std::endl;
  for(int i=0;i<neq+1;i++){ 
    os << mm[i] << " " << getErr(mm[i],&sign) << " (" << sign << ")" << std::endl;
  }
  os << "Scan over range:" << std::endl;
  int npt = 60;
  bigfloat dlt = (apend - apstrt)/bigfloat(npt-1);
  for (bigfloat x=apstrt; x<=apend; x = x + dlt) {
    double f = evaluateFunc(x);
    double r = evaluateApprox(x);
    os<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
  }
  return;
 }
--- a/Grid/algorithms/approx/RemezGeneral.h
+++ b/Grid/algorithms/approx/RemezGeneral.h
@@ -0,0 +1,170 @@
 /*
  C.Kelly Jan 2020 based on implementation by M. Clark May 2005
  AlgRemezGeneral is an implementation of the Remez algorithm for approximating an arbitrary function by a rational polynomial 
  It includes optional restriction to odd/even polynomials for the numerator and/or denominator
 */
 #ifndef INCLUDED_ALG_REMEZ_GENERAL_H
 #define INCLUDED_ALG_REMEZ_GENERAL_H
 #include <stddef.h>
 #include <Grid/GridStd.h>
 #ifdef HAVE_LIBGMP
 #include "bigfloat.h"
 #else
 #include "bigfloat_double.h"
 #endif
 class AlgRemezGeneral{
 public:
  enum PolyType { Even, Odd, Full };
 private:
  // In GSL-style, pass the function as a function pointer. Any data required to evaluate the function is passed in as a void pointer
  bigfloat (*f)(bigfloat x, void *data);
  void *data;
  // The approximation parameters
  std::vector<bigfloat> param;
  bigfloat norm;
  // The number of non-zero terms in the numerator and denominator
  int n, d;
  // The numerator and denominator degree (i.e.  the largest power)
  int pow_n, pow_d;
  // Specify if the numerator and/or denominator are odd/even polynomials
  PolyType num_type;
  PolyType den_type;
  std::vector<int> num_pows; //contains the mapping, with -1 if not present
  std::vector<int> den_pows;
  // The bounds of the approximation
  bigfloat apstrt, apwidt, apend;
  // Variables used to calculate the approximation
  int nd1, iter;
  std::vector<bigfloat> xx;
  std::vector<bigfloat> mm;
  std::vector<bigfloat> step;
  bigfloat delta, spread;
  // Variables used in search
  std::vector<bigfloat> yy;
  // Variables used in solving linear equations
  std::vector<bigfloat> A;
  std::vector<bigfloat> B;
  std::vector<int> IPS;
  // The number of equations we must solve at each iteration (n+d+1)
  int neq;
  // The precision of the GNU MP library
  long prec;
  // Initialize member variables associated with the polynomial's properties
  void setupPolyProperties(int num_degree, int den_degree, PolyType num_type_in, PolyType den_type_in);
  // Initial values of maximal and minmal errors
  void initialGuess();
  // Initialise step sizes
  void stpini();
  // Initialize the algorithm
  void reinitializeAlgorithm();
  // Solve the equations
  void equations();
  // Search for error maxima and minima
  void search(); 
  // Calculate function required for the approximation
  inline bigfloat func(bigfloat x) const{
    return f(x, data);
  }
  // Compute size and sign of the approximation error at x
  bigfloat getErr(bigfloat x, int *sign) const;
  // Solve the system AX=B   where X = param
  int simq();
  // Evaluate the rational form P(x)/Q(x) using coefficients from the solution vector param
  bigfloat approx(bigfloat x) const;
 public:
  AlgRemezGeneral(double lower, double upper, long prec,
 		  bigfloat (*f)(bigfloat x, void *data), void *data);
  inline int getDegree(void) const{ 
    assert(n==d);
    return n;
  }
  // Reset the bounds of the approximation
  inline void setBounds(double lower, double upper) {
    apstrt = lower;
    apend = upper;
    apwidt = apend - apstrt;
  }
  // Get the bounds of the approximation
  inline void getBounds(double &lower, double &upper) const{ 
    lower=(double)apstrt;
    upper=(double)apend;
  }
  // Run the algorithm to generate the rational approximation
  double generateApprox(int num_degree, int den_degree, 
 			PolyType num_type, PolyType den_type,
 			const double tolerance = 1e-15, const int report_freq = 1000);
  inline double generateApprox(int num_degree, int den_degree, 
 			       const double tolerance = 1e-15, const int report_freq = 1000){
    return generateApprox(num_degree, den_degree, Full, Full, tolerance, report_freq);
  }
  // Evaluate the rational form P(x)/Q(x) using coefficients from the
  // solution vector param
  inline double evaluateApprox(double x) const{
    return (double)approx((bigfloat)x);
  }
  // Evaluate the rational form Q(x)/P(x) using coefficients from the solution vector param
  inline double evaluateInverseApprox(double x) const{
    return 1.0/(double)approx((bigfloat)x);
  }  
  // Calculate function required for the approximation
  inline double evaluateFunc(double x) const{
    return (double)func((bigfloat)x);
  }
  // Calculate inverse function required for the approximation
  inline double evaluateInverseFunc(double x) const{
    return 1.0/(double)func((bigfloat)x);
  }
  // Dump csv of function, approx and error
  void csv(std::ostream &os = std::cout) const;
  // Get the coefficient of the term x^i in the numerator
  inline double getCoeffNum(const int i) const{    
    return num_pows[i] == -1 ? 0. : double(param[num_pows[i]]);
  }
  // Get the coefficient of the term x^i in the denominator
  inline double getCoeffDen(const int i) const{ 
    if(i == pow_d) return 1.0;
    else return den_pows[i] == -1 ? 0. : double(param[den_pows[i]+n+1]); 
  }
 };
 #endif
--- a/Grid/algorithms/approx/ZMobius.cc
+++ b/Grid/algorithms/approx/ZMobius.cc
@@ -0,0 +1,183 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/approx/ZMobius.cc
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@phys.columbia.edu>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/algorithms/approx/ZMobius.h>
 #include <Grid/algorithms/approx/RemezGeneral.h>
 NAMESPACE_BEGIN(Grid);
 NAMESPACE_BEGIN(Approx);
 //Compute the tanh approximation
 inline double epsilonMobius(const double x, const std::vector<ComplexD> &w){
  int Ls = w.size();
  ComplexD fxp = 1., fmp = 1.;
  for(int i=0;i<Ls;i++){
    fxp = fxp * ( w[i] + x );
    fmp = fmp * ( w[i] - x );
  }
  return ((fxp - fmp)/(fxp + fmp)).real();
 }
 inline double epsilonMobius(const double x, const std::vector<RealD> &w){
  int Ls = w.size();
  double fxp = 1., fmp = 1.;
  for(int i=0;i<Ls;i++){
    fxp = fxp * ( w[i] + x );
    fmp = fmp * ( w[i] - x );
  }
  return (fxp - fmp)/(fxp + fmp);
 }
 //Compute the tanh approximation in a form suitable for the Remez
 bigfloat epsilonMobius(bigfloat x, void* data){
  const std::vector<RealD> &omega = *( (std::vector<RealD> const*)data );
  bigfloat fxp(1.0);
  bigfloat fmp(1.0);
  for(int i=0;i<omega.size();i++){
    fxp = fxp * ( bigfloat(omega[i]) + x);
    fmp = fmp * ( bigfloat(omega[i]) - x);
  }
  return (fxp - fmp)/(fxp + fmp);
 }
 //Compute the Zmobius Omega parameters suitable for eigenvalue range   -lambda_bound <= lambda <= lambda_bound
 //Note omega_i = 1/(b_i + c_i)   where b_i and c_i are the Mobius parameters
 void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
 			 const std::vector<RealD> &omega_in, const int Ls_in,
 			 const RealD lambda_bound){
  assert(omega_in.size() == Ls_in);
  omega_out.resize(Ls_out);
  //Use the Remez algorithm to generate the appropriate rational polynomial
  //For odd polynomial, to satisfy Haar condition must take either positive or negative half of range (cf https://arxiv.org/pdf/0803.0439.pdf page 6)  
  AlgRemezGeneral remez(0, lambda_bound, 64, &epsilonMobius, (void*)&omega_in); 
  remez.generateApprox(Ls_out-1, Ls_out,AlgRemezGeneral::Odd, AlgRemezGeneral::Even, 1e-15, 100);
  remez.csv(std::cout);
  //The rational approximation has the form  [ f(x) - f(-x) ] / [ f(x) + f(-x) ]  where  f(x) = \Prod_{i=0}^{L_s-1} ( \omega_i + x )
  //cf https://academiccommons.columbia.edu/doi/10.7916/D8T72HD7  pg 102
  //omega_i are therefore the negative of the complex roots of f(x)
  //We can find the roots by recognizing that the eigenvalues of a matrix A are the roots of the characteristic polynomial
  // \rho(\lambda) = det( A - \lambda I )    where I is the unit matrix
  //The matrix whose characteristic polynomial is an arbitrary monic polynomial a0 + a1 x + a2 x^2 + ... x^n   is the companion matrix 
  // A = | 0    1   0    0 0 .... 0 |
  //     | 0    0   1    0 0 .... 0 |
  //     | :    :   :    : :      : |
  //     | 0    0   0    0 0      1
  //     | -a0 -a1 -a2  ...  ... -an|
  //Note the Remez defines the largest power to have unit coefficient
  std::vector<RealD> coeffs(Ls_out+1);
  for(int i=0;i<Ls_out+1;i+=2) coeffs[i] = coeffs[i] = remez.getCoeffDen(i); //even powers
  for(int i=1;i<Ls_out+1;i+=2) coeffs[i] = coeffs[i] = remez.getCoeffNum(i); //odd powers
  std::vector<std::complex<RealD> > roots(Ls_out);
  //Form the companion matrix
  Eigen::MatrixXd compn(Ls_out,Ls_out);
  for(int i=0;i<Ls_out-1;i++) compn(i,0) = 0.;
  compn(Ls_out - 1, 0) = -coeffs[0];
  for(int j=1;j<Ls_out;j++){
    for(int i=0;i<Ls_out-1;i++) compn(i,j) = i == j-1 ? 1. : 0.;
    compn(Ls_out - 1, j) = -coeffs[j];
  }
  //Eigensolve
  Eigen::EigenSolver<Eigen::MatrixXd> slv(compn, false);
  const auto & ev = slv.eigenvalues();
  for(int i=0;i<Ls_out;i++)
    omega_out[i] = -ev(i);
  //Sort ascending (smallest at start of vector!)
  std::sort(omega_out.begin(), omega_out.end(), 
 	    [&](const ComplexD &a, const ComplexD &b){ return a.real() < b.real() || (a.real() == b.real() && a.imag() < b.imag()); });
  //McGlynn thesis pg 122 suggest improved iteration counts if magnitude of omega diminishes towards the center of the 5th dimension
  std::vector<ComplexD> omega_tmp = omega_out;
  int s_low=0, s_high=Ls_out-1, ss=0;
  for(int s_from = Ls_out-1; s_from >= 0; s_from--){ //loop from largest omega
    int s_to;
    if(ss % 2 == 0){
      s_to = s_low++;
    }else{
      s_to = s_high--;
    }
    omega_out[s_to] = omega_tmp[s_from];
    ++ss;
  }
  std::cout << "Resulting omega_i:" << std::endl;  
  for(int i=0;i<Ls_out;i++)
    std::cout << omega_out[i] << std::endl;
  std::cout << "Test result matches the approximate polynomial found by the Remez" << std::endl;
  std::cout << "<x> <remez approx> <poly approx> <diff poly approx remez approx> <exact> <diff poly approx exact>\n";
  int npt = 60;
  double dlt = lambda_bound/double(npt-1);
  for (int i =0; i<npt; i++){
    double x = i*dlt;
    double r = remez.evaluateApprox(x);
    double p = epsilonMobius(x, omega_out);
    double e = epsilonMobius(x, omega_in);
    std::cout << x<< " " << r << " " << p <<" " <<r-p << " " << e << " " << e-p << std::endl;
  }
 }
 //mobius_param = b+c   with b-c=1
 void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound){
  std::vector<RealD> omega_in(Ls_in, 1./mobius_param);
  computeZmobiusOmega(omega_out, Ls_out, omega_in, Ls_in, lambda_bound);
 }
 //ZMobius class takes  gamma_i = (b+c) omega_i as its input, where b, c are factored out
 void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, 
 			 const RealD mobius_param_out, const int Ls_out, 
 			 const RealD mobius_param_in, const int Ls_in,
 			 const RealD lambda_bound){
  computeZmobiusOmega(gamma_out, Ls_out, mobius_param_in, Ls_in, lambda_bound);
  for(int i=0;i<Ls_out;i++) gamma_out[i] = gamma_out[i] * mobius_param_out;
 }
 //Assumes mobius_param_out == mobius_param_in
 void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound){
  computeZmobiusGamma(gamma_out, mobius_param, Ls_out, mobius_param, Ls_in, lambda_bound);
 }
 NAMESPACE_END(Approx);
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/approx/ZMobius.h
+++ b/Grid/algorithms/approx/ZMobius.h
@@ -0,0 +1,57 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/approx/ZMobius.h
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@phys.columbia.edu>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_ZMOBIUS_APPROX_H
 #define GRID_ZMOBIUS_APPROX_H
 #include <Grid/GridCore.h>
 NAMESPACE_BEGIN(Grid);
 NAMESPACE_BEGIN(Approx);
 //Compute the Zmobius Omega parameters suitable for eigenvalue range   -lambda_bound <= lambda <= lambda_bound
 //Note omega_i = 1/(b_i + c_i)   where b_i and c_i are the Mobius parameters
 void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
 			 const std::vector<RealD> &omega_in, const int Ls_in,
 			 const RealD lambda_bound);
 //mobius_param = b+c   with b-c=1
 void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound);
 //ZMobius class takes  gamma_i = (b+c) omega_i as its input, where b, c are factored out
 void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, 
 			 const RealD mobius_param_out, const int Ls_out, 
 			 const RealD mobius_param_in, const int Ls_in,
 			 const RealD lambda_bound);
 //Assumes mobius_param_out == mobius_param_in
 void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound);
 NAMESPACE_END(Approx);
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/approx/bigfloat_double.h
+++ b/Grid/algorithms/approx/bigfloat_double.h
@@ -25,6 +25,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef INCLUDED_BIGFLOAT_DOUBLE_H
 #define INCLUDED_BIGFLOAT_DOUBLE_H
 #include <math.h>
 typedef double mfloat; 
@@ -186,4 +190,6 @@ public:
  //  friend bigfloat& random(void);
 };
 #endif
--- a/Grid/algorithms/iterative/BiCGSTAB.h
+++ b/Grid/algorithms/iterative/BiCGSTAB.h
@@ -0,0 +1,222 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/BiCGSTAB.h
 Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: juettner <juettner@soton.ac.uk>
 Author: David Murphy <djmurphy@mit.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_BICGSTAB_H
 #define GRID_BICGSTAB_H
 NAMESPACE_BEGIN(Grid);
 /////////////////////////////////////////////////////////////
 // Base classes for iterative processes based on operators
 // single input vec, single output vec.
 /////////////////////////////////////////////////////////////
 template <class Field>
 class BiCGSTAB : public OperatorFunction<Field> 
 {
  public:
    using OperatorFunction<Field>::operator();
    bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                             // Defaults true.
    RealD Tolerance;
    Integer MaxIterations;
    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
    BiCGSTAB(RealD tol, Integer maxit, bool err_on_no_conv = true) : 
      Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){};
    void operator()(LinearOperatorBase<Field>& Linop, const Field& src, Field& psi) 
    {
      psi.Checkerboard() = src.Checkerboard();
      conformable(psi, src);
      RealD cp(0), rho(1), rho_prev(0), alpha(1), beta(0), omega(1);
      RealD a(0), bo(0), b(0), ssq(0);
      Field p(src);
      Field r(src);
      Field rhat(src);
      Field v(src);
      Field s(src);
      Field t(src);
      Field h(src);
      v = Zero();
      p = Zero();
      // Initial residual computation & set up
      RealD guess = norm2(psi);
      assert(std::isnan(guess) == 0);
      Linop.Op(psi, v);
      b = norm2(v);
      r = src - v;
      rhat = r;
      a = norm2(r);
      ssq = norm2(src);
      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: guess " << guess << std::endl;
      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:   src " << ssq << std::endl;
      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:    mp " << b << std::endl;
      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:     r " << a << std::endl;
      RealD rsq = Tolerance * Tolerance * ssq;
      // Check if guess is really REALLY good :)
      if(a <= rsq){ return; }
      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: k=0 residual " << a << " target " << rsq << std::endl;
      GridStopWatch LinalgTimer;
      GridStopWatch InnerTimer;
      GridStopWatch AxpyNormTimer;
      GridStopWatch LinearCombTimer;
      GridStopWatch MatrixTimer;
      GridStopWatch SolverTimer;
      SolverTimer.Start();
      int k;
      for (k = 1; k <= MaxIterations; k++) 
      {
        rho_prev = rho;
        LinalgTimer.Start();
        InnerTimer.Start();
        ComplexD Crho  = innerProduct(rhat,r);
        InnerTimer.Stop();
        rho = Crho.real();
        beta = (rho / rho_prev) * (alpha / omega);
        LinearCombTimer.Start();
        bo = beta * omega;
        auto p_v = p.View();
        auto r_v = r.View();
        auto v_v = v.View();
        accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
          coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
        });
        LinearCombTimer.Stop();
        LinalgTimer.Stop();
        MatrixTimer.Start();
        Linop.Op(p,v);
        MatrixTimer.Stop();
        LinalgTimer.Start();
        InnerTimer.Start();
        ComplexD Calpha = innerProduct(rhat,v);
        InnerTimer.Stop();
        alpha = rho / Calpha.real();
        LinearCombTimer.Start();
        auto h_v = h.View();
        auto psi_v = psi.View();
        accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
          coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
        });
        auto s_v = s.View();
        accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
          coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
        });
        LinearCombTimer.Stop();
        LinalgTimer.Stop();
        MatrixTimer.Start();
        Linop.Op(s,t);
        MatrixTimer.Stop();
        LinalgTimer.Start();
        InnerTimer.Start();
        ComplexD Comega = innerProduct(t,s);
        InnerTimer.Stop();
        omega = Comega.real() / norm2(t);
        LinearCombTimer.Start();
        auto t_v = t.View();
        accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
          coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
          coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
        });
        LinearCombTimer.Stop();
        cp = norm2(r);
        LinalgTimer.Stop();
        std::cout << GridLogIterative << "BiCGSTAB: Iteration " << k << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
        // Stopping condition
        if(cp <= rsq) 
        {
          SolverTimer.Stop();
          Linop.Op(psi, v);
          p = v - src;
          RealD srcnorm = sqrt(norm2(src));
          RealD resnorm = sqrt(norm2(p));
          RealD true_residual = resnorm / srcnorm;
          std::cout << GridLogMessage << "BiCGSTAB Converged on iteration " << k << std::endl;
          std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp/ssq) << std::endl;
          std::cout << GridLogMessage << "\tTrue residual " << true_residual << std::endl;
          std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
          std::cout << GridLogMessage << "Time breakdown " << std::endl;
          std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() << std::endl;
          std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() << std::endl;
          std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() << std::endl;
          std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() << std::endl;
          std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() << std::endl;
          std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl;
          if(ErrorOnNoConverge){ assert(true_residual / Tolerance < 10000.0); }
          IterationsToComplete = k;	
          return;
        }
      }
      std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl;
      if(ErrorOnNoConverge){ assert(0); }
      IterationsToComplete = k;
    }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
+++ b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
@@ -0,0 +1,158 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: ./lib/algorithms/iterative/BiCGSTABMixedPrec.h
 Copyright (C) 2015
 Author: Christopher Kelly <ckelly@phys.columbia.edu>
 Author: David Murphy <djmurphy@mit.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_BICGSTAB_MIXED_PREC_H
 #define GRID_BICGSTAB_MIXED_PREC_H
 NAMESPACE_BEGIN(Grid);
 // Mixed precision restarted defect correction BiCGSTAB
 template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
 {
  public:                                                
    RealD   Tolerance;
    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid; // Grid for single-precision fields
    RealD OuterLoopNormMult; // Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    LinearOperatorBase<FieldF> &Linop_f;
    LinearOperatorBase<FieldD> &Linop_d;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
    MixedPrecisionBiCGSTAB(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, 
        LinearOperatorBase<FieldF>& _Linop_f, LinearOperatorBase<FieldD>& _Linop_d) : 
      Linop_f(_Linop_f), Linop_d(_Linop_d), Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), 
      MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid), OuterLoopNormMult(100.), guesser(NULL) {};
    void useGuesser(LinearFunction<FieldF>& g){
      guesser = &g;
    }
    void operator() (const FieldD& src_d_in, FieldD& sol_d)
    {
      TotalInnerIterations = 0;
      GridStopWatch TotalTimer;
      TotalTimer.Start();
      int cb = src_d_in.Checkerboard();
      sol_d.Checkerboard() = cb;
      RealD src_norm = norm2(src_d_in);
      RealD stop = src_norm * Tolerance*Tolerance;
      GridBase* DoublePrecGrid = src_d_in.Grid();
      FieldD tmp_d(DoublePrecGrid);
      tmp_d.Checkerboard() = cb;
      FieldD tmp2_d(DoublePrecGrid);
      tmp2_d.Checkerboard() = cb;
      FieldD src_d(DoublePrecGrid);
      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
      RealD inner_tol = InnerTolerance;
      FieldF src_f(SinglePrecGrid);
      src_f.Checkerboard() = cb;
      FieldF sol_f(SinglePrecGrid);
      sol_f.Checkerboard() = cb;
      BiCGSTAB<FieldF> CG_f(inner_tol, MaxInnerIterations);
      CG_f.ErrorOnNoConverge = false;
      GridStopWatch InnerCGtimer;
      GridStopWatch PrecChangeTimer;
      Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
      for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++)
      {
        // Compute double precision rsd and also new RHS vector.
        Linop_d.Op(sol_d, tmp_d);
        RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
        std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration " << outer_iter << " residual " << norm << " target " << stop << std::endl;
        if(norm < OuterLoopNormMult * stop){
          std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration converged on iteration " << outer_iter << std::endl;
          break;
        }
        while(norm * inner_tol * inner_tol < stop){ inner_tol *= 2; } // inner_tol = sqrt(stop/norm) ??
        PrecChangeTimer.Start();
        precisionChange(src_f, src_d);
        PrecChangeTimer.Stop();
        sol_f = Zero();
        //Optionally improve inner solver guess (eg using known eigenvectors)
        if(guesser != NULL){ (*guesser)(src_f, sol_f); }
        //Inner CG
        CG_f.Tolerance = inner_tol;
        InnerCGtimer.Start();
        CG_f(Linop_f, src_f, sol_f);
        InnerCGtimer.Stop();
        TotalInnerIterations += CG_f.IterationsToComplete;
        //Convert sol back to double and add to double prec solution
        PrecChangeTimer.Start();
        precisionChange(tmp_d, sol_f);
        PrecChangeTimer.Stop();
        axpy(sol_d, 1.0, tmp_d, sol_d);
      }
      //Final trial CG
      std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Starting final patch-up double-precision solve" << std::endl;
      BiCGSTAB<FieldD> CG_d(Tolerance, MaxInnerIterations);
      CG_d(Linop_d, src_d_in, sol_d);
      TotalFinalStepIterations = CG_d.IterationsToComplete;
      TotalTimer.Stop();
      std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
      std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
  }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/iterative/BlockConjugateGradient.h
+++ b/Grid/algorithms/iterative/BlockConjugateGradient.h
@@ -52,6 +52,7 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  Integer PrintInterval; //GridLogMessages or Iterative
  RealD TrueResidual;
  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100)
@@ -306,7 +307,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
      Linop.HermOp(X, AD);
      AD = AD-B;
-      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
+      TrueResidual = std::sqrt(norm2(AD)/norm2(B));
      std::cout << GridLogMessage <<"\tTrue residual is " << TrueResidual <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -442,7 +444,8 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
      Linop.HermOp(Psi, AP);
      AP = AP-Src;
-      std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
+      TrueResidual = std::sqrt(norm2(AP)/norm2(Src));
      std::cout <<GridLogMessage << "\tTrue residual is " << TrueResidual <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -653,7 +656,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
      if ( rr > max_resid ) max_resid = rr;
    }
-    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl;
+    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
    if ( max_resid < Tolerance*Tolerance ) { 
@@ -668,7 +671,8 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
      for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
      for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
-      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl;
+      TrueResidual = std::sqrt(normv(AD)/normv(B));
      std::cout << GridLogMessage << "\tTrue residual is " << TrueResidual <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -49,6 +49,7 @@ public:
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  RealD TrueResidual;
  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol),
@@ -71,7 +72,6 @@ public:
    // Initial residual computation & set up
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    Linop.HermOpAndNorm(psi, mmp, d, b);
@@ -82,6 +82,14 @@ public:
    cp = a;
    ssq = norm2(src);
    // Handle trivial case of zero src
    if (ssq == 0.){
      psi = Zero();
      IterationsToComplete = 1;
      TrueResidual = 0.;
      return;
    }
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   src " << ssq << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:    mp " << d << std::endl;
@@ -93,6 +101,7 @@ public:
    // Check if guess is really REALLY good :)
    if (cp <= rsq) {
      TrueResidual = std::sqrt(a/ssq);
      std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
      IterationsToComplete = 0;	
      return;
@@ -142,7 +151,7 @@ public:
      LinalgTimer.Stop();
      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
-                << " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
+                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
      // Stopping condition
      if (cp <= rsq) {
@@ -154,26 +163,33 @@ public:
        RealD resnorm = std::sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
-        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
+        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
-        std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
+		  << "\tComputed residual " << std::sqrt(cp / ssq)
-	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
+		  << "\tTrue residual " << true_residual
-	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+		  << "\tTarget " << Tolerance << std::endl;
-        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
+        std::cout << GridLogIterative << "Time breakdown "<<std::endl;
-	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
 	IterationsToComplete = k;	
 	TrueResidual = true_residual;
        return;
      }
    }
    // Failed. Calculate true residual before giving up                                                         
    Linop.HermOpAndNorm(psi, mmp, d, qq);
    p = mmp - src;
    TrueResidual = sqrt(norm2(p)/ssq);
    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl;
    if (ErrorOnNoConverge) assert(0);
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -46,15 +46,19 @@ public:
  RealD   Tolerance;
  Integer MaxIterations;
-    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
@@ -125,6 +129,17 @@ public:
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
@@ -270,6 +285,7 @@ public:
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
@@ -299,7 +315,8 @@ public:
 	  axpy(r,-alpha[s],src,tmp);
 	  RealD rn = norm2(r);
 	  RealD cn = norm2(src);
-	  std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
+	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<< TrueResidualShift[s] <<std::endl;
 	}
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -43,6 +43,11 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
 {
  // If assume basis[j] are already orthonormal,
  // can take all inner products in parallel saving 2x bandwidth
  // Save 3x bandwidth on the second line of loop.
  // perhaps 2.5x speed up.
  // 2x overall in Multigrid Lanczos  
  for(int j=0; j<k; ++j){
    auto ip = innerProduct(basis[j],w);
    w = w - ip*basis[j];
@@ -54,16 +59,15 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
 {
  typedef decltype(basis[0].View()) View;
  auto tmp_v = basis[0].View();
-  std::vector<View> basis_v(basis.size(),tmp_v);
+  Vector<View> basis_v(basis.size(),tmp_v);
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0].Grid();
-      
+
  for(int k=0;k<basis.size();k++){
    basis_v[k] = basis[k].View();
  }
-
+#if 0
  std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
  thread_region
  {
    vobj* B = Bt.data() + Nm * thread_num();
@@ -81,24 +85,89 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
      }
    });
  }
 #else
  int nrot = j1-j0;
  uint64_t oSites   =grid->oSites();
  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
  //  printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
  Vector <vobj> Bt(siteBlock * nrot); 
  auto Bp=&Bt[0];
  // GPU readable copy of Eigen matrix
  Vector<double> Qt_jv(Nm*Nm);
  double *Qt_p = & Qt_jv[0];
  for(int k=0;k<Nm;++k){
    for(int j=0;j<Nm;++j){
      Qt_p[j*Nm+k]=Qt(j,k);
    }
  }
  // Block the loop to keep storage footprint down
  vobj zz=Zero();
  for(uint64_t s=0;s<oSites;s+=siteBlock){
    // remaining work in this block
    int ssites=MIN(siteBlock,oSites-s);
    // zero out the accumulators
    accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
 	auto z=coalescedRead(zz);
 	coalescedWrite(Bp[ss],z);
    });
    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
      int j =sj%nrot;
      int jj  =j0+j;
      int ss =sj/nrot;
      int sss=ss+s;
      for(int k=k0; k<k1; ++k){
 	auto tmp = coalescedRead(Bp[ss*nrot+j]);
 	coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
      }
    });
    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
      int j =sj%nrot;
      int jj  =j0+j;
      int ss =sj/nrot;
      int sss=ss+s;
      coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
    });
  }
 #endif
 }
 // Extract a single rotated vector
 template<class Field>
 void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
 {
  typedef decltype(basis[0].View()) View;
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0].Grid();
  result.Checkerboard() = basis[0].Checkerboard();
  auto result_v=result.View();
-  thread_for(ss, grid->oSites(),{
+  Vector<View> basis_v(basis.size(),result_v);
-    vobj B = Zero();
+  for(int k=0;k<basis.size();k++){
    basis_v[k] = basis[k].View();
  }
  vobj zz=Zero();
  Vector<double> Qt_jv(Nm);
  double * Qt_j = & Qt_jv[0];
  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
    auto B=coalescedRead(zz);
    for(int k=k0; k<k1; ++k){
-      auto basis_k = basis[k].View();
+      B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
      B +=Qt(j,k) * basis_k[ss];
    }
-    result_v[ss] = B;
+    coalescedWrite(result_v[ss], B);
  });
 }
@@ -282,7 +351,7 @@ public:
 			    RealD _eresid, // resid in lmdue deficit 
 			    int _MaxIter, // Max iterations
 			    RealD _betastp=0.0, // if beta(k) < betastp: converged
-			    int _MinRestart=1, int _orth_period = 1,
+			    int _MinRestart=0, int _orth_period = 1,
 			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOp), _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(Tester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
@@ -298,7 +367,7 @@ public:
 			       RealD _eresid, // resid in lmdue deficit 
 			       int _MaxIter, // Max iterations
 			       RealD _betastp=0.0, // if beta(k) < betastp: converged
-			       int _MinRestart=1, int _orth_period = 1,
+			       int _MinRestart=0, int _orth_period = 1,
 			       IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOp),  _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(SimpleTester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
@@ -347,7 +416,7 @@ until convergence
    GridBase *grid = src.Grid();
    assert(grid == evec[0].Grid());
-    GridLogIRL.TimingMode(1);
+    //    GridLogIRL.TimingMode(1);
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@@ -372,14 +441,17 @@ until convergence
    {
      auto src_n = src;
      auto tmp = src;
      std::cout << GridLogIRL << " IRL source norm " << norm2(src) << std::endl;
      const int _MAX_ITER_IRL_MEVAPP_ = 50;
      for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
 	normalise(src_n);
 	_HermOp(src_n,tmp);
 	//	std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
 	//	std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
 	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
 	RealD vden = norm2(src_n);
 	RealD na = vnum/vden;
-	if (fabs(evalMaxApprox/na - 1.0) < 0.05)
+	if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
 	std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
@@ -577,11 +649,11 @@ until convergence
 /* Saad PP. 195
 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
 2. For k = 1,2,...,m Do:
-3. wk:=Avk−βkv_{k−1}      
+3. wk:=Avk - b_k v_{k-1}      
-4. αk:=(wk,vk)       // 
+4. ak:=(wk,vk)       // 
-5. wk:=wk−αkvk       // wk orthog vk 
+5. wk:=wk-akvk       // wk orthog vk 
-6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+6. bk+1 := ||wk||_2. If b_k+1 = 0 then Stop
-7. vk+1 := wk/βk+1
+7. vk+1 := wk/b_k+1
 8. EndDo
 */
  void step(std::vector<RealD>& lmd,
@@ -589,6 +661,7 @@ until convergence
 	    std::vector<Field>& evec,
 	    Field& w,int Nm,int k)
  {
    std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
    const RealD tiny = 1.0e-20;
    assert( k< Nm );
@@ -600,20 +673,20 @@ until convergence
    if(k>0) w -= lme[k-1] * evec[k-1];
-    ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
+    ComplexD zalph = innerProduct(evec_k,w);
    RealD     alph = real(zalph);
-    w = w - alph * evec_k;// 5. wk:=wk−αkvk
+    w = w - alph * evec_k;
-    RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+    RealD beta = normalise(w); 
    // 7. vk+1 := wk/βk+1
    lmd[k] = alph;
    lme[k] = beta;
-    if (k>0 && k % orth_period == 0) {
+    if ( (k>0) && ( (k % orth_period) == 0 )) {
      std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
      orthogonalize(w,evec,k); // orthonormalise
-      std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
+      std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
    }
    if(k < Nm-1) evec[k+1] = w;
@@ -621,6 +694,8 @@ until convergence
    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
    if ( beta < tiny ) 
      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
    std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
  }
  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
--- a/Grid/algorithms/iterative/NormalEquations.h
+++ b/Grid/algorithms/iterative/NormalEquations.h
@@ -33,26 +33,78 @@ NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Take a matrix and form an NE solver calling a Herm solver
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class Field> class NormalEquations : public OperatorFunction<Field>{
+template<class Field> class NormalEquations {
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
-
+  LinearFunction<Field>   & _Guess;
 public:
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
-  NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver) 
+ NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
-    :  _Matrix(Matrix), _HermitianSolver(HermitianSolver) {}; 
+		 LinearFunction<Field> &Guess) 
   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 
  void operator() (const Field &in, Field &out){
    Field src(in.Grid());
    Field tmp(in.Grid());
    MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
    _Matrix.Mdag(in,src);
-    _HermitianSolver(src,out);  // Mdag M out = Mdag in
+    _Guess(src,out);
    _HermitianSolver(MdagMOp,src,out);  // Mdag M out = Mdag in
  }     
 };
 template<class Field> class HPDSolver {
 private:
  LinearOperatorBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
  LinearFunction<Field>   & _Guess;
 public:
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
 HPDSolver(LinearOperatorBase<Field> &Matrix,
 	   OperatorFunction<Field> &HermitianSolver,
 	   LinearFunction<Field> &Guess) 
   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 
  void operator() (const Field &in, Field &out){
    _Guess(in,out);
    _HermitianSolver(_Matrix,in,out);  // Mdag M out = Mdag in
  }     
 };
 template<class Field> class MdagMSolver {
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
  LinearFunction<Field>   & _Guess;
 public:
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
 MdagMSolver(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
 	     LinearFunction<Field> &Guess) 
   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 
  void operator() (const Field &in, Field &out){
    MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
    _Guess(in,out);
    _HermitianSolver(MdagMOp,in,out);  // Mdag M out = Mdag in
  }     
 };
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@@ -30,12 +30,12 @@ template<class Field> class PowerMethod
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
-      if ( (fabs(evalMaxApprox/na - 1.0) < 0.01) || (i==_MAX_ITER_EST_-1) ) { 
+      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
 	std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
 	return evalMaxApprox; 
      } 
      evalMaxApprox = na; 
      std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
      src_n = tmp;
    }
    assert(0);
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -38,10 +38,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);
 #define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" " 
 template<class Field>
-class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
+class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
  using OperatorFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
@@ -49,23 +50,29 @@ public:
  int mmax;
  int nstep;
  int steps;
  int level;
  GridStopWatch PrecTimer;
  GridStopWatch MatTimer;
  GridStopWatch LinalgTimer;
-  LinearFunction<Field> &Preconditioner;
+  LinearFunction<Field>     &Preconditioner;
  LinearOperatorBase<Field> &Linop;
-  PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
+  void Level(int lv) { level=lv; };
  PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
    Tolerance(tol), 
    MaxIterations(maxit),
    Linop(_Linop),
    Preconditioner(Prec),
    mmax(_mmax),
    nstep(_nstep)
  { 
    level=1;
    verbose=1;
  };
-  void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+  void operator() (const Field &src, Field &psi){
    psi=Zero();
    RealD cp, ssq,rsq;
@@ -84,9 +91,9 @@ public:
    steps=0;
    for(int k=0;k<MaxIterations;k++){
-      cp=GCRnStep(Linop,src,psi,rsq);
+      cp=GCRnStep(src,psi,rsq);
-      std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
+      GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
      if(cp<rsq) {
@@ -95,24 +102,26 @@ public:
 	Linop.HermOp(psi,r);
 	axpy(r,-1.0,src,r);
 	RealD tr = norm2(r);
-	std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
+	GCRLogLevel<<"PGCR: Converged on iteration " <<steps
 		 << " computed residual "<<sqrt(cp/ssq)
 		 << " true residual "    <<sqrt(tr/ssq)
 		 << " target "           <<Tolerance <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
+	GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
+	/*
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
+	  GCRLogLevel<<"PGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
+	  GCRLogLevel<<"PGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
 	  GCRLogLevel<<"PGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
 	*/
 	return;
      }
    }
-    std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
+    GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
-    assert(0);
+    //    assert(0);
  }
-  RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
+  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
    RealD cp;
    RealD a, b;
@@ -134,9 +143,7 @@ public:
    std::vector<Field> p(mmax,grid);
    std::vector<RealD> qq(mmax);
-    std::cout<<GridLogIterative<< " ************** "<< std::endl;
+    GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
    std::cout<<GridLogIterative<< "   GCRnStep("<<nstep<<")"<<std::endl;
    std::cout<<GridLogIterative<< " ************** "<< std::endl;
    //////////////////////////////////
    // initial guess x0 is taken as nonzero.
@@ -150,35 +157,15 @@ public:
    LinalgTimer.Start();
    r=src-Az;
    LinalgTimer.Stop();
-    std::cout<<GridLogIterative<< " GCRnStep true residual r = src - A psi   "<<norm2(r) <<std::endl;
+    GCRLogLevel<< "PGCR true residual r = src - A psi   "<<norm2(r) <<std::endl;
    /////////////////////
    // p = Prec(r)
    /////////////////////
    std::cout<<GridLogIterative<< " GCRnStep apply preconditioner z= M^-1 r "<< std::endl;
    std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
    PrecTimer.Start();
    Preconditioner(r,z);
    PrecTimer.Stop();
    std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
    std::cout<<GridLogIterative<< " GCRnStep called Preconditioner z "<< norm2(z) <<std::endl;
    //    MatTimer.Start();
    //    Linop.HermOp(z,tmp); 
    //    MatTimer.Stop();
    //    LinalgTimer.Start();
    //    ttmp=tmp;
    //    tmp=tmp-r;
    //    LinalgTimer.Stop();
    /*
      std::cout<<GridLogMessage<<r<<std::endl;
      std::cout<<GridLogMessage<<z<<std::endl;
      std::cout<<GridLogMessage<<ttmp<<std::endl;
      std::cout<<GridLogMessage<<tmp<<std::endl;
    */
    MatTimer.Start();
    Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
@@ -190,7 +177,6 @@ public:
    p[0]= z;
    q[0]= Az;
    qq[0]= zAAz;
    std::cout<<GridLogIterative<< " GCRnStep p0=z, q0 = A p0 " <<std::endl;
    cp =norm2(r);
    LinalgTimer.Stop();
@@ -212,20 +198,16 @@ public:
      cp = axpy_norm(r,-a,q[peri_k],r);
      LinalgTimer.Stop();
-      std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
+      GCRLogLevel<< "PGCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
      if((k==nstep-1)||(cp<rsq)){
 	return cp;
      }
      std::cout<<GridLogIterative<< " GCRnStep apply preconditioner z= M^-1 r "<< std::endl;
      std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
      PrecTimer.Start();
      Preconditioner(r,z);// solve Az = r
      PrecTimer.Stop();
      std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
      std::cout<<GridLogIterative<< " GCRnStep called Preconditioner z "<< norm2(z) <<std::endl;
      MatTimer.Start();
      Linop.HermOpAndNorm(z,Az,zAz,zAAz);
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -405,6 +405,70 @@ namespace Grid {
    }
  };
  template<class Field> class NonHermitianSchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field> 
  {
    public:
      typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
      NonHermitianSchurRedBlackDiagMooeeSolve(OperatorFunction<Field>& RBSolver, const bool initSubGuess = false,
          const bool _solnAsInitGuess = false)  
      : SchurRedBlackBase<Field>(RBSolver, initSubGuess, _solnAsInitGuess) {};
      //////////////////////////////////////////////////////
      // Override RedBlack specialisation
      //////////////////////////////////////////////////////
      virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o)
      {
        GridBase* grid  = _Matrix.RedBlackGrid();
        GridBase* fgrid = _Matrix.Grid();
        Field  tmp(grid);
        Field Mtmp(grid);
        pickCheckerboard(Even, src_e, src);
        pickCheckerboard(Odd , src_o, src);
        /////////////////////////////////////////////////////
        // src_o = Mdag * (source_o - Moe MeeInv source_e)
        /////////////////////////////////////////////////////
        _Matrix.MooeeInv(src_e, tmp);   assert(   tmp.Checkerboard() == Even );
        _Matrix.Meooe   (tmp, Mtmp);    assert(  Mtmp.Checkerboard() == Odd  );     
        src_o -= Mtmp;                  assert( src_o.Checkerboard() == Odd  );     
      }
      virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
      {
        GridBase* grid  = _Matrix.RedBlackGrid();
        GridBase* fgrid = _Matrix.Grid();
        Field     tmp(grid);
        Field   sol_e(grid);
        Field src_e_i(grid);
        ///////////////////////////////////////////////////
        // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
        ///////////////////////////////////////////////////
        _Matrix.Meooe(sol_o, tmp);         assert(     tmp.Checkerboard() == Even );
        src_e_i = src_e - tmp;             assert( src_e_i.Checkerboard() == Even );
        _Matrix.MooeeInv(src_e_i, sol_e);  assert(   sol_e.Checkerboard() == Even );
        setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
        setCheckerboard(sol, sol_o); assert( sol_o.Checkerboard() == Odd  );
      }
      virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
      {
        NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
        this->_HermitianRBSolver(_OpEO, src_o, sol_o);  assert(sol_o.Checkerboard() == Odd);
      }
      virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o)
      {
        NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
        this->_HermitianRBSolver(_OpEO, src_o, sol_o); 
      }
  };
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal is identity, right preconditioned by Mee^inv
  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
@@ -482,5 +546,76 @@ namespace Grid {
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
    }
  };
  template<class Field> class NonHermitianSchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field> 
  {
    public:
      typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
      /////////////////////////////////////////////////////
      // Wrap the usual normal equations Schur trick
      /////////////////////////////////////////////////////
      NonHermitianSchurRedBlackDiagTwoSolve(OperatorFunction<Field>& RBSolver, const bool initSubGuess = false,
          const bool _solnAsInitGuess = false)  
      : SchurRedBlackBase<Field>(RBSolver, initSubGuess, _solnAsInitGuess) {};
      virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o)
      {
        GridBase* grid  = _Matrix.RedBlackGrid();
        GridBase* fgrid = _Matrix.Grid();
        Field  tmp(grid);
        Field Mtmp(grid);
        pickCheckerboard(Even, src_e, src);
        pickCheckerboard(Odd , src_o, src);
        /////////////////////////////////////////////////////
        // src_o = Mdag * (source_o - Moe MeeInv source_e)
        /////////////////////////////////////////////////////
        _Matrix.MooeeInv(src_e, tmp);   assert(   tmp.Checkerboard() == Even );
        _Matrix.Meooe   (tmp, Mtmp);    assert(  Mtmp.Checkerboard() == Odd  );     
        src_o -= Mtmp;                  assert( src_o.Checkerboard() == Odd  );     
      }
      virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
      {
        GridBase* grid  = _Matrix.RedBlackGrid();
        GridBase* fgrid = _Matrix.Grid();
        Field sol_o_i(grid);
        Field     tmp(grid);
        Field   sol_e(grid);
        ////////////////////////////////////////////////
        // MooeeInv due to pecond
        ////////////////////////////////////////////////
        _Matrix.MooeeInv(sol_o, tmp);
        sol_o_i = tmp;
        ///////////////////////////////////////////////////
        // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
        ///////////////////////////////////////////////////
        _Matrix.Meooe(sol_o_i, tmp);    assert(   tmp.Checkerboard() == Even );
        tmp = src_e - tmp;              assert( src_e.Checkerboard() == Even );
        _Matrix.MooeeInv(tmp, sol_e);   assert( sol_e.Checkerboard() == Even );
        setCheckerboard(sol, sol_e);    assert(   sol_e.Checkerboard() == Even );
        setCheckerboard(sol, sol_o_i);  assert( sol_o_i.Checkerboard() == Odd  );
      };
      virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
      {
        NonHermitianSchurDiagTwoOperator<Matrix,Field> _OpEO(_Matrix);
        this->_HermitianRBSolver(_OpEO, src_o, sol_o);
      };
      virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o,  std::vector<Field>& sol_o)
      {
        NonHermitianSchurDiagTwoOperator<Matrix,Field> _OpEO(_Matrix);
        this->_HermitianRBSolver(_OpEO, src_o, sol_o); 
      }
  };
 }
 #endif
--- a/Grid/allocator/AlignedAllocator.cc
+++ b/Grid/allocator/AlignedAllocator.cc
@@ -6,6 +6,12 @@ NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
 #ifdef GRID_NVCC
 #define SMALL_LIMIT (0)
 #else
 #define SMALL_LIMIT (4096)
 #endif
 #ifdef POINTER_CACHE
 int PointerCache::victim;
@@ -13,7 +19,7 @@ PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
 void *PointerCache::Insert(void *ptr,size_t bytes) {
-  if (bytes < 4096 ) return ptr;
+  if (bytes < SMALL_LIMIT ) return ptr;
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
@@ -50,7 +56,7 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
 void *PointerCache::Lookup(size_t bytes) {
-  if (bytes < 4096 ) return NULL;
+  if (bytes < SMALL_LIMIT ) return NULL;
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -49,8 +49,13 @@ NAMESPACE_BEGIN(Grid);
 #ifdef POINTER_CACHE
 class PointerCache {
 private:
-
+/*Pinning pages is costly*/
 /*Could maintain separate large and small allocation caches*/
 #ifdef GRID_NVCC 
  static const int Ncache=128;
 #else
  static const int Ncache=8;
 #endif
  static int victim;
  typedef struct { 
@@ -63,7 +68,6 @@ private:
 public:
  static void *Insert(void *ptr,size_t bytes) ;
  static void *Lookup(size_t bytes) ;
@@ -170,13 +174,14 @@ public:
    // Unified (managed) memory
    ////////////////////////////////////
    if ( ptr == (_Tp *) NULL ) {
      //      printf(" alignedAllocater cache miss %ld bytes ",bytes);      BACKTRACEFP(stdout);
      auto err = cudaMallocManaged((void **)&ptr,bytes);
      if( err != cudaSuccess ) {
 	ptr = (_Tp *) NULL;
 	std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
 	assert(0);
      }
-    }
+    } 
    assert( ptr != (_Tp *)NULL);
 #else 
    //////////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -47,20 +47,19 @@ public:
  // Give Lattice access
  template<class object> friend class Lattice;
-  GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) {}; 
+  GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) { LocallyPeriodic=0;}; 
  GridBase(const Coordinate & processor_grid,
 	   const CartesianCommunicator &parent,
 	   int &split_rank) 
-    : CartesianCommunicator(processor_grid,parent,split_rank) {};
+    : CartesianCommunicator(processor_grid,parent,split_rank) {LocallyPeriodic=0;};
  GridBase(const Coordinate & processor_grid,
 	   const CartesianCommunicator &parent) 
-    : CartesianCommunicator(processor_grid,parent,dummy) {};
+    : CartesianCommunicator(processor_grid,parent,dummy) {LocallyPeriodic=0;};
  virtual ~GridBase() = default;
  // Physics Grid information.
  Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes.
  Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal
@@ -80,7 +79,8 @@ public:
  Coordinate _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
  Coordinate _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
-    bool _isCheckerBoarded; 
+  bool _isCheckerBoarded; 
  int        LocallyPeriodic;
 public:
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -173,6 +173,7 @@ public:
  ///////////////////////////////////////////////////
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_object scalar_object;
  typedef vobj vector_object;
 private:
--- a/Grid/lattice/Lattice_coordinate.h
+++ b/Grid/lattice/Lattice_coordinate.h
@@ -37,19 +37,18 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
  GridBase *grid = l.Grid();
  int Nsimd = grid->iSites();
  Coordinate gcoor;
  ExtractBuffer<scalar_type> mergebuf(Nsimd);
  vector_type vI;
  auto l_v = l.View();
-  for(int o=0;o<grid->oSites();o++){
+  thread_for( o, grid->oSites(), {
    vector_type vI;
    Coordinate gcoor;
    ExtractBuffer<scalar_type> mergebuf(Nsimd);
    for(int i=0;i<grid->iSites();i++){
      grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
      mergebuf[i]=(Integer)gcoor[mu];
    }
    merge<vector_type,scalar_type>(vI,mergebuf);
    l_v[o]=vI;
-  }
+  });
 };
 // LatticeCoordinate();
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -156,7 +156,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
 template<class vobj,class sobj>
-void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
+accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
  GridBase *grid = l.Grid();
@@ -185,7 +185,7 @@ void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
 };
 template<class vobj,class sobj>
-void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
+accelerator_inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
  GridBase *grid=l.Grid();
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -1,5 +1,4 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_transfer.h
@@ -83,12 +82,35 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
  });
 }
 template<class vobj,class CComplex,int nbasis>
 inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
 			  const             Lattice<vobj>   &fineData,
 			  const std::vector<Lattice<vobj> > &Basis)
 {
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();
  Lattice<CComplex> ip(coarse); 
  //  auto fineData_   = fineData.View();
  auto coarseData_ = coarseData.View();
  auto ip_         = ip.View();
  for(int v=0;v<nbasis;v++) {
    blockInnerProduct(ip,Basis[v],fineData);
    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
 	coalescedWrite(coarseData_[sc](v),ip_(sc));
      });
  }
 }
 template<class vobj,class CComplex,int nbasis>
 inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 const             Lattice<vobj>   &fineData,
 			 const std::vector<Lattice<vobj> > &Basis)
 {
  typedef iVector<CComplex,nbasis > coarseSiteData;
  coarseSiteData elide;
  typedef decltype(coalescedRead(elide)) ScalarComplex;
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();
  int  _ndimension = coarse->_ndimension;
@@ -106,26 +128,40 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
    assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
  }
  int blockVol = fine->oSites()/coarse->oSites();
  coarseData=Zero();
  auto fineData_   = fineData.View();
  auto coarseData_ = coarseData.View();
-  // Loop over coars parallel, and then loop over fine associated with coarse.
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////
-  thread_for( sf, fine->oSites(), {
+  // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
-    int sc;
+  // Otherwise do fine inner product per site, and make the update atomic
-    Coordinate coor_c(_ndimension);
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////
-    Coordinate coor_f(_ndimension);
+  accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {
    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
-    thread_critical {
+    auto sc=sci/nbasis;
-      for(int i=0;i<nbasis;i++) {
+    auto i=sci%nbasis;
-	auto Basis_      = Basis[i].View();
+    auto Basis_      = Basis[i].View();
-	coarseData_[sc](i)=coarseData_[sc](i) + innerProduct(Basis_[sf],fineData_[sf]);
+
-      }
+    Coordinate coor_c(_ndimension);
    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
    int sf;
    decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
    for(int sb=0;sb<blockVol;sb++){
      Coordinate coor_b(_ndimension);
      Coordinate coor_f(_ndimension);
      Lexicographic::CoorFromIndex(coor_b,sb,block_r);
      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
      reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
    }
    coalescedWrite(coarseData_[sc](i),reduce);
  });
  return;
 }
@@ -160,7 +196,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
  auto fineY_  = fineY.View();
  auto coarseA_= coarseA.View();
-  thread_for(sf, fine->oSites(), {
+  accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
    int sc;
    Coordinate coor_c(_ndimension);
@@ -171,7 +207,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
    // z = A x + y
-    fineZ_[sf]=coarseA_[sc]*fineX_[sf]+fineY_[sf];
+    coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf));
  });
@@ -196,7 +232,7 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
  fine_inner = localInnerProduct(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
-  thread_for(ss, coarse->oSites(),{
+  accelerator_for(ss, coarse->oSites(), 1, {
    CoarseInner_[ss] = coarse_inner_[ss];
  });
 }
@@ -226,23 +262,29 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  for(int d=0 ; d<_ndimension;d++){
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
  }
  int blockVol = fine->oSites()/coarse->oSites();
  // Turn this around to loop threaded over sc and interior loop 
  // over sf would thread better
  coarseData=Zero();
  auto coarseData_ = coarseData.View();
  auto fineData_   = fineData.View();
-  thread_for(sf,fine->oSites(),{
+  accelerator_for(sc,coarse->oSites(),1,{
-    int sc;
+
    // One thread per sub block
    Coordinate coor_c(_ndimension);
-    Coordinate coor_f(_ndimension);
+    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
-    
+    coarseData_[sc]=Zero();
-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+
-    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
+    for(int sb=0;sb<blockVol;sb++){
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+      
-    
+      int sf;
-    thread_critical { 
+      Coordinate coor_b(_ndimension);
      Coordinate coor_f(_ndimension);
      Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
      coarseData_[sc]=coarseData_[sc]+fineData_[sf];
    }
@@ -296,6 +338,7 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
  }
 }
 #if 0
 template<class vobj,class CComplex,int nbasis>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
@@ -321,7 +364,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
  auto coarseData_ = coarseData.View();
  // Loop with a cache friendly loop ordering
-  thread_for(sf,fine->oSites(),{
+  accelerator_for(sf,fine->oSites(),1,{
    int sc;
    Coordinate coor_c(_ndimension);
    Coordinate coor_f(_ndimension);
@@ -332,13 +375,35 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
    for(int i=0;i<nbasis;i++) {
      auto basis_ = Basis[i].View();
-      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf];
+      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
-      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf];
+      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
    }
  });
  return;
 }
 #else
 template<class vobj,class CComplex,int nbasis>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
 			 const std::vector<Lattice<vobj> > &Basis)
 {
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();
  fineData=Zero();
  for(int i=0;i<nbasis;i++) {
    Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
    Lattice<CComplex> cip(coarse);
    auto cip_ = cip.View();
    auto  ip_ =  ip.View();
    accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
 	coalescedWrite(cip_[sc], ip_(sc)());
    });
    blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
  }
 }
 #endif
 // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 // Simd layouts need not match since we use peek/poke Local
@@ -374,6 +439,67 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
  });
 }
 template<class vobj>
 void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate FromLowerLeft, Coordinate ToLowerLeft, Coordinate RegionSize)
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  static const int words=sizeof(vobj)/sizeof(vector_type);
  GridBase *Fg = From.Grid();
  GridBase *Tg = To.Grid();
  assert(!Fg->_isCheckerBoarded);
  assert(!Tg->_isCheckerBoarded);
  int Nsimd = Fg->Nsimd();
  int nF = Fg->_ndimension;
  int nT = Tg->_ndimension;
  int nd = nF;
  assert(nF == nT);
  for(int d=0;d<nd;d++){
    assert(Fg->_processors[d]  == Tg->_processors[d]);
  }
  // the above should guarantee that the operations are local
  Coordinate ldf = Fg->_ldimensions;
  Coordinate rdf = Fg->_rdimensions;
  Coordinate isf = Fg->_istride;
  Coordinate osf = Fg->_ostride;
  Coordinate rdt = Tg->_rdimensions;
  Coordinate ist = Tg->_istride;
  Coordinate ost = Tg->_ostride;
  auto t_v = To.View();
  auto f_v = From.View();
  accelerator_for(idx,Fg->lSites(),1,{
    sobj s;
    Coordinate Fcoor(nd);
    Coordinate Tcoor(nd);
    Lexicographic::CoorFromIndex(Fcoor,idx,ldf);
    int in_region=1;
    for(int d=0;d<nd;d++){
      if ( (Fcoor[d] < FromLowerLeft[d]) || (Fcoor[d]>=FromLowerLeft[d]+RegionSize[d]) ){ 
 	in_region=0;
      }
      Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
    }
    if (in_region) {
      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
      scalar_type * fp = (scalar_type *)&f_v[odx_f];
      scalar_type * tp = (scalar_type *)&t_v[odx_t];
      for(int w=0;w<words;w++){
 	tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd];  // FIXME IF RRII layout, type pun no worke
      }
      //      peekLocalSite(s,From,Fcoor);
      //      pokeLocalSite(s,To  ,Tcoor);
    }
  });
 }
 template<class vobj>
 void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -101,7 +101,8 @@ public:
  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
  // Efficient support for multigrid coarsening
-  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void   Meooe5D       (const FermionField &in, FermionField &out);
  void   MeooeDag5D    (const FermionField &in, FermionField &out);
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -62,14 +62,15 @@ public:
  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);
-      ///////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////
-      // Physical surface field utilities
+  // Physical surface field utilities
-      ///////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////
-      //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
+  //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
-      //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
+  //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
-      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
-      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+  virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
  // Constructors
  ContinuedFractionFermion5D(GaugeField &_Umu,
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -89,6 +89,7 @@ public:
  virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
      virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -103,6 +103,7 @@ public:
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -86,7 +86,8 @@ public:
  void   MooeeDag    (const FermionField &in, FermionField &out);
  void   MooeeInvDag (const FermionField &in, FermionField &out);
-  void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  void Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
  // These can be overridden by fancy 5d chiral action
--- a/Grid/qcd/action/fermion/MADWF.h
+++ b/Grid/qcd/action/fermion/MADWF.h
@@ -40,6 +40,11 @@ inline void convert(const Fieldi &from,Fieldo &to)
  to=from;
 }
 struct MADWFinnerIterCallbackBase{
  virtual void operator()(const RealD current_resid){}
  virtual ~MADWFinnerIterCallbackBase(){}
 };
 template<class Matrixo,class Matrixi,class PVinverter,class SchurSolver, class Guesser> 
 class MADWF 
 {
@@ -56,24 +61,30 @@ class MADWF
  RealD target_resid;
  int   maxiter;
 public:
  //operator() is called on "callback" at the end of every inner iteration. This allows for example the adjustment of the inner
  //tolerance to speed up subsequent iteration
  MADWFinnerIterCallbackBase* callback;
 public:
  MADWF(Matrixo &_Mato,
-	Matrixi &_Mati, 
+	Matrixi &_Mati,
-	PVinverter &_PauliVillarsSolvero, 
+	PVinverter &_PauliVillarsSolvero,
 	SchurSolver &_SchurSolveri,
 	Guesser & _Guesseri,
 	RealD resid,
-	int _maxiter) :
+	int _maxiter,
 	MADWFinnerIterCallbackBase* _callback = NULL) :
  Mato(_Mato),Mati(_Mati),
    SchurSolveri(_SchurSolveri),
-    PauliVillarsSolvero(_PauliVillarsSolvero),Guesseri(_Guesseri)
+    PauliVillarsSolvero(_PauliVillarsSolvero),Guesseri(_Guesseri),
-  {   
+    callback(_callback)
-    target_resid=resid;
+    {
-    maxiter     =_maxiter; 
+      target_resid=resid;
-  };
+      maxiter     =_maxiter;
-
+    };
  void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
  {
    std::cout << GridLogMessage<< " ************************************************" << std::endl;
@@ -177,6 +188,8 @@ class MADWF
       std::cout << GridLogMessage << "Residual " << i << ": " << resid  << std::endl;
       std::cout << GridLogMessage << "***************************************" <<std::endl;
       if(callback != NULL) (*callback)(resid);       
       if (resid < target_resid) {
 	 return;
       }
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@@ -67,12 +67,13 @@ public:
  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);
-      ///////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////
-      // Physical surface field utilities
+  // Physical surface field utilities
-      ///////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////
-      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
-      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+  virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
  // Constructors
  PartialFractionFermion5D(GaugeField &_Umu,
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -115,9 +115,10 @@ public:
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
-  void DhopDirDisp(const FermionField &in, FermionField &out, int dirdisp,
+  void DhopDirAll(const FermionField &in, std::vector<FermionField> &out);
-                   int gamma, int dag);
+  void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag);
  ///////////////////////////////////////////////////////////////
  // Extra methods added by derived
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -111,15 +111,16 @@ public:
  virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
  virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
  // These can be overridden by fancy 5d chiral action
  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
-      void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
-      void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
  // Implement hopping term non-hermitian hopping term; half cb or both
  // Implement s-diagonal DW
@@ -131,6 +132,9 @@ public:
  // add a DhopComm
  // -- suboptimal interface will presently trigger multiple comms.
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
  void DhopDirAll(const FermionField &in,std::vector<FermionField> &out);
  void DhopDirComms(const FermionField &in);
  void DhopDirCalc(const FermionField &in, FermionField &out,int point);
  ///////////////////////////////////////////////////////////////
  // New methods added 
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -60,13 +60,25 @@ public:
 			    int Ls, int Nsite, const FermionField &in, FermionField &out,
 			    int interior=1,int exterior=1) ;
  static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
 			  int Nsite, const FermionField &in, std::vector<FermionField> &out) ;
  static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
 			    int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);
 private:
-  static accelerator void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
+  static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
 				   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
  static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
  static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
  static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
  static accelerator_inline void DhopDirTp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
  static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
  static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
  static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
  static accelerator_inline void DhopDirTm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
  // Specialised variants
  static accelerator void GenericDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
--- a/Grid/qcd/action/fermion/g5HermitianLinop.h
+++ b/Grid/qcd/action/fermion/g5HermitianLinop.h
@@ -54,6 +54,14 @@ public:
    _Mat.Mdir(in,tmp,dir,disp);
    G5R5(out,tmp);
  }
  void OpDirAll(const Field &in, std::vector<Field> &out) {
    Field tmp(in.Grid());
    _Mat.MdirAll(in,out);
    for(int p=0;p<out.size();p++) {
      tmp=out[p];
      G5R5(out[p],tmp);
    }
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
@@ -96,6 +104,12 @@ public:
    _Mat.Mdir(in,tmp,dir,disp);
    out=g5*tmp;
  }
  void OpDirAll(const Field &in, std::vector<Field> &out) {
    _Mat.MdirAll(in,out);
    for(int p=0;p<out.size();p++) {
      out[p]=g5*out[p];
    }
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -389,6 +389,14 @@ void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,in
  Meo5D(psi,tmp);
  this->DhopDir(tmp,chi,dir,disp);
 }
 template<class Impl>
 void  CayleyFermion5D<Impl>::MdirAll(const FermionField &psi, std::vector<FermionField> &out)
 {
  FermionField tmp(psi.Grid());
  Meo5D(psi,tmp);
  this->DhopDirAll(tmp,out);
 }
 // force terms; five routines; default to Dhop on diagonal
 template<class Impl>
 void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@@ -143,6 +143,25 @@ void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionFi
  }
 }
 template<class Impl>
 void  ContinuedFractionFermion5D<Impl>::MdirAll (const FermionField &psi, std::vector<FermionField> &chi)
 {
  int Ls = this->Ls;
  this->DhopDirAll(psi,chi); // Dslash on diagonal. g5 Dslash is hermitian
  for(int p=0;p<chi.size();p++){
    int sign=1;
    for(int s=0;s<Ls;s++){
      if ( s==(Ls-1) ){
 	ag5xpby_ssp(chi[p],Beta[s]*ZoloHiInv,chi[p],0.0,chi[p],s,s);
      } else {
 	ag5xpby_ssp(chi[p],cc[s]*Beta[s]*sign*ZoloHiInv,chi[p],0.0,chi[p],s,s);
      }
      sign=-sign; 
    }
  }
 }
 template<class Impl>
 void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
 {
  int Ls = this->Ls;
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@@ -538,10 +538,16 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
 // Implement the general interface. Here we use SAME mass on all slices
 /////////////////////////////////////////////////////////////////////////
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
  DhopDir(in, out, dir, disp);
 }
 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
  assert(0);
 }
 template <class Impl>
 RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@@ -362,12 +362,19 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
 }
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
  DhopDir(in, out, dir, disp);
 }
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
  assert(0); // Not implemented yet
 }
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
  Compressor compressor;
  Stencil.HaloExchange(in, compressor);
@@ -380,6 +387,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
  });
 };
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
 						  DoubledGaugeField &U,
@@ -404,7 +412,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
 #ifdef GRID_OMP
  Compressor compressor; 
  int len =  U.Grid()->oSites();
  const int LLs =  1;
  DhopTotalTime   -= usecond();
--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@@ -31,7 +31,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
-template<class Impl>
+ template<class Impl>
 void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
  // this does both dag and undag but is trivial; make a common helper routing
  int Ls = this->Ls;
@@ -45,8 +45,25 @@ void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionFiel
    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
  }
  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
 }
 template<class Impl>
 void  PartialFractionFermion5D<Impl>::MdirAll (const FermionField &psi, std::vector<FermionField> &chi){
  // this does both dag and undag but is trivial; make a common helper routing
  int Ls = this->Ls;
  this->DhopDirAll(psi,chi);
  for(int point=0;point<chi.size();point++){
    int nblock=(Ls-1)/2;
    for(int b=0;b<nblock;b++){
      int s = 2*b;
      ag5xpby_ssp(chi[point],-scale,chi[point],0.0,chi[point],s,s); 
      ag5xpby_ssp(chi[point], scale,chi[point],0.0,chi[point],s+1,s+1); 
    }
    ag5xpby_ssp(chi[point],p[nblock]*scale/amax,chi[point],0.0,chi[point],Ls-1,Ls-1);
  }
 }
 template<class Impl>
 void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
 {
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -241,6 +241,15 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);
 };
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
 {
  Compressor compressor(DaggerNo);
  Stencil.HaloExchange(in,compressor);
  uint64_t Nsite = Umu.Grid()->oSites();
  Kernels::DhopDirAll(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out);
 };
 template<class Impl>
 void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -319,28 +319,51 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
 }
 template <class Impl>
-void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
  DhopDir(in, out, dir, disp);
 }
 template <class Impl>
 void WilsonFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
  DhopDirAll(in, out);
 }
 template <class Impl>
 void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
  Compressor compressor(DaggerNo);
  Stencil.HaloExchange(in, compressor);
  int skip = (disp == 1) ? 0 : 1;
  int dirdisp = dir + skip * 4;
  int gamma = dir + (1 - skip) * 4;
-  DhopDirDisp(in, out, dirdisp, gamma, DaggerNo);
+  DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
 };
 template <class Impl>
-void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
+void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
-  Compressor compressor(dag);
+  Compressor compressor(DaggerNo);
  Stencil.HaloExchange(in, compressor);
  assert((out.size()==8)||(out.size()==9)); 
  for(int dir=0;dir<Nd;dir++){
    for(int disp=-1;disp<=1;disp+=2){
      int skip = (disp == 1) ? 0 : 1;
      int dirdisp = dir + skip * 4;
      int gamma = dir + (1 - skip) * 4;
      DhopDirCalc(in, out[dirdisp], dirdisp, gamma, DaggerNo);
    }
  }
 }
 template <class Impl>
 void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
 {
  int Ls=1;
-  int Nsite=in.oSites();
+  uint64_t Nsite=in.oSites();
  Kernels::DhopDirKernel(Stencil, Umu, Stencil.CommBuf(), Ls, Nsite, in, out, dirdisp, gamma);
 };
@@ -348,7 +371,8 @@ template <class Impl>
 void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       DoubledGaugeField &U,
                                       const FermionField &in,
-                                       FermionField &out, int dag) {
+                                       FermionField &out, int dag) 
 {
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -91,8 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  }								\
  synchronise();						
-#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
+#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon)		\
  if (gamma == Dir) {						\
    if (SE->_is_local ) {					\
      int perm= SE->_permute;					\
      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
@@ -102,10 +101,14 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
    }								\
    synchronise();						\
    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
-    Recon(result, Uchi);					\
+    Recon(result, Uchi);					
-    synchronise();						\
+
 #define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
  if (gamma == Dir) {						\
    GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon);			\
  }
  ////////////////////////////////////////////////////////////////////
  // All legs kernels ; comms then compute
  ////////////////////////////////////////////////////////////////////
@@ -284,7 +287,36 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
  }
 };
-template <class Impl>
+#define DhopDirMacro(Dir,spProj,spRecon)	\
  template <class Impl>							\
  void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
 					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
  {									\
  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;		\
  typedef decltype(coalescedRead(in[0]))  calcSpinor;			\
  calcHalfSpinor chi;							\
  calcSpinor result;							\
  calcHalfSpinor Uchi;							\
  StencilEntry *SE;							\
  int ptype;								\
  const int Nsimd = SiteHalfSpinor::Nsimd();				\
  const int lane=SIMTlane(Nsimd);					\
 									\
  SE = st.GetEntry(ptype, dir, sF);					\
  GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon);				\
  coalescedWrite(out[sF], result,lane);					\
  }									
 DhopDirMacro(Xp,spProjXp,spReconXp);
 DhopDirMacro(Yp,spProjYp,spReconYp);
 DhopDirMacro(Zp,spProjZp,spReconZp);
 DhopDirMacro(Tp,spProjTp,spReconTp);
 DhopDirMacro(Xm,spProjXm,spReconXm);
 DhopDirMacro(Ym,spProjYm,spReconYm);
 DhopDirMacro(Zm,spProjZm,spReconZm);
 DhopDirMacro(Tm,spProjTm,spReconTm);
 template <class Impl> 
 void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
 				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) 
 {
@@ -299,18 +331,7 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
  const int lane=SIMTlane(Nsimd);
  SE = st.GetEntry(ptype, dir, sF);
-  if (gamma == Xp) {						
+  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
    if (SE->_is_local ) {					
      int perm= SE->_permute;					
      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	
      spProjXp(chi,tmp);						
    } else {							
      chi = coalescedRead(buf[SE->_offset],lane);			
    }								
    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		
    spReconXp(result, Uchi);					
  }
  GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
  GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
  GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
@@ -321,6 +342,38 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
  coalescedWrite(out[sF], result,lane);
 }
 template <class Impl>
 void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
 				      int Nsite, const FermionField &in, std::vector<FermionField> &out) 
 {
   auto U_v   = U.View();
   auto in_v  = in.View();
   auto st_v  = st.View();
   auto out_Xm = out[0].View();
   auto out_Ym = out[1].View();
   auto out_Zm = out[2].View();
   auto out_Tm = out[3].View();
   auto out_Xp = out[4].View();
   auto out_Yp = out[5].View();
   auto out_Zp = out[6].View();
   auto out_Tp = out[7].View();
   accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
      int sU=sss/Ls;				
      int sF =sss;				
      DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0);
      DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1);
      DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2);
      DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3);
      DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4);
      DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5);
      DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6);
      DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7);
   });
 }
 template <class Impl>
 void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
 					 int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma) 
@@ -332,13 +385,32 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
   auto in_v  = in.View();
   auto out_v = out.View();
   auto st_v  = st.View();
-   accelerator_for(ss,Nsite,Simd::Nsimd(),{
+#define LoopBody(Dir)				\
-    for(int s=0;s<Ls;s++){
+   case Dir :			\
-      int sU=ss;
+     accelerator_forNB(ss,Nsite,Simd::Nsimd(),{	\
-      int sF = s+Ls*sU; 
+       for(int s=0;s<Ls;s++){			\
-      DhopDirK(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp,gamma);
+	 int sU=ss;				\
-    }
+	 int sF = s+Ls*sU;						\
-  });
+	 DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
       }							       \
       });							       \
     break;
   switch(gamma){
   LoopBody(Xp);
   LoopBody(Yp);
   LoopBody(Zp);
   LoopBody(Tp);
   LoopBody(Xm);
   LoopBody(Ym);
   LoopBody(Zm);
   LoopBody(Tm);
   default:
     assert(0);
     break;
   }
 #undef LoopBody
 } 
 #define KERNEL_CALLNB(A) \
--- a/Grid/qcd/modules/Registration.h
+++ b/Grid/qcd/modules/Registration.h
@@ -80,6 +80,8 @@ static Registrar<OneFlavourRatioEOFModule<FermionImplementationPolicy>,
 static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>,   
                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient"); 
 static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,   
                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("BiCGSTAB"); 
 static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,   
                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual"); 
--- a/Grid/qcd/modules/SolverModules.h
+++ b/Grid/qcd/modules/SolverModules.h
@@ -119,6 +119,17 @@ class ConjugateGradientModule: public SolverModule<ConjugateGradient, Field, Sol
  }
 };
 template <class Field >
 class BiCGSTABModule: public SolverModule<BiCGSTAB, Field, SolverParameters> {
  typedef SolverModule<BiCGSTAB, Field, SolverParameters> SolverBase;
  using SolverBase::SolverBase; // for constructors
  // acquire resource
  virtual void initialize(){
    this->SolverPtr.reset(new BiCGSTAB<Field>(this->Par_.tolerance, this->Par_.max_iterations, true));
  }
 };
 template <class Field >
 class ConjugateResidualModule: public SolverModule<ConjugateResidual, Field, SolverParameters> {
  typedef SolverModule<ConjugateResidual, Field, SolverParameters> SolverBase;
--- a/Grid/qcd/utils/CovariantLaplacian.h
+++ b/Grid/qcd/utils/CovariantLaplacian.h
@@ -92,6 +92,7 @@ public:
  };
  void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
  void MdirAll(const GaugeField&, std::vector<GaugeField> &){ assert(0);}
  void Mdiag(const GaugeField&, GaugeField&){ assert(0);}
  void ImportGauge(const GaugeField& _U) {
--- a/Grid/serialisation/BaseIO.h
+++ b/Grid/serialisation/BaseIO.h
@@ -97,6 +97,23 @@ namespace Grid {
    template<typename T, typename V = void> struct is_tensor_variable : public std::false_type {};
    template<typename T> struct is_tensor_variable<T, typename std::enable_if<is_tensor<T>::value
        && !is_tensor_fixed<T>::value>::type> : public std::true_type {};
    // Helper functions to get the ultimate scalar inside a tensor, and corresponding size
    template <typename ET>
    inline typename std::enable_if<is_tensor<ET>::value, const typename ET::Index>::type
    getScalarCount(const ET &eigenTensor) { return eigenTensor.size() * Traits<ET>::count; }
    template <typename ET>
    inline typename std::enable_if<is_tensor_of_scalar<ET>::value, const typename ET::Scalar *>::type
    getFirstScalar(const ET &eigenTensor) { return eigenTensor.data(); }
    template <typename ET>
    inline typename std::enable_if<is_tensor_of_scalar<ET>::value, typename ET::Scalar *>::type
    getFirstScalar(ET &eigenTensor) { return eigenTensor.data(); }
    template <typename ET>
    inline typename std::enable_if<is_tensor_of_container<ET>::value, const typename Traits<ET>::scalar_type *>::type
    getFirstScalar(const ET &eigenTensor) { return eigenTensor.data()->begin(); }
    template <typename ET>
    inline typename std::enable_if<is_tensor_of_container<ET>::value, typename Traits<ET>::scalar_type *>::type
    getFirstScalar(ET &eigenTensor) { return eigenTensor.data()->begin(); }
  }
  // Abstract writer/reader classes ////////////////////////////////////////////
@@ -128,23 +145,6 @@ namespace Grid {
    typename std::enable_if<EigenIO::is_tensor<ETensor>::value>::type
    write(const std::string &s, const ETensor &output);
    // Helper functions for Scalar vs Container specialisations
    template <typename ETensor>
    inline typename std::enable_if<EigenIO::is_tensor_of_scalar<ETensor>::value,
    const typename ETensor::Scalar *>::type
    getFirstScalar(const ETensor &output)
    {
      return output.data();
    }
    template <typename ETensor>
    inline typename std::enable_if<EigenIO::is_tensor_of_container<ETensor>::value,
    const typename EigenIO::Traits<ETensor>::scalar_type *>::type
    getFirstScalar(const ETensor &output)
    {
      return output.data()->begin();
    }
    template <typename S>
    inline typename std::enable_if<EigenIO::is_scalar<S>::value, void>::type
    copyScalars(S * &pCopy, const S &Source)
@@ -318,12 +318,12 @@ namespace Grid {
      TotalDims[TensorRank + i] = Traits::Dimension(i);
    // If the Tensor isn't in Row-Major order, then we'll need to copy it's data
-    const bool CopyData{NumElements > 1 && ETensor::Layout != Eigen::StorageOptions::RowMajor};
+    const bool CopyData{NumElements > 1 && static_cast<int>( ETensor::Layout ) != static_cast<int>( Eigen::StorageOptions::RowMajor )};
    const Scalar * pWriteBuffer;
    std::vector<Scalar> CopyBuffer;
    const Index TotalNumElements = NumElements * Traits::count;
    if( !CopyData ) {
-      pWriteBuffer = getFirstScalar( output );
+      pWriteBuffer = EigenIO::getFirstScalar( output );
    } else {
      // Regardless of the Eigen::Tensor storage order, the copy will be Row Major
      CopyBuffer.resize( TotalNumElements );
--- a/Grid/simd/Grid_gpu_vec.h
+++ b/Grid/simd/Grid_gpu_vec.h
@@ -403,6 +403,10 @@ namespace Optimization {
    accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b){
      return a/b;
    }
    accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b){
      return a/b;
    }
    // Danger -- element wise divide fro complex, not complex div. 
    // See Grid_vector_types.h lines around 735, applied after "toReal"
    accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b){
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -628,6 +628,7 @@ void Grid_debug_handler_init(void)
  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
  sigaction(SIGBUS,&sa,NULL);
  sigaction(SIGUSR2,&sa,NULL);
  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
--- a/Hadrons/A2AMatrix.hpp
+++ b/Hadrons/A2AMatrix.hpp
@@ -1,777 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/A2AMatrix.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef A2A_Matrix_hpp_
 #define A2A_Matrix_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/TimerArray.hpp>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
 #ifdef USE_MKL
 #include "mkl.h"
 #include "mkl_cblas.h"
 #endif
 #ifndef HADRONS_A2AM_NAME 
 #define HADRONS_A2AM_NAME "a2aMatrix"
 #endif
 #ifndef HADRONS_A2AM_IO_TYPE
 #define HADRONS_A2AM_IO_TYPE ComplexF
 #endif
 #define HADRONS_A2AM_PARALLEL_IO
 BEGIN_HADRONS_NAMESPACE
 // general A2A matrix set based on Eigen tensors and Grid-allocated memory
 // Dimensions:
 //   0 - ext - external field (momentum, EM field, ...)
 //   1 - str - spin-color structure
 //   2 - t   - timeslice
 //   3 - i   - left  A2A mode index
 //   4 - j   - right A2A mode index
 template <typename T>
 using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;
 template <typename T>
 using A2AMatrix = Eigen::Matrix<T, -1, -1, Eigen::RowMajor>;
 template <typename T>
 using A2AMatrixTr = Eigen::Matrix<T, -1, -1, Eigen::ColMajor>;
 /******************************************************************************
 *                      Abstract class for A2A kernels                        *
 ******************************************************************************/
 template <typename T, typename Field>
 class A2AKernel
 {
 public:
    A2AKernel(void) = default;
    virtual ~A2AKernel(void) = default;
    virtual void operator()(A2AMatrixSet<T> &m, const Field *left, const Field *right,
                          const unsigned int orthogDim, double &time) = 0;
    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
 };
 /******************************************************************************
 *                  Class to handle A2A matrix block HDF5 I/O                 *
 ******************************************************************************/
 template <typename T>
 class A2AMatrixIo
 {
 public:
    // constructors
    A2AMatrixIo(void) = default;
    A2AMatrixIo(std::string filename, std::string dataname, 
                const unsigned int nt, const unsigned int ni = 0,
                const unsigned int nj = 0);
    // destructor
    ~A2AMatrixIo(void) = default;
    // access
    unsigned int getNi(void) const;
    unsigned int getNj(void) const;
    unsigned int getNt(void) const;
    size_t       getSize(void) const;
    // file allocation
    template <typename MetadataType>
    void initFile(const MetadataType &d, const unsigned int chunkSize);
    // block I/O
    void saveBlock(const T *data, const unsigned int i, const unsigned int j,
                   const unsigned int blockSizei, const unsigned int blockSizej);
    void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
                   const unsigned int i, const unsigned int j);
    template <template <class> class Vec, typename VecT>
    void load(Vec<VecT> &v, double *tRead = nullptr, GridBase *grid = nullptr);
 private:
    std::string  filename_{""}, dataname_{""};
    unsigned int nt_{0}, ni_{0}, nj_{0};
 };
 /******************************************************************************
 *                  Wrapper for A2A matrix block computation                  *
 ******************************************************************************/
 template <typename T, typename Field, typename MetadataType, typename TIo = T>
 class A2AMatrixBlockComputation
 {
 private:
    struct IoHelper
    {
        A2AMatrixIo<TIo> io;
        MetadataType     md;
        unsigned int     e, s, i, j;
    };
    typedef std::function<std::string(const unsigned int, const unsigned int)>  FilenameFn;
    typedef std::function<MetadataType(const unsigned int, const unsigned int)> MetadataFn;
 public:
    // constructor
    A2AMatrixBlockComputation(GridBase *grid,
                              const unsigned int orthogDim,
                              const unsigned int next,
                              const unsigned int nstr,
                              const unsigned int blockSize,
                              const unsigned int cacheBlockSize,
                              TimerArray *tArray = nullptr);
    // execution
    void execute(const std::vector<Field> &left, 
                 const std::vector<Field> &right,
                 A2AKernel<T, Field> &kernel,
                 const FilenameFn &ionameFn,
                 const FilenameFn &filenameFn,
                 const MetadataFn &metadataFn);
 private:
    // I/O handler
    void saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h);
 private:
    TimerArray            *tArray_;
    GridBase              *grid_;
    unsigned int          orthogDim_, nt_, next_, nstr_, blockSize_, cacheBlockSize_;
    Vector<T>             mCache_;
    Vector<TIo>           mBuf_;
    std::vector<IoHelper> nodeIo_;
 };
 /******************************************************************************
 *                       A2A matrix contraction kernels                       *
 ******************************************************************************/
 class A2AContraction
 {
 public:
    // accTrMul(acc, a, b): acc += tr(a*b)
    template <typename C, typename MatLeft, typename MatRight>
    static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
    {
        const int RowMajor = Eigen::RowMajor;
        const int ColMajor = Eigen::ColMajor;
        if ((MatLeft::Options  == RowMajor) and
            (MatRight::Options == ColMajor))
        {
  	  thread_for(r,a.rows(),
            {
                C tmp;
 #ifdef USE_MKL
                dotuRow(tmp, r, a, b);
 #else
                tmp = a.row(r).conjugate().dot(b.col(r));
 #endif
                thread_critical
                {
                    acc += tmp;
                }
            });
        }
        else
 	  {
            thread_for(c,a.cols(),
            {
                C tmp;
 #ifdef USE_MKL 
                dotuCol(tmp, c, a, b);
 #else
                tmp = a.col(c).conjugate().dot(b.row(c));
 #endif
                thread_critical
                {
                    acc += tmp;
                }
            });
        }
    }
    template <typename MatLeft, typename MatRight>
    static inline double accTrMulFlops(const MatLeft &a, const MatRight &b)
    {
        double n = a.rows()*a.cols();
        return 8.*n;
    }
    // mul(res, a, b): res = a*b
 #ifdef USE_MKL
    template <template <class, int...> class Mat, int... Opts>
    static inline void mul(Mat<ComplexD, Opts...> &res, 
                           const Mat<ComplexD, Opts...> &a, 
                           const Mat<ComplexD, Opts...> &b)
    {
        static const ComplexD one(1., 0.), zero(0., 0.);
        const int RowMajor = Eigen::RowMajor;
        const int ColMajor = Eigen::ColMajor;
        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
        {
            res.resize(a.rows(), b.cols());
        }
        if (Mat<ComplexD, Opts...>::Options == RowMajor)
        {
            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
                        res.data(), res.cols());
        }
        else if (Mat<ComplexD, Opts...>::Options == ColMajor)
        {
            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
                        res.data(), res.rows());
        }
    }
    template <template <class, int...> class Mat, int... Opts>
    static inline void mul(Mat<ComplexF, Opts...> &res, 
                           const Mat<ComplexF, Opts...> &a, 
                           const Mat<ComplexF, Opts...> &b)
    {
        static const ComplexF one(1., 0.), zero(0., 0.);
        const int RowMajor = Eigen::RowMajor;
        const int ColMajor = Eigen::ColMajor;
        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
        {
            res.resize(a.rows(), b.cols());
        }
        if (Mat<ComplexF, Opts...>::Options == RowMajor)
        {
            cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
                        res.data(), res.cols());
        }
        else if (Mat<ComplexF, Opts...>::Options == ColMajor)
        {
            cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
                        res.data(), res.rows());
        }
    }
 #else
    template <typename Mat>
    static inline void mul(Mat &res, const Mat &a, const Mat &b)
    {
        res = a*b;
    }
 #endif
    template <typename Mat>
    static inline double mulFlops(const Mat &a, const Mat &b)
    {
        double nr = a.rows(), nc = a.cols();
        return nr*nr*(6.*nc + 2.*(nc - 1.));
    }
 private:
    template <typename C, typename MatLeft, typename MatRight>
    static inline void makeDotRowPt(C * &aPt, unsigned int &aInc, C * &bPt, 
                                    unsigned int &bInc, const unsigned int aRow, 
                                    const MatLeft &a, const MatRight &b)
    {
        const int RowMajor = Eigen::RowMajor;
        const int ColMajor = Eigen::ColMajor;
        if (MatLeft::Options == RowMajor)
        {
            aPt  = a.data() + aRow*a.cols();
            aInc = 1;
        }
        else if (MatLeft::Options == ColMajor)
        {
            aPt  = a.data() + aRow;
            aInc = a.rows();
        }
        if (MatRight::Options == RowMajor)
        {
            bPt  = b.data() + aRow;
            bInc = b.cols();
        }
        else if (MatRight::Options == ColMajor)
        {
            bPt  = b.data() + aRow*b.rows();
            bInc = 1;
        }
    }
 #ifdef USE_MKL
    template <typename C, typename MatLeft, typename MatRight>
    static inline void makeDotColPt(C * &aPt, unsigned int &aInc, C * &bPt, 
                                    unsigned int &bInc, const unsigned int aCol, 
                                    const MatLeft &a, const MatRight &b)
    {
        const int RowMajor = Eigen::RowMajor;
        const int ColMajor = Eigen::ColMajor;
        if (MatLeft::Options == RowMajor)
        {
            aPt  = a.data() + aCol;
            aInc = a.cols();
        }
        else if (MatLeft::Options == ColMajor)
        {
            aPt  = a.data() + aCol*a.rows();
            aInc = 1;
        }
        if (MatRight::Options == RowMajor)
        {
            bPt  = b.data() + aCol*b.cols();
            bInc = 1;
        }
        else if (MatRight::Options == ColMajor)
        {
            bPt  = b.data() + aCol;
            bInc = b.rows();
        }
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuRow(ComplexF &res, const unsigned int aRow,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexF *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
        cblas_cdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuCol(ComplexF &res, const unsigned int aCol,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexF *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
        cblas_cdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuRow(ComplexD &res, const unsigned int aRow,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexD *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
        cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuCol(ComplexD &res, const unsigned int aCol,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexD *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
        cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
    }
 #endif
 };
 /******************************************************************************
 *                     A2AMatrixIo template implementation                    *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename T>
 A2AMatrixIo<T>::A2AMatrixIo(std::string filename, std::string dataname, 
                            const unsigned int nt, const unsigned int ni,
                            const unsigned int nj)
 : filename_(filename), dataname_(dataname)
 , nt_(nt), ni_(ni), nj_(nj)
 {}
 // access //////////////////////////////////////////////////////////////////////
 template <typename T>
 unsigned int A2AMatrixIo<T>::getNt(void) const
 {
    return nt_;
 }
 template <typename T>
 unsigned int A2AMatrixIo<T>::getNi(void) const
 {
    return ni_;
 }
 template <typename T>
 unsigned int A2AMatrixIo<T>::getNj(void) const
 {
    return nj_;
 }
 template <typename T>
 size_t A2AMatrixIo<T>::getSize(void) const
 {
    return nt_*ni_*nj_*sizeof(T);
 }
 // file allocation /////////////////////////////////////////////////////////////
 template <typename T>
 template <typename MetadataType>
 void A2AMatrixIo<T>::initFile(const MetadataType &d, const unsigned int chunkSize)
 {
 #ifdef HAVE_HDF5
    std::vector<hsize_t>    dim = {static_cast<hsize_t>(nt_), 
                                   static_cast<hsize_t>(ni_), 
                                   static_cast<hsize_t>(nj_)},
                            chunk = {static_cast<hsize_t>(nt_), 
                                     static_cast<hsize_t>(chunkSize), 
                                     static_cast<hsize_t>(chunkSize)};
    H5NS::DataSpace         dataspace(dim.size(), dim.data());
    H5NS::DataSet           dataset;
    H5NS::DSetCreatPropList plist;
    // create empty file just with metadata
    {
        Hdf5Writer writer(filename_);
        write(writer, dataname_, d);
    }
    // create the dataset
    Hdf5Reader reader(filename_, false);
    push(reader, dataname_);
    auto &group = reader.getGroup();
    plist.setChunk(chunk.size(), chunk.data());
    plist.setFletcher32();
    dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type<T>::type(), dataspace, plist);
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
 #endif
 }
 // block I/O ///////////////////////////////////////////////////////////////////
 template <typename T>
 void A2AMatrixIo<T>::saveBlock(const T *data, 
                               const unsigned int i, 
                               const unsigned int j,
                               const unsigned int blockSizei,
                               const unsigned int blockSizej)
 {
 #ifdef HAVE_HDF5
    Hdf5Reader           reader(filename_, false);
    std::vector<hsize_t> count = {nt_, blockSizei, blockSizej},
                         offset = {0, static_cast<hsize_t>(i),
                                   static_cast<hsize_t>(j)},
                         stride = {1, 1, 1},
                         block  = {1, 1, 1}; 
    H5NS::DataSpace      memspace(count.size(), count.data()), dataspace;
    H5NS::DataSet        dataset;
    //    size_t               shift;
    push(reader, dataname_);
    auto &group = reader.getGroup();
    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
    dataspace   = dataset.getSpace();
    dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
                              stride.data(), block.data());
    dataset.write(data, Hdf5Type<T>::type(), memspace, dataspace);
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
 #endif
 }
 template <typename T>
 void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,
                               const unsigned int ext, const unsigned int str,
                               const unsigned int i, const unsigned int j)
 {
    unsigned int blockSizei = m.dimension(3);
    unsigned int blockSizej = m.dimension(4);
    unsigned int nstr       = m.dimension(1);
    size_t       offset     = (ext*nstr + str)*nt_*blockSizei*blockSizej;
    saveBlock(m.data() + offset, i, j, blockSizei, blockSizej);
 }
 template <typename T>
 template <template <class> class Vec, typename VecT>
 void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead, GridBase *grid)
 {
 #ifdef HAVE_HDF5
    std::vector<hsize_t> hdim;
    H5NS::DataSet        dataset;
    H5NS::DataSpace      dataspace;
    H5NS::CompType       datatype;
    if (!(grid) || grid->IsBoss())
    {
        Hdf5Reader reader(filename_);
        push(reader, dataname_);
        auto &group = reader.getGroup();
        dataset = group.openDataSet(HADRONS_A2AM_NAME);
        datatype = dataset.getCompType();
        dataspace = dataset.getSpace();
        hdim.resize(dataspace.getSimpleExtentNdims());
        dataspace.getSimpleExtentDims(hdim.data());
        if ((nt_ * ni_ * nj_ != 0) and
            ((hdim[0] != nt_) or (hdim[1] != ni_) or (hdim[2] != nj_)))
        {
            HADRONS_ERROR(Size, "all-to-all matrix size mismatch (got "
                + std::to_string(hdim[0]) + "x" + std::to_string(hdim[1]) + "x"
                + std::to_string(hdim[2]) + ", expected "
                + std::to_string(nt_) + "x" + std::to_string(ni_) + "x"
                + std::to_string(nj_));
        }
        else if (ni_*nj_ == 0)
        {
            if (hdim[0] != nt_)
            {
                HADRONS_ERROR(Size, "all-to-all time size mismatch (got "
                    + std::to_string(hdim[0]) + ", expected "
                    + std::to_string(nt_) + ")");
            }
            ni_ = hdim[1];
            nj_ = hdim[2];
        }
    }
    if (grid)
    {
        grid->Broadcast(grid->BossRank(), &ni_, sizeof(unsigned int));
        grid->Broadcast(grid->BossRank(), &nj_, sizeof(unsigned int));
    }
    A2AMatrix<T>         buf(ni_, nj_);
    int broadcastSize =  sizeof(T) * buf.size();
    std::vector<hsize_t> count    = {1, static_cast<hsize_t>(ni_),
                                     static_cast<hsize_t>(nj_)},
                         stride   = {1, 1, 1},
                         block    = {1, 1, 1},
                         memCount = {static_cast<hsize_t>(ni_),
                                     static_cast<hsize_t>(nj_)};
    H5NS::DataSpace      memspace(memCount.size(), memCount.data());
    std::cout << "Loading timeslice";
    std::cout.flush();
    *tRead = 0.;
    for (unsigned int tp1 = nt_; tp1 > 0; --tp1)
    {
        unsigned int         t      = tp1 - 1;
        std::vector<hsize_t> offset = {static_cast<hsize_t>(t), 0, 0};
        if (t % 10 == 0)
        {
            std::cout << " " << t;
            std::cout.flush();
        }
        if (!(grid) || grid->IsBoss())
        {
            dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
                                      stride.data(), block.data());
        }
        if (tRead) *tRead -= usecond();
        if (!(grid) || grid->IsBoss())
        {
            dataset.read(buf.data(), datatype, memspace, dataspace);
        }
        if (grid)
        {
            grid->Broadcast(grid->BossRank(), buf.data(), broadcastSize);
        }
        if (tRead) *tRead += usecond();
        v[t] = buf.template cast<VecT>();
    }
    std::cout << std::endl;
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
 #endif
 }
 /******************************************************************************
 *               A2AMatrixBlockComputation template implementation            *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename T, typename Field, typename MetadataType, typename TIo>
 A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
 ::A2AMatrixBlockComputation(GridBase *grid,
                            const unsigned int orthogDim,
                            const unsigned int next, 
                            const unsigned int nstr,
                            const unsigned int blockSize, 
                            const unsigned int cacheBlockSize,
                            TimerArray *tArray)
 : grid_(grid), nt_(grid->GlobalDimensions()[orthogDim]), orthogDim_(orthogDim)
 , next_(next), nstr_(nstr), blockSize_(blockSize), cacheBlockSize_(cacheBlockSize)
 , tArray_(tArray)
 {
    mCache_.resize(nt_*next_*nstr_*cacheBlockSize_*cacheBlockSize_);
    mBuf_.resize(nt_*next_*nstr_*blockSize_*blockSize_);
 }
 #define START_TIMER(name) if (tArray_) tArray_->startTimer(name)
 #define STOP_TIMER(name)  if (tArray_) tArray_->stopTimer(name)
 #define GET_TIMER(name)   ((tArray_ != nullptr) ? tArray_->getDTimer(name) : 0.)
 // execution ///////////////////////////////////////////////////////////////////
 template <typename T, typename Field, typename MetadataType, typename TIo>
 void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
 ::execute(const std::vector<Field> &left, const std::vector<Field> &right,
          A2AKernel<T, Field> &kernel, const FilenameFn &ionameFn,
          const FilenameFn &filenameFn, const MetadataFn &metadataFn)
 {
    //////////////////////////////////////////////////////////////////////////
    // i,j   is first  loop over blockSize_ factors
    // ii,jj is second loop over cacheBlockSize_ factors for high perf contractions
    // iii,jjj are loops within cacheBlock
    // Total index is sum of these  i+ii+iii etc...
    //////////////////////////////////////////////////////////////////////////
    int    N_i = left.size();
    int    N_j = right.size();
    double flops, bytes, t_kernel;
    double nodes = grid_->NodeCount();
    int NBlock_i = N_i/blockSize_ + (((N_i % blockSize_) != 0) ? 1 : 0);
    int NBlock_j = N_j/blockSize_ + (((N_j % blockSize_) != 0) ? 1 : 0);
    for(int i=0;i<N_i;i+=blockSize_)
    for(int j=0;j<N_j;j+=blockSize_)
    {
        // Get the W and V vectors for this block^2 set of terms
        int N_ii = MIN(N_i-i,blockSize_);
        int N_jj = MIN(N_j-j,blockSize_);
        A2AMatrixSet<TIo> mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj);
        LOG(Message) << "All-to-all matrix block " 
                     << j/blockSize_ + NBlock_j*i/blockSize_ + 1 
                     << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
                     << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
                     << std::endl;
        // Series of cache blocked chunks of the contractions within this block
        flops    = 0.0;
        bytes    = 0.0;
        t_kernel = 0.0;
        for(int ii=0;ii<N_ii;ii+=cacheBlockSize_)
        for(int jj=0;jj<N_jj;jj+=cacheBlockSize_)
        {
            double t;
            int N_iii = MIN(N_ii-ii,cacheBlockSize_);
            int N_jjj = MIN(N_jj-jj,cacheBlockSize_);
            A2AMatrixSet<T> mCacheBlock(mCache_.data(), next_, nstr_, nt_, N_iii, N_jjj);
            START_TIMER("kernel");
            kernel(mCacheBlock, &left[i+ii], &right[j+jj], orthogDim_, t);
            STOP_TIMER("kernel");
            t_kernel += t;
            flops    += kernel.flops(N_iii, N_jjj);
            bytes    += kernel.bytes(N_iii, N_jjj);
            START_TIMER("cache copy");
            thread_for_collapse( 5,e,next_,{
              for(int s =0;s< nstr_;s++)
              for(int t =0;t< nt_;t++)
              for(int iii=0;iii< N_iii;iii++)
              for(int jjj=0;jjj< N_jjj;jjj++)
              {
                mBlock(e,s,t,ii+iii,jj+jjj) = mCacheBlock(e,s,t,iii,jjj);
              }
            });
            STOP_TIMER("cache copy");
        }
        // perf
        LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes 
                     << " Gflop/s/node " << std::endl;
        LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes 
                     << " GB/s/node "  << std::endl;
        // IO
        double       blockSize, ioTime;
        unsigned int myRank = grid_->ThisRank(), nRank  = grid_->RankCount();
        LOG(Message) << "Writing block to disk" << std::endl;
        ioTime = -GET_TIMER("IO: write block");
        START_TIMER("IO: total");
        makeFileDir(filenameFn(0, 0), grid_);
 #ifdef HADRONS_A2AM_PARALLEL_IO
        grid_->Barrier();
        // make task list for current node
        nodeIo_.clear();
        for(int f = myRank; f < next_*nstr_; f += nRank)
        {
            IoHelper h;
            h.i  = i;
            h.j  = j;
            h.e  = f/nstr_;
            h.s  = f % nstr_;
            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
            h.md = metadataFn(h.e, h.s);
            nodeIo_.push_back(h);
        }
        // parallel IO
        for (auto &h: nodeIo_)
        {
            saveBlock(mBlock, h);
        }
        grid_->Barrier();
 #else
        // serial IO, for testing purposes only
        for(int e = 0; e < next_; e++)
        for(int s = 0; s < nstr_; s++)
        {
            IoHelper h;
            h.i  = i;
            h.j  = j;
            h.e  = e;
            h.s  = s;
            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
            h.md = metadataFn(h.e, h.s);
            saveBlock(mfBlock, h);
        }
 #endif
        STOP_TIMER("IO: total");
        blockSize  = static_cast<double>(next_*nstr_*nt_*N_ii*N_jj*sizeof(TIo));
        ioTime    += GET_TIMER("IO: write block");
        LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
                     << ioTime  << " us (" 
                     << blockSize/ioTime*1.0e6/1024/1024
                     << " MB/s)" << std::endl;
    }
 }
 // I/O handler /////////////////////////////////////////////////////////////////
 template <typename T, typename Field, typename MetadataType, typename TIo>
 void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
 ::saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h)
 {
    if ((h.i == 0) and (h.j == 0))
    {
        START_TIMER("IO: file creation");
        h.io.initFile(h.md, blockSize_);
        STOP_TIMER("IO: file creation");
    }
    START_TIMER("IO: write block");
    h.io.saveBlock(m, h.e, h.s, h.i, h.j);
    STOP_TIMER("IO: write block");
 }
 #undef START_TIMER
 #undef STOP_TIMER
 #undef GET_TIMER
 END_HADRONS_NAMESPACE
 #endif // A2A_Matrix_hpp_
--- a/Hadrons/A2AVectors.hpp
+++ b/Hadrons/A2AVectors.hpp
@@ -1,342 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/A2AVectors.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: fionnoh <fionnoh@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef A2A_Vectors_hpp_
 #define A2A_Vectors_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Environment.hpp>
 #include <Hadrons/Solver.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                 Class to generate V & W all-to-all vectors                 *
 ******************************************************************************/
 template <typename FImpl>
 class A2AVectorsSchurDiagTwo
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    SOLVER_TYPE_ALIASES(FImpl,);
 public:
    A2AVectorsSchurDiagTwo(FMat &action, Solver &solver);
    virtual ~A2AVectorsSchurDiagTwo(void) = default;
    void makeLowModeV(FermionField &vout, 
                      const FermionField &evec, const Real &eval);
    void makeLowModeV5D(FermionField &vout_4d, FermionField &vout_5d, 
                        const FermionField &evec, const Real &eval);
    void makeLowModeW(FermionField &wout, 
                      const FermionField &evec, const Real &eval);
    void makeLowModeW5D(FermionField &wout_4d, FermionField &wout_5d, 
                        const FermionField &evec, const Real &eval);
    void makeHighModeV(FermionField &vout, const FermionField &noise);
    void makeHighModeV5D(FermionField &vout_4d, FermionField &vout_5d, 
                         const FermionField &noise_5d);
    void makeHighModeW(FermionField &wout, const FermionField &noise);
    void makeHighModeW5D(FermionField &vout_5d, FermionField &wout_5d, 
                         const FermionField &noise_5d);
 private:
    FMat                                     &action_;
    Solver                                   &solver_;
    GridBase                                 *fGrid_, *frbGrid_, *gGrid_;
    bool                                     is5d_;
    FermionField                             src_o_, sol_e_, sol_o_, tmp_, tmp5_;
    SchurDiagTwoOperator<FMat, FermionField> op_;
 };
 /******************************************************************************
 *                  Methods for V & W all-to-all vectors I/O                  *
 ******************************************************************************/
 class A2AVectorsIo
 {
 public:
    struct Record: Serializable
    {
        GRID_SERIALIZABLE_CLASS_MEMBERS(Record,
                                        unsigned int, index);
        Record(void): index(0) {}
    };
 public:
    template <typename Field>
    static void write(const std::string fileStem, std::vector<Field> &vec, 
                      const bool multiFile, const int trajectory = -1);
    template <typename Field>
    static void read(std::vector<Field> &vec, const std::string fileStem,
                     const bool multiFile, const int trajectory = -1);
 private:
    static inline std::string vecFilename(const std::string stem, const int traj, 
                                          const bool multiFile)
    {
        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
        if (multiFile)
        {
            return stem + t;
        }
        else
        {
            return stem + t + ".bin";
        }
    }
 };
 /******************************************************************************
 *               A2AVectorsSchurDiagTwo template implementation               *
 ******************************************************************************/
 template <typename FImpl>
 A2AVectorsSchurDiagTwo<FImpl>::A2AVectorsSchurDiagTwo(FMat &action, Solver &solver)
 : action_(action)
 , solver_(solver)
 , fGrid_(action_.FermionGrid())
 , frbGrid_(action_.FermionRedBlackGrid())
 , gGrid_(action_.GaugeGrid())
 , src_o_(frbGrid_)
 , sol_e_(frbGrid_)
 , sol_o_(frbGrid_)
 , tmp_(frbGrid_)
 , tmp5_(fGrid_)
 , op_(action_)
 {}
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeV(FermionField &vout, const FermionField &evec, const Real &eval)
 {
    src_o_ = evec;
    src_o_.Checkerboard() = Odd;
    pickCheckerboard(Even, sol_e_, vout);
    pickCheckerboard(Odd, sol_o_, vout);
    /////////////////////////////////////////////////////
    // v_ie = -(1/eval_i) * MeeInv Meo MooInv evec_i
    /////////////////////////////////////////////////////
    action_.MooeeInv(src_o_, tmp_);
    assert(tmp_.Checkerboard() == Odd);
    action_.Meooe(tmp_, sol_e_);
    assert(sol_e_.Checkerboard() == Even);
    action_.MooeeInv(sol_e_, tmp_);
    assert(tmp_.Checkerboard() == Even);
    sol_e_ = (-1.0 / eval) * tmp_;
    assert(sol_e_.Checkerboard() == Even);
    /////////////////////////////////////////////////////
    // v_io = (1/eval_i) * MooInv evec_i
    /////////////////////////////////////////////////////
    action_.MooeeInv(src_o_, tmp_);
    assert(tmp_.Checkerboard() == Odd);
    sol_o_ = (1.0 / eval) * tmp_;
    assert(sol_o_.Checkerboard() == Odd);
    setCheckerboard(vout, sol_e_);
    assert(sol_e_.Checkerboard() == Even);
    setCheckerboard(vout, sol_o_);
    assert(sol_o_.Checkerboard() == Odd);
 }
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeV5D(FermionField &vout_4d, FermionField &vout_5d, const FermionField &evec, const Real &eval)
 {
    makeLowModeV(vout_5d, evec, eval);
    action_.ExportPhysicalFermionSolution(vout_5d, vout_4d);
 }
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeW(FermionField &wout, const FermionField &evec, const Real &eval)
 {
    src_o_ = evec;
    src_o_.Checkerboard() = Odd;
    pickCheckerboard(Even, sol_e_, wout);
    pickCheckerboard(Odd, sol_o_, wout);
    /////////////////////////////////////////////////////
    // w_ie = - MeeInvDag MoeDag Doo evec_i
    /////////////////////////////////////////////////////
    op_.Mpc(src_o_, tmp_);
    assert(tmp_.Checkerboard() == Odd);
    action_.MeooeDag(tmp_, sol_e_);
    assert(sol_e_.Checkerboard() == Even);
    action_.MooeeInvDag(sol_e_, tmp_);
    assert(tmp_.Checkerboard() == Even);
    sol_e_ = (-1.0) * tmp_;
    /////////////////////////////////////////////////////
    // w_io = Doo evec_i
    /////////////////////////////////////////////////////
    op_.Mpc(src_o_, sol_o_);
    assert(sol_o_.Checkerboard() == Odd);
    setCheckerboard(wout, sol_e_);
    assert(sol_e_.Checkerboard() == Even);
    setCheckerboard(wout, sol_o_);
    assert(sol_o_.Checkerboard() == Odd);
 }
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeW5D(FermionField &wout_4d, 
                                                   FermionField &wout_5d, 
                                                   const FermionField &evec, 
                                                   const Real &eval)
 {
    makeLowModeW(tmp5_, evec, eval);
    action_.DminusDag(tmp5_, wout_5d);
    action_.ExportPhysicalFermionSource(wout_5d, wout_4d);
 }
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeV(FermionField &vout, 
                                                  const FermionField &noise)
 {
    solver_(vout, noise);
 }
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeV5D(FermionField &vout_4d, 
                                                    FermionField &vout_5d, 
                                                    const FermionField &noise)
 {
    if (noise.Grid()->Dimensions() == fGrid_->Dimensions() - 1)
    {
        action_.ImportPhysicalFermionSource(noise, tmp5_);
    }
    else
    {
        tmp5_ = noise;
    }
    makeHighModeV(vout_5d, tmp5_);
    action_.ExportPhysicalFermionSolution(vout_5d, vout_4d);
 }
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeW(FermionField &wout, 
                                                  const FermionField &noise)
 {
    wout = noise;
 }
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeW5D(FermionField &wout_4d, 
                                                    FermionField &wout_5d, 
                                                    const FermionField &noise)
 {
    if (noise.Grid()->Dimensions() == fGrid_->Dimensions() - 1)
    {
        action_.ImportUnphysicalFermion(noise, wout_5d);
        wout_4d = noise;
    }
    else
    {
        wout_5d = noise;
        action_.ExportPhysicalFermionSource(wout_5d, wout_4d);
    }
 }
 /******************************************************************************
 *               all-to-all vectors I/O template implementation               *
 ******************************************************************************/
 template <typename Field>
 void A2AVectorsIo::write(const std::string fileStem, std::vector<Field> &vec, 
                         const bool multiFile, const int trajectory)
 {
    Record       record;
    GridBase     *grid = vec[0].Grid();
    ScidacWriter binWriter(grid->IsBoss());
    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
    if (multiFile)
    {
        std::string fullFilename;
        for (unsigned int i = 0; i < vec.size(); ++i)
        {
            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
            LOG(Message) << "Writing vector " << i << std::endl;
            makeFileDir(fullFilename, grid);
            binWriter.open(fullFilename);
            record.index = i;
            binWriter.writeScidacFieldRecord(vec[i], record);
            binWriter.close();
        }
    }
    else
    {
        makeFileDir(filename, grid);
        binWriter.open(filename);
        for (unsigned int i = 0; i < vec.size(); ++i)
        {
            LOG(Message) << "Writing vector " << i << std::endl;
            record.index = i;
            binWriter.writeScidacFieldRecord(vec[i], record);
        }
        binWriter.close();
    }
 }
 template <typename Field>
 void A2AVectorsIo::read(std::vector<Field> &vec, const std::string fileStem, 
                        const bool multiFile, const int trajectory)
 {
    Record       record;
    ScidacReader binReader;
    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
    if (multiFile)
    {
        std::string fullFilename;
        for (unsigned int i = 0; i < vec.size(); ++i)
        {
            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
            LOG(Message) << "Reading vector " << i << std::endl;
            binReader.open(fullFilename);
            binReader.readScidacFieldRecord(vec[i], record);
            binReader.close();
            if (record.index != i)
            {
                HADRONS_ERROR(Io, "vector index mismatch");
            }
        }
    }
    else
    {
        binReader.open(filename);
        for (unsigned int i = 0; i < vec.size(); ++i)
        {
            LOG(Message) << "Reading vector " << i << std::endl;
            binReader.readScidacFieldRecord(vec[i], record);
            if (record.index != i)
            {
                HADRONS_ERROR(Io, "vector index mismatch");
            }
        }
        binReader.close();
    }
 }
 END_HADRONS_NAMESPACE
 #endif // A2A_Vectors_hpp_
--- a/Hadrons/Application.cc
+++ b/Hadrons/Application.cc
@@ -1,287 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Application.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Application.hpp>
 #include <Hadrons/GeneticScheduler.hpp>
 #include <Hadrons/Modules.hpp>
 using namespace Grid;
 using namespace Hadrons;
 #define BIG_SEP "================"
 #define SEP     "----------------"
 /******************************************************************************
 *                       Application implementation                           *
 ******************************************************************************/
 // constructors ////////////////////////////////////////////////////////////////
 #define MACOUT(macro)    macro              << " (" << #macro << ")"
 #define MACOUTS(macro) HADRONS_STR(macro) << " (" << #macro << ")"
 Application::Application(void)
 {
    initLogger();
    auto dim = GridDefaultLatt(), mpi = GridDefaultMpi(), loc(dim);
    if (dim.size())
    {
        locVol_ = 1;
        for (unsigned int d = 0; d < dim.size(); ++d)
        {
            loc[d]  /= mpi[d];
            locVol_ *= loc[d];
        }
        LOG(Message) << "====== HADRONS APPLICATION INITIALISATION ======" << std::endl;
        LOG(Message) << "** Dimensions" << std::endl;
        LOG(Message) << "Global lattice: " << dim << std::endl;
        LOG(Message) << "MPI partition : " << mpi << std::endl;
        LOG(Message) << "Local lattice : " << loc << std::endl;
        LOG(Message) << std::endl;
        LOG(Message) << "** Default parameters (and associated C macros)" << std::endl;
        LOG(Message) << "ASCII output precision  : " << MACOUT(DEFAULT_ASCII_PREC) << std::endl;
        LOG(Message) << "Fermion implementation  : " << MACOUTS(FIMPLBASE) << std::endl;
        LOG(Message) << "z-Fermion implementation: " << MACOUTS(ZFIMPLBASE) << std::endl;
        LOG(Message) << "Scalar implementation   : " << MACOUTS(SIMPLBASE) << std::endl;
        LOG(Message) << "Gauge implementation    : " << MACOUTS(GIMPLBASE) << std::endl;
        LOG(Message) << "Eigenvector base size   : " 
                    << MACOUT(HADRONS_DEFAULT_LANCZOS_NBASIS) << std::endl;
        LOG(Message) << "Schur decomposition     : " << MACOUTS(HADRONS_DEFAULT_SCHUR) << std::endl;
        LOG(Message) << std::endl;
    }
 }
 Application::Application(const Application::GlobalPar &par)
 : Application()
 {
    setPar(par);
 }
 Application::Application(const std::string parameterFileName)
 : Application()
 {
    parameterFileName_ = parameterFileName;
 }
 // access //////////////////////////////////////////////////////////////////////
 void Application::setPar(const Application::GlobalPar &par)
 {
    par_ = par;
 }
 const Application::GlobalPar & Application::getPar(void)
 {
    return par_;
 }
 // execute /////////////////////////////////////////////////////////////////////
 void Application::run(void)
 {
    LOG(Message) << "====== HADRONS APPLICATION START ======" << std::endl;
    if (!parameterFileName_.empty() and (vm().getNModule() == 0))
    {
        parseParameterFile(parameterFileName_);
    }
    if (getPar().runId.empty())
    {
        HADRONS_ERROR(Definition, "run id is empty");
    }
    LOG(Message) << "RUN ID '" << getPar().runId << "'" << std::endl;
    BinaryIO::latticeWriteMaxRetry = getPar().parallelWriteMaxRetry;
    LOG(Message) << "Attempt(s) for resilient parallel I/O: " 
                 << BinaryIO::latticeWriteMaxRetry << std::endl;
    vm().setRunId(getPar().runId);
    vm().printContent();
    env().printContent();
    if (getPar().saveSchedule or getPar().scheduleFile.empty())
    {
        schedule();
        if (getPar().saveSchedule)
        {
            std::string filename;
            filename = (getPar().scheduleFile.empty()) ? 
                         "hadrons.sched" : getPar().scheduleFile;
            saveSchedule(filename);
        }
    }
    else
    {
        loadSchedule(getPar().scheduleFile);
    }
    printSchedule();
    if (!getPar().graphFile.empty())
    {
        makeFileDir(getPar().graphFile, env().getGrid());
        vm().dumpModuleGraph(getPar().graphFile);
    }
    configLoop();
 }
 // parse parameter file ////////////////////////////////////////////////////////
 class ObjectId: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ObjectId,
                                    std::string, name,
                                    std::string, type);
 };
 void Application::parseParameterFile(const std::string parameterFileName)
 {
    XmlReader reader(parameterFileName);
    GlobalPar par;
    ObjectId  id;
    LOG(Message) << "Building application from '" << parameterFileName << "'..." << std::endl;
    read(reader, "parameters", par);
    setPar(par);
    if (!push(reader, "modules"))
    {
        HADRONS_ERROR(Parsing, "Cannot open node 'modules' in parameter file '" 
                              + parameterFileName + "'");
    }
    if (!push(reader, "module"))
    {
        HADRONS_ERROR(Parsing, "Cannot open node 'modules/module' in parameter file '" 
                              + parameterFileName + "'");
    }
    do
    {
        read(reader, "id", id);
        vm().createModule(id.name, id.type, reader);
    } while (reader.nextElement("module"));
    pop(reader);
    pop(reader);
 }
 void Application::saveParameterFile(const std::string parameterFileName, unsigned int prec)
 {
    LOG(Message) << "Saving application to '" << parameterFileName << "'..." << std::endl;
    if (env().getGrid()->IsBoss())
    {
        XmlWriter          writer(parameterFileName);
        writer.setPrecision(prec);
        ObjectId           id;
        const unsigned int nMod = vm().getNModule();
        write(writer, "parameters", getPar());
        push(writer, "modules");
        for (unsigned int i = 0; i < nMod; ++i)
        {
            push(writer, "module");
            id.name = vm().getModuleName(i);
            id.type = vm().getModule(i)->getRegisteredName();
            write(writer, "id", id);
            vm().getModule(i)->saveParameters(writer, "options");
            pop(writer);
        }
        pop(writer);
        pop(writer);
    }
 }
 // schedule computation ////////////////////////////////////////////////////////
 void Application::schedule(void)
 {
    if (!scheduled_ and !loadedSchedule_)
    {
        program_   = vm().schedule(par_.genetic);
        scheduled_ = true;
    }
 }
 void Application::saveSchedule(const std::string filename)
 {
    LOG(Message) << "Saving current schedule to '" << filename << "'..."
                 << std::endl;
    if (env().getGrid()->IsBoss())
    {
        TextWriter               writer(filename);
        std::vector<std::string> program;
        if (!scheduled_)
        {
            HADRONS_ERROR(Definition, "Computation not scheduled");
        }
        for (auto address: program_)
        {
            program.push_back(vm().getModuleName(address));
        }
        write(writer, "schedule", program);
    }
 }
 void Application::loadSchedule(const std::string filename)
 {
    TextReader               reader(filename);
    std::vector<std::string> program;
    LOG(Message) << "Loading schedule from '" << filename << "'..."
                 << std::endl;
    read(reader, "schedule", program);
    program_.clear();
    for (auto &name: program)
    {
        program_.push_back(vm().getModuleAddress(name));
    }
    loadedSchedule_ = true;
    scheduled_      = true;
 }
 void Application::printSchedule(void)
 {
    if (!scheduled_ and !loadedSchedule_)
    {
        HADRONS_ERROR(Definition, "Computation not scheduled");
    }
    auto peak = vm().memoryNeeded(program_);
    LOG(Message) << "Schedule (memory needed: " << sizeString(peak) << "):"
                 << std::endl;
    for (unsigned int i = 0; i < program_.size(); ++i)
    {
        LOG(Message) << std::setw(4) << i + 1 << ": "
                     << vm().getModuleName(program_[i]) << std::endl;
    }
 }
 // loop on configurations //////////////////////////////////////////////////////
 void Application::configLoop(void)
 {
    auto range = par_.trajCounter;
    for (unsigned int t = range.start; t < range.end; t += range.step)
    {
        LOG(Message) << BIG_SEP << " Starting measurement for trajectory " << t
                     << " " << BIG_SEP << std::endl;
        vm().setTrajectory(t);
        vm().executeProgram(program_);
    }
    LOG(Message) << BIG_SEP << " End of measurement " << BIG_SEP << std::endl;
    env().freeAll();
 }
--- a/Hadrons/Application.hpp
+++ b/Hadrons/Application.hpp
@@ -1,126 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Application.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Application_hpp_
 #define Hadrons_Application_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/VirtualMachine.hpp>
 #include <Hadrons/Module.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         Main program manager                               *
 ******************************************************************************/
 class Application
 {
 public:
    class TrajRange: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
                                        unsigned int, start,
                                        unsigned int, end,
                                        unsigned int, step);
    };
    class GlobalPar: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(GlobalPar,
                                        TrajRange,                  trajCounter,
                                        VirtualMachine::GeneticPar, genetic,
                                        std::string,                runId,
                                        std::string,                graphFile,
                                        std::string,                scheduleFile,
                                        bool,                       saveSchedule,
                                        int,                        parallelWriteMaxRetry);
        GlobalPar(void): parallelWriteMaxRetry{-1} {}
    };
 public:
    // constructors
    Application(void);
    Application(const GlobalPar &par);
    Application(const std::string parameterFileName);
    // destructor
    virtual ~Application(void) = default;
    // access
    void              setPar(const GlobalPar &par);
    const GlobalPar & getPar(void);
    // module creation
    template <typename M>
    void createModule(const std::string name);
    template <typename M>
    void createModule(const std::string name, const typename M::Par &par);
    // execute
    void run(void);
    // XML parameter file I/O
    void parseParameterFile(const std::string parameterFileName);
    void saveParameterFile(const std::string parameterFileName, unsigned int prec=15);
    // schedule computation
    void schedule(void);
    void saveSchedule(const std::string filename);
    void loadSchedule(const std::string filename);
    void printSchedule(void);
    // loop on configurations
    void configLoop(void);
 private:
    // environment shortcut
    DEFINE_ENV_ALIAS;
    // virtual machine shortcut
    DEFINE_VM_ALIAS;
 private:
    long unsigned int       locVol_;
    std::string             parameterFileName_{""};
    GlobalPar               par_;
    VirtualMachine::Program program_;
    bool                    scheduled_{false}, loadedSchedule_{false};
 };
 /******************************************************************************
 *                     Application template implementation                    *
 ******************************************************************************/
 // module creation /////////////////////////////////////////////////////////////
 template <typename M>
 void Application::createModule(const std::string name)
 {
    vm().createModule<M>(name);
    scheduled_ = false;
 }
 template <typename M>
 void Application::createModule(const std::string name,
                               const typename M::Par &par)
 {
    vm().createModule<M>(name, par);
    scheduled_ = false;
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Application_hpp_
--- a/Hadrons/Archive/Modules/ScalarVP.cc
+++ b/Hadrons/Archive/Modules/ScalarVP.cc
@@ -1,564 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/ScalarVP.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: James Harrison <jch1g10@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Hadrons/Modules/MScalar/ScalarVP.hpp>
 #include <Hadrons/Modules/MScalar/Scalar.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MScalar;
 /*
 * Scalar QED vacuum polarisation up to O(alpha)
 *
 * Conserved vector 2-point function diagram notation:
 *        _______
 *       /       \
 * U_nu *         * U_mu
 *       \_______/
 *
 *                (   adj(S(a\hat{nu}|x)) U_mu(x) S(0|x+a\hat{mu}) U_nu(0)    )
 *          = 2 Re(                             -                             )
 *                ( adj(S(a\hat{nu}|x+a\hat{mu})) adj(U_mu(x)) S(0|x) U_nu(0) )
 *  
 *
 *            _______
 *           /       \
 * free = 1 *         * 1
 *           \_______/
 *
 *
 *
 *             _______
 *            /       \
 * S = iA_nu *         * iA_mu
 *            \_______/
 *
 *
 *         Delta_1
 *         ___*___
 *        /       \
 * X = 1 *         * 1
 *        \___*___/
 *         Delta_1
 *
 *          Delta_1                     Delta_1
 *          ___*___                     ___*___
 *         /       \                   /       \
 *      1 *         * iA_mu  +  iA_nu *         * 1
 *         \_______/                   \_______/
 * 4C =        _______                     _______
 *            /       \                   /       \
 *      +  1 *         * iA_mu  +  iA_nu *         * 1
 *            \___*___/                   \___*___/
 *             Delta_1                     Delta_1
 *
 *     Delta_1   Delta_1
 *          _*___*_             _______
 *         /       \           /       \
 * 2E = 1 *         * 1  +  1 *         * 1
 *         \_______/           \_*___*_/
 *                         Delta_1   Delta_1
 *
 *          Delta_2
 *          ___*___             _______
 *         /       \           /       \
 * 2T = 1 *         * 1  +  1 *         * 1
 *         \_______/           \___*___/
 *                              Delta_2
 *
 *
 *                    _______
 *                   /       \
 * srcT = -A_nu^2/2 *         * 1
 *                   \_______/
 *
 *
 *
 *            _______
 *           /       \
 * snkT = 1 *         * -A_mu^2/2
 *           \_______/
 *
 * Full VP to O(alpha) = free + q^2*(S+X+4C+2E+2T+srcT+snkT)
 */
 /******************************************************************************
 *                  TScalarVP implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TScalarVP::TScalarVP(const std::string name)
 : Module<ScalarVPPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TScalarVP::getInput(void)
 {
    prop0Name_ = par().scalarProp + "_0";
    propQName_ = par().scalarProp + "_Q";
    propSunName_ = par().scalarProp + "_Sun";
    propTadName_ = par().scalarProp + "_Tad";
 	std::vector<std::string> in = {par().emField, prop0Name_, propQName_,
                                   propSunName_, propTadName_};
    return in;
 }
 std::vector<std::string> TScalarVP::getOutput(void)
 {
    std::vector<std::string> out;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        // out.push_back(getName() + "_propQ_" + std::to_string(mu));
        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
        {
            out.push_back(getName() + "_" + std::to_string(mu)
                          + "_" + std::to_string(nu));
        }
    }
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TScalarVP::setup(void)
 {
 	freeMomPropName_ = FREEMOMPROP(static_cast<TChargedProp *>(vm().getModule(par().scalarProp))->par().mass);
 	GFSrcName_ = par().scalarProp + "_DinvSrc";
    fftName_   = par().scalarProp + "_fft";
 	phaseName_.clear();
 	muPropQName_.clear();
    vpTensorName_.clear();
    momPhaseName_.clear();
 	for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
        muPropQName_.push_back(getName() + "_propQ_" + std::to_string(mu));
        std::vector<std::string> vpTensorName_mu;
        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
        {
            vpTensorName_mu.push_back(getName() + "_" + std::to_string(mu)
                                      + "_" + std::to_string(nu));
        }
        vpTensorName_.push_back(vpTensorName_mu);
    }
    if (!par().output.empty())
    {
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            momPhaseName_.push_back("_momentumphase_" + std::to_string(i_p));
        }
    }
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
 	{
 	    envCreateLat(ScalarField, muPropQName_[mu]);
        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
        {
            envCreateLat(ScalarField, vpTensorName_[mu][nu]);
        }
 	}
    if (!par().output.empty())
    {
        momPhasesDone_ = env().hasCreatedObject(momPhaseName_[0]);
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            envCacheLat(ScalarField, momPhaseName_[i_p]);
        }
    }
    envTmpLat(ScalarField, "buf");
    envTmpLat(ScalarField, "result");
    envTmpLat(ScalarField, "Amu");
    envTmpLat(ScalarField, "Usnk");
    envTmpLat(ScalarField, "tmpProp");
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TScalarVP::execute(void)
 {
    // CACHING ANALYTIC EXPRESSIONS
    makeCaches();
    Complex ci(0.0,1.0);
    Real    q        = static_cast<TChargedProp *>(vm().getModule(par().scalarProp))->par().charge;
    auto    &prop0   = envGet(ScalarField, prop0Name_);
    auto    &propQ   = envGet(ScalarField, propQName_);
    auto    &propSun = envGet(ScalarField, propSunName_);
    auto    &propTad = envGet(ScalarField, propTadName_);
    auto    &GFSrc   = envGet(ScalarField, GFSrcName_);
    auto    &G       = envGet(ScalarField, freeMomPropName_);
    auto    &fft     = envGet(FFT, fftName_);
    phase_.clear();
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        auto &phmu = envGet(ScalarField, phaseName_[mu]);
        phase_.push_back(&phmu);
    }
    // PROPAGATORS FROM SHIFTED SOURCES
    LOG(Message) << "Computing O(q) charged scalar propagators..."
                 << std::endl;
    std::vector<ScalarField *> muPropQ;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        auto &propmu = envGet(ScalarField, muPropQName_[mu]);
        // -G*momD1*G*F*tau_mu*Src (momD1 = F*D1*Finv)
        propmu = adj(*phase_[mu])*GFSrc;
        momD1(propmu, fft);
        propmu = -G*propmu;
        fft.FFT_all_dim(propmu, propmu, FFT::backward);
        muPropQ.push_back(&propmu);
    }
    // CONTRACTIONS
    auto        &A = envGet(EmField, par().emField);
    envGetTmp(ScalarField, buf);
    envGetTmp(ScalarField, result);
    envGetTmp(ScalarField, Amu);
    envGetTmp(ScalarField, Usnk);
    envGetTmp(ScalarField, tmpProp);
    TComplex    Anu0, Usrc;
    std::vector<int> coor0 = {0, 0, 0, 0};
    std::vector<std::vector<ScalarField *> > vpTensor;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        std::vector<ScalarField *> vpTensor_mu;
        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
        {
            auto &vpmunu = envGet(ScalarField, vpTensorName_[mu][nu]);
            vpTensor_mu.push_back(&vpmunu);
        }
        vpTensor.push_back(vpTensor_mu);
    }
    // Prepare output data structure if necessary
    Result outputData;
    if (!par().output.empty())
    {
        outputData.projection.resize(par().outputMom.size());
        outputData.lattice_size = env().getGrid()->FullDimensions().toVector();
        outputData.mass = static_cast<TChargedProp *>(vm().getModule(par().scalarProp))->par().mass;
        outputData.charge = q;
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            outputData.projection[i_p].momentum = strToVec<int>(par().outputMom[i_p]);
            outputData.projection[i_p].pi.resize(env().getNd());
            outputData.projection[i_p].pi_free.resize(env().getNd());
            outputData.projection[i_p].pi_2E.resize(env().getNd());
            outputData.projection[i_p].pi_2T.resize(env().getNd());
            outputData.projection[i_p].pi_S.resize(env().getNd());
            outputData.projection[i_p].pi_4C.resize(env().getNd());
            outputData.projection[i_p].pi_X.resize(env().getNd());
            outputData.projection[i_p].pi_srcT.resize(env().getNd());
            outputData.projection[i_p].pi_snkT.resize(env().getNd());
            for (unsigned int nu = 0; nu < env().getNd(); ++nu)
            {
                outputData.projection[i_p].pi[nu].resize(env().getNd());
                outputData.projection[i_p].pi_free[nu].resize(env().getNd());
                outputData.projection[i_p].pi_2E[nu].resize(env().getNd());
                outputData.projection[i_p].pi_2T[nu].resize(env().getNd());
                outputData.projection[i_p].pi_S[nu].resize(env().getNd());
                outputData.projection[i_p].pi_4C[nu].resize(env().getNd());
                outputData.projection[i_p].pi_X[nu].resize(env().getNd());
                outputData.projection[i_p].pi_srcT[nu].resize(env().getNd());
                outputData.projection[i_p].pi_snkT[nu].resize(env().getNd());
            }
        }
    }
    // Do contractions
    for (unsigned int nu = 0; nu < env().getNd(); ++nu)
    {
        peekSite(Anu0, peekLorentz(A, nu), coor0);
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            LOG(Message) << "Computing Pi[" << mu << "][" << nu << "]..."
                         << std::endl;
            Amu = peekLorentz(A, mu);
            // free
            tmpProp = Cshift(prop0, nu, -1);     // S_0(0|x-a\hat{\nu})
                                                 // = S_0(a\hat{\nu}|x)
            Usrc    = Complex(1.0,0.0);
            vpContraction(result, prop0, tmpProp, Usrc, mu);
            *vpTensor[mu][nu] = result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_free[mu][nu], result,
                            i_p);
                }
            }
            tmpProp = result; // Just using tmpProp as a temporary ScalarField
                              // here (buf is modified by calls to writeVP())
            // srcT
            result = tmpProp * (-0.5)*Anu0*Anu0;
            *vpTensor[mu][nu] += q*q*result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_srcT[mu][nu], result,
                            i_p);
                }
            }
            // snkT
            result = tmpProp * (-0.5)*Amu*Amu;
            *vpTensor[mu][nu] += q*q*result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_snkT[mu][nu], result,
                            i_p);
                }
            }
            // S
            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
            Usrc    = ci*Anu0;
            Usnk    = ci*Amu;
            vpContraction(result, prop0, tmpProp, Usrc, Usnk, mu);
            *vpTensor[mu][nu] += q*q*result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_S[mu][nu], result,
                            i_p);
                }
            }
            // 4C
            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
            Usrc    = Complex(1.0,0.0);
            Usnk    = ci*Amu;
            vpContraction(result, propQ, tmpProp, Usrc, Usnk, mu);
            Usrc    = ci*Anu0;
            vpContraction(buf, propQ, tmpProp, Usrc, mu);
            result += buf;
            vpContraction(buf, prop0, *muPropQ[nu], Usrc, mu);
            result += buf;
            Usrc = Complex(1.0,0.0);
            Usnk = ci*Amu;
            vpContraction(buf, prop0, *muPropQ[nu], Usrc, Usnk, mu);
            result += buf;
            *vpTensor[mu][nu] += q*q*result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_4C[mu][nu], result,
                            i_p);
                }
            }
            // X
            Usrc = Complex(1.0,0.0);
            vpContraction(result, propQ, *muPropQ[nu], Usrc, mu);
            *vpTensor[mu][nu] += q*q*result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_X[mu][nu], result,
                            i_p);
                }
            }
            // 2E
            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
            Usrc    = Complex(1.0,0.0);
            vpContraction(result, propSun, tmpProp, Usrc, mu);
            tmpProp = Cshift(propSun, nu, -1);     // S_\Sigma(0|x-a\hat{\nu})
                               //(Note: <S(0|x-a\hat{\nu})> = <S(a\hat{\nu}|x)>)
            vpContraction(buf, prop0, tmpProp, Usrc, mu);
            result += buf;
            *vpTensor[mu][nu] += q*q*result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_2E[mu][nu], result,
                            i_p);
                }
            }
            // 2T
            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
            Usrc    = Complex(1.0,0.0);
            vpContraction(result, propTad, tmpProp, Usrc, mu);
            tmpProp = Cshift(propTad, nu, -1);     // S_T(0|x-a\hat{\nu})
            vpContraction(buf, prop0, tmpProp, Usrc, mu);
            result += buf;
            *vpTensor[mu][nu] += q*q*result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_2T[mu][nu], result,
                            i_p);
                }
            }
            // Do momentum projections of full VP if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi[mu][nu],
                            *vpTensor[mu][nu], i_p);
                }
            }
        }
    }
    // OUTPUT IF NECESSARY
    if (!par().output.empty())
    {
        LOG(Message) << "Saving momentum-projected HVP to '"
                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        saveResult(par().output, "HVP", outputData);
    }
 }
 void TScalarVP::makeCaches(void)
 {
    envGetTmp(ScalarField, buf);
    if ( (!par().output.empty()) && (!momPhasesDone_) )
    {
        LOG(Message) << "Caching phases for momentum projections..."
                     << std::endl;
        auto l = env().getGrid()->FullDimensions();
        Complex          ci(0.0,1.0);
        // Calculate phase factors
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            std::vector<int> mom = strToVec<int>(par().outputMom[i_p]);
            auto &momph_ip = envGet(ScalarField, momPhaseName_[i_p]);
            momph_ip = Zero();
            for (unsigned int j = 0; j < env().getNd()-1; ++j)
            {
                Real twoPiL = M_PI*2./l[j];
                LatticeCoordinate(buf, j);
                buf = mom[j]*twoPiL*buf;
                momph_ip = momph_ip + buf;
            }
            momph_ip = exp(-ci*momph_ip);
            momPhase_.push_back(&momph_ip);
        }
    }
 }
 void TScalarVP::vpContraction(ScalarField &vp,
                   ScalarField &prop_0_x, ScalarField &prop_nu_x,
                   TComplex u_src, ScalarField &u_snk, int mu)
 {
    // Note: this function assumes a point source is used.
    vp = adj(prop_nu_x) * u_snk * Cshift(prop_0_x, mu, 1) * u_src;
    vp -= Cshift(adj(prop_nu_x), mu, 1) * adj(u_snk) * prop_0_x * u_src;
    vp = 2.0*real(vp);
 }
 void TScalarVP::vpContraction(ScalarField &vp,
                   ScalarField &prop_0_x, ScalarField &prop_nu_x,
                   TComplex u_src, int mu)
 {
    // Note: this function assumes a point source is used.
    vp = adj(prop_nu_x) * Cshift(prop_0_x, mu, 1) * u_src;
    vp -= Cshift(adj(prop_nu_x), mu, 1) * prop_0_x * u_src;
    vp = 2.0*real(vp);
 }
 void TScalarVP::project(std::vector<Complex> &projection, const ScalarField &vp, int i_p)
 {
    std::vector<TComplex>   vecBuf;
    envGetTmp(ScalarField, buf);
    buf = vp*(*momPhase_[i_p]);
    sliceSum(buf, vecBuf, Tp);
    projection.resize(vecBuf.size());
    for (unsigned int t = 0; t < vecBuf.size(); ++t)
    {
        projection[t] = TensorRemove(vecBuf[t]);
    }
 }
 void TScalarVP::momD1(ScalarField &s, FFT &fft)
 {
    auto        &A = envGet(EmField, par().emField);
    Complex     ci(0.0,1.0);
    envGetTmp(ScalarField, buf);
    envGetTmp(ScalarField, result);
    envGetTmp(ScalarField, Amu);
    result = Zero();
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = (*phase_[mu])*s;
        fft.FFT_all_dim(buf, buf, FFT::backward);
        buf = Amu*buf;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result - ci*buf;
    }
    fft.FFT_all_dim(s, s, FFT::backward);
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = Amu*s;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result + ci*adj(*phase_[mu])*buf;
    }
    s = result;
 }
--- a/Hadrons/Archive/Modules/ScalarVP.hpp
+++ b/Hadrons/Archive/Modules/ScalarVP.hpp
@@ -1,129 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/ScalarVP.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: James Harrison <jch1g10@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalar_ScalarVP_hpp_
 #define Hadrons_MScalar_ScalarVP_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         Scalar vacuum polarisation                         *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalar)
 class ScalarVPPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarVPPar,
                                    std::string, emField,
                                    std::string, scalarProp,
                                    std::string, output,
                                    std::vector<std::string>, outputMom);
 };
 class TScalarVP: public Module<ScalarVPPar>
 {
 public:
    BASIC_TYPE_ALIASES(SIMPL,);
    typedef PhotonR::GaugeField     EmField;
    typedef PhotonR::GaugeLinkField EmComp;
    class Result: Serializable
    {
    public:
        class Projection: Serializable
        {
        public:
            GRID_SERIALIZABLE_CLASS_MEMBERS(Projection,
                                            std::vector<int>,     momentum,
                                            std::vector<std::vector<std::vector<Complex>>>, pi,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_free,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_2E,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_2T,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_S,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_4C,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_X,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_srcT,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_snkT);
        };
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        std::vector<int>,        lattice_size,
                                        double,                  mass,
                                        double,                  charge,
                                        std::vector<Projection>, projection);
    };
 public:
    // constructor
    TScalarVP(const std::string name);
    // destructor
    virtual ~TScalarVP(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    void makeCaches(void);
    // conserved vector two-point contraction
    void vpContraction(ScalarField &vp,
                       ScalarField &prop_0_x, ScalarField &prop_nu_x,
                       TComplex u_src, ScalarField &u_snk, int mu);
    // conserved vector two-point contraction with unit gauge link at sink
    void vpContraction(ScalarField &vp,
                       ScalarField &prop_0_x, ScalarField &prop_nu_x,
                       TComplex u_src, int mu);
    // write momentum-projected vacuum polarisation to file(s)
    void project(std::vector<Complex> &projection, const ScalarField &vp,
                 int i_p);
    // momentum-space Delta_1 insertion
    void momD1(ScalarField &s, FFT &fft);
 private:
    bool                                        momPhasesDone_;
    std::string                                 freeMomPropName_, GFSrcName_,
                                                prop0Name_, propQName_,
                                                propSunName_, propTadName_,
                                                fftName_;
    std::vector<std::string>                    phaseName_, muPropQName_,
                                                momPhaseName_;
    std::vector<std::vector<std::string> >      vpTensorName_;
    std::vector<ScalarField *>                  phase_, momPhase_;
 };
 MODULE_REGISTER(ScalarVP, TScalarVP, MScalar);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalar_ScalarVP_hpp_
--- a/Hadrons/Archive/Modules/TestSeqConserved.cc
+++ b/Hadrons/Archive/Modules/TestSeqConserved.cc
@@ -1,35 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/TestSeqConserved.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MUtilities/TestSeqConserved.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MUtilities;
 template class Grid::Hadrons::MUtilities::TTestSeqConserved<FIMPL>;
--- a/Hadrons/Archive/Modules/TestSeqConserved.hpp
+++ b/Hadrons/Archive/Modules/TestSeqConserved.hpp
@@ -1,186 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/TestSeqConserved.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MUtilities_TestSeqConserved_hpp_
 #define Hadrons_MUtilities_TestSeqConserved_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
  Ward Identity contractions using sequential propagators.
 -----------------------------
 * options:
 - q:      point source propagator, 5D if available (string)
 - qSeq:   result of sequential insertion of conserved current using q (string)
 - action: action used for computation of q (string)
 - origin: string giving point source origin of q (string)
 - t_J:    time at which sequential current is inserted (int)
 - mu:     Lorentz index of current inserted (int)
 - curr:   current type, e.g. vector/axial (Current)
 */
 /******************************************************************************
 *                            TestSeqConserved                                *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MUtilities)
 class TestSeqConservedPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TestSeqConservedPar,
                                    std::string,  q,
                                    std::string,  qSeq,
                                    std::string,  action,
                                    std::string,  origin,
                                    unsigned int, t_J,
                                    unsigned int, mu,
                                    Current,      curr);
 };
 template <typename FImpl>
 class TTestSeqConserved: public Module<TestSeqConservedPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TTestSeqConserved(const std::string name);
    // destructor
    virtual ~TTestSeqConserved(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(TestSeqConserved, TTestSeqConserved<FIMPL>, MUtilities);
 /******************************************************************************
 *                     TTestSeqConserved implementation                       *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TTestSeqConserved<FImpl>::TTestSeqConserved(const std::string name)
 : Module<TestSeqConservedPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TTestSeqConserved<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q, par().qSeq, par().action};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TTestSeqConserved<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TTestSeqConserved<FImpl>::setup(void)
 {
    auto Ls = env().getObjectLs(par().q);
    if (Ls != env().getObjectLs(par().action))
    {
        HADRONS_ERROR(Size, "Ls mismatch between quark action and propagator");
    }
    envTmpLat(PropagatorField, "tmp");
    envTmpLat(LatticeComplex, "c");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TTestSeqConserved<FImpl>::execute(void)
 {
    // Check sequential insertion of current gives same result as conserved 
    // current sink upon contraction. Assume q uses a point source.
    auto                  &q    = envGet(PropagatorField, par().q);
    auto                  &qSeq = envGet(PropagatorField, par().qSeq);
    auto                  &act  = envGet(FMat, par().action);
    Gamma                 g5(Gamma::Algebra::Gamma5);
    Gamma::Algebra        gA = (par().curr == Current::Axial) ?
                                  Gamma::Algebra::Gamma5 :
                                  Gamma::Algebra::Identity;
    Gamma                 g(gA);
    SitePropagator        qSite;
    Complex               test_S, test_V, check_S, check_V;
    std::vector<TComplex> check_buf;
    std::vector<int>      siteCoord;
    envGetTmp(PropagatorField, tmp);
    envGetTmp(LatticeComplex, c);
    siteCoord = strToVec<int>(par().origin);
    peekSite(qSite, qSeq, siteCoord);
    test_S = trace(qSite*g);
    test_V = trace(qSite*g*Gamma::gmu[par().mu]);
    act.ContractConservedCurrent(q, q, tmp, par().curr, par().mu);
    c = trace(tmp*g);
    sliceSum(c, check_buf, Tp);
    check_S = TensorRemove(check_buf[par().t_J]);
    c = trace(tmp*g*Gamma::gmu[par().mu]);
    sliceSum(c, check_buf, Tp);
    check_V = TensorRemove(check_buf[par().t_J]);
    LOG(Message) << "Test S  = " << abs(test_S)   << std::endl;
    LOG(Message) << "Test V  = " << abs(test_V) << std::endl;
    LOG(Message) << "Check S = " << abs(check_S) << std::endl;
    LOG(Message) << "Check V = " << abs(check_V) << std::endl;
    // Check difference = 0
    check_S -= test_S;
    check_V -= test_V;
    LOG(Message) << "Consistency check for sequential conserved " 
                 << par().curr << " current insertion: " << std::endl; 
    LOG(Message) << "Diff S  = " << abs(check_S) << std::endl;
    LOG(Message) << "Diff V  = " << abs(check_V) << std::endl;
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_TestSeqConserved_hpp_
--- a/Hadrons/Archive/Modules/TestSeqGamma.cc
+++ b/Hadrons/Archive/Modules/TestSeqGamma.cc
@@ -1,35 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/TestSeqGamma.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MUtilities/TestSeqGamma.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MUtilities;
 template class Grid::Hadrons::MUtilities::TTestSeqGamma<FIMPL>;
--- a/Hadrons/Archive/Modules/TestSeqGamma.hpp
+++ b/Hadrons/Archive/Modules/TestSeqGamma.hpp
@@ -1,150 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/TestSeqGamma.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MUtilities_TestSeqGamma_hpp_
 #define Hadrons_MUtilities_TestSeqGamma_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                              TestSeqGamma                                  *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MUtilities)
 class TestSeqGammaPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TestSeqGammaPar,
                                    std::string,    q,
                                    std::string,    qSeq,
                                    std::string,    origin,
                                    Gamma::Algebra, gamma,
                                    unsigned int,   t_g);
 };
 template <typename FImpl>
 class TTestSeqGamma: public Module<TestSeqGammaPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TTestSeqGamma(const std::string name);
    // destructor
    virtual ~TTestSeqGamma(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(TestSeqGamma, TTestSeqGamma<FIMPL>, MUtilities);
 /******************************************************************************
 *                      TTestSeqGamma implementation                          *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TTestSeqGamma<FImpl>::TTestSeqGamma(const std::string name)
 : Module<TestSeqGammaPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TTestSeqGamma<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q, par().qSeq};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TTestSeqGamma<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TTestSeqGamma<FImpl>::setup(void)
 {
    envTmpLat(LatticeComplex, "c");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TTestSeqGamma<FImpl>::execute(void)
 {
    auto                  &q    = envGet(PropagatorField, par().q);
    auto                  &qSeq = envGet(PropagatorField, par().qSeq);
    Gamma                 g5(Gamma::Algebra::Gamma5);
    Gamma                 g(par().gamma);
    SitePropagator        qSite;
    Complex               test, check;
    std::vector<TComplex> check_buf;
    std::vector<int>      siteCoord;
    // Check sequential insertion of gamma matrix gives same result as 
    // insertion of gamma at sink upon contraction. Assume q uses a point 
    // source.
    envGetTmp(LatticeComplex, c);
    siteCoord = strToVec<int>(par().origin);
    peekSite(qSite, qSeq, siteCoord);
    test = trace(g*qSite);
    c = trace(adj(g)*g5*adj(q)*g5*g*q);
    sliceSum(c, check_buf, Tp);
    check = TensorRemove(check_buf[par().t_g]);
    LOG(Message) << "Seq Result = " << abs(test)  << std::endl;
    LOG(Message) << "Reference  = " << abs(check) << std::endl;
    // Check difference = 0
    check -= test;
    LOG(Message) << "Consistency check for sequential " << par().gamma  
                 << " insertion = " << abs(check) << std::endl;
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_TestSeqGamma_hpp_
--- a/Hadrons/Archive/Modules/VPCounterTerms.cc
+++ b/Hadrons/Archive/Modules/VPCounterTerms.cc
@@ -1,260 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/VPCounterTerms.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: James Harrison <jch1g10@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MScalar/VPCounterTerms.hpp>
 #include <Hadrons/Modules/MScalar/Scalar.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MScalar;
 /******************************************************************************
 *                  TVPCounterTerms implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TVPCounterTerms::TVPCounterTerms(const std::string name)
 : Module<VPCounterTermsPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TVPCounterTerms::getInput(void)
 {
    std::vector<std::string> in = {par().source};
    return in;
 }
 std::vector<std::string> TVPCounterTerms::getOutput(void)
 {
    std::vector<std::string> out;
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TVPCounterTerms::setup(void)
 {
 	freeMomPropName_ = FREEMOMPROP(par().mass);
    phaseName_.clear();
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
    }
    GFSrcName_ = getName() + "_DinvSrc";
    phatsqName_ = getName() + "_pHatSquared";
    prop0Name_ = getName() + "_freeProp";
    twoscalarName_ = getName() + "_2scalarProp";
    psquaredName_ = getName() + "_psquaredProp";
    if (!par().output.empty())
    {
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            momPhaseName_.push_back("_momentumphase_" + std::to_string(i_p));
        }
    }
    envCreateLat(ScalarField, freeMomPropName_);
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        envCreateLat(ScalarField, phaseName_[mu]);
    }
    envCreateLat(ScalarField, phatsqName_);
    envCreateLat(ScalarField, GFSrcName_);
    envCreateLat(ScalarField, prop0Name_);
    envCreateLat(ScalarField, twoscalarName_);
    envCreateLat(ScalarField, psquaredName_);
    if (!par().output.empty())
    {
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            envCacheLat(ScalarField, momPhaseName_[i_p]);
        }
    }
    envTmpLat(ScalarField, "buf");
    envTmpLat(ScalarField, "tmp_vp");
    envTmpLat(ScalarField, "vpPhase");
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TVPCounterTerms::execute(void)
 {
 	auto &source = envGet(ScalarField, par().source);
    Complex     ci(0.0,1.0);
    FFT         fft(env().getGrid());
    envGetTmp(ScalarField, buf);
    envGetTmp(ScalarField, tmp_vp);
    // Momentum-space free scalar propagator
    auto &G = envGet(ScalarField, freeMomPropName_);
    SIMPL::MomentumSpacePropagator(G, par().mass);
    // Phases and hat{p}^2
    auto &phatsq = envGet(ScalarField, phatsqName_);
    Coordinate l = env().getGrid()->FullDimensions();
    LOG(Message) << "Calculating shift phases..." << std::endl;
    phatsq = Zero();
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Real    twoPiL = M_PI*2./l[mu];
        auto &phmu  = envGet(ScalarField, phaseName_[mu]);
        LatticeCoordinate(buf, mu);
        phmu = exp(ci*twoPiL*buf);
        phase_.push_back(&phmu);
        buf = 2.*sin(.5*twoPiL*buf);
 		phatsq = phatsq + buf*buf;
    }
    // G*F*src
    auto &GFSrc       = envGet(ScalarField, GFSrcName_);
    fft.FFT_all_dim(GFSrc, source, FFT::forward);
    GFSrc = G*GFSrc;
    // Position-space free scalar propagator
    auto &prop0       = envGet(ScalarField, prop0Name_);
    prop0 = GFSrc;
    fft.FFT_all_dim(prop0, prop0, FFT::backward);
    // Propagators for counter-terms
    auto &twoscalarProp        = envGet(ScalarField, twoscalarName_);
    auto &psquaredProp         = envGet(ScalarField, psquaredName_);
    twoscalarProp = G*GFSrc;
    fft.FFT_all_dim(twoscalarProp, twoscalarProp, FFT::backward);
    psquaredProp = G*phatsq*GFSrc;
    fft.FFT_all_dim(psquaredProp, psquaredProp, FFT::backward);
    // Prepare output data structure if necessary
    Result outputData;
    if (!par().output.empty())
    {
        outputData.projection.resize(par().outputMom.size());
        outputData.lattice_size = env().getGrid()->FullDimensions().toVector();
        outputData.mass = par().mass;
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            outputData.projection[i_p].momentum = strToVec<int>(par().outputMom[i_p]);
            outputData.projection[i_p].twoScalar.resize(env().getNd());
            outputData.projection[i_p].threeScalar.resize(env().getNd());
            outputData.projection[i_p].pSquaredInsertion.resize(env().getNd());
            for (unsigned int nu = 0; nu < env().getNd(); ++nu)
            {
                outputData.projection[i_p].twoScalar[nu].resize(env().getNd());
                outputData.projection[i_p].threeScalar[nu].resize(env().getNd());
                outputData.projection[i_p].pSquaredInsertion[nu].resize(env().getNd());
            }
            // Calculate phase factors
            auto &momph_ip = envGet(ScalarField, momPhaseName_[i_p]);
            momph_ip = Zero();
            for (unsigned int j = 0; j < env().getNd()-1; ++j)
            {
                Real twoPiL = M_PI*2./l[j];
                LatticeCoordinate(buf, j);
                buf = outputData.projection[i_p].momentum[j]*twoPiL*buf;
                momph_ip = momph_ip + buf;
            }
            momph_ip = exp(-ci*momph_ip);
            momPhase_.push_back(&momph_ip);
        }
    }
    // Contractions
    for (unsigned int nu = 0; nu < env().getNd(); ++nu)
    {
    	buf = adj(Cshift(prop0, nu, -1));
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            // Two-scalar loop
            tmp_vp = buf * Cshift(prop0, mu, 1);
            tmp_vp -= Cshift(buf, mu, 1) * prop0;
            tmp_vp = 2.0*real(tmp_vp);
            // Output if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].twoScalar[mu][nu],
                            tmp_vp, i_p);
                }
            }
        	// Three-scalar loop (no vertex)
    		tmp_vp = buf * Cshift(twoscalarProp, mu, 1);
            tmp_vp -= Cshift(buf, mu, 1) * twoscalarProp;
            tmp_vp = 2.0*real(tmp_vp);
            // Output if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].threeScalar[mu][nu],
                            tmp_vp, i_p);
                }
            }
            // Three-scalar loop (hat{p}^2 insertion)
    		tmp_vp = buf * Cshift(psquaredProp, mu, 1);
            tmp_vp -= Cshift(buf, mu, 1) * psquaredProp;
            tmp_vp = 2.0*real(tmp_vp);
            // Output if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pSquaredInsertion[mu][nu],
                            tmp_vp, i_p);
                }
            }
        }
    }
    // OUTPUT IF NECESSARY
    if (!par().output.empty())
    {
        LOG(Message) << "Saving momentum-projected correlators to '"
                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        saveResult(par().output, "scalar_loops", outputData);
    }
 }
 void TVPCounterTerms::project(std::vector<Complex> &projection, const ScalarField &vp, int i_p)
 {
    std::vector<TComplex>   vecBuf;
    envGetTmp(ScalarField, vpPhase);
    vpPhase = vp*(*momPhase_[i_p]);
    sliceSum(vpPhase, vecBuf, Tp);
    projection.resize(vecBuf.size());
    for (unsigned int t = 0; t < vecBuf.size(); ++t)
    {
        projection[t] = TensorRemove(vecBuf[t]);
    }
 }
--- a/Hadrons/Archive/Modules/VPCounterTerms.hpp
+++ b/Hadrons/Archive/Modules/VPCounterTerms.hpp
@@ -1,103 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/VPCounterTerms.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: James Harrison <jch1g10@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalar_VPCounterTerms_hpp_
 #define Hadrons_MScalar_VPCounterTerms_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         VPCounterTerms                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalar)
 class VPCounterTermsPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(VPCounterTermsPar,
                                    std::string, source,
                                    double,      mass,
                                    std::string, output,
                                    std::vector<std::string>, outputMom);
 };
 class TVPCounterTerms: public Module<VPCounterTermsPar>
 {
 public:
    BASIC_TYPE_ALIASES(SIMPL,);
    class Result: Serializable
    {
    public:
        class Projection: Serializable
        {
        public:
            GRID_SERIALIZABLE_CLASS_MEMBERS(Projection,
                                            std::vector<int>,     momentum,
                                            std::vector<std::vector<std::vector<Complex>>>, twoScalar,
                                            std::vector<std::vector<std::vector<Complex>>>, threeScalar,
                                            std::vector<std::vector<std::vector<Complex>>>, pSquaredInsertion);
        };
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        std::vector<int>,        lattice_size,
                                        double,                  mass,
                                        std::vector<Projection>, projection);
    };
 public:
    // constructor
    TVPCounterTerms(const std::string name);
    // destructor
    virtual ~TVPCounterTerms(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    void project(std::vector<Complex> &projection, const ScalarField &vp, int i_p);
 private:
    std::string                freeMomPropName_, GFSrcName_, phatsqName_, prop0Name_,
                               twoscalarName_, twoscalarVertexName_,
                               psquaredName_, psquaredVertexName_;
    std::vector<std::string>   phaseName_, momPhaseName_;
    std::vector<ScalarField *> phase_, momPhase_;
 };
 MODULE_REGISTER(VPCounterTerms, TVPCounterTerms, MScalar);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalar_VPCounterTerms_hpp_
--- a/Hadrons/Archive/Modules/WardIdentity.cc
+++ b/Hadrons/Archive/Modules/WardIdentity.cc
@@ -1,35 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/WardIdentity.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/WardIdentity.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TWardIdentity<FIMPL>;
--- a/Hadrons/Archive/Modules/WardIdentity.hpp
+++ b/Hadrons/Archive/Modules/WardIdentity.hpp
@@ -1,224 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/WardIdentity.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_WardIdentity_hpp_
 #define Hadrons_MContraction_WardIdentity_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
  Ward Identity contractions
 -----------------------------
 * options:
 - q:          propagator, 5D if available (string)
 - action:     action module used for propagator solution (string)
 - mass:       mass of quark (double)
 - test_axial: whether or not to test PCAC relation.
 */
 /******************************************************************************
 *                              WardIdentity                                  *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 class WardIdentityPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WardIdentityPar,
                                    std::string, q,
                                    std::string, action,
                                    double,      mass,
                                    bool,        test_axial);
 };
 template <typename FImpl>
 class TWardIdentity: public Module<WardIdentityPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWardIdentity(const std::string name);
    // destructor
    virtual ~TWardIdentity(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    unsigned int Ls_;
 };
 MODULE_REGISTER_TMP(WardIdentity, TWardIdentity<FIMPL>, MContraction);
 /******************************************************************************
 *                     TWardIdentity implementation                           *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TWardIdentity<FImpl>::TWardIdentity(const std::string name)
 : Module<WardIdentityPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TWardIdentity<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q, par().action};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TWardIdentity<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWardIdentity<FImpl>::setup(void)
 {
    Ls_ = env().getObjectLs(par().q);
    if (Ls_ != env().getObjectLs(par().action))
    {
        HADRONS_ERROR(Size, "Ls mismatch between quark action and propagator");
    }
    envTmpLat(PropagatorField, "tmp");
    envTmpLat(PropagatorField, "vector_WI");
    if (par().test_axial)
    {
        envTmpLat(PropagatorField, "psi");
        envTmpLat(LatticeComplex,  "PP");
        envTmpLat(LatticeComplex,  "axial_defect");
        envTmpLat(LatticeComplex,  "PJ5q");
    }
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWardIdentity<FImpl>::execute(void)
 {
    LOG(Message) << "Performing Ward Identity checks for quark '" << par().q
                 << "'." << std::endl;
    auto  &q   = envGet(PropagatorField, par().q);
    auto  &act = envGet(FMat, par().action);
    Gamma g5(Gamma::Algebra::Gamma5);
    // Compute D_mu V_mu, D here is backward derivative.
    envGetTmp(PropagatorField, tmp);
    envGetTmp(PropagatorField, vector_WI);
    vector_WI    = Zero();
    for (unsigned int mu = 0; mu < Nd; ++mu)
    {
        act.ContractConservedCurrent(q, q, tmp, Current::Vector, mu);
        tmp -= Cshift(tmp, mu, -1);
        vector_WI += tmp;
    }
    // Test ward identity D_mu V_mu = 0;
    LOG(Message) << "Vector Ward Identity check Delta_mu V_mu = " 
                 << norm2(vector_WI) << std::endl;
    if (par().test_axial)
    {
        envGetTmp(PropagatorField, psi);
        envGetTmp(LatticeComplex, PP);
        envGetTmp(LatticeComplex, axial_defect);
        envGetTmp(LatticeComplex, PJ5q);
        std::vector<TComplex> axial_buf;
        // Compute <P|D_mu A_mu>, D is backwards derivative.
        axial_defect = Zero();
        for (unsigned int mu = 0; mu < Nd; ++mu)
        {
            act.ContractConservedCurrent(q, q, tmp, Current::Axial, mu);
            tmp -= Cshift(tmp, mu, -1);
            axial_defect += trace(g5*tmp);
        }
        // Get <P|J5q> for 5D (Zero(); for 4D) and <P|P>.
        PJ5q = Zero();
        if (Ls_ > 1)
        {
            // <P|P>
            ExtractSlice(tmp, q, 0, 0);
            psi  = 0.5 * (tmp - g5*tmp);
            ExtractSlice(tmp, q, Ls_ - 1, 0);
            psi += 0.5 * (tmp + g5*tmp);
            PP = trace(adj(psi)*psi);
            // <P|5Jq>
            ExtractSlice(tmp, q, Ls_/2 - 1, 0);
            psi  = 0.5 * (tmp + g5*tmp);
            ExtractSlice(tmp, q, Ls_/2, 0);
            psi += 0.5 * (tmp - g5*tmp);
            PJ5q = trace(adj(psi)*psi);
        }
        else
        {
            PP = trace(adj(q)*q);
        }
        // Test ward identity <P|D_mu A_mu> = 2m<P|P> + 2<P|J5q>
        LOG(Message) << "|D_mu A_mu|^2 = " << norm2(axial_defect) << std::endl;
        LOG(Message) << "|PP|^2        = " << norm2(PP) << std::endl;
        LOG(Message) << "|PJ5q|^2      = " << norm2(PJ5q) << std::endl;
        LOG(Message) << "Axial Ward Identity defect Delta_mu A_mu = "
                     << norm2(axial_defect) << std::endl;
        // Axial defect by timeslice.
        axial_defect -= 2.*(par().mass*PP + PJ5q);
        LOG(Message) << "Check Axial defect by timeslice" << std::endl;
        sliceSum(axial_defect, axial_buf, Tp);
        for (int t = 0; t < axial_buf.size(); ++t)
        {
            LOG(Message) << "t = " << t << ": " 
                         << TensorRemove(axial_buf[t]) << std::endl;
        }
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_WardIdentity_hpp_
--- a/Hadrons/Archive/Modules/WeakHamiltonian.hpp
+++ b/Hadrons/Archive/Modules/WeakHamiltonian.hpp
@@ -1,118 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/WeakHamiltonian.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_WeakHamiltonian_hpp_
 #define Hadrons_MContraction_WeakHamiltonian_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         WeakHamiltonian                                    *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 /*******************************************************************************
 * Utilities for contractions involving the Weak Hamiltonian.
 ******************************************************************************/
 //// Sum and store correlator.
 #define MAKE_DIAG(exp, buf, res, n)\
 sliceSum(exp, buf, Tp);\
 res.name = (n);\
 res.corr.resize(buf.size());\
 for (unsigned int t = 0; t < buf.size(); ++t)\
 {\
    res.corr[t] = TensorRemove(buf[t]);\
 }
 //// Contraction of mu index: use 'mu' variable in exp.
 #define SUM_MU(buf,exp)\
   buf = Zero();				\
 for (unsigned int mu = 0; mu < ndim; ++mu)\
 {\
    buf += exp;\
 }
 enum 
 {
  i_V = 0,
  i_A = 1,
  n_i = 2
 };
 class WeakHamiltonianPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WeakHamiltonianPar,
                                    std::string, q1,
                                    std::string, q2,
                                    std::string, q3,
                                    std::string, q4,
                                    unsigned int, tSnk,
                                    std::string, output);
 };
 #define MAKE_WEAK_MODULE(modname)\
 class T##modname: public Module<WeakHamiltonianPar>\
 {\
 public:\
    FERM_TYPE_ALIASES(FIMPL,)\
    class Result: Serializable\
    {\
    public:\
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,\
                                        std::string, name,\
                                        std::vector<Complex>, corr);\
    };\
 public:\
    /* constructor */ \
    T##modname(const std::string name);\
    /* destructor */ \
    virtual ~T##modname(void) {};\
    /* dependency relation */ \
    virtual std::vector<std::string> getInput(void);\
    virtual std::vector<std::string> getOutput(void);\
 public:\
    std::vector<std::string> VA_label = {"V", "A"};\
 protected:\
    /* setup */ \
    virtual void setup(void);\
    /* execution */ \
    virtual void execute(void);\
 };\
 MODULE_REGISTER(modname, T##modname, MContraction);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_WeakHamiltonian_hpp_
--- a/Hadrons/Archive/Modules/WeakHamiltonianEye.cc
+++ b/Hadrons/Archive/Modules/WeakHamiltonianEye.cc
@@ -1,151 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/WeakHamiltonianEye.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 /*
 * Weak Hamiltonian current-current contractions, Eye-type.
 * 
 * These contractions are generated by the Q1 and Q2 operators in the physical
 * basis (see e.g. Fig 3 of arXiv:1507.03094).
 * 
 * Schematics:        q4                 |                  
 *                  /-<-¬                |                             
 *                 /     \               |             q2           q3
 *                 \     /               |        /----<------*------<----¬                        
 *            q2    \   /    q3          |       /          /-*-¬          \
 *       /-----<-----* *-----<----¬      |      /          /     \          \
 *    i *            H_W           * f   |   i *           \     /  q4      * f
 *       \                        /      |      \           \->-/          /   
 *        \                      /       |       \                        /       
 *         \---------->---------/        |        \----------->----------/        
 *                   q1                  |                   q1                  
 *                                       |
 *                Saucer (S)             |                  Eye (E)
 * 
 * S: trace(q3*g5*q1*adj(q2)*g5*gL[mu][p_1]*q4*gL[mu][p_2])
 * E: trace(q3*g5*q1*adj(q2)*g5*gL[mu][p_1])*trace(q4*gL[mu][p_2])
 * 
 * Note q1 must be sink smeared.
 */
 /******************************************************************************
 *                  TWeakHamiltonianEye implementation                        *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TWeakHamiltonianEye::TWeakHamiltonianEye(const std::string name)
 : Module<WeakHamiltonianPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TWeakHamiltonianEye::getInput(void)
 {
    std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
    return in;
 }
 std::vector<std::string> TWeakHamiltonianEye::getOutput(void)
 {
    std::vector<std::string> out = {};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TWeakHamiltonianEye::setup(void)
 {
    unsigned int ndim = env().getNd();
    envTmpLat(LatticeComplex,  "expbuf");
    envTmpLat(PropagatorField, "tmp1");
    envTmpLat(LatticeComplex,  "tmp2");
    envTmp(std::vector<PropagatorField>, "S_body", 1, ndim, PropagatorField(env().getGrid()));
    envTmp(std::vector<PropagatorField>, "S_loop", 1, ndim, PropagatorField(env().getGrid()));
    envTmp(std::vector<LatticeComplex>,  "E_body", 1, ndim, LatticeComplex(env().getGrid()));
    envTmp(std::vector<LatticeComplex>,  "E_loop", 1, ndim, LatticeComplex(env().getGrid()));
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TWeakHamiltonianEye::execute(void)
 {
    LOG(Message) << "Computing Weak Hamiltonian (Eye type) contractions '" 
                 << getName() << "' using quarks '" << par().q1 << "', '" 
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
    auto                   &q1 = envGet(SlicedPropagator, par().q1);
    auto                   &q2 = envGet(PropagatorField, par().q2);
    auto                   &q3 = envGet(PropagatorField, par().q3);
    auto                   &q4 = envGet(PropagatorField, par().q4);
    Gamma                  g5  = Gamma(Gamma::Algebra::Gamma5);
    std::vector<TComplex>  corrbuf;
    std::vector<Result>    result(n_eye_diag);
    unsigned int ndim    = env().getNd();
    envGetTmp(LatticeComplex,               expbuf); 
    envGetTmp(PropagatorField,              tmp1);
    envGetTmp(LatticeComplex,               tmp2);
    envGetTmp(std::vector<PropagatorField>, S_body);
    envGetTmp(std::vector<PropagatorField>, S_loop);
    envGetTmp(std::vector<LatticeComplex>,  E_body);
    envGetTmp(std::vector<LatticeComplex>,  E_loop);
    // Get sink timeslice of q1.
    SitePropagator q1Snk = q1[par().tSnk];
    // Setup for S-type contractions.
    for (int mu = 0; mu < ndim; ++mu)
    {
        S_body[mu] = MAKE_SE_BODY(q1Snk, q2, q3, GammaL(Gamma::gmu[mu]));
        S_loop[mu] = MAKE_SE_LOOP(q4, GammaL(Gamma::gmu[mu]));
    }
    // Perform S-type contractions.    
    SUM_MU(expbuf, trace(S_body[mu]*S_loop[mu]))
    MAKE_DIAG(expbuf, corrbuf, result[S_diag], "HW_S")
    // Recycle sub-expressions for E-type contractions.
    for (unsigned int mu = 0; mu < ndim; ++mu)
    {
        E_body[mu] = trace(S_body[mu]);
        E_loop[mu] = trace(S_loop[mu]);
    }
    // Perform E-type contractions.
    SUM_MU(expbuf, E_body[mu]*E_loop[mu])
    MAKE_DIAG(expbuf, corrbuf, result[E_diag], "HW_E")
    // IO
    saveResult(par().output, "HW_Eye", result);
 }
--- a/Hadrons/Archive/Modules/WeakHamiltonianEye.hpp
+++ b/Hadrons/Archive/Modules/WeakHamiltonianEye.hpp
@@ -1,59 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/WeakHamiltonianEye.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_WeakHamiltonianEye_hpp_
 #define Hadrons_MContraction_WeakHamiltonianEye_hpp_
 #include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         WeakHamiltonianEye                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 enum
 {
    S_diag = 0,
    E_diag = 1,
    n_eye_diag = 2
 };
 // Saucer and Eye subdiagram contractions.
 #define MAKE_SE_BODY(Q_1, Q_2, Q_3, gamma) (Q_3*g5*Q_1*adj(Q_2)*g5*gamma)
 #define MAKE_SE_LOOP(Q_loop, gamma) (Q_loop*gamma)
 MAKE_WEAK_MODULE(WeakHamiltonianEye)
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_WeakHamiltonianEye_hpp_
--- a/Hadrons/Archive/Modules/WeakHamiltonianNonEye.cc
+++ b/Hadrons/Archive/Modules/WeakHamiltonianNonEye.cc
@@ -1,148 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/WeakHamiltonianNonEye.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 /*
 * Weak Hamiltonian current-current contractions, Non-Eye-type.
 * 
 * These contractions are generated by the Q1 and Q2 operators in the physical
 * basis (see e.g. Fig 3 of arXiv:1507.03094).
 * 
 * Schematic:     
 *            q2             q3          |           q2              q3
 *          /--<--¬       /--<--¬        |        /--<--¬         /--<--¬       
 *         /       \     /       \       |       /       \       /       \      
 *        /         \   /         \      |      /         \     /         \     
 *       /           \ /           \     |     /           \   /           \    
 *    i *             * H_W         *  f |  i *             * * H_W         * f 
 *      \             *             |    |     \           /   \           /
 *       \           / \           /     |      \         /     \         /    
 *        \         /   \         /      |       \       /       \       /  
 *         \       /     \       /       |        \-->--/         \-->--/      
 *          \-->--/       \-->--/        |          q1               q4 
 *            q1             q4          |
 *                Connected (C)          |                 Wing (W)
 *
 * C: trace(q1*adj(q2)*g5*gL[mu]*q3*adj(q4)*g5*gL[mu])
 * W: trace(q1*adj(q2)*g5*gL[mu])*trace(q3*adj(q4)*g5*gL[mu])
 * 
 */
 /******************************************************************************
 *                  TWeakHamiltonianNonEye implementation                     *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TWeakHamiltonianNonEye::TWeakHamiltonianNonEye(const std::string name)
 : Module<WeakHamiltonianPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TWeakHamiltonianNonEye::getInput(void)
 {
    std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
    return in;
 }
 std::vector<std::string> TWeakHamiltonianNonEye::getOutput(void)
 {
    std::vector<std::string> out = {};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TWeakHamiltonianNonEye::setup(void)
 {
    unsigned int ndim = env().getNd();
    envTmpLat(LatticeComplex,  "expbuf");
    envTmpLat(PropagatorField, "tmp1");
    envTmpLat(LatticeComplex,  "tmp2");
    envTmp(std::vector<PropagatorField>, "C_i_side_loop", 1, ndim, PropagatorField(env().getGrid()));
    envTmp(std::vector<PropagatorField>, "C_f_side_loop", 1, ndim, PropagatorField(env().getGrid()));
    envTmp(std::vector<LatticeComplex>,  "W_i_side_loop", 1, ndim, LatticeComplex(env().getGrid()));
    envTmp(std::vector<LatticeComplex>,  "W_f_side_loop", 1, ndim, LatticeComplex(env().getGrid()));
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TWeakHamiltonianNonEye::execute(void)
 {
    LOG(Message) << "Computing Weak Hamiltonian (Non-Eye type) contractions '" 
                 << getName() << "' using quarks '" << par().q1 << "', '" 
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
    auto                  &q1 = envGet(PropagatorField, par().q1);
    auto                  &q2 = envGet(PropagatorField, par().q2);
    auto                  &q3 = envGet(PropagatorField, par().q3);
    auto                  &q4 = envGet(PropagatorField, par().q4);
    Gamma                 g5  = Gamma(Gamma::Algebra::Gamma5);
    std::vector<TComplex> corrbuf;
    std::vector<Result>   result(n_noneye_diag); 
    unsigned int          ndim = env().getNd();
    envGetTmp(LatticeComplex,               expbuf); 
    envGetTmp(PropagatorField,              tmp1);
    envGetTmp(LatticeComplex,               tmp2);
    envGetTmp(std::vector<PropagatorField>, C_i_side_loop);
    envGetTmp(std::vector<PropagatorField>, C_f_side_loop);
    envGetTmp(std::vector<LatticeComplex>,  W_i_side_loop);
    envGetTmp(std::vector<LatticeComplex>,  W_f_side_loop);
    // Setup for C-type contractions.
    for (int mu = 0; mu < ndim; ++mu)
    {
        C_i_side_loop[mu] = MAKE_CW_SUBDIAG(q1, q2, GammaL(Gamma::gmu[mu]));
        C_f_side_loop[mu] = MAKE_CW_SUBDIAG(q3, q4, GammaL(Gamma::gmu[mu]));
    }
    // Perform C-type contractions.    
    SUM_MU(expbuf, trace(C_i_side_loop[mu]*C_f_side_loop[mu]))
    MAKE_DIAG(expbuf, corrbuf, result[C_diag], "HW_C")
    // Recycle sub-expressions for W-type contractions.
    for (unsigned int mu = 0; mu < ndim; ++mu)
    {
        W_i_side_loop[mu] = trace(C_i_side_loop[mu]);
        W_f_side_loop[mu] = trace(C_f_side_loop[mu]);
    }
    // Perform W-type contractions.
    SUM_MU(expbuf, W_i_side_loop[mu]*W_f_side_loop[mu])
    MAKE_DIAG(expbuf, corrbuf, result[W_diag], "HW_W")
    // IO
    saveResult(par().output, "HW_NonEye", result);
 }
--- a/Hadrons/Archive/Modules/WeakHamiltonianNonEye.hpp
+++ b/Hadrons/Archive/Modules/WeakHamiltonianNonEye.hpp
@@ -1,58 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/WeakHamiltonianNonEye.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
 #define Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
 #include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         WeakHamiltonianNonEye                              *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 enum
 {
    W_diag = 0,
    C_diag = 1,
    n_noneye_diag = 2
 };
 // Wing and Connected subdiagram contractions
 #define MAKE_CW_SUBDIAG(Q_1, Q_2, gamma) (Q_1*adj(Q_2)*g5*gamma)
 MAKE_WEAK_MODULE(WeakHamiltonianNonEye)
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
--- a/Hadrons/Archive/Modules/WeakNeutral4ptDisc.cc
+++ b/Hadrons/Archive/Modules/WeakNeutral4ptDisc.cc
@@ -1,142 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/WeakNeutral4ptDisc.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 /*
 * Weak Hamiltonian + current contractions, disconnected topology for neutral 
 * mesons.
 * 
 * These contractions are generated by operators Q_1,...,10 of the dS=1 Weak
 * Hamiltonian in the physical basis and an additional current J (see e.g. 
 * Fig 11 of arXiv:1507.03094).
 * 
 * Schematic:
 *                        
 *           q2          q4             q3
 *       /--<--¬     /---<--¬       /---<--¬
 *     /         \ /         \     /        \
 *  i *           * H_W      |  J *          * f
 *     \         / \         /     \        /
 *      \--->---/   \-------/       \------/
 *          q1 
 * 
 * options
 * - q1: input propagator 1 (string)
 * - q2: input propagator 2 (string)
 * - q3: input propagator 3 (string), assumed to be sequential propagator 
 * - q4: input propagator 4 (string), assumed to be a loop
 * 
 * type 1: trace(q1*adj(q2)*g5*gL[mu])*trace(loop*gL[mu])*trace(q3*g5)
 * type 2: trace(q1*adj(q2)*g5*gL[mu]*loop*gL[mu])*trace(q3*g5)
 */
 /*******************************************************************************
 *                  TWeakNeutral4ptDisc implementation                         *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TWeakNeutral4ptDisc::TWeakNeutral4ptDisc(const std::string name)
 : Module<WeakHamiltonianPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TWeakNeutral4ptDisc::getInput(void)
 {
    std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
    return in;
 }
 std::vector<std::string> TWeakNeutral4ptDisc::getOutput(void)
 {
    std::vector<std::string> out = {};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TWeakNeutral4ptDisc::setup(void)
 {
    unsigned int ndim = env().getNd();
    envTmpLat(LatticeComplex,  "expbuf");
    envTmpLat(PropagatorField, "tmp");
    envTmpLat(LatticeComplex,  "curr");
    envTmp(std::vector<PropagatorField>, "meson", 1, ndim, PropagatorField(env().getGrid()));
    envTmp(std::vector<PropagatorField>, "loop", 1, ndim,  PropagatorField(env().getGrid()));
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TWeakNeutral4ptDisc::execute(void)
 {
    LOG(Message) << "Computing Weak Hamiltonian neutral disconnected contractions '" 
                 << getName() << "' using quarks '" << par().q1 << "', '" 
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
    auto                  &q1 = envGet(PropagatorField, par().q1);
    auto                  &q2 = envGet(PropagatorField, par().q2);
    auto                  &q3 = envGet(PropagatorField, par().q3);
    auto                  &q4 = envGet(PropagatorField, par().q4);
    Gamma                 g5  = Gamma(Gamma::Algebra::Gamma5);
    std::vector<TComplex> corrbuf;
    std::vector<Result>   result(n_neut_disc_diag);
    unsigned int          ndim = env().getNd();
    envGetTmp(LatticeComplex,               expbuf); 
    envGetTmp(PropagatorField,              tmp);
    envGetTmp(LatticeComplex,               curr);
    envGetTmp(std::vector<PropagatorField>, meson);
    envGetTmp(std::vector<PropagatorField>, loop);
    // Setup for type 1 contractions.
    for (int mu = 0; mu < ndim; ++mu)
    {
        meson[mu] = MAKE_DISC_MESON(q1, q2, GammaL(Gamma::gmu[mu]));
        loop[mu] = MAKE_DISC_LOOP(q4, GammaL(Gamma::gmu[mu]));
    }
    curr = MAKE_DISC_CURR(q3, GammaL(Gamma::Algebra::Gamma5));
    // Perform type 1 contractions.    
    SUM_MU(expbuf, trace(meson[mu]*loop[mu]))
    expbuf *= curr;
    MAKE_DIAG(expbuf, corrbuf, result[neut_disc_1_diag], "HW_disc0_1")
    // Perform type 2 contractions.
    SUM_MU(expbuf, trace(meson[mu])*trace(loop[mu]))
    expbuf *= curr;
    MAKE_DIAG(expbuf, corrbuf, result[neut_disc_2_diag], "HW_disc0_2")
    // IO
    saveResult(par().output, "HW_disc0", result);
 }
--- a/Hadrons/Archive/Modules/WeakNeutral4ptDisc.hpp
+++ b/Hadrons/Archive/Modules/WeakNeutral4ptDisc.hpp
@@ -1,60 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/WeakNeutral4ptDisc.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
 #define Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
 #include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         WeakNeutral4ptDisc                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 enum
 {
    neut_disc_1_diag = 0,
    neut_disc_2_diag = 1,
    n_neut_disc_diag = 2
 };
 // Neutral 4pt disconnected subdiagram contractions.
 #define MAKE_DISC_MESON(Q_1, Q_2, gamma) (Q_1*adj(Q_2)*g5*gamma)
 #define MAKE_DISC_LOOP(Q_LOOP, gamma) (Q_LOOP*gamma)
 #define MAKE_DISC_CURR(Q_c, gamma) (trace(Q_c*gamma))
 MAKE_WEAK_MODULE(WeakNeutral4ptDisc)
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
--- a/Hadrons/DilutedNoise.hpp
+++ b/Hadrons/DilutedNoise.hpp
@@ -1,356 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/DilutedNoise.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
 Author: Vera Guelpers <vmg1n14@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_DilutedNoise_hpp_
 #define Hadrons_DilutedNoise_hpp_
 #include <Hadrons/Global.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                   Abstract container for diluted noise                     *
 ******************************************************************************/
 template <typename FImpl>
 class DilutedNoise
 {
 public:
    typedef typename FImpl::FermionField FermionField;
 public:
    // constructor/destructor
    DilutedNoise(GridCartesian *g);
    DilutedNoise(GridCartesian *g, const unsigned int nNoise);
    virtual ~DilutedNoise(void) = default;
    // access
    std::vector<FermionField> &       getNoise(void);
    const std::vector<FermionField> & getNoise(void) const;
    const FermionField &              operator[](const unsigned int i) const;
    FermionField &                    operator[](const unsigned int i);
    void                              normalise(Real norm);
    void                              resize(const unsigned int nNoise);
    unsigned int                      size(void) const;
    GridCartesian                     *getGrid(void) const;
    // generate noise (pure virtual)
    virtual void generateNoise(GridParallelRNG &rng) = 0;
 private:
    std::vector<FermionField> noise_;
    GridCartesian             *grid_;
    unsigned int              nNoise_;
 };
 template <typename FImpl>
 class TimeDilutedSpinColorDiagonalNoise: public DilutedNoise<FImpl>
 {
 public:
    typedef typename FImpl::FermionField FermionField;
 public:
    // constructor/destructor
    TimeDilutedSpinColorDiagonalNoise(GridCartesian *g);
    virtual ~TimeDilutedSpinColorDiagonalNoise(void) = default;
    // generate noise
    virtual void generateNoise(GridParallelRNG &rng);
 private:
    unsigned int nt_;
 };
 template <typename FImpl>
 class FullVolumeSpinColorDiagonalNoise: public DilutedNoise<FImpl>
 {
 public:
    typedef typename FImpl::FermionField FermionField;
 public:
    // constructor/destructor
    FullVolumeSpinColorDiagonalNoise(GridCartesian *g, unsigned int n_src);
    virtual ~FullVolumeSpinColorDiagonalNoise(void) = default;
    // generate noise
    virtual void generateNoise(GridParallelRNG &rng);
 private:
    unsigned int nSrc_;
 };
 template <typename FImpl>
 class SparseSpinColorDiagonalNoise: public DilutedNoise<FImpl>
 {
 public:
    typedef typename FImpl::FermionField FermionField;
 public:
    // constructor/destructor
    SparseSpinColorDiagonalNoise(GridCartesian *g, unsigned int n_src, unsigned int n_sparse);
    virtual ~SparseSpinColorDiagonalNoise(void) = default;
    // generate noise
    virtual void generateNoise(GridParallelRNG &rng);
 private:
    unsigned int nSrc_;
    unsigned int nSparse_;
 };
 /******************************************************************************
 *                    DilutedNoise template implementation                    *
 ******************************************************************************/
 template <typename FImpl>
 DilutedNoise<FImpl>::DilutedNoise(GridCartesian *g)
 : grid_(g)
 {}
 template <typename FImpl>
 DilutedNoise<FImpl>::DilutedNoise(GridCartesian *g,
                                  const unsigned int nNoise)
 : DilutedNoise(g)
 {
    resize(nNoise);
 }
 template <typename FImpl>
 std::vector<typename DilutedNoise<FImpl>::FermionField> & DilutedNoise<FImpl>::
 getNoise(void)
 {
    return noise_;
 }
 template <typename FImpl>
 const std::vector<typename DilutedNoise<FImpl>::FermionField> & DilutedNoise<FImpl>::
 getNoise(void) const
 {
    return noise_;
 }
 template <typename FImpl>
 const typename DilutedNoise<FImpl>::FermionField & 
 DilutedNoise<FImpl>::operator[](const unsigned int i) const
 {
    return noise_[i];
 }
 template <typename FImpl>
 typename DilutedNoise<FImpl>::FermionField & 
 DilutedNoise<FImpl>::operator[](const unsigned int i)
 {
    return noise_[i];
 }
 template <typename FImpl>
 void DilutedNoise<FImpl>::normalise(Real norm)
 {
    for(int i=0;i<noise_.size();i++)
    {
        noise_[i] = norm*noise_[i];
    }
 }
 template <typename FImpl>
 void DilutedNoise<FImpl>::resize(const unsigned int nNoise)
 {
    nNoise_ = nNoise;
    noise_.resize(nNoise, grid_);
 }
 template <typename FImpl>
 unsigned int DilutedNoise<FImpl>::size(void) const
 {  
    return noise_.size();
 }
 template <typename FImpl>
 GridCartesian * DilutedNoise<FImpl>::getGrid(void) const
 {
    return grid_;
 }
 /******************************************************************************
 *        TimeDilutedSpinColorDiagonalNoise template implementation           *
 ******************************************************************************/
 template <typename FImpl>
 TimeDilutedSpinColorDiagonalNoise<FImpl>::
 TimeDilutedSpinColorDiagonalNoise(GridCartesian *g)
 : DilutedNoise<FImpl>(g)
 {
    nt_ = this->getGrid()->GlobalDimensions().size();
    this->resize(nt_*Ns*FImpl::Dimension);
 }
 template <typename FImpl>
 void TimeDilutedSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng)
 {
    typedef decltype(peekColour((*this)[0], 0)) SpinField;
    auto                       &noise = *this;
    auto                       g      = this->getGrid();
    auto                       nd     = g->GlobalDimensions().size();
    auto                       nc     = FImpl::Dimension;
    Complex                    shift(1., 1.);
    Lattice<iScalar<vInteger>> tLat(g);
    LatticeComplex             eta(g), etaCut(g);
    SpinField                  etas(g);
    unsigned int               i = 0;
    LatticeCoordinate(tLat, nd - 1);
    bernoulli(rng, eta);
    eta = (2.*eta - shift)*(1./::sqrt(2.));
    for (unsigned int t = 0; t < nt_; ++t)
    {
        etaCut = where((tLat == t), eta, 0.*eta);
        for (unsigned int s = 0; s < Ns; ++s)
        {
 	    etas = Zero();
 	    pokeSpin(etas, etaCut, s);
            for (unsigned int c = 0; c < nc; ++c)
            {
  	        noise[i] = Zero();
                pokeColour(noise[i], etas, c);
                i++;
            }
        }
    }
 }
 /******************************************************************************
 *        FullVolumeSpinColorDiagonalNoise template implementation           *
 ******************************************************************************/
 template <typename FImpl>
 FullVolumeSpinColorDiagonalNoise<FImpl>::
 FullVolumeSpinColorDiagonalNoise(GridCartesian *g, unsigned int nSrc)
 : DilutedNoise<FImpl>(g, nSrc*Ns*FImpl::Dimension), nSrc_(nSrc)
 {}
 template <typename FImpl>
 void FullVolumeSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng)
 {
    typedef decltype(peekColour((*this)[0], 0)) SpinField;
    auto                       &noise = *this;
    auto                       g      = this->getGrid();
    auto                       nd     = g->GlobalDimensions().size();
    auto                       nc     = FImpl::Dimension;
    Complex                    shift(1., 1.);
    LatticeComplex             eta(g);
    SpinField                  etas(g);
    unsigned int               i = 0;
    bernoulli(rng, eta);
    eta = (2.*eta - shift)*(1./::sqrt(2.));
    for (unsigned int n = 0; n < nSrc_; ++n)
    {
        for (unsigned int s = 0; s < Ns; ++s)
        {
  	    etas = Zero();
            pokeSpin(etas, eta, s);
            for (unsigned int c = 0; c < nc; ++c)
            {
 	        noise[i] = Zero();
                pokeColour(noise[i], etas, c);
                i++;
            }
        }
    }
 }
 /******************************************************************************
 *        SparseSpinColorDiagonalNoise template implementation           *
 ******************************************************************************/
 template <typename FImpl>
 SparseSpinColorDiagonalNoise<FImpl>::
 SparseSpinColorDiagonalNoise(GridCartesian *g, unsigned int nSrc, unsigned int nSparse)
 : DilutedNoise<FImpl>(g, nSrc*Ns*FImpl::Dimension), nSrc_(nSrc), nSparse_(nSparse)
 {}
 template <typename FImpl>
 void SparseSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng)
 {
    typedef decltype(peekColour((*this)[0], 0)) SpinField;
    auto                       &noise = *this;
    auto                       g      = this->getGrid();
    auto                       nd     = g->GlobalDimensions().size();
    auto                       nc     = FImpl::Dimension;
    LatticeInteger             coor(g), coorTot(g); coorTot = 0.;
    Complex                    shift(1., 1.);
    LatticeComplex             eta(g), etaSparse(g);
    SpinField                  etas(g);
    unsigned int               i = 0;
    unsigned int               j = 0;
    unsigned int               nSrc_ec;
    if(nSrc_%nSparse_==0)
    {
         nSrc_ec = nSrc_/nSparse_;
    }
    else
    {
         nSrc_ec = (nSrc_ - nSrc_%nSparse_)/nSparse_;
    }
    for (unsigned int n = 0; n < nSrc_; ++n)
    {
        bernoulli(rng, eta);
        eta = (2.*eta - shift)*(1./::sqrt(2.));
        if(nSparse_ != 1)
        { 
        assert(g->GlobalDimensions()[1]%nSparse_ == 0);
        // # 0 # 0
        // 0 # 0 #
        // # 0 # 0
        // 0 # 0 #
        coorTot = 0;
            for(unsigned int d = 0; d < nd; ++d) 
            {
                LatticeCoordinate(coor, d);
                coorTot = coorTot + coor;
            }
            coorTot = coorTot + j;
            eta = where(mod(coorTot,nSparse_), 0.*eta, eta);
        }
        for (unsigned int s = 0; s < Ns; ++s)
        {
            etas = Zero();
            pokeSpin(etas, eta, s);
            for (unsigned int c = 0; c < nc; ++c)
            {
                noise[i] = Zero();
                pokeColour(noise[i], etas, c);
                i++;
                /**/ 
            }
        }
        ((n+1)%nSrc_ec == 0) ? j++: 0;
    }
    Real norm = sqrt(1./nSrc_ec);
    this->normalise(norm);
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_DilutedNoise_hpp_
--- a/Hadrons/DiskVector.hpp
+++ b/Hadrons/DiskVector.hpp
@@ -1,511 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/DiskVector.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_DiskVector_hpp_
 #define Hadrons_DiskVector_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 #include <deque>
 #include <sys/stat.h>
 #include <ftw.h>
 #include <unistd.h>
 #ifdef DV_DEBUG
 #define DV_DEBUG_MSG(dv, stream) LOG(Debug) << "diskvector " << (dv) << ": " << stream << std::endl
 #else
 #define DV_DEBUG_MSG(dv, stream)
 #endif
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                           Abstract base class                              *
 ******************************************************************************/
 template <typename T>
 class DiskVectorBase
 {
 public:
    typedef T ObjectType;
    // helper for read/write vector access
    class RwAccessHelper
    {
    public:
        RwAccessHelper(DiskVectorBase<T> &master, const unsigned int i)
        : master_(master), cmaster_(master), i_(i) {}
        // operator=: somebody is trying to store a vector element
        // write to cache and tag as modified
        T &operator=(const T &obj) const
        {
            auto &cache    = *master_.cachePtr_;
            auto &modified = *master_.modifiedPtr_;
            auto &index    = *master_.indexPtr_;
            DV_DEBUG_MSG(&master_, "writing to " << i_);
            master_.cacheInsert(i_, obj);
            modified[index.at(i_)] = true;
            return cache[index.at(i_)];
        }
        // implicit cast to const object reference and redirection
        // to the const operator[] for read-only operations
        operator const T&() const
        {
            return cmaster_[i_];
        }
    private:
        DiskVectorBase<T>       &master_;
        const DiskVectorBase<T> &cmaster_;
        const unsigned int      i_;
    };
 public:
    DiskVectorBase(const std::string dirname, const unsigned int size = 0,
                   const unsigned int cacheSize = 1, const bool clean = true,
                   GridBase *grid = nullptr);
    DiskVectorBase(DiskVectorBase<T> &&v) = default;
    virtual ~DiskVectorBase(void);
    const T & operator[](const unsigned int i) const;
    RwAccessHelper operator[](const unsigned int i);
    double hitRatio(void) const;
    void resetStat(void);
    void setSize(unsigned int size_);
    unsigned int getSize() const;
    unsigned int dvSize;
    void setGrid(GridBase *grid_);
    GridBase *getGrid() const;
    GridBase *dvGrid;
 private:
    virtual void load(T &obj, const std::string filename) const = 0;
    virtual void save(const std::string filename, const T &obj) const = 0;
    virtual std::string filename(const unsigned int i) const;
    void evict(void) const;
    void fetch(const unsigned int i) const;
    void cacheInsert(const unsigned int i, const T &obj) const;
    void clean(void);
 private:
    std::string                                           dirname_;
    unsigned int                                          size_, cacheSize_;
    double                                                access_{0.}, hit_{0.};
    bool                                                  clean_;
    GridBase                                              *grid_;
    // using pointers to allow modifications when class is const
    // semantic: const means data unmodified, but cache modification allowed
    std::unique_ptr<std::vector<T>>                       cachePtr_;
    std::unique_ptr<std::vector<bool>>                    modifiedPtr_;
    std::unique_ptr<std::map<unsigned int, unsigned int>> indexPtr_;
    std::unique_ptr<std::stack<unsigned int>>             freePtr_;
    std::unique_ptr<std::deque<unsigned int>>             loadsPtr_;                
 };
 /******************************************************************************
 *                   Specialisation for serialisable classes                  *
 ******************************************************************************/
 template <typename T, typename Reader, typename Writer>
 class SerializableDiskVector: public DiskVectorBase<T>
 {
 public:
    using DiskVectorBase<T>::DiskVectorBase;
 private:
    virtual void load(T &obj, const std::string filename) const
    {
        Reader reader(filename);
        read(reader, basename(filename), obj);
    }
    virtual void save(const std::string filename, const T &obj) const
    {
        Writer writer(filename);
        write(writer, basename(filename), obj);
    }
 };
 /******************************************************************************
 *                      Specialisation for Eigen matrices                     *
 ******************************************************************************/
 template <typename T>
 using EigenDiskVectorMat = A2AMatrix<T>;
 template <typename T>
 class EigenDiskVector: public DiskVectorBase<EigenDiskVectorMat<T>>
 {
 public:
    using DiskVectorBase<EigenDiskVectorMat<T>>::DiskVectorBase;
    typedef EigenDiskVectorMat<T> Matrix;
 public:
    T operator()(const unsigned int i, const Eigen::Index j,
                 const Eigen::Index k) const
    {
        return (*this)[i](j, k);
    }
    std::vector<int> dimensions() const
    {
        std::vector<int> dims(3);
        dims[0] = (*this).getSize();
        dims[1] = (*this)[0].rows();
        dims[2] = (*this)[0].cols();
        return dims;
    }
 private:
    virtual void load(EigenDiskVectorMat<T> &obj, const std::string filename) const
    {
        GridBase *loadGrid;
        loadGrid = (*this).getGrid();
        if (!(loadGrid) || loadGrid->IsBoss())
        {
            std::ifstream f(filename, std::ios::binary);
            uint32_t      crc, check;
            Eigen::Index  nRow, nCol;
            size_t        matSize;
            double        tRead, tHash;
            f.read(reinterpret_cast<char *>(&crc), sizeof(crc));
            f.read(reinterpret_cast<char *>(&nRow), sizeof(nRow));
            f.read(reinterpret_cast<char *>(&nCol), sizeof(nCol));
            obj.resize(nRow, nCol);
            matSize = nRow*nCol*sizeof(T);
            tRead  = -usecond();
            f.read(reinterpret_cast<char *>(obj.data()), matSize);
            tRead += usecond();
            tHash  = -usecond();
    #ifdef USE_IPP
            check  = GridChecksum::crc32c(obj.data(), matSize);
    #else
            check  = GridChecksum::crc32(obj.data(), matSize);
    #endif
            tHash += usecond();
            DV_DEBUG_MSG(this, "Eigen read " << tRead/1.0e6 << " sec " << matSize/tRead*1.0e6/1024/1024 << " MB/s");
            DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << check << std::dec 
                        << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
            if (crc != check)
            {
                HADRONS_ERROR(Io, "checksum failed")
            }
        }
        int broadcastSize;
        broadcastSize = sizeof(T)*obj.size();
        if (loadGrid)
        {
            loadGrid->Broadcast(loadGrid->BossRank(), obj.data(), broadcastSize);
            loadGrid->Barrier();
        }
    }
    virtual void save(const std::string filename, const EigenDiskVectorMat<T> &obj) const
    {
        GridBase *saveGrid;
        saveGrid = (*this).getGrid();
        if (!(saveGrid) || saveGrid->IsBoss())
        {
            std::ofstream f(filename, std::ios::binary);
            uint32_t      crc;
            Eigen::Index  nRow, nCol;
            size_t        matSize;
            double        tWrite, tHash;
            nRow    = obj.rows();
            nCol    = obj.cols();
            matSize = nRow*nCol*sizeof(T);
            tHash   = -usecond();
    #ifdef USE_IPP
            crc     = GridChecksum::crc32c(obj.data(), matSize);
    #else
            crc     = GridChecksum::crc32(obj.data(), matSize);
    #endif
            tHash  += usecond();
            f.write(reinterpret_cast<char *>(&crc), sizeof(crc));
            f.write(reinterpret_cast<char *>(&nRow), sizeof(nRow));
            f.write(reinterpret_cast<char *>(&nCol), sizeof(nCol));
            tWrite = -usecond();
            f.write(reinterpret_cast<const char *>(obj.data()), matSize);
            tWrite += usecond();
            DV_DEBUG_MSG(this, "Eigen write " << tWrite/1.0e6 << " sec " << matSize/tWrite*1.0e6/1024/1024 << " MB/s");
            DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << crc << std::dec
                        << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
        }
        if (saveGrid)   saveGrid->Barrier();
    }
 };
 /******************************************************************************
 *                       DiskVectorBase implementation                         *
 ******************************************************************************/
 template <typename T>
 DiskVectorBase<T>::DiskVectorBase(const std::string dirname, 
                                  const unsigned int size,
                                  const unsigned int cacheSize,
                                  const bool clean,
                                  GridBase *grid)
 : dirname_(dirname), size_(size), cacheSize_(cacheSize), clean_(clean), grid_(grid)
 , cachePtr_(new std::vector<T>(size))
 , modifiedPtr_(new std::vector<bool>(size, false))
 , indexPtr_(new std::map<unsigned int, unsigned int>())
 , freePtr_(new std::stack<unsigned int>)
 , loadsPtr_(new std::deque<unsigned int>())
 {
    struct stat s;
    if (!(grid_) || grid_->IsBoss())
    {
        if(stat(dirname.c_str(), &s) == 0)
        {
            HADRONS_ERROR(Io, "directory '" + dirname + "' already exists")
        }
        mkdir(dirname);
    }
    if (grid_)  grid_->Barrier();
    for (unsigned int i = 0; i < cacheSize_; ++i)
    {
        freePtr_->push(i);
    }
    setSize(size_);
    setGrid(grid_);
 }
 template <typename T>
 DiskVectorBase<T>::~DiskVectorBase(void)
 {
    if (clean_)
    {
        clean();
    }
 }
 template <typename T>
 void DiskVectorBase<T>::setSize(unsigned int size_)
 {
    dvSize = size_;
 }
 template <typename T>
 unsigned int DiskVectorBase<T>::getSize() const
 {
    return dvSize;
 }
 template <typename T>
 void DiskVectorBase<T>::setGrid(GridBase *grid_)
 {
    dvGrid = grid_;
 }
 template <typename T>
 GridBase *DiskVectorBase<T>::getGrid() const
 {
    return dvGrid;
 }
 template <typename T>
 const T & DiskVectorBase<T>::operator[](const unsigned int i) const
 {
    auto &cache   = *cachePtr_;
    auto &index   = *indexPtr_;
    auto &freeInd = *freePtr_;
    auto &loads   = *loadsPtr_;
    DV_DEBUG_MSG(this, "accessing " << i << " (RO)");
    if (i >= size_)
    {
        HADRONS_ERROR(Size, "index out of range");
    }
    const_cast<double &>(access_)++;
    if (index.find(i) == index.end())
    {
        // cache miss
        DV_DEBUG_MSG(this, "cache miss");
        fetch(i);
    }
    else
    {
        DV_DEBUG_MSG(this, "cache hit");
        auto pos = std::find(loads.begin(), loads.end(), i);
        const_cast<double &>(hit_)++;
        loads.erase(pos);
        loads.push_back(i);
    }
 #ifdef DV_DEBUG
    std::string msg;
    for (auto &p: loads)
    {
        msg += std::to_string(p) + " ";
    }
    DV_DEBUG_MSG(this, "in cache: " << msg);
 #endif
    if (grid_)  grid_->Barrier();
    return cache[index.at(i)];
 }
 template <typename T>
 typename DiskVectorBase<T>::RwAccessHelper DiskVectorBase<T>::operator[](const unsigned int i)
 {
    DV_DEBUG_MSG(this, "accessing " << i << " (RW)");
    if (i >= size_)
    {
        HADRONS_ERROR(Size, "index out of range");
    }
    return RwAccessHelper(*this, i);
 }
 template <typename T>
 double DiskVectorBase<T>::hitRatio(void) const
 {
    return hit_/access_;
 }
 template <typename T>
 void DiskVectorBase<T>::resetStat(void)
 {
    access_ = 0.;
    hit_    = 0.;
 }
 template <typename T>
 std::string DiskVectorBase<T>::filename(const unsigned int i) const
 {
    return dirname_ + "/elem_" + std::to_string(i);
 }
 template <typename T>
 void DiskVectorBase<T>::evict(void) const
 {
    auto &cache    = *cachePtr_;
    auto &modified = *modifiedPtr_;
    auto &index    = *indexPtr_;
    auto &freeInd  = *freePtr_;
    auto &loads    = *loadsPtr_;
    if (index.size() >= cacheSize_)
    {
        unsigned int i = loads.front();
        DV_DEBUG_MSG(this, "evicting " << i);
        if (modified[index.at(i)])
        {
            DV_DEBUG_MSG(this, "element " << i << " modified, saving to disk");
            save(filename(i), cache[index.at(i)]);
        }
        freeInd.push(index.at(i));
        index.erase(i);
        loads.pop_front();
    }
    if (grid_)  grid_->Barrier();
 }
 template <typename T>
 void DiskVectorBase<T>::fetch(const unsigned int i) const
 {
    auto &cache    = *cachePtr_;
    auto &modified = *modifiedPtr_;
    auto &index    = *indexPtr_;
    auto &freeInd  = *freePtr_;
    auto &loads    = *loadsPtr_;
    struct stat s;
    DV_DEBUG_MSG(this, "loading " << i << " from disk");
    evict();
    if(stat(filename(i).c_str(), &s) != 0)
    {
        HADRONS_ERROR(Io, "disk vector element " + std::to_string(i) + " uninitialised");
    }
    index[i] = freeInd.top();
    freeInd.pop();
    load(cache[index.at(i)], filename(i));
    loads.push_back(i);
    modified[index.at(i)] = false;
 }
 template <typename T>
 void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
 {
    auto &cache    = *cachePtr_;
    auto &modified = *modifiedPtr_;
    auto &index    = *indexPtr_;
    auto &freeInd  = *freePtr_;
    auto &loads    = *loadsPtr_;
    evict();
    index[i] = freeInd.top();
    freeInd.pop();
    cache[index.at(i)] = obj;
    loads.push_back(i);
    modified[index.at(i)] = false;
    if (grid_)  grid_->Barrier();
 #ifdef DV_DEBUG
    std::string msg;
    for (auto &p: loads)
    {
        msg += std::to_string(p) + " ";
    }
    DV_DEBUG_MSG(this, "in cache: " << msg);
 #endif
 }
 #ifdef DV_DEBUG
 #undef DV_DEBUG_MSG
 #endif
 template <typename T>
 void DiskVectorBase<T>::clean(void)
 {
    if (!(grid_) || grid_->IsBoss())
    {
        auto unlink = [](const char *fpath, const struct stat *sb,
                         int typeflag, struct FTW *ftwbuf) {
            int rv = remove(fpath);
            if (rv)
            {
                HADRONS_ERROR(Io, "cannot remove '" + std::string(fpath) + "': " + std::string(std::strerror(errno)));
            }
            return rv;
        };
        nftw(dirname_.c_str(), unlink, 64, FTW_DEPTH | FTW_PHYS);
    }
    if (grid_)  grid_->Barrier();
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_DiskVector_hpp_
--- a/Hadrons/EigenPack.hpp
+++ b/Hadrons/EigenPack.hpp
@@ -1,416 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/EigenPack.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_EigenPack_hpp_
 #define Hadrons_EigenPack_hpp_
 #include <Hadrons/Global.hpp>
 #include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
 BEGIN_HADRONS_NAMESPACE
 // Lanczos type
 #ifndef HADRONS_DEFAULT_LANCZOS_NBASIS
 #define HADRONS_DEFAULT_LANCZOS_NBASIS 60
 #endif
 #define HADRONS_DUMP_EP_METADATA(record) \
 LOG(Message) << "Eigenpack metadata:" << std::endl;\
 LOG(Message) << "* operator" << std::endl;\
 LOG(Message) << (record).operatorXml << std::endl;\
 LOG(Message) << "* solver" << std::endl;\
 LOG(Message) << (record).solverXml << std::endl;
 struct PackRecord
 {
    std::string operatorXml, solverXml;
 };
 struct VecRecord: Serializable
 {
    GRID_SERIALIZABLE_CLASS_MEMBERS(VecRecord,
                                    unsigned int, index,
                                    double,       eval);
    VecRecord(void): index(0), eval(0.) {}
 };
 namespace EigenPackIo
 {
    inline void readHeader(PackRecord &record, ScidacReader &binReader)
    {
        std::string recordXml;
        binReader.readLimeObject(recordXml, SCIDAC_FILE_XML);
        XmlReader xmlReader(recordXml, true, "eigenPackPar");
        xmlReader.push();
        xmlReader.readCurrentSubtree(record.operatorXml);
        xmlReader.nextElement();
        xmlReader.readCurrentSubtree(record.solverXml);
    }
    template <typename T, typename TIo = T>
    void readElement(T &evec, RealD &eval, const unsigned int index,
                     ScidacReader &binReader, TIo *ioBuf = nullptr)
    {
        VecRecord vecRecord;
        LOG(Message) << "Reading eigenvector " << index << std::endl;
        if (ioBuf == nullptr)
        {
            binReader.readScidacFieldRecord(evec, vecRecord);
        }
        else
        {
            binReader.readScidacFieldRecord(*ioBuf, vecRecord);
            precisionChange(evec, *ioBuf);
        }
        if (vecRecord.index != index)
        {
            HADRONS_ERROR(Io, "Eigenvector " + std::to_string(index) + " has a"
                            + " wrong index (expected " + std::to_string(vecRecord.index) 
                            + ")");
        }
        eval = vecRecord.eval;
    }
    template <typename T, typename TIo = T>
    static void readPack(std::vector<T> &evec, std::vector<RealD> &eval,
                         PackRecord &record, const std::string filename, 
                         const unsigned int size, bool multiFile, 
                         GridBase *gridIo = nullptr)
    {
        std::unique_ptr<TIo> ioBuf{nullptr};
        ScidacReader         binReader;
        if (typeHash<T>() != typeHash<TIo>())
        {
            if (gridIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "I/O type different from vector type but null I/O grid passed");
            }
            ioBuf.reset(new TIo(gridIo));
        }
        if (multiFile)
        {
            std::string fullFilename;
            for(int k = 0; k < size; ++k) 
            {
                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
                binReader.open(fullFilename);
                readHeader(record, binReader);
                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
                binReader.close();
            }
        }
        else
        {
            binReader.open(filename);
            readHeader(record, binReader);
            for(int k = 0; k < size; ++k) 
            {
                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
            }
            binReader.close();
        }
    }
    inline void writeHeader(ScidacWriter &binWriter, PackRecord &record)
    {
        XmlWriter xmlWriter("", "eigenPackPar");
        xmlWriter.pushXmlString(record.operatorXml);
        xmlWriter.pushXmlString(record.solverXml);
        binWriter.writeLimeObject(1, 1, xmlWriter, "parameters", SCIDAC_FILE_XML);
    }
    template <typename T, typename TIo = T>
    void writeElement(ScidacWriter &binWriter, T &evec, RealD &eval, 
                      const unsigned int index, TIo *ioBuf, 
                      T *testBuf = nullptr)
    {
        VecRecord vecRecord;
        LOG(Message) << "Writing eigenvector " << index << std::endl;
        vecRecord.eval  = eval;
        vecRecord.index = index;
        if ((ioBuf == nullptr) || (testBuf == nullptr))
        {
            binWriter.writeScidacFieldRecord(evec, vecRecord, DEFAULT_ASCII_PREC);
        }
        else
        {
            precisionChange(*ioBuf, evec);
            precisionChange(*testBuf, *ioBuf);
            *testBuf -= evec;
            LOG(Message) << "Precision diff norm^2 " << norm2(*testBuf) << std::endl;
            binWriter.writeScidacFieldRecord(*ioBuf, vecRecord, DEFAULT_ASCII_PREC);
        }   
    }
    template <typename T, typename TIo = T>
    static void writePack(const std::string filename, std::vector<T> &evec, 
                          std::vector<RealD> &eval, PackRecord &record, 
                          const unsigned int size, bool multiFile, 
                          GridBase *gridIo = nullptr)
    {
        GridBase             *grid = evec[0].Grid();
        std::unique_ptr<TIo> ioBuf{nullptr}; 
        std::unique_ptr<T>   testBuf{nullptr};
        ScidacWriter         binWriter(grid->IsBoss());
        if (typeHash<T>() != typeHash<TIo>())
        {
            if (gridIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "I/O type different from vector type but null I/O grid passed");
            }
            ioBuf.reset(new TIo(gridIo));
            testBuf.reset(new T(grid));
        }
        if (multiFile)
        {
            std::string fullFilename;
            for(int k = 0; k < size; ++k) 
            {
                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
                makeFileDir(fullFilename, grid);
                binWriter.open(fullFilename);
                writeHeader(binWriter, record);
                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
                binWriter.close();
            }
        }
        else
        {
            makeFileDir(filename, grid);
            binWriter.open(filename);
            writeHeader(binWriter, record);
            for(int k = 0; k < size; ++k) 
            {
                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
            }
            binWriter.close();
        }
    }
 }
 template <typename F>
 class BaseEigenPack
 {
 public:
    typedef F Field;
 public:
    std::vector<RealD> eval;
    std::vector<F>     evec;
    PackRecord         record;
 public:
    BaseEigenPack(void)          = default;
    BaseEigenPack(const size_t size, GridBase *grid)
    {
        resize(size, grid);
    }
    virtual ~BaseEigenPack(void) = default;
    void resize(const size_t size, GridBase *grid)
    {
        eval.resize(size);
        evec.resize(size, grid);
    }
 };
 template <typename F, typename FIo = F>
 class EigenPack: public BaseEigenPack<F>
 {
 public:
    typedef F   Field;
    typedef FIo FieldIo;
 public:
    EigenPack(void)          = default;
    virtual ~EigenPack(void) = default;
    EigenPack(const size_t size, GridBase *grid, GridBase *gridIo = nullptr)
    : BaseEigenPack<F>(size, grid)
    {
        if (typeHash<F>() != typeHash<FIo>())
        {
            if (gridIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "I/O type different from vector type but null I/O grid passed");
            }
        }
        gridIo_ = gridIo;
    }
    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        EigenPackIo::readPack<F, FIo>(this->evec, this->eval, this->record, 
                                      evecFilename(fileStem, traj, multiFile), 
                                      this->evec.size(), multiFile, gridIo_);
        HADRONS_DUMP_EP_METADATA(this->record);
    }
    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        EigenPackIo::writePack<F, FIo>(evecFilename(fileStem, traj, multiFile), 
                                       this->evec, this->eval, this->record, 
                                       this->evec.size(), multiFile, gridIo_);
    }
 protected:
    std::string evecFilename(const std::string stem, const int traj, const bool multiFile)
    {
        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
        if (multiFile)
        {
            return stem + t;
        }
        else
        {
            return stem + t + ".bin";
        }
    }
 protected:
    GridBase *gridIo_;
 };
 template <typename FineF, typename CoarseF, 
          typename FineFIo = FineF, typename CoarseFIo = CoarseF>
 class CoarseEigenPack: public EigenPack<FineF, FineFIo>
 {
 public:
    typedef CoarseF   CoarseField;
    typedef CoarseFIo CoarseFieldIo;
 public:      
    std::vector<CoarseF> evecCoarse;
    std::vector<RealD>   evalCoarse;
 public:
    CoarseEigenPack(void)          = default;
    virtual ~CoarseEigenPack(void) = default;
    CoarseEigenPack(const size_t sizeFine, const size_t sizeCoarse, 
                    GridBase *gridFine, GridBase *gridCoarse,
                    GridBase *gridFineIo = nullptr, 
                    GridBase *gridCoarseIo = nullptr)
    {
        if (typeHash<FineF>() != typeHash<FineFIo>())
        {
            if (gridFineIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "Fine I/O type different from vector type but null fine I/O grid passed");
            }
        }
        if (typeHash<CoarseF>() != typeHash<CoarseFIo>())
        {
            if (gridCoarseIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "Coarse I/O type different from vector type but null coarse I/O grid passed");
            }
        }
        this->gridIo_ = gridFineIo;
        gridCoarseIo_ = gridCoarseIo;
        resize(sizeFine, sizeCoarse, gridFine, gridCoarse);
    }
    void resize(const size_t sizeFine, const size_t sizeCoarse, 
                GridBase *gridFine, GridBase *gridCoarse)
    {
        EigenPack<FineF, FineFIo>::resize(sizeFine, gridFine);
        evalCoarse.resize(sizeCoarse);
        evecCoarse.resize(sizeCoarse, gridCoarse);
    }
    void readFine(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        EigenPack<FineF, FineFIo>::read(fileStem + "_fine", multiFile, traj);
    }
    void readCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        PackRecord dummy;
        EigenPackIo::readPack<CoarseF, CoarseFIo>(evecCoarse, evalCoarse, dummy, 
                              this->evecFilename(fileStem + "_coarse", traj, multiFile), 
                              evecCoarse.size(), multiFile, gridCoarseIo_);
    }
    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        readFine(fileStem, multiFile, traj);
        readCoarse(fileStem, multiFile, traj);
    }
    void writeFine(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        EigenPack<FineF, FineFIo>::write(fileStem + "_fine", multiFile, traj);
    }
    void writeCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        EigenPackIo::writePack<CoarseF, CoarseFIo>(this->evecFilename(fileStem + "_coarse", traj, multiFile), 
                                                   evecCoarse, evalCoarse, this->record, 
                                                   evecCoarse.size(), multiFile, gridCoarseIo_);
    }
    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        writeFine(fileStem, multiFile, traj);
        writeCoarse(fileStem, multiFile, traj);
    }
 private:
    GridBase *gridCoarseIo_;
 };
 template <typename FImpl>
 using BaseFermionEigenPack = BaseEigenPack<typename FImpl::FermionField>;
 template <typename FImpl, typename FImplIo = FImpl>
 using FermionEigenPack = EigenPack<typename FImpl::FermionField, typename FImplIo::FermionField>;
 template <typename FImpl, int nBasis, typename FImplIo = FImpl>
 using CoarseFermionEigenPack = CoarseEigenPack<
    typename FImpl::FermionField,
    typename LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
                                   typename FImpl::SiteComplex, 
                                   nBasis>::CoarseField,
    typename FImplIo::FermionField,
    typename LocalCoherenceLanczos<typename FImplIo::SiteSpinor, 
                                   typename FImplIo::SiteComplex, 
                                   nBasis>::CoarseField>;
 #undef HADRONS_DUMP_EP_METADATA
 END_HADRONS_NAMESPACE
 #endif // Hadrons_EigenPack_hpp_
--- a/Hadrons/Environment.cc
+++ b/Hadrons/Environment.cc
@@ -1,347 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Environment.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Environment.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 using namespace Grid;
 using namespace Hadrons;
 #define ERROR_NO_ADDRESS(address)\
 HADRONS_ERROR_REF(ObjectDefinition, "no object with address " + std::to_string(address), address);
 /******************************************************************************
 *                       Environment implementation                           *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 Environment::Environment(void)
 {
    dim_ = GridDefaultLatt().toVector();
    nd_  = dim_.size();
    vol_ = 1.;
    for (auto d: dim_)
    {
        vol_ *= d;
    }
 }
 // grids ///////////////////////////////////////////////////////////////////////
 unsigned int Environment::getNd(void) const
 {
    return nd_;
 }
 std::vector<int> Environment::getDim(void) const
 {
    return dim_;
 }
 int Environment::getDim(const unsigned int mu) const
 {
    return dim_[mu];
 }
 double Environment::getVolume(void) const
 {
    return vol_;
 }
 // random number generator /////////////////////////////////////////////////////
 GridParallelRNG * Environment::get4dRng(void)
 {
    if (rng4d_ == nullptr)
    {
        rng4d_.reset(new GridParallelRNG(getGrid()));
    }
    return rng4d_.get();
 }
 GridSerialRNG * Environment::getSerialRng(void)
 {
    if (rngSerial_ == nullptr)
    {
        rngSerial_.reset(new GridSerialRNG());
    }
    return rngSerial_.get();
 }
 // general memory management ///////////////////////////////////////////////////
 void Environment::addObject(const std::string name, const int moduleAddress)
 {
    if (!hasObject(name))
    {
        ObjInfo info;
        info.name   = name;
        info.module = moduleAddress;
        info.data   = nullptr;
        object_.push_back(std::move(info));
        objectAddress_[name] = static_cast<unsigned int>(object_.size() - 1);
    }
    else
    {
        HADRONS_ERROR_REF(ObjectDefinition, "object '" + name + "' already exists",
                          getObjectAddress(name));
    }
 }
 void Environment::setObjectModule(const unsigned int objAddress,
                                  const int modAddress)
 {
    object_[objAddress].module = modAddress;
 }
 unsigned int Environment::getMaxAddress(void) const
 {
    return object_.size();
 }
 unsigned int Environment::getObjectAddress(const std::string name) const
 {
    if (hasObject(name))
    {
        return objectAddress_.at(name);
    }
    else
    {
        HADRONS_ERROR(Definition, "no object with name '" + name + "'");
    }
 }
 std::string Environment::getObjectName(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return object_[address].name;
    }
    else
    {
        ERROR_NO_ADDRESS(address);
    }
 }
 std::string Environment::getObjectType(const unsigned int address) const
 {
    if (hasObject(address))
    {
        if (object_[address].type)
        {
            return typeName(object_[address].type);
        }
        else
        {
            return "<no type>";
        }
    }
    else
    {
        ERROR_NO_ADDRESS(address);
    }
 }
 std::string Environment::getObjectType(const std::string name) const
 {
    return getObjectType(getObjectAddress(name));
 }
 Environment::Size Environment::getObjectSize(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return object_[address].size;
    }
    else
    {
        ERROR_NO_ADDRESS(address);
    }
 }
 Environment::Size Environment::getObjectSize(const std::string name) const
 {
    return getObjectSize(getObjectAddress(name));
 }
 Environment::Storage Environment::getObjectStorage(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return object_[address].storage;
    }
    else
    {
        ERROR_NO_ADDRESS(address);
    }
 }
 Environment::Storage Environment::getObjectStorage(const std::string name) const
 {
    return getObjectStorage(getObjectAddress(name));
 }
 int Environment::getObjectModule(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return object_[address].module;
    }
    else
    {
        ERROR_NO_ADDRESS(address);
    }
 }
 int Environment::getObjectModule(const std::string name) const
 {
    return getObjectModule(getObjectAddress(name));
 }
 unsigned int Environment::getObjectLs(const unsigned int address) const
 {
    if (hasCreatedObject(address))
    {
        return object_[address].Ls;
    }
    else
    {
        ERROR_NO_ADDRESS(address);
    }
 }
 unsigned int Environment::getObjectLs(const std::string name) const
 {
    return getObjectLs(getObjectAddress(name));
 }
 bool Environment::hasObject(const unsigned int address) const
 {
    return (address < object_.size());
 }
 bool Environment::hasObject(const std::string name) const
 {
    auto it = objectAddress_.find(name);
    return ((it != objectAddress_.end()) and hasObject(it->second));
 }
 bool Environment::hasCreatedObject(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return (object_[address].data != nullptr);
    }
    else
    {
        return false;
    }
 }
 bool Environment::hasCreatedObject(const std::string name) const
 {
    if (hasObject(name))
    {
        return hasCreatedObject(getObjectAddress(name));
    }
    else
    {
        return false;
    }
 }
 bool Environment::isObject5d(const unsigned int address) const
 {
    return (getObjectLs(address) > 1);
 }
 bool Environment::isObject5d(const std::string name) const
 {
    return (getObjectLs(name) > 1);
 }
 Environment::Size Environment::getTotalSize(void) const
 {
    Environment::Size size = 0;
    for (auto &o: object_)
    {
        size += o.size;
    }
    return size;
 }
 void Environment::freeObject(const unsigned int address)
 {
    if (hasCreatedObject(address))
    {
        LOG(Message) << "Destroying object '" << object_[address].name
                     << "'" << std::endl;
    }
    object_[address].size = 0;
    object_[address].type = nullptr;
    object_[address].data.reset(nullptr);
 }
 void Environment::freeObject(const std::string name)
 {
    freeObject(getObjectAddress(name));
 }
 void Environment::freeAll(void)
 {
    for (unsigned int i = 0; i < object_.size(); ++i)
    {
        freeObject(i);
    }
 }
 void Environment::protectObjects(const bool protect)
 {
    protect_ = protect;
 }
 bool Environment::objectsProtected(void) const
 {
    return protect_;
 }
 // print environment content ///////////////////////////////////////////////////
 void Environment::printContent(void) const
 {
    LOG(Debug) << "Objects: " << std::endl;
    for (unsigned int i = 0; i < object_.size(); ++i)
    {
        LOG(Debug) << std::setw(4) << i << ": "
                   << getObjectName(i) << " ("
                   << sizeString(getObjectSize(i)) << ")" << std::endl;
    }
 }
--- a/Hadrons/Environment.hpp
+++ b/Hadrons/Environment.hpp
@@ -1,588 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Environment.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Environment_hpp_
 #define Hadrons_Environment_hpp_
 #include <Hadrons/Global.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         Global environment                                 *
 ******************************************************************************/
 class Object
 {
 public:
    Object(void) = default;
    virtual ~Object(void) = default;
 };
 template <typename T>
 class Holder: public Object
 {
 public:
    Holder(void) = default;
    Holder(T *pt);
    virtual ~Holder(void) = default;
    T &       get(void) const;
    T *       getPt(void) const;
    void      reset(T *pt);
 private:
    std::unique_ptr<T> objPt_{nullptr};
 };
 #define DEFINE_ENV_ALIAS \
 inline Environment & env(void) const\
 {\
    return Environment::getInstance();\
 }
 #define DEFINE_ENV_LAMBDA \
 auto env = [](void)->Environment &{return Environment::getInstance();}
 class Environment
 {
    SINGLETON(Environment);
 public:
    typedef SITE_SIZE_TYPE                         Size;
    typedef std::unique_ptr<GridCartesian>         GridPt;
    typedef std::unique_ptr<GridRedBlackCartesian> GridRbPt;
    typedef std::unique_ptr<GridParallelRNG>       RngPt;
    typedef std::unique_ptr<GridSerialRNG>         SerialRngPt;
    enum class Storage {object, cache, temporary};
 private:
    struct ObjInfo
    {
        Size                    size{0};
        Storage                 storage{Storage::object};
        unsigned int            Ls{0};
        const std::type_info    *type{nullptr}, *derivedType{nullptr};
        std::string             name;
        int                     module{-1};
        std::unique_ptr<Object> data{nullptr};
    };
    typedef std::pair<size_t, unsigned int>     FineGridKey;
    typedef std::pair<size_t, std::vector<int>> CoarseGridKey;
 public:
    // grids
    template <typename VType = vComplex>
    void                    createGrid(const unsigned int Ls);
    template <typename VType = vComplex>
    void                    createCoarseGrid(const std::vector<int> &blockSize,
                                             const unsigned int Ls);
    template <typename VType = vComplex>
    GridCartesian *         getGrid(void);
    template <typename VType = vComplex>
    GridRedBlackCartesian * getRbGrid(void);
    template <typename VType = vComplex>
    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize);
    template <typename VType = vComplex>
    GridCartesian *         getGrid(const unsigned int Ls);
    template <typename VType = vComplex>
    GridRedBlackCartesian * getRbGrid(const unsigned int Ls);
    template <typename VType = vComplex>
    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize,
                                          const unsigned int Ls);
    std::vector<int>        getDim(void) const;
    int                     getDim(const unsigned int mu) const;
    unsigned int            getNd(void) const;
    double                  getVolume(void) const;
    // random number generator
    GridParallelRNG *       get4dRng(void);
    GridSerialRNG *         getSerialRng(void);
    // general memory management
    void                    addObject(const std::string name,
                                      const int moduleAddress = -1);
    template <typename B, typename T, typename ... Ts>
    void                    createDerivedObject(const std::string name,
                                                const Environment::Storage storage,
                                                const unsigned int Ls,
                                                Ts && ... args);
    template <typename T, typename ... Ts>
    void                    createObject(const std::string name,
                                         const Environment::Storage storage,
                                         const unsigned int Ls,
                                         Ts && ... args);
    void                    setObjectModule(const unsigned int objAddress,
                                            const int modAddress);
    template <typename B, typename T>
    T *                     getDerivedObject(const unsigned int address) const;
    template <typename B, typename T>
    T *                     getDerivedObject(const std::string name) const;
    template <typename T>
    T *                     getObject(const unsigned int address) const;
    template <typename T>
    T *                     getObject(const std::string name) const;
    unsigned int            getMaxAddress(void) const;
    unsigned int            getObjectAddress(const std::string name) const;
    std::string             getObjectName(const unsigned int address) const;
    std::string             getObjectType(const unsigned int address) const;
    std::string             getObjectType(const std::string name) const;
    Size                    getObjectSize(const unsigned int address) const;
    Size                    getObjectSize(const std::string name) const;
    Storage                 getObjectStorage(const unsigned int address) const;
    Storage                 getObjectStorage(const std::string name) const;
    int                     getObjectModule(const unsigned int address) const;
    int                     getObjectModule(const std::string name) const;
    unsigned int            getObjectLs(const unsigned int address) const;
    unsigned int            getObjectLs(const std::string name) const;
    bool                    hasObject(const unsigned int address) const;
    bool                    hasObject(const std::string name) const;
    bool                    hasCreatedObject(const unsigned int address) const;
    bool                    hasCreatedObject(const std::string name) const;
    bool                    isObject5d(const unsigned int address) const;
    bool                    isObject5d(const std::string name) const;
    template <typename T>
    bool                    isObjectOfType(const unsigned int address) const;
    template <typename T>
    bool                    isObjectOfType(const std::string name) const;
    Environment::Size       getTotalSize(void) const;
    void                    freeObject(const unsigned int address);
    void                    freeObject(const std::string name);
    void                    freeAll(void);
    void                    protectObjects(const bool protect);
    bool                    objectsProtected(void) const;
    // print environment content
    void                    printContent(void) const;
 private:
    // general
    double                              vol_;
    bool                                protect_{true};
    // grids
    std::vector<int>                    dim_;
    std::map<FineGridKey, GridPt>       grid4d_;
    std::map<FineGridKey, GridPt>       grid5d_;
    std::map<FineGridKey, GridRbPt>     gridRb4d_;
    std::map<FineGridKey, GridRbPt>     gridRb5d_;
    std::map<CoarseGridKey, GridPt>     gridCoarse4d_;
    std::map<CoarseGridKey, GridPt>     gridCoarse5d_;
    unsigned int                        nd_;
    // random number generator
    RngPt                               rng4d_{nullptr};
    SerialRngPt                         rngSerial_{nullptr};
    // object store
    std::vector<ObjInfo>                object_;
    std::map<std::string, unsigned int> objectAddress_;
 };
 /******************************************************************************
 *                       Holder template implementation                       *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename T>
 Holder<T>::Holder(T *pt)
 : objPt_(pt)
 {}
 // access //////////////////////////////////////////////////////////////////////
 template <typename T>
 T & Holder<T>::get(void) const
 {
    return *objPt_.get();
 }
 template <typename T>
 T * Holder<T>::getPt(void) const
 {
    return objPt_.get();
 }
 template <typename T>
 void Holder<T>::reset(T *pt)
 {
    objPt_.reset(pt);
 }
 /******************************************************************************
 *                     Environment template implementation                    *
 ******************************************************************************/
 // grids ///////////////////////////////////////////////////////////////////////
 #define HADRONS_DUMP_GRID(...)\
 LOG(Debug) << "New grid " << (__VA_ARGS__) << std::endl;\
 LOG(Debug) << " - cb  : " << (__VA_ARGS__)->_isCheckerBoarded << std::endl;\
 LOG(Debug) << " - fdim: " << (__VA_ARGS__)->_fdimensions << std::endl;\
 LOG(Debug) << " - gdim: " << (__VA_ARGS__)->_gdimensions << std::endl;\
 LOG(Debug) << " - ldim: " << (__VA_ARGS__)->_ldimensions << std::endl;\
 LOG(Debug) << " - rdim: " << (__VA_ARGS__)->_rdimensions << std::endl;
 template <typename VType>
 void Environment::createGrid(const unsigned int Ls)
 {
    size_t hash = typeHash<VType>();
    if (grid4d_.find({hash, 1}) == grid4d_.end())
    {
        grid4d_[{hash, 1}].reset(
            SpaceTimeGrid::makeFourDimGrid(getDim(), 
                                        GridDefaultSimd(getNd(), VType::Nsimd()),
                                        GridDefaultMpi()));
        HADRONS_DUMP_GRID(grid4d_[{hash, 1}].get());
        gridRb4d_[{hash, 1}].reset(
            SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_[{hash, 1}].get()));
        HADRONS_DUMP_GRID(gridRb4d_[{hash, 1}].get());
    }
    if (grid5d_.find({hash, Ls}) == grid5d_.end())
    {
        auto g = grid4d_[{hash, 1}].get();
        grid5d_[{hash, Ls}].reset(SpaceTimeGrid::makeFiveDimGrid(Ls, g));
        HADRONS_DUMP_GRID(grid5d_[{hash, Ls}].get());
        gridRb5d_[{hash, Ls}].reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, g));
        HADRONS_DUMP_GRID(gridRb5d_[{hash, Ls}].get());
    }
 }
 template <typename VType>
 void Environment::createCoarseGrid(const std::vector<int> &blockSize,
                                   const unsigned int Ls)
 {
    int              nd      = getNd();
    std::vector<int> fineDim = getDim(), coarseDim(nd);
    unsigned int     cLs;
    auto             key4d = blockSize, key5d = blockSize;
    size_t           hash  = typeHash<VType>();
    createGrid(Ls);
    for (int d = 0; d < coarseDim.size(); d++)
    {
        coarseDim[d] = fineDim[d]/blockSize[d];
        if (coarseDim[d]*blockSize[d] != fineDim[d])
        {
            HADRONS_ERROR(Size, "Fine dimension " + std::to_string(d) 
                         + " (" + std::to_string(fineDim[d]) 
                         + ") not divisible by coarse dimension ("
                         + std::to_string(coarseDim[d]) + ")"); 
        }
    }
    if (blockSize.size() > nd)
    {
        cLs = Ls/blockSize[nd];
        if (cLs*blockSize[nd] != Ls)
        {
            HADRONS_ERROR(Size, "Fine Ls (" + std::to_string(Ls) 
                         + ") not divisible by coarse Ls ("
                         + std::to_string(cLs) + ")");
        }
    }
    else
    {
        cLs = Ls;
    }
    key4d.resize(nd);
    key5d.push_back(Ls);
    CoarseGridKey hkey4d = {hash, key4d}, hkey5d = {hash, key5d};
    if (gridCoarse4d_.find(hkey4d) == gridCoarse4d_.end())
    {
        gridCoarse4d_[hkey4d].reset(
            SpaceTimeGrid::makeFourDimGrid(coarseDim, 
                GridDefaultSimd(nd, VType::Nsimd()), GridDefaultMpi()));
        HADRONS_DUMP_GRID(gridCoarse4d_[hkey4d].get());
    }
    if (gridCoarse5d_.find(hkey5d) == gridCoarse5d_.end())
    {
        gridCoarse5d_[hkey5d].reset(
            SpaceTimeGrid::makeFiveDimGrid(cLs, gridCoarse4d_[hkey4d].get()));
        HADRONS_DUMP_GRID(gridCoarse5d_[hkey5d].get());
    }
 }
 #undef HADRONS_DUMP_GRID
 template <typename VType>
 GridCartesian * Environment::getGrid(void)
 {
    FineGridKey key = {typeHash<VType>(), 1};
    auto it = grid4d_.find(key);
    if (it != grid4d_.end())
    {
        return it->second.get();
    }
    else
    {
        createGrid<VType>(1);
        return grid4d_.at(key).get();
    }
 }
 template <typename VType>
 GridRedBlackCartesian * Environment::getRbGrid(void)
 {
    FineGridKey key = {typeHash<VType>(), 1};
    auto        it  = gridRb4d_.find(key);
    if (it != gridRb4d_.end())
    {
        return it->second.get();
    }
    else
    {
        createGrid<VType>(1);
        return gridRb4d_.at(key).get();
    }
 }
 template <typename VType>
 GridCartesian * Environment::getCoarseGrid(const std::vector<int> &blockSize)
 {
    std::vector<int> s = blockSize;
    s.resize(getNd());
    CoarseGridKey key = {typeHash<VType>(), s};
    auto          it  = gridCoarse4d_.find(key);
    if (it != gridCoarse4d_.end())
    {
        return it->second.get();
    }
    else
    {
        createCoarseGrid<VType>(blockSize, 1);
        return gridCoarse4d_.at(key).get();
    }
 }
 template <typename VType>
 GridCartesian * Environment::getGrid(const unsigned int Ls)
 {
    FineGridKey key = {typeHash<VType>(), Ls};
    auto        it  = grid5d_.find(key);
    if (it != grid5d_.end())
    {
        return it->second.get();
    }
    else
    {
        createGrid<VType>(Ls);
        return grid5d_.at(key).get();
    }
 }
 template <typename VType>
 GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls)
 {
    FineGridKey key = {typeHash<VType>(), Ls};
    auto        it  = gridRb5d_.find(key);
    if (it != gridRb5d_.end())
    {
        return it->second.get();
    }
    else
    {
        createGrid<VType>(Ls);
        return gridRb5d_.at(key).get();
    }
 }
 template <typename VType>
 GridCartesian * Environment::getCoarseGrid(const std::vector<int> &blockSize,
                                           const unsigned int Ls)
 {
    std::vector<int> s = blockSize;
    s.push_back(Ls);
    CoarseGridKey key = {typeHash<VType>(), s};
    auto it = gridCoarse5d_.find(key);
    if (it != gridCoarse5d_.end())
    {
        return it->second.get();
    }
    else
    {
        createCoarseGrid<VType>(blockSize, Ls);
        return gridCoarse5d_.at(key).get();
    }
 }
 // general memory management ///////////////////////////////////////////////////
 template <typename B, typename T, typename ... Ts>
 void Environment::createDerivedObject(const std::string name,
                                      const Environment::Storage storage,
                                      const unsigned int Ls,
                                      Ts && ... args)
 {
    if (!hasObject(name))
    {
        addObject(name);
    }
    unsigned int address = getObjectAddress(name);
    if (!object_[address].data or !objectsProtected())
    {
        MemoryStats memStats;
        if (!MemoryProfiler::stats)
        {
            MemoryProfiler::stats = &memStats;
        }
        size_t initMem               = MemoryProfiler::stats->currentlyAllocated;
        object_[address].storage     = storage;
        object_[address].Ls          = Ls;
        object_[address].data.reset(new Holder<B>(new T(std::forward<Ts>(args)...)));
        object_[address].size        = MemoryProfiler::stats->maxAllocated - initMem;
        object_[address].type        = typeIdPt<B>();
        object_[address].derivedType = typeIdPt<T>();
        if (MemoryProfiler::stats == &memStats)
        {
            MemoryProfiler::stats = nullptr;
        }
    }
    // object already exists, no error if it is a cache, error otherwise
    else if ((object_[address].storage               != Storage::cache) or 
             (object_[address].storage               != storage)        or
             (object_[address].name                  != name)           or
             (typeHash(object_[address].type)        != typeHash<B>())  or
             (typeHash(object_[address].derivedType) != typeHash<T>()))
    {
        HADRONS_ERROR_REF(ObjectDefinition, "object '" + name + "' already allocated", address);
    }
 }
 template <typename T, typename ... Ts>
 void Environment::createObject(const std::string name, 
                               const Environment::Storage storage,
                               const unsigned int Ls,
                               Ts && ... args)
 {
    createDerivedObject<T, T>(name, storage, Ls, std::forward<Ts>(args)...);
 }
 template <typename B, typename T>
 T * Environment::getDerivedObject(const unsigned int address) const
 {
    if (hasObject(address))
    {
        if (hasCreatedObject(address))
        {
            if (auto h = dynamic_cast<Holder<B> *>(object_[address].data.get()))
            {
                if (&typeid(T) == &typeid(B))
                {
                    return dynamic_cast<T *>(h->getPt());
                }
                else
                {
                    if (auto hder = dynamic_cast<T *>(h->getPt()))
                    {
                        return hder;
                    }
                    else
                    {
                        HADRONS_ERROR_REF(ObjectType, "object with address " +
                            std::to_string(address) +
                            " cannot be casted to '" + typeName(&typeid(T)) +
                            "' (has type '" + typeName(&typeid(h->get())) + "')", address);
                    }
                }
            }
            else
            {
                HADRONS_ERROR_REF(ObjectType, "object with address " + 
                            std::to_string(address) +
                            " does not have type '" + typeName(&typeid(B)) +
                            "' (has type '" + getObjectType(address) + "')", address);
            }
        }
        else
        {
            HADRONS_ERROR_REF(ObjectDefinition, "object with address " + 
                              std::to_string(address) + " is empty", address);
        }
    }
    else
    {
        HADRONS_ERROR_REF(ObjectDefinition, "no object with address " + 
                          std::to_string(address), address);
    }
 }
 template <typename B, typename T>
 T * Environment::getDerivedObject(const std::string name) const
 {
    return getDerivedObject<B, T>(getObjectAddress(name));
 }
 template <typename T>
 T * Environment::getObject(const unsigned int address) const
 {
    return getDerivedObject<T, T>(address);
 }
 template <typename T>
 T * Environment::getObject(const std::string name) const
 {
    return getObject<T>(getObjectAddress(name));
 }
 template <typename T>
 bool Environment::isObjectOfType(const unsigned int address) const
 {
    if (hasObject(address))
    {
        if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
        {
            return true;
        }
        else
        {
            return false;
        }
    }
    else
    {
        HADRONS_ERROR_REF(ObjectDefinition, "no object with address " 
                          + std::to_string(address), address);
    }
 }
 template <typename T>
 bool Environment::isObjectOfType(const std::string name) const
 {
    return isObjectOfType<T>(getObjectAddress(name));
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Environment_hpp_
--- a/Hadrons/Exceptions.cc
+++ b/Hadrons/Exceptions.cc
@@ -1,102 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Exceptions.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Exceptions.hpp>
 #include <Hadrons/VirtualMachine.hpp>
 #include <Hadrons/Module.hpp>
 #ifndef ERR_SUFF
 #define ERR_SUFF " (" + loc + ")"
 #endif
 #define CTOR_EXC(name, init) \
 name::name(std::string msg, std::string loc)\
 :init\
 {}
 #define CTOR_EXC_REF(name, init) \
 name::name(std::string msg, std::string loc, const unsigned int address)\
 :init\
 {}
 using namespace Grid;
 using namespace Hadrons;
 using namespace Exceptions;
 // backtrace cache
 std::vector<std::string> Grid::Hadrons::Exceptions::backtraceStr;
 // logic errors
 CTOR_EXC(Logic, logic_error(msg + ERR_SUFF))
 CTOR_EXC(Definition, Logic("definition error: " + msg, loc))
 CTOR_EXC(Implementation, Logic("implementation error: " + msg, loc))
 CTOR_EXC(Range, Logic("range error: " + msg, loc))
 CTOR_EXC(Size, Logic("size error: " + msg, loc))
 // runtime errors
 CTOR_EXC(Runtime, runtime_error(msg + ERR_SUFF))
 CTOR_EXC(Argument, Runtime("argument error: " + msg, loc))
 CTOR_EXC(Io, Runtime("IO error: " + msg, loc))
 CTOR_EXC(Memory, Runtime("memory error: " + msg, loc))
 CTOR_EXC(Parsing, Runtime("parsing error: " + msg, loc))
 CTOR_EXC(Program, Runtime("program error: " + msg, loc))
 CTOR_EXC(System, Runtime("system error: " + msg, loc))
 // virtual machine errors
 CTOR_EXC_REF(ObjectDefinition, RuntimeRef("object definition error: " + msg, loc, address));
 CTOR_EXC_REF(ObjectType, RuntimeRef("object type error: " + msg, loc, address));
 // abort functions
 void Grid::Hadrons::Exceptions::abort(const std::exception& e)
 {
    auto &vm = VirtualMachine::getInstance();
    int  mod = vm.getCurrentModule();
    LOG(Error) << "FATAL ERROR -- Exception " << typeName(&typeid(e)) 
               << std::endl;
    if (mod >= 0)
    {
        LOG(Error) << "During execution of module '"
                    << vm.getModuleName(mod) << "' (address " << mod << ")"
                    << std::endl;
    }
    LOG(Error) << e.what() << std::endl;
    if (!backtraceStr.empty())
    {
        LOG(Error) << "-- BACKTRACE --------------" << std::endl;
        for (auto &s: backtraceStr)
        {
            LOG(Error) << s << std::endl;
        }
        LOG(Error) << "---------------------------" << std::endl;
    }
    LOG(Error) << "Aborting program" << std::endl;
    Grid_finalize();
    exit(EXIT_FAILURE);
 }
--- a/Hadrons/Exceptions.hpp
+++ b/Hadrons/Exceptions.hpp
@@ -1,129 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Exceptions.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Exceptions_hpp_
 #define Hadrons_Exceptions_hpp_
 #include <stdexcept>
 #include <execinfo.h>
 #ifndef Hadrons_Global_hpp_
 #include <Hadrons/Global.hpp>
 #endif
 #define HADRONS_SRC_LOC std::string(__FUNCTION__) + " at " \
                        + std::string(__FILE__) + ":" + std::to_string(__LINE__)
 #define HADRONS_BACKTRACE_MAX 128
 #ifdef HAVE_EXECINFO_H
 #define HADRONS_CACHE_BACKTRACE \
 {\
    void* _callstack[HADRONS_BACKTRACE_MAX];\
    int _i, _frames = backtrace(_callstack, HADRONS_BACKTRACE_MAX);\
    char** _strs = backtrace_symbols(_callstack, _frames);\
    Grid::Hadrons::Exceptions::backtraceStr.clear();\
    for (_i = 0; _i < _frames; ++_i)\
    {\
        Hadrons::Exceptions::backtraceStr.push_back(std::string(_strs[_i]));\
    }\
    free(_strs);\
 }
 #else
 #define HADRONS_CACHE_BACKTRACE \
 Grid::Hadrons::Exceptions::backtraceStr.clear();\
 Grid::Hadrons::Exceptions::backtraceStr.push_back("<backtrace not supported>");
 #endif
 #define HADRONS_ERROR(exc, msg)\
 HADRONS_CACHE_BACKTRACE \
 throw(Exceptions::exc(msg, HADRONS_SRC_LOC));
 #define HADRONS_ERROR_REF(exc, msg, address)\
 HADRONS_CACHE_BACKTRACE \
 throw(Exceptions::exc(msg, HADRONS_SRC_LOC, address));
 #define DECL_EXC(name, base) \
 class name: public base\
 {\
 public:\
    name(std::string msg, std::string loc);\
 }
 #define DECL_EXC_REF(name, base) \
 class name: public base\
 {\
 public:\
    name(std::string msg, std::string loc, const unsigned int address);\
 }
 BEGIN_HADRONS_NAMESPACE
 namespace Exceptions
 {
    // backtrace cache
    extern std::vector<std::string> backtraceStr;
    // logic errors
    DECL_EXC(Logic, std::logic_error);
    DECL_EXC(Definition, Logic);
    DECL_EXC(Implementation, Logic);
    DECL_EXC(Range, Logic);
    DECL_EXC(Size, Logic);
    // runtime errors
    DECL_EXC(Runtime, std::runtime_error);
    DECL_EXC(Argument, Runtime);
    DECL_EXC(Io, Runtime);
    DECL_EXC(Memory, Runtime);
    DECL_EXC(Parsing, Runtime);
    DECL_EXC(Program, Runtime);
    DECL_EXC(System, Runtime);
    // virtual machine errors
    class RuntimeRef: public Runtime
    {
    public:
        RuntimeRef(std::string msg, std::string loc, const unsigned int address)
        : Runtime(msg, loc), address_(address)
        {}
        unsigned int getAddress(void) const
        {
            return address_;
        }
    private:
        unsigned int address_;
    };
    DECL_EXC_REF(ObjectDefinition, RuntimeRef);
    DECL_EXC_REF(ObjectType, RuntimeRef);
    // abort functions
    void abort(const std::exception& e);
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Exceptions_hpp_
--- a/Hadrons/Factory.hpp
+++ b/Hadrons/Factory.hpp
@@ -1,105 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Factory.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Factory_hpp_
 #define Hadrons_Factory_hpp_
 #include <Hadrons/Global.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                        abstract factory class                              *
 ******************************************************************************/
 template <typename T>
 class Factory
 {
 public:
    typedef std::function<std::unique_ptr<T>(const std::string)> Func;
 public:
    // constructor
    Factory(void) = default;
    // destructor
    virtual ~Factory(void) = default;
    // registration
    void registerBuilder(const std::string type, const Func &f);
    // get builder list
    std::vector<std::string> getBuilderList(void) const;
    // factory
    std::unique_ptr<T> create(const std::string type,
                              const std::string name) const;
 private:
    std::map<std::string, Func> builder_;
 };
 /******************************************************************************
 *                         template implementation                            *
 ******************************************************************************/
 // registration ////////////////////////////////////////////////////////////////
 template <typename T>
 void Factory<T>::registerBuilder(const std::string type, const Func &f)
 {
    builder_[type] = f;
 }
 // get module list /////////////////////////////////////////////////////////////
 template <typename T>
 std::vector<std::string> Factory<T>::getBuilderList(void) const
 {
    std::vector<std::string> list;
    for (auto &b: builder_)
    {
        list.push_back(b.first);
    }
    return list;
 }
 // factory /////////////////////////////////////////////////////////////////////
 template <typename T>
 std::unique_ptr<T> Factory<T>::create(const std::string type,
                                      const std::string name) const
 {
    Func func;
    try
    {
        func = builder_.at(type);
    }
    catch (std::out_of_range &)
    {
        HADRONS_ERROR(Argument, "object of type '" + type + "' unknown");
    }
    return func(name);
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Factory_hpp_
--- a/Hadrons/GeneticScheduler.hpp
+++ b/Hadrons/GeneticScheduler.hpp
@@ -1,321 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/GeneticScheduler.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_GeneticScheduler_hpp_
 #define Hadrons_GeneticScheduler_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Graph.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                   Scheduler based on a genetic algorithm                   *
 ******************************************************************************/
 template <typename V, typename T>
 class GeneticScheduler
 {
 public:
    typedef std::vector<T>                 Gene;
    typedef std::pair<Gene *, Gene *>      GenePair;
    typedef std::function<V(const Gene &)> ObjFunc;
    struct Parameters
    {
        double       mutationRate;
        unsigned int popSize, seed;
    };
 public:
    // constructor
    GeneticScheduler(Graph<T> &graph, const ObjFunc &func,
                     const Parameters &par);
    // destructor
    virtual ~GeneticScheduler(void) = default;
    // access
    const Gene & getMinSchedule(void);
    V            getMinValue(void);
    // reset population
    void initPopulation(void);
    // breed a new generation
    void nextGeneration(void);
    // heuristic benchmarks
    void benchmarkCrossover(const unsigned int nIt);
    // print population
    friend std::ostream & operator<<(std::ostream &out,
                                     const GeneticScheduler<V, T> &s)
    {
        out << "[";
        for (auto &p: s.population_)
        {
            out << p.first << ", ";
        }
        out << "\b\b]";
        return out;
    }
 private:
    void doCrossover(void);
    void doMutation(void);
    // genetic operators
    GenePair selectPair(void);
    void     crossover(Gene &c1, Gene &c2, const Gene &p1, const Gene &p2);
    void     mutation(Gene &m, const Gene &c);
 private:
    Graph<T>               &graph_;
    const ObjFunc          &func_;
    const Parameters       par_;
    std::multimap<V, Gene> population_;
    std::mt19937           gen_;
 };
 /******************************************************************************
 *                       template implementation                              *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename V, typename T>
 GeneticScheduler<V, T>::GeneticScheduler(Graph<T> &graph, const ObjFunc &func,
                                      const Parameters &par)
 : graph_(graph)
 , func_(func)
 , par_(par)
 {
    gen_.seed(par_.seed);
 }
 // access //////////////////////////////////////////////////////////////////////
 template <typename V, typename T>
 const typename GeneticScheduler<V, T>::Gene &
 GeneticScheduler<V, T>::getMinSchedule(void)
 {
    return population_.begin()->second;
 }
 template <typename V, typename T>
 V GeneticScheduler<V, T>::getMinValue(void)
 {
    return population_.begin()->first;
 }
 // breed a new generation //////////////////////////////////////////////////////
 template <typename V, typename T>
 void GeneticScheduler<V, T>::nextGeneration(void)
 {
    // random initialization of the population if necessary
    if (population_.size() != par_.popSize)
    {
        initPopulation();
    }
    //LOG(Debug) << "Starting population:\n" << *this << std::endl;
    // random mutations
    for (unsigned int i = 0; i < par_.popSize; ++i)
    {
        doMutation();
    }
    //LOG(Debug) << "After mutations:\n" << *this << std::endl;
    // mating
    for (unsigned int i = 0; i < par_.popSize/2; ++i)
    {
        doCrossover();
    }
    //LOG(Debug) << "After mating:\n" << *this << std::endl;
    // grim reaper
    auto it = population_.begin();
    std::advance(it, par_.popSize);
    population_.erase(it, population_.end());
    //LOG(Debug) << "After grim reaper:\n" << *this << std::endl;
 }
 // evolution steps /////////////////////////////////////////////////////////////
 template <typename V, typename T>
 void GeneticScheduler<V, T>::initPopulation(void)
 {
    population_.clear();
    for (unsigned int i = 0; i < par_.popSize; ++i)
    {
        auto p = graph_.topoSort(gen_);
        population_.insert(std::make_pair(func_(p), p));
    }
 }
 template <typename V, typename T>
 void GeneticScheduler<V, T>::doCrossover(void)
 {
    auto p = selectPair();
    Gene &p1 = *(p.first), &p2 = *(p.second);
    Gene c1, c2;
    crossover(c1, c2, p1, p2);
    thread_critical
    {
        population_.insert(std::make_pair(func_(c1), c1));
        population_.insert(std::make_pair(func_(c2), c2));
    }
 }
 template <typename V, typename T>
 void GeneticScheduler<V, T>::doMutation(void)
 {
    std::uniform_real_distribution<double>      mdis(0., 1.);
    std::uniform_int_distribution<unsigned int> pdis(0, population_.size() - 1);
    if (mdis(gen_) < par_.mutationRate)
    {
        Gene m;
        auto it = population_.begin();
        std::advance(it, pdis(gen_));
        mutation(m, it->second);
 	thread_critical
        {
            population_.insert(std::make_pair(func_(m), m));
        }
    }
 }
 // genetic operators ///////////////////////////////////////////////////////////
 template <typename V, typename T>
 typename GeneticScheduler<V, T>::GenePair GeneticScheduler<V, T>::selectPair(void)
 {
    std::vector<double> prob;
    unsigned int        ind;
    Gene                *p1, *p2;
    const double        max = population_.rbegin()->first;
    for (auto &c: population_)
    {
        prob.push_back(std::exp((c.first-1.)/max));
    }        
    std::discrete_distribution<unsigned int> dis1(prob.begin(), prob.end());
    auto rIt = population_.begin();
    ind = dis1(gen_);
    std::advance(rIt, ind);
    p1 = &(rIt->second);
    prob[ind] = 0.;
    std::discrete_distribution<unsigned int> dis2(prob.begin(), prob.end());
    rIt = population_.begin();
    std::advance(rIt, dis2(gen_));
    p2 = &(rIt->second);
    return std::make_pair(p1, p2);
 }
 template <typename V, typename T>
 void GeneticScheduler<V, T>::crossover(Gene &c1, Gene &c2, const Gene &p1,
                                    const Gene &p2)
 {
    Gene                                        buf;
    std::uniform_int_distribution<unsigned int> dis(0, p1.size() - 1);
    unsigned int                                cut = dis(gen_);
    c1.clear();
    buf = p2;
    for (unsigned int i = 0; i < cut; ++i)
    {
        c1.push_back(p1[i]);
        buf.erase(std::find(buf.begin(), buf.end(), p1[i]));
    }
    for (unsigned int i = 0; i < buf.size(); ++i)
    {
        c1.push_back(buf[i]);
    }
    c2.clear();
    buf = p2;
    for (unsigned int i = cut; i < p1.size(); ++i)
    {
        buf.erase(std::find(buf.begin(), buf.end(), p1[i]));
    }
    for (unsigned int i = 0; i < buf.size(); ++i)
    {
        c2.push_back(buf[i]);
    }
    for (unsigned int i = cut; i < p1.size(); ++i)
    {
        c2.push_back(p1[i]);
    }
 }
 template <typename V, typename T>
 void GeneticScheduler<V, T>::mutation(Gene &m, const Gene &c)
 {
    Gene                                        buf;
    std::uniform_int_distribution<unsigned int> dis(0, c.size() - 1);
    unsigned int                                cut = dis(gen_);
    Graph<T>                                    g1 = graph_, g2 = graph_;
    for (unsigned int i = 0; i < cut; ++i)
    {
        g1.removeVertex(c[i]);
    }
    for (unsigned int i = cut; i < c.size(); ++i)
    {
        g2.removeVertex(c[i]);
    }
    if (g1.size() > 0)
    {
        buf = g1.topoSort(gen_);
    }
    if (g2.size() > 0)
    {
        m = g2.topoSort(gen_);
    }
    for (unsigned int i = cut; i < c.size(); ++i)
    {
        m.push_back(buf[i - cut]);
    }
 }
 template <typename V, typename T>
 void GeneticScheduler<V, T>::benchmarkCrossover(const unsigned int nIt)
 {
    Gene   p1, p2, c1, c2;
    double neg = 0., eq = 0., pos = 0., total;
    int    improvement;
    LOG(Message) << "Benchmarking crossover..." << std::endl;
    for (unsigned int i = 0; i < nIt; ++i)
    {
        p1 = graph_.topoSort(gen_);
        p2 = graph_.topoSort(gen_);
        crossover(c1, c2, p1, p2);
        improvement = (func_(c1) + func_(c2) - func_(p1) - func_(p2))/2;
        if (improvement < 0) neg++; else if (improvement == 0) eq++; else pos++;
    }
    total = neg + eq + pos;
    LOG(Message) << "  -: " << neg/total << "  =: " << eq/total
                 << "  +: " << pos/total << std::endl;
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_GeneticScheduler_hpp_
--- a/Hadrons/Global.cc
+++ b/Hadrons/Global.cc
@@ -1,213 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Global.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Global.hpp>
 using namespace Grid;
 using namespace Hadrons;
 HadronsLogger Hadrons::HadronsLogError(1,"Error");
 HadronsLogger Hadrons::HadronsLogWarning(1,"Warning");
 HadronsLogger Hadrons::HadronsLogMessage(1,"Message");
 HadronsLogger Hadrons::HadronsLogIterative(1,"Iterative");
 HadronsLogger Hadrons::HadronsLogDebug(1,"Debug");
 HadronsLogger Hadrons::HadronsLogIRL(1,"IRL");
 void Hadrons::initLogger(void)
 {
    auto w  = std::string("Hadrons").length();
    int  cw = 8;
    GridLogError.setTopWidth(w);
    GridLogWarning.setTopWidth(w);
    GridLogMessage.setTopWidth(w);
    GridLogIterative.setTopWidth(w);
    GridLogDebug.setTopWidth(w);
    GridLogIRL.setTopWidth(w);
    GridLogError.setChanWidth(cw);
    GridLogWarning.setChanWidth(cw);
    GridLogMessage.setChanWidth(cw);
    GridLogIterative.setChanWidth(cw);
    GridLogDebug.setChanWidth(cw);
    GridLogIRL.setChanWidth(cw);
    HadronsLogError.Active(true);
    HadronsLogWarning.Active(true);
    HadronsLogMessage.Active(GridLogMessage.isActive());
    HadronsLogIterative.Active(GridLogIterative.isActive());
    HadronsLogDebug.Active(GridLogDebug.isActive());
    HadronsLogIRL.Active(GridLogIRL.isActive());
    HadronsLogError.setChanWidth(cw);
    HadronsLogWarning.setChanWidth(cw);
    HadronsLogMessage.setChanWidth(cw);
    HadronsLogIterative.setChanWidth(cw);
    HadronsLogDebug.setChanWidth(cw);
    HadronsLogIRL.setChanWidth(cw);
 }
 // type utilities //////////////////////////////////////////////////////////////
 size_t Hadrons::typeHash(const std::type_info *info)
 {
    return info->hash_code();
 }
 //constexpr unsigned int maxNameSize = 1024u;
 std::string Hadrons::typeName(const std::type_info *info)
 {
    char        *buf;
    std::string name;
    buf  = abi::__cxa_demangle(info->name(), nullptr, nullptr, nullptr);
    name = buf;
    free(buf);
    return name;
 }
 // default writers/readers /////////////////////////////////////////////////////
 #ifdef HAVE_HDF5
 const std::string Hadrons::resultFileExt = "h5";
 #else
 const std::string Hadrons::resultFileExt = "xml";
 #endif
 // recursive mkdir /////////////////////////////////////////////////////////////
 int Hadrons::mkdir(const std::string dirName)
 {
    if (!dirName.empty() and access(dirName.c_str(), R_OK|W_OK|X_OK))
    {
        mode_t mode755;
        char   tmp[MAX_PATH_LENGTH];
        char   *p = NULL;
        size_t len;
        mode755 = S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
        snprintf(tmp, sizeof(tmp), "%s", dirName.c_str());
        len = strlen(tmp);
        if(tmp[len - 1] == '/')
        {
            tmp[len - 1] = 0;
        }
        for(p = tmp + 1; *p; p++)
        {
            if(*p == '/')
            {
                *p = 0;
                ::mkdir(tmp, mode755);
                *p = '/';
            }
        }
        return ::mkdir(tmp, mode755);
    }
    else
    {
        return 0;
    }
 }
 std::string Hadrons::basename(const std::string &s)
 {
    constexpr char sep = '/';
    size_t         i   = s.rfind(sep, s.length());
    if (i != std::string::npos)
    {
        return s.substr(i+1, s.length() - i);
    }
    else
    {
        return s;
    }
 }
 std::string Hadrons::dirname(const std::string &s)
 {
    constexpr char sep = '/';
    size_t         i   = s.rfind(sep, s.length());
    if (i != std::string::npos)
    {
        return s.substr(0, i);
    }
    else
    {
        return "";
    }
 }
 void Hadrons::makeFileDir(const std::string filename, GridBase *g)
 {
    bool doIt = true;
    if (g)
    {
        doIt = g->IsBoss();
    }
    if (doIt)
    {
        std::string dir    = dirname(filename);
        int         status = mkdir(dir);
        if (status)
        {
            HADRONS_ERROR(Io, "cannot create directory '" + dir
                          + "' ( " + std::strerror(errno) + ")");
        }
    }
 }
 void Hadrons::printTimeProfile(const std::map<std::string, GridTime> &timing, 
                               GridTime total)
 {
    typedef decltype(total.count()) Count;
    std::map<Count, std::string, std::greater<Count>> rtiming;
    const double dtotal = static_cast<double>(total.count());
    auto cf = std::cout.flags();
    auto p  = std::cout.precision();
    unsigned int width = 0;
    for (auto &t: timing)
    {
        width = std::max(width, static_cast<unsigned int>(t.first.length()));
        rtiming[t.second.count()] = t.first;
    }
    for (auto &rt: rtiming)
    {
        LOG(Message) << std::setw(width) << rt.second << ": " 
                     << rt.first << " us (" << std::fixed 
                     << std::setprecision(1) 
                     << static_cast<double>(rt.first)/dtotal*100 << "%)"
                     << std::endl;
    }
    std::cout.flags(cf);
    std::cout.precision(p);
 }
--- a/Hadrons/Global.hpp
+++ b/Hadrons/Global.hpp
@@ -1,282 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Global.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Global_hpp_
 #define Hadrons_Global_hpp_
 #include <set>
 #include <stack>
 #include <regex>
 #include <Grid/Grid.h>
 #include <cxxabi.h>
 #ifndef SITE_SIZE_TYPE
 #define SITE_SIZE_TYPE size_t
 #endif
 #ifndef DEFAULT_ASCII_PREC
 #define DEFAULT_ASCII_PREC 16
 #endif
 #define ARG(...) __VA_ARGS__
 /* the 'using Grid::operator<<;' statement prevents a very nasty compilation
 * error with GCC 5 (clang & GCC 6 compile fine without it).
 */
 #define BEGIN_HADRONS_NAMESPACE \
 namespace Grid {\
 namespace Hadrons {\
 using Grid::operator<<;\
 using Grid::operator>>;
 #define END_HADRONS_NAMESPACE }}
 #define BEGIN_MODULE_NAMESPACE(name)\
 namespace name {\
 using Grid::operator<<;\
 using Grid::operator>>;
 #define END_MODULE_NAMESPACE }
 #define _HADRONS_IMPL(impl, sub) impl##sub
 #define HADRONS_IMPL(impl, sub)   _HADRONS_IMPL(impl, sub)
 #ifndef FIMPLBASE
 #define FIMPLBASE WilsonImpl
 #endif
 #define FIMPL  HADRONS_IMPL(FIMPLBASE, R)
 #define FIMPLF HADRONS_IMPL(FIMPLBASE, F)
 #define FIMPLD HADRONS_IMPL(FIMPLBASE, D)
 #ifndef ZFIMPLBASE
 #define ZFIMPLBASE ZWilsonImpl
 #endif
 #define ZFIMPL  HADRONS_IMPL(ZFIMPLBASE, R)
 #define ZFIMPLF HADRONS_IMPL(ZFIMPLBASE, F)
 #define ZFIMPLD HADRONS_IMPL(ZFIMPLBASE, D)
 #ifndef SIMPLBASE
 #define SIMPLBASE ScalarImplC
 #endif
 #define SIMPL  HADRONS_IMPL(SIMPLBASE, R)
 #define SIMPLF HADRONS_IMPL(SIMPLBASE, F)
 #define SIMPLD HADRONS_IMPL(SIMPLBASE, D)
 #ifndef GIMPLBASE
 #define GIMPLBASE PeriodicGimpl
 #endif
 #define GIMPL  HADRONS_IMPL(GIMPLBASE, R)
 #define GIMPLF HADRONS_IMPL(GIMPLBASE, F)
 #define GIMPLD HADRONS_IMPL(GIMPLBASE, D)
 BEGIN_HADRONS_NAMESPACE
 // type aliases
 #define BASIC_TYPE_ALIASES(Impl, suffix)\
 typedef typename Impl::Field                         ScalarField##suffix;\
 typedef typename Impl::PropagatorField               PropagatorField##suffix;\
 typedef typename Impl::SitePropagator::scalar_object SitePropagator##suffix;\
 typedef typename Impl::ComplexField                  ComplexField##suffix;\
 typedef std::vector<SitePropagator##suffix>          SlicedPropagator##suffix;\
 typedef std::vector<typename ComplexField##suffix::vector_object::scalar_object> SlicedComplex##suffix;
 #define FERM_TYPE_ALIASES(FImpl, suffix)\
 BASIC_TYPE_ALIASES(FImpl, suffix);\
 typedef FermionOperator<FImpl>                     FMat##suffix;\
 typedef typename FImpl::FermionField               FermionField##suffix;\
 typedef typename FImpl::GaugeField                 GaugeField##suffix;\
 typedef typename FImpl::DoubledGaugeField          DoubledGaugeField##suffix;\
 typedef Lattice<iSpinMatrix<typename FImpl::Simd>> SpinMatrixField##suffix;
 #define GAUGE_TYPE_ALIASES(GImpl, suffix)\
 typedef typename GImpl::GaugeField GaugeField##suffix;
 #define SOLVER_TYPE_ALIASES(FImpl, suffix)\
 typedef Solver<FImpl> Solver##suffix;
 #define SINK_TYPE_ALIASES(suffix)\
 typedef std::function<SlicedPropagator##suffix\
                      (const PropagatorField##suffix &)> SinkFn##suffix;
 // logger
 class HadronsLogger: public Logger
 {
 public:
    HadronsLogger(int on, std::string nm): Logger("Hadrons", on, nm,
                                                  GridLogColours, "BLACK"){};
 };
 #define LOG(channel) std::cout << HadronsLog##channel
 #define HADRONS_DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
 extern HadronsLogger HadronsLogError;
 extern HadronsLogger HadronsLogWarning;
 extern HadronsLogger HadronsLogMessage;
 extern HadronsLogger HadronsLogIterative;
 extern HadronsLogger HadronsLogDebug;
 extern HadronsLogger HadronsLogIRL;
 void initLogger(void);
 // singleton pattern
 #define SINGLETON(name)\
 public:\
    name(const name &e) = delete;\
    void operator=(const name &e) = delete;\
    static name & getInstance(void)\
    {\
        static name e;\
        return e;\
    }\
 private:\
    name(void);
 #define SINGLETON_DEFCTOR(name)\
 public:\
    name(const name &e) = delete;\
    void operator=(const name &e) = delete;\
    static name & getInstance(void)\
    {\
        static name e;\
        return e;\
    }\
 private:\
    name(void) = default;
 // type utilities
 template <typename T>
 const std::type_info * typeIdPt(const T &x)
 {
    return &typeid(x);
 }
 template <typename T>
 const std::type_info * typeIdPt(void)
 {
    return &typeid(T);
 }
 size_t typeHash(const std::type_info *info);
 template <typename T>
 size_t typeHash(const T &x)
 {
    return typeHash(typeIdPt(x));
 }
 template <typename T>
 size_t typeHash(void)
 {
    return typeHash(typeIdPt<T>());
 }
 std::string typeName(const std::type_info *info);
 template <typename T>
 std::string typeName(const T &x)
 {
    return typeName(typeIdPt(x));
 }
 template <typename T>
 std::string typeName(void)
 {
    return typeName(typeIdPt<T>());
 }
 // default writers/readers
 extern const std::string resultFileExt;
 #ifdef HAVE_HDF5
 typedef Hdf5Reader ResultReader;
 typedef Hdf5Writer ResultWriter;
 #else
 typedef XmlReader ResultReader;
 typedef XmlWriter ResultWriter;
 #endif
 #define RESULT_FILE_NAME(name, traj) \
 name + "." + std::to_string(traj) + "." + resultFileExt
 // recursive mkdir
 #define MAX_PATH_LENGTH 512u
 int         mkdir(const std::string dirName);
 std::string basename(const std::string &s);
 std::string dirname(const std::string &s);
 void        makeFileDir(const std::string filename, GridBase *g = nullptr);
 // default Schur convention
 #ifndef HADRONS_DEFAULT_SCHUR 
 #define HADRONS_DEFAULT_SCHUR DiagTwo
 #endif
 #define _HADRONS_SCHUR_OP_(conv) Schur##conv##Operator
 #define HADRONS_SCHUR_OP(conv) _HADRONS_SCHUR_OP_(conv)
 #define HADRONS_DEFAULT_SCHUR_OP HADRONS_SCHUR_OP(HADRONS_DEFAULT_SCHUR)
 #define _HADRONS_SCHUR_SOLVE_(conv) SchurRedBlack##conv##Solve
 #define HADRONS_SCHUR_SOLVE(conv) _HADRONS_SCHUR_SOLVE_(conv)
 #define HADRONS_DEFAULT_SCHUR_SOLVE HADRONS_SCHUR_SOLVE(HADRONS_DEFAULT_SCHUR)
 #define _HADRONS_SCHUR_A2A_(conv) A2AVectorsSchur##conv
 #define HADRONS_SCHUR_A2A(conv) _HADRONS_SCHUR_A2A_(conv)
 #define HADRONS_DEFAULT_SCHUR_A2A HADRONS_SCHUR_A2A(HADRONS_DEFAULT_SCHUR)
 // stringify macro
 #define _HADRONS_STR(x) #x
 #define HADRONS_STR(x) _HADRONS_STR(x)
 // pretty print time profile
 void printTimeProfile(const std::map<std::string, GridTime> &timing, GridTime total);
 // token replacement utility
 template <typename T>
 void tokenReplace(std::string &str, const std::string token,
                  const T &x, const std::string mark = "@")
 {
    std::string fullToken = mark + token + mark;
    auto pos = str.find(fullToken);
    if (pos != std::string::npos)
    {
        str.replace(pos, fullToken.size(), std::to_string(x));
    }
 }
 // generic correlator class
 template <typename Metadata, typename Scalar = Complex>
 struct Correlator: Serializable
 {
    GRID_SERIALIZABLE_CLASS_MEMBERS(ARG(Correlator<Metadata, Scalar>),
                                    Metadata,             info,
                                    std::vector<Scalar>, corr);
 };
 END_HADRONS_NAMESPACE
 #include <Hadrons/Exceptions.hpp>
 #endif // Hadrons_Global_hpp_
--- a/Hadrons/Graph.hpp
+++ b/Hadrons/Graph.hpp
@@ -1,759 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Graph.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Graph_hpp_
 #define Hadrons_Graph_hpp_
 #include <Hadrons/Global.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                          Oriented graph class                              *
 ******************************************************************************/
 // I/O for edges
 template <typename T>
 std::ostream & operator<<(std::ostream &out, const std::pair<T, T> &e)
 {
    out << "\""  << e.first << "\" -> \"" << e.second << "\"";
    return out;
 }
 // main class
 template <typename T>
 class Graph
 {
 public:
    typedef std::pair<T, T> Edge;
 public:
    // constructor
    Graph(void);
    // destructor
    virtual ~Graph(void) = default;
    // access
    void           addVertex(const T &value);
    void           addEdge(const Edge &e);
    void           addEdge(const T &start, const T &end);
    std::vector<T> getVertices(void) const;
    void           removeVertex(const T &value);
    void           removeEdge(const Edge &e);
    void           removeEdge(const T &start, const T &end);
    unsigned int   size(void) const;
    // tests
    bool gotValue(const T &value) const;
    // graph topological manipulations
    std::vector<T>              getAdjacentVertices(const T &value) const;
    std::vector<T>              getChildren(const T &value) const;
    std::vector<T>              getParents(const T &value) const;
    std::vector<T>              getRoots(void) const;
    std::vector<Graph<T>>       getConnectedComponents(void) const;
    std::vector<T>              topoSort(void);
    template <typename Gen>
    std::vector<T>              topoSort(Gen &gen);
    std::vector<std::vector<T>> allTopoSort(void);
    // I/O
    friend std::ostream & operator<<(std::ostream &out, const Graph<T> &g)
    {
        out << "{";
        for (auto &e: g.edgeSet_)
        {
            out << e << ", ";
        }
        if (g.edgeSet_.size() != 0)
        {
            out << "\b\b";
        }
        out << "}";
        return out;
    }
 private:
    // vertex marking
    void      mark(const T &value, const bool doMark = true);
    void      markAll(const bool doMark = true);
    void      unmark(const T &value);
    void      unmarkAll(void);
    bool      isMarked(const T &value) const;
    const T * getFirstMarked(const bool isMarked = true) const;
    template <typename Gen>
    const T * getRandomMarked(const bool isMarked, Gen &gen);
    const T * getFirstUnmarked(void) const;
    template <typename Gen>
    const T * getRandomUnmarked(Gen &gen);
    // prune marked/unmarked vertices
    void removeMarked(const bool isMarked = true);
    void removeUnmarked(void);
    // depth-first search marking
    void depthFirstSearch(void);
    void depthFirstSearch(const T &root);
 private:
    std::map<T, bool>  isMarked_;
    std::set<Edge>     edgeSet_;
 };
 // build depedency matrix from topological sorts
 template <typename T>
 std::map<T, std::map<T, bool>>
 makeDependencyMatrix(const std::vector<std::vector<T>> &topSort);
 /******************************************************************************
 *                       template implementation                              *
 ******************************************************************************
 * in all the following V is the number of vertex and E is the number of edge
 * in the worst case E = V^2
 */
 // constructor /////////////////////////////////////////////////////////////////
 template <typename T>
 Graph<T>::Graph(void)
 {}
 // access //////////////////////////////////////////////////////////////////////
 // complexity: log(V)
 template <typename T>
 void Graph<T>::addVertex(const T &value)
 {
    isMarked_[value] = false;
 }
 // complexity: O(log(V))
 template <typename T>
 void Graph<T>::addEdge(const Edge &e)
 {
    addVertex(e.first);
    addVertex(e.second);
    edgeSet_.insert(e);
 }
 // complexity: O(log(V))
 template <typename T>
 void Graph<T>::addEdge(const T &start, const T &end)
 {
    addEdge(Edge(start, end));
 }
 template <typename T>
 std::vector<T> Graph<T>::getVertices(void) const
 {
    std::vector<T> vertex;
    for (auto &v: isMarked_)
    {
        vertex.push_back(v.first);
    }
    return vertex;
 }
 // complexity: O(V*log(V))
 template <typename T>
 void Graph<T>::removeVertex(const T &value)
 {
    // remove vertex from the mark table
    auto vIt = isMarked_.find(value);
    if (vIt != isMarked_.end())
    {
        isMarked_.erase(vIt);
    }
    else
    {
        HADRONS_ERROR(Range, "vertex does not exists");
    }
    // remove all edges containing the vertex
    auto pred = [&value](const Edge &e)
    {
        return ((e.first == value) or (e.second == value));
    };
    auto eIt = find_if(edgeSet_.begin(), edgeSet_.end(), pred);
    while (eIt != edgeSet_.end())
    {
        edgeSet_.erase(eIt);
        eIt = find_if(edgeSet_.begin(), edgeSet_.end(), pred);
    }
 }
 // complexity: O(log(V))
 template <typename T>
 void Graph<T>::removeEdge(const Edge &e)
 {
    auto eIt = edgeSet_.find(e);
    if (eIt != edgeSet_.end())
    {
        edgeSet_.erase(eIt);
    }
    else
    {
        HADRONS_ERROR(Range, "edge does not exists");
    }
 }
 // complexity: O(log(V))
 template <typename T>
 void Graph<T>::removeEdge(const T &start, const T &end)
 {
    removeEdge(Edge(start, end));
 }
 // complexity: O(1)
 template <typename T>
 unsigned int Graph<T>::size(void) const
 {
    return isMarked_.size();
 }
 // tests ///////////////////////////////////////////////////////////////////////
 // complexity: O(log(V))
 template <typename T>
 bool Graph<T>::gotValue(const T &value) const
 {
    auto it = isMarked_.find(value);
    if (it == isMarked_.end())
    {
        return false;
    }
    else
    {
        return true;
    }
 }
 // vertex marking //////////////////////////////////////////////////////////////
 // complexity: O(log(V))
 template <typename T>
 void Graph<T>::mark(const T &value, const bool doMark)
 {
    if (gotValue(value))
    {
        isMarked_[value] = doMark;
    }
    else
    {
        HADRONS_ERROR(Range, "vertex does not exists");
    }
 }
 // complexity: O(V*log(V))
 template <typename T>
 void Graph<T>::markAll(const bool doMark)
 {
    for (auto &v: isMarked_)
    {
        mark(v.first, doMark);
    }
 }
 // complexity: O(log(V))
 template <typename T>
 void Graph<T>::unmark(const T &value)
 {
    mark(value, false);
 }
 // complexity: O(V*log(V))
 template <typename T>
 void Graph<T>::unmarkAll(void)
 {
    markAll(false);
 }
 // complexity: O(log(V))
 template <typename T>
 bool Graph<T>::isMarked(const T &value) const
 {
    if (gotValue(value))
    {
        return isMarked_.at(value);
    }
    else
    {
        HADRONS_ERROR(Range, "vertex does not exists");
        return false;
    }
 }
 // complexity: O(log(V))
 template <typename T>
 const T * Graph<T>::getFirstMarked(const bool isMarked) const
 {
    auto pred = [&isMarked](const std::pair<T, bool> &v)
    {
        return (v.second == isMarked);
    };
    auto vIt = std::find_if(isMarked_.begin(), isMarked_.end(), pred);
    if (vIt != isMarked_.end())
    {
        return &(vIt->first);
    }
    else
    {
        return nullptr;
    }
 }
 // complexity: O(log(V))
 template <typename T>
 template <typename Gen>
 const T * Graph<T>::getRandomMarked(const bool isMarked, Gen &gen)
 {
    auto pred = [&isMarked](const std::pair<T, bool> &v)
    {
        return (v.second == isMarked);
    };
    std::uniform_int_distribution<unsigned int> dis(0, size() - 1);
    auto                                        rIt = isMarked_.begin();
    std::advance(rIt, dis(gen));
    auto vIt = std::find_if(rIt, isMarked_.end(), pred);
    if (vIt != isMarked_.end())
    {
        return &(vIt->first);
    }
    else
    {
        vIt = std::find_if(isMarked_.begin(), rIt, pred);
        if (vIt != rIt)
        {
            return &(vIt->first);
        }
        else
        {
            return nullptr;
        }
    }
 }
 // complexity: O(log(V))
 template <typename T>
 const T * Graph<T>::getFirstUnmarked(void) const
 {
    return getFirstMarked(false);
 }
 // complexity: O(log(V))
 template <typename T>
 template <typename Gen>
 const T * Graph<T>::getRandomUnmarked(Gen &gen)
 {
    return getRandomMarked(false, gen);
 }
 // prune marked/unmarked vertices //////////////////////////////////////////////
 // complexity: O(V^2*log(V))
 template <typename T>
 void Graph<T>::removeMarked(const bool isMarked)
 {
    auto isMarkedCopy = isMarked_;
    for (auto &v: isMarkedCopy)
    {
        if (v.second == isMarked)
        {
            removeVertex(v.first);
        }
    }
 }
 // complexity: O(V^2*log(V))
 template <typename T>
 void Graph<T>::removeUnmarked(void)
 {
    removeMarked(false);
 }
 // depth-first search marking //////////////////////////////////////////////////
 // complexity: O(V*log(V))
 template <typename T>
 void Graph<T>::depthFirstSearch(void)
 {
    depthFirstSearch(isMarked_.begin()->first);
 }
 // complexity: O(V*log(V))
 template <typename T>
 void Graph<T>::depthFirstSearch(const T &root)
 {
    std::vector<T> adjacentVertex;
    mark(root);
    adjacentVertex = getAdjacentVertices(root);
    for (auto &v: adjacentVertex)
    {
        if (!isMarked(v))
        {
            depthFirstSearch(v);
        }
    }
 }
 // graph topological manipulations /////////////////////////////////////////////
 // complexity: O(V*log(V))
 template <typename T>
 std::vector<T> Graph<T>::getAdjacentVertices(const T &value) const
 {
    std::vector<T> adjacentVertex;
    auto pred = [&value](const Edge &e)
    {
        return ((e.first == value) or (e.second == value));
    };
    auto eIt = std::find_if(edgeSet_.begin(), edgeSet_.end(), pred);
    while (eIt != edgeSet_.end())
    {
        if (eIt->first == value)
        {
            adjacentVertex.push_back((*eIt).second);
        }
        else if (eIt->second == value)
        {
            adjacentVertex.push_back((*eIt).first);
        }
        eIt = std::find_if(++eIt, edgeSet_.end(), pred);
    }
    return adjacentVertex;
 }
 // complexity: O(V*log(V))
 template <typename T>
 std::vector<T> Graph<T>::getChildren(const T &value) const
 {
    std::vector<T> child;
    auto pred = [&value](const Edge &e)
    {
        return (e.first == value);
    };
    auto eIt = std::find_if(edgeSet_.begin(), edgeSet_.end(), pred);
    while (eIt != edgeSet_.end())
    {
        child.push_back((*eIt).second);
        eIt = std::find_if(++eIt, edgeSet_.end(), pred);
    }
    return child;
 }
 // complexity: O(V*log(V))
 template <typename T>
 std::vector<T> Graph<T>::getParents(const T &value) const
 {
    std::vector<T> parent;
    auto pred = [&value](const Edge &e)
    {
        return (e.second == value);
    };
    auto eIt = std::find_if(edgeSet_.begin(), edgeSet_.end(), pred);
    while (eIt != edgeSet_.end())
    {
        parent.push_back((*eIt).first);
        eIt = std::find_if(++eIt, edgeSet_.end(), pred);
    }
    return parent;
 }
 // complexity: O(V^2*log(V))
 template <typename T>
 std::vector<T> Graph<T>::getRoots(void) const
 {
    std::vector<T> root;
    for (auto &v: isMarked_)
    {
        auto parent = getParents(v.first);
        if (parent.size() == 0)
        {
            root.push_back(v.first);
        }
    }
    return root;
 }
 // complexity: O(V^2*log(V))
 template <typename T>
 std::vector<Graph<T>> Graph<T>::getConnectedComponents(void) const
 {
    std::vector<Graph<T>> res;
    Graph<T>              copy(*this);
    while (copy.size() > 0)
    {
        copy.depthFirstSearch();
        res.push_back(copy);
        res.back().removeUnmarked();
        res.back().unmarkAll();
        copy.removeMarked();
        copy.unmarkAll();
    }
    return res;
 }
 // topological sort using a directed DFS algorithm
 // complexity: O(V*log(V))
 template <typename T>
 std::vector<T> Graph<T>::topoSort(void)
 {
    std::stack<T>     buf;
    std::vector<T>    res;
    const T           *vPt;
    std::map<T, bool> tmpMarked(isMarked_);
    // visit function
    std::function<void(const T &)> visit = [&](const T &v)
    {
        if (tmpMarked.at(v))
        {
            HADRONS_ERROR(Range, "cannot topologically sort a cyclic graph");
        }
        if (!isMarked(v))
        {
            std::vector<T> child = getChildren(v);
            tmpMarked[v] = true;
            for (auto &c: child)
            {
                visit(c);
            }
            mark(v);
            tmpMarked[v] = false;
            buf.push(v);
        }
    };
    // reset temporary marks
    for (auto &v: tmpMarked)
    {
        tmpMarked.at(v.first) = false;
    }
    // loop on unmarked vertices
    unmarkAll();
    vPt = getFirstUnmarked();
    while (vPt)
    {
        visit(*vPt);
        vPt = getFirstUnmarked();
    }
    unmarkAll();
    // create result vector
    while (!buf.empty())
    {
        res.push_back(buf.top());
        buf.pop();
    }
    return res;
 }
 // random version of the topological sort
 // complexity: O(V*log(V))
 template <typename T>
 template <typename Gen>
 std::vector<T> Graph<T>::topoSort(Gen &gen)
 {
    std::stack<T>     buf;
    std::vector<T>    res;
    const T           *vPt;
    std::map<T, bool> tmpMarked(isMarked_);
    // visit function
    std::function<void(const T &)> visit = [&](const T &v)
    {
        if (tmpMarked.at(v))
        {
            HADRONS_ERROR(Range, "cannot topologically sort a cyclic graph");
        }
        if (!isMarked(v))
        {
            std::vector<T> child = getChildren(v);
            tmpMarked[v] = true;
            std::shuffle(child.begin(), child.end(), gen);
            for (auto &c: child)
            {
                visit(c);
            }
            mark(v);
            tmpMarked[v] = false;
            buf.push(v);
        }
    };
    // reset temporary marks
    for (auto &v: tmpMarked)
    {
        tmpMarked.at(v.first) = false;
    }
    // loop on unmarked vertices
    unmarkAll();
    vPt = getRandomUnmarked(gen);
    while (vPt)
    {
        visit(*vPt);
        vPt = getRandomUnmarked(gen);
    }
    unmarkAll();
    // create result vector
    while (!buf.empty())
    {
        res.push_back(buf.top());
        buf.pop();
    }
    return res;
 }
 // generate all possible topological sorts
 // Y. L. Varol & D. Rotem, Comput. J. 24(1), pp. 83–84, 1981
 // http://comjnl.oupjournals.org/cgi/doi/10.1093/comjnl/24.1.83
 // complexity: O(V*log(V)) (from the paper, but really ?)
 template <typename T>
 std::vector<std::vector<T>> Graph<T>::allTopoSort(void)
 {
    std::vector<std::vector<T>>    res;
    std::map<T, std::map<T, bool>> iMat;
    // create incidence matrix
    for (auto &v1: isMarked_)
    for (auto &v2: isMarked_)
    {
        iMat[v1.first][v2.first] = false;
    }
    for (auto &v: isMarked_)
    {
        auto cVec = getChildren(v.first);
        for (auto &c: cVec)
        {
            iMat[v.first][c] = true;
        }
    }
    // generate initial topological sort
    res.push_back(topoSort());
    // generate all other topological sorts by permutation
    std::vector<T>            p = res[0];
    const unsigned int        n = size();
    std::vector<unsigned int> loc(n);
    unsigned int              i, k, k1;
    T                         obj_k, obj_k1;
    bool                      isFinal;
    for (unsigned int j = 0; j < n; ++j)
    {
        loc[j] = j;
    }
    i = 0;
    while (i < n-1)
    {
        k      = loc[i];
        k1     = k + 1;
        obj_k  = p[k];
        if (k1 >= n)
        {
            isFinal = true;
            obj_k1  = obj_k;
        }
        else
        {
            isFinal = false;
            obj_k1  = p[k1];
        }
        if (iMat[res[0][i]][obj_k1] or isFinal)
        {
            for (unsigned int l = k; l >= i + 1; --l)
            {
                p[l]   = p[l-1];
            }
            p[i]   = obj_k;
            loc[i] = i;
            i++;
        }
        else
        {
            p[k]   = obj_k1;
            p[k1]  = obj_k;
            loc[i] = k1;
            i      = 0;
            res.push_back(p);
        }
    }
    return res;
 }
 // build depedency matrix from topological sorts ///////////////////////////////
 // complexity: something like O(V^2*log(V!))
 template <typename T>
 std::map<T, std::map<T, bool>>
 makeDependencyMatrix(const std::vector<std::vector<T>> &topSort)
 {
    std::map<T, std::map<T, bool>> m;
    const std::vector<T>           &vList = topSort[0];
    for (auto &v1: vList)
    for (auto &v2: vList)
    {
        bool dep = true;
        for (auto &t: topSort)
        {
            auto i1 = std::find(t.begin(), t.end(), v1);
            auto i2 = std::find(t.begin(), t.end(), v2);
            dep = dep and (i1 - i2 > 0);
            if (!dep) break;
        }
        m[v1][v2] = dep;
    }
    return m;
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Graph_hpp_
--- a/Hadrons/Makefile.am
+++ b/Hadrons/Makefile.am
@@ -1,38 +0,0 @@
 SUBDIRS = . Utilities
 lib_LIBRARIES = libHadrons.a
 include modules.inc
 libHadrons_a_SOURCES = \
    Application.cc     \
    Environment.cc     \
 	Exceptions.cc      \
    Global.cc          \
    Module.cc		   \
 	TimerArray.cc      \
 	VirtualMachine.cc  \
 	$(modules_cc)
 libHadrons_adir = $(includedir)/Hadrons
 nobase_libHadrons_a_HEADERS = \
 	A2AVectors.hpp            \
 	A2AMatrix.hpp             \
 	Application.hpp           \
 	DilutedNoise.hpp          \
 	DiskVector.hpp            \
 	EigenPack.hpp             \
 	Environment.hpp           \
 	Exceptions.hpp            \
 	Factory.hpp               \
 	GeneticScheduler.hpp      \
 	Global.hpp                \
 	Graph.hpp                 \
 	Module.hpp                \
 	Modules.hpp               \
 	ModuleFactory.hpp         \
        NamedTensor.hpp           \
 	Solver.hpp                \
 	TimerArray.hpp            \
 	VirtualMachine.hpp        \
 	$(modules_hpp)
--- a/Hadrons/Module.cc
+++ b/Hadrons/Module.cc
@@ -1,110 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Module.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Module.hpp>
 using namespace Grid;
 using namespace Hadrons;
 /******************************************************************************
 *                       ModuleBase implementation                            *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 ModuleBase::ModuleBase(const std::string name)
 : name_(name)
 {}
 // access //////////////////////////////////////////////////////////////////////
 std::string ModuleBase::getName(void) const
 {
    return name_;
 }
 // get factory registration name if available
 std::string ModuleBase::getRegisteredName(void)
 {
    HADRONS_ERROR(Definition, "module '" + getName() + "' has no registered type"
                 + " in the factory");
 }
 // execution ///////////////////////////////////////////////////////////////////
 void ModuleBase::operator()(void)
 {
    resetTimers();
    startTimer("_total");
    startTimer("_setup");
    setup();
    stopTimer("_setup");
    startTimer("_execute");
    execute();
    stopAllTimers();
 }
 std::string ModuleBase::makeSeedString(void)
 {
    std::string seed;
    if (!vm().getRunId().empty())
    {
        seed += vm().getRunId() + "-";
    }
    seed += getName() + "-" + std::to_string(vm().getTrajectory());
    return seed;
 }
 GridParallelRNG & ModuleBase::rng4d(void)
 {
    auto &r = *env().get4dRng();
    if (makeSeedString() != seed_)
    {
        seed_ = makeSeedString();
        LOG(Message) << "Seeding 4D RNG " << &r << " with string '" 
                     << seed_ << "'" << std::endl;
        r.SeedUniqueString(seed_);
    }
    return r;
 }
 GridSerialRNG & ModuleBase::rngSerial(void)
 {
    auto &r = *env().getSerialRng();
    if (makeSeedString() != seed_)
    {
        seed_ = makeSeedString();
        LOG(Message) << "Seeding Serial RNG " << &r << " with string '" 
                     << seed_ << "'" << std::endl;
        r.SeedUniqueString(seed_);
    }
    return r;
 }
--- a/Hadrons/Module.hpp
+++ b/Hadrons/Module.hpp
@@ -1,295 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Module.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Module_hpp_
 #define Hadrons_Module_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/TimerArray.hpp>
 #include <Hadrons/VirtualMachine.hpp>
 BEGIN_HADRONS_NAMESPACE
 // module registration macros
 #define MODULE_REGISTER(mod, base, ns)\
 class mod: public base\
 {\
 public:\
    typedef base Base;\
    using Base::Base;\
    virtual std::string getRegisteredName(void)\
    {\
        return std::string(#ns "::" #mod);\
    }\
 };\
 class ns##mod##ModuleRegistrar\
 {\
 public:\
    ns##mod##ModuleRegistrar(void)\
    {\
        ModuleFactory &modFac = ModuleFactory::getInstance();\
        modFac.registerBuilder(#ns "::" #mod, [&](const std::string name)\
                              {\
                                  return std::unique_ptr<ns::mod>(new ns::mod(name));\
                              });\
    }\
 };\
 static ns##mod##ModuleRegistrar ns##mod##ModuleRegistrarInstance;
 #define MODULE_REGISTER_TMP(mod, base, ns)\
 extern template class base;\
 MODULE_REGISTER(mod, ARG(base), ns);
 #define HADRONS_MACRO_REDIRECT_12(arg1, arg2, macro, ...) macro
 #define HADRONS_MACRO_REDIRECT_23(arg1, arg2, arg3, macro, ...) macro
 #define envGetGrid4(latticeType)\
 env().template getGrid<typename latticeType::vector_type>()
 #define envGetGrid5(latticeType, Ls)\
 env().template getGrid<typename latticeType::vector_type>(Ls)
 #define envGetGrid(...)\
 HADRONS_MACRO_REDIRECT_12(__VA_ARGS__, envGetGrid5, envGetGrid4)(__VA_ARGS__)
 #define envGetCoarseGrid4(latticeType, blockSize)\
 env().template getCoarseGrid<typename latticeType::vector_type>(blockSize)
 #define envGetCoarseGrid5(latticeType, blockSize, Ls)\
 env().template getCoarseGrid<typename latticeType::vector_type>(blockSize, Ls)
 #define envGetCoarseGrid(...)\
 HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envGetCoarseGrid5, envGetCoarseGrid4)(__VA_ARGS__)
 #define envGetRbGrid4(latticeType)\
 env().template getRbGrid<typename latticeType::vector_type>()
 #define envGetRbGrid5(latticeType, Ls)\
 env().template getRbGrid<typename latticeType::vector_type>(Ls)
 #define envGetRbGrid(...)\
 HADRONS_MACRO_REDIRECT_12(__VA_ARGS__, envGetRbGrid5, envGetRbGrid4)(__VA_ARGS__)
 #define envGet(type, name)\
 *env().template getObject<type>(name)
 #define envGetDerived(base, type, name)\
 *env().template getDerivedObject<base, type>(name)
 #define envGetTmp(type, var)\
 type &var = *env().template getObject<type>(getName() + "_tmp_" + #var)
 #define envHasType(type, name)\
 env().template isObjectOfType<type>(name)
 #define envCreate(type, name, Ls, ...)\
 env().template createObject<type>(name, Environment::Storage::object, Ls, __VA_ARGS__)
 #define envCreateDerived(base, type, name, Ls, ...)\
 env().template createDerivedObject<base, type>(name, Environment::Storage::object, Ls, __VA_ARGS__)
 #define envCreateLat4(type, name)\
 envCreate(type, name, 1, envGetGrid(type))
 #define envCreateLat5(type, name, Ls)\
 envCreate(type, name, Ls, envGetGrid(type, Ls))
 #define envCreateLat(...)\
 HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envCreateLat5, envCreateLat4)(__VA_ARGS__)
 #define envCache(type, name, Ls, ...)\
 env().template createObject<type>(name, Environment::Storage::cache, Ls, __VA_ARGS__)
 #define envCacheLat4(type, name)\
 envCache(type, name, 1, envGetGrid(type))
 #define envCacheLat5(type, name, Ls)\
 envCache(type, name, Ls, envGetGrid(type, Ls))
 #define envCacheLat(...)\
 HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envCacheLat5, envCacheLat4)(__VA_ARGS__)
 #define envTmp(type, name, Ls, ...)\
 env().template createObject<type>(getName() + "_tmp_" + name,         \
                                  Environment::Storage::temporary, Ls, __VA_ARGS__)
 #define envTmpLat4(type, name)\
 envTmp(type, name, 1, envGetGrid(type))
 #define envTmpLat5(type, name, Ls)\
 envTmp(type, name, Ls, envGetGrid(type, Ls))
 #define envTmpLat(...)\
 HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envTmpLat5, envTmpLat4)(__VA_ARGS__)
 #define saveResult(ioStem, name, result)\
 if (env().getGrid()->IsBoss() and !ioStem.empty())\
 {\
    makeFileDir(ioStem, env().getGrid());\
    {\
        ResultWriter _writer(RESULT_FILE_NAME(ioStem, vm().getTrajectory()));\
        write(_writer, name, result);\
    }\
 }
 /******************************************************************************
 *                            Module class                                    *
 ******************************************************************************/
 // base class
 class ModuleBase: public TimerArray
 {
 public:
    // constructor
    ModuleBase(const std::string name);
    // destructor
    virtual ~ModuleBase(void) = default;
    // access
    std::string getName(void) const;
    // get factory registration name if available
    virtual std::string getRegisteredName(void);
    // dependencies/products
    virtual std::vector<std::string> getInput(void) = 0;
    virtual std::vector<std::string> getReference(void)
    {
        return std::vector<std::string>(0);
    };
    virtual std::vector<std::string> getOutput(void) = 0;
    // parse parameters
    virtual void parseParameters(XmlReader &reader, const std::string name) = 0;
    virtual void saveParameters(XmlWriter &writer, const std::string name) = 0;
    // parameter string
    virtual std::string parString(void) const = 0;
    // setup
    virtual void setup(void) {};
    virtual void execute(void) = 0;
    // execution
    void operator()(void);
 protected:
    // environment shortcut
    DEFINE_ENV_ALIAS;
    // virtual machine shortcut
    DEFINE_VM_ALIAS;
    // RNG seeded from module string
    GridParallelRNG &rng4d(void);
    GridSerialRNG &rngSerial(void);
 private:
    std::string makeSeedString(void);
 private:
    std::string                          name_, currentTimer_, seed_;
    std::map<std::string, GridStopWatch> timer_; 
 };
 // derived class, templating the parameter class
 template <typename P>
 class Module: public ModuleBase
 {
 public:
    typedef P Par;
 public:
    // constructor
    Module(const std::string name);
    // destructor
    virtual ~Module(void) = default;
    // parse parameters
    virtual void parseParameters(XmlReader &reader, const std::string name);
    virtual void saveParameters(XmlWriter &writer, const std::string name);
    // parameter string
    virtual std::string parString(void) const;
    // parameter access
    const P &   par(void) const;
    void        setPar(const P &par);
 private:
    P par_;
 };
 // no parameter type
 class NoPar {};
 template <>
 class Module<NoPar>: public ModuleBase
 {
 public:
    // constructor
    Module(const std::string name): ModuleBase(name) {};
    // destructor
    virtual ~Module(void) = default;
    // parse parameters (do nothing)
    virtual void parseParameters(XmlReader &reader, const std::string name) {};
    virtual void saveParameters(XmlWriter &writer, const std::string name)
    {
        push(writer, "options");
        pop(writer);
    };
    // parameter string (empty)
    virtual std::string parString(void) const {return "";};
 };
 /******************************************************************************
 *                           Template implementation                          *
 ******************************************************************************/
 template <typename P>
 Module<P>::Module(const std::string name)
 : ModuleBase(name)
 {}
 template <typename P>
 void Module<P>::parseParameters(XmlReader &reader, const std::string name)
 {
    read(reader, name, par_);
 }
 template <typename P>
 void Module<P>::saveParameters(XmlWriter &writer, const std::string name)
 {
    write(writer, name, par_);
 }
 template <typename P>
 std::string Module<P>::parString(void) const
 {
    XmlWriter writer("", "");
    write(writer, par_.SerialisableClassName(), par_);
    return writer.string();
 }
 template <typename P>
 const P & Module<P>::par(void) const
 {
    return par_;
 }
 template <typename P>
 void Module<P>::setPar(const P &par)
 {
    par_ = par;
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Module_hpp_
--- a/Hadrons/ModuleFactory.hpp
+++ b/Hadrons/ModuleFactory.hpp
@@ -1,48 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/ModuleFactory.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_ModuleFactory_hpp_
 #define Hadrons_ModuleFactory_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Factory.hpp>
 #include <Hadrons/Module.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                            ModuleFactory                                   *
 ******************************************************************************/
 class ModuleFactory: public Factory<ModuleBase>
 {
    SINGLETON_DEFCTOR(ModuleFactory)
 };
 END_HADRONS_NAMESPACE
 #endif // Hadrons_ModuleFactory_hpp_
--- a/Hadrons/Modules.hpp
+++ b/Hadrons/Modules.hpp
@@ -1,87 +0,0 @@
 #include <Hadrons/Modules/MAction/DWF.hpp>
 #include <Hadrons/Modules/MAction/MobiusDWF.hpp>
 #include <Hadrons/Modules/MAction/ScaledDWF.hpp>
 #include <Hadrons/Modules/MAction/WilsonClover.hpp>
 #include <Hadrons/Modules/MAction/Wilson.hpp>
 #include <Hadrons/Modules/MAction/ZMobiusDWF.hpp>
 #include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
 #include <Hadrons/Modules/MContraction/A2AFourQuarkContraction.hpp>
 #include <Hadrons/Modules/MContraction/A2ALoop.hpp>
 #include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
 #include <Hadrons/Modules/MContraction/Baryon.hpp>
 #include <Hadrons/Modules/MContraction/DiscLoop.hpp>
 #include <Hadrons/Modules/MContraction/Gamma3pt.hpp>
 #include <Hadrons/Modules/MContraction/Meson.hpp>
 #include <Hadrons/Modules/MContraction/SigmaToNucleonEye.hpp>
 #include <Hadrons/Modules/MContraction/SigmaToNucleonNonEye.hpp>
 #include <Hadrons/Modules/MContraction/WeakEye3pt.hpp>
 #include <Hadrons/Modules/MContraction/WeakMesonDecayKl2.hpp>
 #include <Hadrons/Modules/MContraction/WeakNonEye3pt.hpp>
 #include <Hadrons/Modules/MDistil/Distil.hpp>
 #include <Hadrons/Modules/MDistil/DistilPar.hpp>
 #include <Hadrons/Modules/MDistil/DistilVectors.hpp>
 #include <Hadrons/Modules/MDistil/LapEvec.hpp>
 #include <Hadrons/Modules/MDistil/Noises.hpp>
 #include <Hadrons/Modules/MDistil/PerambFromSolve.hpp>
 #include <Hadrons/Modules/MDistil/Perambulator.hpp>
 #include <Hadrons/Modules/MFermion/EMLepton.hpp>
 #include <Hadrons/Modules/MFermion/FreeProp.hpp>
 #include <Hadrons/Modules/MFermion/GaugeProp.hpp>
 #include <Hadrons/Modules/MGauge/Electrify.hpp>
 #include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
 #include <Hadrons/Modules/MGauge/GaugeFix.hpp>
 #include <Hadrons/Modules/MGauge/Random.hpp>
 #include <Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
 #include <Hadrons/Modules/MGauge/UnitEm.hpp>
 #include <Hadrons/Modules/MGauge/Unit.hpp>
 #include <Hadrons/Modules/MIO/LoadA2AMatrixDiskVector.hpp>
 #include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
 #include <Hadrons/Modules/MIO/LoadBinary.hpp>
 #include <Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp>
 #include <Hadrons/Modules/MIO/LoadCosmHol.hpp>
 #include <Hadrons/Modules/MIO/LoadDistilNoise.hpp>
 #include <Hadrons/Modules/MIO/LoadEigenPack.hpp>
 #include <Hadrons/Modules/MIO/LoadNersc.hpp>
 #include <Hadrons/Modules/MIO/LoadPerambulator.hpp>
 #include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
 #include <Hadrons/Modules/MNoise/SparseSpinColorDiagonal.hpp>
 #include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
 #include <Hadrons/Modules/MNPR/Amputate.hpp>
 #include <Hadrons/Modules/MNPR/Bilinear.hpp>
 #include <Hadrons/Modules/MNPR/FourQuark.hpp>
 #include <Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Hadrons/Modules/MScalar/Scalar.hpp>
 #include <Hadrons/Modules/MScalarSUN/Div.hpp>
 #include <Hadrons/Modules/MScalarSUN/EMT.hpp>
 #include <Hadrons/Modules/MScalarSUN/Grad.hpp>
 #include <Hadrons/Modules/MScalarSUN/StochFreeField.hpp>
 #include <Hadrons/Modules/MScalarSUN/TransProj.hpp>
 #include <Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
 #include <Hadrons/Modules/MScalarSUN/TrMag.hpp>
 #include <Hadrons/Modules/MScalarSUN/TrPhi.hpp>
 #include <Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
 #include <Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp>
 #include <Hadrons/Modules/MScalarSUN/Utils.hpp>
 #include <Hadrons/Modules/MSink/Point.hpp>
 #include <Hadrons/Modules/MSink/Smear.hpp>
 #include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
 #include <Hadrons/Modules/MSolver/A2AVectors.hpp>
 #include <Hadrons/Modules/MSolver/Guesser.hpp>
 #include <Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp>
 #include <Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp>
 #include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Hadrons/Modules/MSource/Convolution.hpp>
 #include <Hadrons/Modules/MSource/Gauss.hpp>
 #include <Hadrons/Modules/MSource/JacobiSmear.hpp>
 #include <Hadrons/Modules/MSource/Momentum.hpp>
 #include <Hadrons/Modules/MSource/MomentumPhase.hpp>
 #include <Hadrons/Modules/MSource/Point.hpp>
 #include <Hadrons/Modules/MSource/SeqAslash.hpp>
 #include <Hadrons/Modules/MSource/SeqConserved.hpp>
 #include <Hadrons/Modules/MSource/SeqGamma.hpp>
 #include <Hadrons/Modules/MSource/Wall.hpp>
 #include <Hadrons/Modules/MSource/Z2.hpp>
 #include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
 #include <Hadrons/Modules/MUtilities/RandomVectors.hpp>
--- a/Hadrons/Modules/MAction/DWF.cc
+++ b/Hadrons/Modules/MAction/DWF.cc
@@ -1,37 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/DWF.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MAction/DWF.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TDWF<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TDWF<FIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/DWF.hpp
+++ b/Hadrons/Modules/MAction/DWF.hpp
@@ -1,155 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/DWF.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MAction_DWF_hpp_
 #define Hadrons_MAction_DWF_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                     Domain wall quark action                               *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MAction)
 class DWFPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(DWFPar,
                                    std::string, gauge,
                                    unsigned int, Ls,
                                    double      , mass,
                                    double      , M5,
                                    std::string , boundary,
                                    std::string , twist);
 };
 template <typename FImpl>
 class TDWF: public Module<DWFPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TDWF(const std::string name);
    // destructor
    virtual ~TDWF(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(DWF, TDWF<FIMPL>, MAction);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(DWFF, TDWF<FIMPLF>, MAction);
 #endif
 /******************************************************************************
 *                        DWF template implementation                         *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TDWF<FImpl>::TDWF(const std::string name)
 : Module<DWFPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TDWF<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().gauge};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TDWF<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TDWF<FImpl>::setup(void)
 {
    LOG(Message) << "Setting up domain wall fermion matrix with m= "
                 << par().mass << ", M5= " << par().M5 << " and Ls= "
                 << par().Ls << " using gauge field '" << par().gauge << "'"
                 << std::endl;
    auto &U    = envGet(GaugeField, par().gauge);
    auto &g4   = *envGetGrid(FermionField);
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    typename DomainWallFermion<FImpl>::ImplParams implParams;
    if (!par().boundary.empty())
    {
        implParams.boundary_phases = strToVec<Complex>(par().boundary);
    }
    if (!par().twist.empty())
    {
        implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    }
    LOG(Message) << "Fermion boundary conditions: " << implParams.boundary_phases
                 << std::endl;
    LOG(Message) << "Twists: " << implParams.twist_n_2pi_L
                 << std::endl;
    if (implParams.boundary_phases.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of boundary phase");
    }
    if (implParams.twist_n_2pi_L.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of twist");
    }
    envCreateDerived(FMat, DomainWallFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, implParams);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TDWF<FImpl>::execute(void)
 {}
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MAction_DWF_hpp_
--- a/Hadrons/Modules/MAction/MobiusDWF.cc
+++ b/Hadrons/Modules/MAction/MobiusDWF.cc
@@ -1,37 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/MobiusDWF.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MAction/MobiusDWF.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TMobiusDWF<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TMobiusDWF<FIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/MobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/MobiusDWF.hpp
@@ -1,156 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/MobiusDWF.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MAction_MobiusDWF_hpp_
 #define Hadrons_MAction_MobiusDWF_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                      Mobius domain-wall fermion action                     *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MAction)
 class MobiusDWFPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(MobiusDWFPar,
                                    std::string , gauge,
                                    unsigned int, Ls,
                                    double      , mass,
                                    double      , M5,
                                    double      , b,
                                    double      , c,
                                    std::string , boundary,
                                    std::string , twist);
 };
 template <typename FImpl>
 class TMobiusDWF: public Module<MobiusDWFPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TMobiusDWF(const std::string name);
    // destructor
    virtual ~TMobiusDWF(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(MobiusDWF, TMobiusDWF<FIMPL>, MAction);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(MobiusDWFF, TMobiusDWF<FIMPLF>, MAction);
 #endif
 /******************************************************************************
 *                      TMobiusDWF implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TMobiusDWF<FImpl>::TMobiusDWF(const std::string name)
 : Module<MobiusDWFPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TMobiusDWF<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().gauge};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TMobiusDWF<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TMobiusDWF<FImpl>::setup(void)
 {
    LOG(Message) << "Setting up Mobius domain wall fermion matrix with m= "
                 << par().mass << ", M5= " << par().M5 << ", Ls= " << par().Ls 
                 << ", b= " << par().b << ", c= " << par().c
                 << " using gauge field '" << par().gauge << "'"
                 << std::endl;
    auto &U    = envGet(GaugeField, par().gauge);
    auto &g4   = *envGetGrid(FermionField);
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    typename MobiusFermion<FImpl>::ImplParams implParams;
    if (!par().boundary.empty())
    {
        implParams.boundary_phases = strToVec<Complex>(par().boundary);
    }
    if (!par().twist.empty())
    {
        implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    }
    LOG(Message) << "Fermion boundary conditions: " << implParams.boundary_phases
                 << std::endl;
    LOG(Message) << "Twists: " << implParams.twist_n_2pi_L
                 << std::endl;
    if (implParams.boundary_phases.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of boundary phase");
    }
    if (implParams.twist_n_2pi_L.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of twist");
    }
    envCreateDerived(FMat, MobiusFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, par().b, par().c,
                     implParams);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TMobiusDWF<FImpl>::execute(void)
 {}
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MAction_MobiusDWF_hpp_
--- a/Hadrons/Modules/MAction/ScaledDWF.cc
+++ b/Hadrons/Modules/MAction/ScaledDWF.cc
@@ -1,37 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/ScaledDWF.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MAction/ScaledDWF.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TScaledDWF<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TScaledDWF<FIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/ScaledDWF.hpp
+++ b/Hadrons/Modules/MAction/ScaledDWF.hpp
@@ -1,155 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/ScaledDWF.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MAction_ScaledDWF_hpp_
 #define Hadrons_MAction_ScaledDWF_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                      Scaled domain wall fermion                            *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MAction)
 class ScaledDWFPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ScaledDWFPar,
                                    std::string , gauge,
                                    unsigned int, Ls,
                                    double      , mass,
                                    double      , M5,
                                    double      , scale,
                                    std::string , boundary,
                                    std::string , twist);
 };
 template <typename FImpl>
 class TScaledDWF: public Module<ScaledDWFPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TScaledDWF(const std::string name);
    // destructor
    virtual ~TScaledDWF(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(ScaledDWF, TScaledDWF<FIMPL>, MAction);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(ScaledDWFF, TScaledDWF<FIMPLF>, MAction);
 #endif
 /******************************************************************************
 *                      TScaledDWF implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TScaledDWF<FImpl>::TScaledDWF(const std::string name)
 : Module<ScaledDWFPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TScaledDWF<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().gauge};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TScaledDWF<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TScaledDWF<FImpl>::setup(void)
 {
    LOG(Message) << "Setting up scaled domain wall fermion matrix with m= "
                 << par().mass << ", M5= " << par().M5 << ", Ls= " << par().Ls 
                 << ", scale= " << par().scale
                 << " using gauge field '" << par().gauge << "'"
                 << std::endl;
    auto &U    = envGet(GaugeField, par().gauge);
    auto &g4   = *envGetGrid(FermionField);
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    typename ScaledShamirFermion<FImpl>::ImplParams implParams;
    if (!par().boundary.empty())
    {
        implParams.boundary_phases = strToVec<Complex>(par().boundary);
    }
    if (!par().twist.empty())
    {
        implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    }
    LOG(Message) << "Fermion boundary conditions: " << implParams.boundary_phases
                 << std::endl;
    LOG(Message) << "Twists: " << implParams.twist_n_2pi_L
                 << std::endl;
    if (implParams.boundary_phases.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of boundary phase");
    }
    if (implParams.twist_n_2pi_L.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of twist");
    }
    envCreateDerived(FMat, ScaledShamirFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, par().scale,
                     implParams);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TScaledDWF<FImpl>::execute(void)
 {}
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MAction_ScaledDWF_hpp_
--- a/Hadrons/Modules/MAction/Wilson.cc
+++ b/Hadrons/Modules/MAction/Wilson.cc
@@ -1,37 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/Wilson.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MAction/Wilson.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TWilson<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TWilson<FIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/Wilson.hpp
+++ b/Hadrons/Modules/MAction/Wilson.hpp
@@ -1,148 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/Wilson.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MAction_Wilson_hpp_
 #define Hadrons_MAction_Wilson_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                            TWilson quark action                            *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MAction)
 class WilsonPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonPar,
                                    std::string, gauge,
                                    double     , mass,
                                    std::string, boundary,
                                    std::string, string,
                                    std::string, twist);
 };
 template <typename FImpl>
 class TWilson: public Module<WilsonPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWilson(const std::string name);
    // destructor
    virtual ~TWilson(void) {};
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(Wilson, TWilson<FIMPL>, MAction);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(WilsonF, TWilson<FIMPLF>, MAction);
 #endif
 /******************************************************************************
 *                     TWilson template implementation                        *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TWilson<FImpl>::TWilson(const std::string name)
 : Module<WilsonPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TWilson<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().gauge};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TWilson<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWilson<FImpl>::setup(void)
 {
    LOG(Message) << "Setting up Wilson fermion matrix with m= " << par().mass
                 << " using gauge field '" << par().gauge << "'" << std::endl;
    auto &U      = envGet(GaugeField, par().gauge);
    auto &grid   = *envGetGrid(FermionField);
    auto &gridRb = *envGetRbGrid(FermionField);
    typename WilsonFermion<FImpl>::ImplParams implParams;
    if (!par().boundary.empty())
    {
        implParams.boundary_phases = strToVec<Complex>(par().boundary);
    }
    if (!par().twist.empty())
    {
        implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    }
    LOG(Message) << "Fermion boundary conditions: " << implParams.boundary_phases << std::endl;
    LOG(Message) << "Twists: " << implParams.twist_n_2pi_L << std::endl;
    if (implParams.boundary_phases.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of boundary phase");
    }
    if (implParams.twist_n_2pi_L.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of twist");
    }
    envCreateDerived(FMat, WilsonFermion<FImpl>, getName(), 1, U, grid, gridRb,
                     par().mass, implParams);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWilson<FImpl>::execute()
 {}
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Wilson_hpp_
--- a/Hadrons/Modules/MAction/WilsonClover.cc
+++ b/Hadrons/Modules/MAction/WilsonClover.cc
@@ -1,37 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/WilsonClover.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MAction/WilsonClover.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TWilsonClover<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TWilsonClover<FIMPLF>;
 #endif
--- a/Show More
+++ b/Show More