Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2026-02-01 04:43:29 +00:00 · 2020-04-23 04:35:42 -04:00
parent edec9ee2e2 0782b76ed4
commit c2c3cad20d
307 changed files with 4394 additions and 31968 deletions
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -35,17 +35,22 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 #include <Grid/algorithms/approx/Zolotarev.h>
 #include <Grid/algorithms/approx/Chebyshev.h>
+#include <Grid/algorithms/approx/JacobiPolynomial.h>
 #include <Grid/algorithms/approx/Remez.h>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>
+#include <Grid/algorithms/approx/RemezGeneral.h>
+#include <Grid/algorithms/approx/ZMobius.h>

 #include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
+#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
 #include <Grid/algorithms/iterative/MinimalResidual.h>
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -1,3 +1,14 @@
+    // blockZaxpy in bockPromote - 3s, 5%
+    // noncoalesced linalg in Preconditionoer ~ 3s 5%
+    // Lancos tuning or replace 10-20s ~ 25%, open ended
+    // setup tuning   5s  ~  8%
+    //    -- e.g. ordermin, orderstep tunables.
+    // MdagM path without norm in LinOp code.     few seconds
+
+    // Mdir calc blocking kernels
+    // Fuse kernels in blockMaskedInnerProduct
+    // preallocate Vectors in Cayley 5D ~ few percent few seconds
+
 /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -34,8 +45,29 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+template<class vobj,class CComplex>
+inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
+				    const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask,
+				    const Lattice<vobj> &fineX,
+				    const Lattice<vobj> &fineY)
+{
+  typedef decltype(innerProduct(vobj(),vobj())) dotp;
+
+  GridBase *coarse(CoarseInner.Grid());
+  GridBase *fine  (fineX.Grid());
+
+  Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
+  Lattice<dotp> fine_inner_msk(fine);
+
+  // Multiply could be fused with innerProduct
+  // Single block sum kernel could do both masks.
+  fine_inner = localInnerProduct(fineX,fineY);
+  mult(fine_inner_msk, fine_inner,FineMask);
+  blockSum(CoarseInner,fine_inner_msk);
+}
+
+
 class Geometry {
-  //    int dimension;
 public:
  int npoint;
  std::vector<int> directions   ;
@@ -52,10 +84,10 @@ public:
    directions.resize(npoint);
    displacements.resize(npoint);
    for(int d=0;d<_d;d++){
-      directions[2*d  ] = d+base;
-      directions[2*d+1] = d+base;
-      displacements[2*d  ] = +1;
-      displacements[2*d+1] = -1;
+      directions[d   ] = d+base;
+      directions[d+_d] = d+base;
+      displacements[d  ] = +1;
+      displacements[d+_d]= -1;
    }
    directions   [2*_d]=0;
    displacements[2*_d]=0;
@@ -63,7 +95,7 @@ public:
    //// report back
    std::cout<<GridLogMessage<<"directions    :";
    for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
-    std::cout <<std::endl;
+    std::cout<<std::endl;
    std::cout<<GridLogMessage<<"displacements :";
    for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
    std::cout<<std::endl;
@@ -115,10 +147,10 @@ public:
  
  void Orthogonalise(void){
    CoarseScalar InnerProd(CoarseGrid); 
-    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
-    blockOrthogonalise(InnerProd,subspace);
-    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
+    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
+    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
+    //    blockOrthogonalise(InnerProd,subspace);
    //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
    //      CheckOrthogonal();
  } 
@@ -128,7 +160,7 @@ public:
    for(int i=0;i<nbasis;i++){
      blockProject(iProj,subspace[i],subspace);
      eProj=Zero(); 
-      thread_for(ss, CoarseGrid->oSites(),{
+      accelerator_for(ss, CoarseGrid->oSites(),1,{
 	eProj[ss](i)=CComplex(1.0);
      });
      eProj=eProj - iProj;
@@ -146,61 +178,9 @@ public:
  void CreateSubspaceRandom(GridParallelRNG &RNG){
    for(int i=0;i<nbasis;i++){
      random(RNG,subspace[i]);
-      std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
    }
-    Orthogonalise();
  }

-  /*
-    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
-    {
-    // Run a Lanczos with sloppy convergence
-    const int Nstop = nn;
-    const int Nk = nn+20;
-    const int Np = nn+20;
-    const int Nm = Nk+Np;
-    const int MaxIt= 10000;
-    RealD resid = 1.0e-3;
-
-    Chebyshev<FineField> Cheb(0.5,64.0,21);
-    ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
-    //	IRL.lock = 1;
-
-    FineField noise(FineGrid); gaussian(RNG,noise);
-    FineField tmp(FineGrid); 
-    std::vector<RealD>     eval(Nm);
-    std::vector<FineField> evec(Nm,FineGrid);
-
-    int Nconv;
-    IRL.calc(eval,evec,
-    noise,
-    Nconv);
-
-    // pull back nn vectors
-    for(int b=0;b<nn;b++){
-
-    subspace[b]   = evec[b];
-
-    std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
-
-    hermop.Op(subspace[b],tmp); 
-    std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
-
-    noise = tmp -  sqrt(eval[b])*subspace[b] ;
-
-    std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
-
-    noise = tmp +  eval[b]*subspace[b] ;
-
-    std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
-
-    }
-    Orthogonalise();
-    for(int b=0;b<nn;b++){
-    std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
-    }
-    }
-  */
  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {

    RealD scale;
@@ -232,54 +212,316 @@ public:
      subspace[b]   = noise;

    }
-
-    Orthogonalise();
-
  }

-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
+  // and this is the best I found
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+#if 1
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {

    RealD scale;

+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);

-    Chebyshev<FineField> Cheb(0.1,64.0,900);
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {
+      // Filter
+      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
+      Cheb(hermop,noise,Mn);
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      b++;
+    }
+
+    // Generate a full sequence of Chebyshevs
+    {
+      lo=filterlo;
+      noise=Mn;
+
+      FineField T0(FineGrid); T0 = noise;  
+      FineField T1(FineGrid); 
+      FineField T2(FineGrid);
+      FineField y(FineGrid);
+      
+      FineField *Tnm = &T0;
+      FineField *Tn  = &T1;
+      FineField *Tnp = &T2;
+
+      // Tn=T1 = (xscale M + mscale)in
+      RealD xscale = 2.0/(hi-lo);
+      RealD mscale = -(hi+lo)/(hi-lo);
+      hermop.HermOp(T0,y);
+      T1=y*xscale+noise*mscale;
+
+      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
+	
+	hermop.HermOp(*Tn,y);
+
+	auto y_v = y.View();
+	auto Tn_v = Tn->View();
+	auto Tnp_v = Tnp->View();
+	auto Tnm_v = Tnm->View();
+	const int Nsimd = CComplex::Nsimd();
+	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+        });
+
+	// Possible more fine grained control is needed than a linear sweep,
+	// but huge productivity gain if this is simple algorithm and not a tunable
+	int m =1;
+	if ( n>=ordermin ) m=n-ordermin;
+	if ( (m%orderstep)==0 ) { 
+	  Mn=*Tnp;
+	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
+	  subspace[b] = Mn;
+	  hermop.Op(Mn,tmp); 
+	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+	  b++;
+	}
+
+	// Cycle pointers to avoid copies
+	FineField *swizzle = Tnm;
+	Tnm    =Tn;
+	Tn     =Tnp;
+	Tnp    =swizzle;
+	  
+      }
+    }
+    assert(b==nn);
+  }
+#endif
+#if 0
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;

    FineField noise(FineGrid);
    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+    FineField combined(FineGrid);

-    for(int b=0;b<nn;b++){
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;

-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-      Cheb(hermop,noise,Mn);
-
-      scale = std::pow(norm2(Mn),-0.5); 
-      Mn=Mn*scale;
-      subspace[b]   = Mn;
-
-      hermop.Op(Mn,noise); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(noise)<<std::endl;
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;

+    int b =0;
+#define FILTERb(llo,hhi,oorder)						\
+    {									\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
+      Cheb(hermop,noise,Mn);						\
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
+      subspace[b]   = Mn;						\
+      hermop.Op(Mn,tmp);						\
+      std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
    }									

-    Orthogonalise();
+    //      JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5);	\

+    RealD alpha=-0.8;
+    RealD beta =-0.8;
+#define FILTER(llo,hhi,oorder)						\
+    {									\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
+      /* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
+      Cheb(hermop,noise,Mn);						\
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
+      subspace[b]   = Mn;						\
+      hermop.Op(Mn,tmp);						\
+      std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
+    }									
+    
+#define FILTERc(llo,hhi,oorder)				\
+    {							\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);	\
+      Cheb(hermop,noise,combined);			\
+    }									
+
+    double node = 0.000;
+    FILTERb(lo,hi,orderfilter);// 0
+    //    FILTERc(node,hi,51);// 0
+    noise = Mn;
+    int base = 0;
+    int mult = 100;
+    FILTER(node,hi,base+1*mult);
+    FILTER(node,hi,base+2*mult);
+    FILTER(node,hi,base+3*mult);
+    FILTER(node,hi,base+4*mult);
+    FILTER(node,hi,base+5*mult);
+    FILTER(node,hi,base+6*mult);
+    FILTER(node,hi,base+7*mult);
+    FILTER(node,hi,base+8*mult);
+    FILTER(node,hi,base+9*mult);
+    FILTER(node,hi,base+10*mult);
+    FILTER(node,hi,base+11*mult);
+    FILTER(node,hi,base+12*mult);
+    FILTER(node,hi,base+13*mult);
+    FILTER(node,hi,base+14*mult);
+    FILTER(node,hi,base+15*mult);
+    assert(b==nn);
  }
+#endif
+
+#if 0
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+    FineField combined(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {						
+      Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
+      //      JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
+      //JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
+      //      JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
+      JacobiPoly(hermop,noise,Mn);
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp);
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; 
+      b++;
+      //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
+      //      subspace[b]   = tmp;      b++;
+      //    }									
+    }									
+
+#define FILTER(lambda)						\
+    {								\
+      hermop.HermOp(subspace[0],tmp);				\
+      tmp = tmp - lambda *subspace[0];				\
+      scale = std::pow(norm2(tmp),-0.5);			\
+      tmp=tmp*scale;							\
+      subspace[b]   = tmp;						\
+      hermop.Op(subspace[b],tmp);					\
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
+    }									
+    //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
+    //      subspace[b]   = tmp;      b++;
+    //    }									
+
+    FILTER(2.0e-5);
+    FILTER(2.0e-4);
+    FILTER(4.0e-4);
+    FILTER(8.0e-4);
+    FILTER(8.0e-4);
+
+    FILTER(2.0e-3);
+    FILTER(3.0e-3);
+    FILTER(4.0e-3);
+    FILTER(5.0e-3);
+    FILTER(6.0e-3);
+
+    FILTER(2.5e-3);
+    FILTER(3.5e-3);
+    FILTER(4.5e-3);
+    FILTER(5.5e-3);
+    FILTER(6.5e-3);
+
+    //    FILTER(6.0e-5);//6
+    //    FILTER(7.0e-5);//8
+    //    FILTER(8.0e-5);//9
+    //    FILTER(9.0e-5);//3
+
+    /*
+    //    FILTER(1.0e-4);//10
+    FILTER(2.0e-4);//11
+    //   FILTER(3.0e-4);//12
+    //    FILTER(4.0e-4);//13
+    FILTER(5.0e-4);//14
+
+    FILTER(6.0e-3);//4
+    FILTER(7.0e-4);//1
+    FILTER(8.0e-4);//7
+    FILTER(9.0e-4);//15
+    FILTER(1.0e-3);//2
+
+    FILTER(2.0e-3);//2
+    FILTER(3.0e-3);//2
+    FILTER(4.0e-3);//2
+    FILTER(5.0e-3);//2
+    FILTER(6.0e-3);//2
+
+    FILTER(7.0e-3);//2
+    FILTER(8.0e-3);//2
+    FILTER(1.0e-2);//2
+    */
+    std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
+    assert(b==nn);
+  }
+#endif
+

 };
+
 // Fine Object == (per site) type of fine field
 // nbasis      == number of deflation vectors
 template<class Fobj,class CComplex,int nbasis>
 class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
 public:
    
-  typedef iVector<CComplex,nbasis >             siteVector;
+  typedef iVector<CComplex,nbasis >           siteVector;
+  typedef Lattice<CComplex >                  CoarseComplexField;
  typedef Lattice<siteVector>                 CoarseVector;
  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-
+  typedef iMatrix<CComplex,nbasis >  Cobj;
  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj >        FineField;

@@ -294,7 +536,6 @@ public:

  std::vector<CoarseMatrix> A;
      
-      
  ///////////////////////
  // Interface
  ///////////////////////
@@ -305,33 +546,71 @@ public:
    conformable(_grid,in.Grid());
    conformable(in.Grid(),out.Grid());

-    RealD Nin = norm2(in);
+    //    RealD Nin = norm2(in);
    SimpleCompressor<siteVector> compressor;
+
+    double comms_usec = -usecond();
    Stencil.HaloExchange(in,compressor);
+    comms_usec += usecond();
+
    auto in_v = in.View();
    auto out_v = out.View();
-    thread_for(ss,Grid()->oSites(),{
-      siteVector res = Zero();
-      siteVector nbr;
+    typedef LatticeView<Cobj> Aview;
+
+    Vector<Aview> AcceleratorViewContainer;
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
+    Aview *Aview_p = & AcceleratorViewContainer[0];
+
+    const int Nsimd = CComplex::Nsimd();
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+
+    GridStopWatch ArithmeticTimer;
+    int osites=Grid()->oSites();
+    //    double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
+    //    double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
+    double usecs =-usecond();
+    // assert(geom.npoint==9);
+
+    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
+      int ss = sss/nbasis;
+      int b  = sss%nbasis;
+      calcComplex res = Zero();
+      calcVector nbr;
      int ptype;
      StencilEntry *SE;
+
+      int lane=SIMTlane(Nsimd);
      for(int point=0;point<geom.npoint;point++){

 	SE=Stencil.GetEntry(ptype,point,ss);
 	  
-	if(SE->_is_local&&SE->_permute) { 
-	  permute(nbr,in_v[SE->_offset],ptype);
-	} else if(SE->_is_local) { 
-	  nbr = in_v[SE->_offset];
+	if(SE->_is_local) { 
+	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
 	} else {
-	  nbr = Stencil.CommBuf()[SE->_offset];
+	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
+	}
+	synchronise();
+
+	for(int bb=0;bb<nbasis;bb++) {
+	  res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
 	}
-	auto A_point = A[point].View();
-	res = res + A_point[ss]*nbr;
      }
-      vstream(out_v[ss],res);
+      coalescedWrite(out_v[ss](b),res,lane);
    });
+    usecs +=usecond();
+
+    double nrm_usec=-usecond();
    RealD Nout= norm2(out);
+    nrm_usec+=usecond();
+
+    /*
+        std::cout << GridLogMessage << "\tNorm        " << nrm_usec << " us" <<std::endl;
+        std::cout << GridLogMessage << "\tHalo        " << comms_usec << " us" <<std::endl;
+        std::cout << GridLogMessage << "\tMatrix      " << usecs << " us" <<std::endl;
+        std::cout << GridLogMessage << "\t  mflop/s   " << flops/usecs<<std::endl;
+        std::cout << GridLogMessage << "\t  MB/s      " << bytes/usecs<<std::endl;
+    */
    return Nout;
  };

@@ -349,25 +628,54 @@ public:
      return norm2(out);
    }
  };
-
-  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
-    
-    conformable(_grid,in.Grid());
-    conformable(in.Grid(),out.Grid());
-    
+  void MdirComms(const CoarseVector &in)
+  {
    SimpleCompressor<siteVector> compressor;
    Stencil.HaloExchange(in,compressor);
+  }
+  void MdirCalc(const CoarseVector &in, CoarseVector &out, int point)
+  {
+    conformable(_grid,in.Grid());
+    conformable(_grid,out.Grid());

-    auto point = [dir, disp](){
-      if(dir == 0 and disp == 0)
-	return 8;
-      else
-	return (4 * dir + 1 - disp) / 2;
-    }();
+    typedef LatticeView<Cobj> Aview;
+    Vector<Aview> AcceleratorViewContainer;
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
+    Aview *Aview_p = & AcceleratorViewContainer[0];

    auto out_v = out.View();
    auto in_v  = in.View();
-    thread_for(ss,Grid()->oSites(),{
+
+    const int Nsimd = CComplex::Nsimd();
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+
+    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
+      int ss = sss/nbasis;
+      int b  = sss%nbasis;
+      calcComplex res = Zero();
+      calcVector nbr;
+      int ptype;
+      StencilEntry *SE;
+
+      int lane=SIMTlane(Nsimd);
+      SE=Stencil.GetEntry(ptype,point,ss);
+	  
+      if(SE->_is_local) { 
+	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
+      } else {
+	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
+      }
+      synchronise();
+
+      for(int bb=0;bb<nbasis;bb++) {
+	res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
+      }
+      coalescedWrite(out_v[ss](b),res,lane);
+    });
+#if 0
+    accelerator_for(ss,Grid()->oSites(),1,{
+
      siteVector res = Zero();
      siteVector nbr;
      int ptype;
@@ -382,16 +690,65 @@ public:
      } else {
 	nbr = Stencil.CommBuf()[SE->_offset];
      }
+      synchronise();

-      auto A_point = A[point].View();
-      res = res + A_point[ss]*nbr;
+      res = res + Aview_p[point][ss]*nbr;
      
-      vstream(out_v[ss],res);
+      out_v[ss]=res;
    });
+#endif
+  }
+  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
+  {
+    this->MdirComms(in);
+    int ndir=geom.npoint-1;
+    if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { 
+      std::cout <<"MdirAll out size "<< out.size()<<std::endl;
+      std::cout <<"MdirAll ndir "<< ndir<<std::endl;
+      assert(0);
+    }
+    for(int p=0;p<ndir;p++){
+      MdirCalc(in,out[p],p);
+    }
+  };
+  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
+
+    this->MdirComms(in);
+
+    int ndim = in.Grid()->Nd();
+
+    //////////////
+    // 4D action like wilson
+    // 0+ => 0 
+    // 0- => 1
+    // 1+ => 2 
+    // 1- => 3
+    // etc..
+    //////////////
+    // 5D action like DWF
+    // 1+ => 0 
+    // 1- => 1
+    // 2+ => 2 
+    // 2- => 3
+    // etc..
+    auto point = [dir, disp, ndim](){
+      if(dir == 0 and disp == 0)
+	return 8;
+      else if ( ndim==4 ) { 
+	return (4 * dir + 1 - disp) / 2;
+      } else { 
+	return (4 * (dir-1) + 1 - disp) / 2;
+      }
+    }();
+
+    MdirCalc(in,out,point);
+
  };

-  void Mdiag(const CoarseVector &in, CoarseVector &out){
-    Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil
+  void Mdiag(const CoarseVector &in, CoarseVector &out)
+  {
+    int point=geom.npoint-1;
+    MdirCalc(in, out, point); // No comms
  };

  
@@ -401,25 +758,44 @@ public:
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
-    A(geom.npoint,&CoarseGrid)
+      A(geom.npoint,&CoarseGrid)
  {
  };

  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & Subspace){
+		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  {
+    typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
+    typedef typename Fobj::scalar_type scalar_type;

-    FineField iblock(FineGrid); // contributions from within this block
-    FineField oblock(FineGrid); // contributions from outwith this block
+    FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
+    FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
+
+    std::vector<FineComplexField> masks(geom.npoint,FineGrid);
+    FineComplexField imask(FineGrid); // contributions from within this block
+    FineComplexField omask(FineGrid); // contributions from outwith this block
+
+    FineComplexField evenmask(FineGrid);
+    FineComplexField oddmask(FineGrid); 

    FineField     phi(FineGrid);
    FineField     tmp(FineGrid);
    FineField     zz(FineGrid); zz=Zero();
    FineField    Mphi(FineGrid);
+    FineField    Mphie(FineGrid);
+    FineField    Mphio(FineGrid);
+    std::vector<FineField>     Mphi_p(geom.npoint,FineGrid);

-    Lattice<iScalar<vInteger> > coor(FineGrid);
+    Lattice<iScalar<vInteger> > coor (FineGrid);
+    Lattice<iScalar<vInteger> > bcoor(FineGrid);
+    Lattice<iScalar<vInteger> > bcb  (FineGrid); bcb = Zero();

    CoarseVector iProj(Grid()); 
    CoarseVector oProj(Grid()); 
+    CoarseVector SelfProj(Grid()); 
+    CoarseComplexField iZProj(Grid()); 
+    CoarseComplexField oZProj(Grid()); 
+
    CoarseScalar InnerProd(Grid()); 

    // Orthogonalise the subblocks over the basis
@@ -428,69 +804,117 @@ public:
    // Compute the matrix elements of linop between this orthonormal
    // set of vectors.
    int self_stencil=-1;
-    for(int p=0;p<geom.npoint;p++){ 
+    for(int p=0;p<geom.npoint;p++)
+    { 
+      int dir   = geom.directions[p];
+      int disp  = geom.displacements[p];
      A[p]=Zero();
      if( geom.displacements[p]==0){
 	self_stencil=p;
      }
+
+      Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
+
+      LatticeCoordinate(coor,dir);
+
+      ///////////////////////////////////////////////////////
+      // Work out even and odd block checkerboarding for fast diagonal term
+      ///////////////////////////////////////////////////////
+      if ( disp==1 ) {
+	bcb   = bcb + div(coor,block);
+      }
+	
+      if ( disp==0 ) {
+	  masks[p]= Zero();
+      } else if ( disp==1 ) {
+	masks[p] = where(mod(coor,block)==(block-1),one,zero);
+      } else if ( disp==-1 ) {
+	masks[p] = where(mod(coor,block)==(Integer)0,one,zero);
+      }
    }
+    evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
+    oddmask  = one-evenmask;
+
    assert(self_stencil!=-1);

    for(int i=0;i<nbasis;i++){
+
      phi=Subspace.subspace[i];

-      std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
+      //      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
+      linop.OpDirAll(phi,Mphi_p);
+      linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);

      for(int p=0;p<geom.npoint;p++){ 

+	Mphi = Mphi_p[p];
+
 	int dir   = geom.directions[p];
 	int disp  = geom.displacements[p];

-	Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
+	if ( (disp==-1) || (!hermitian ) ) {

-	LatticeCoordinate(coor,dir);
+	  ////////////////////////////////////////////////////////////////////////
+	  // Pick out contributions coming from this cell and neighbour cell
+	  ////////////////////////////////////////////////////////////////////////
+	  omask = masks[p];
+	  imask = one-omask;
 	
-	if ( disp==0 ){
-	  linop.OpDiag(phi,Mphi);
-	}
-	else  {
-	  linop.OpDir(phi,Mphi,dir,disp); 
-	}
-
-	////////////////////////////////////////////////////////////////////////
-	// Pick out contributions coming from this cell and neighbour cell
-	////////////////////////////////////////////////////////////////////////
-	if ( disp==0 ) {
-	  iblock = Mphi;
-	  oblock = Zero();
-	} else if ( disp==1 ) {
-	  oblock = where(mod(coor,block)==(block-1),Mphi,zz);
-	  iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
-	} else if ( disp==-1 ) {
-	  oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
-	  iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
-	} else {
-	  assert(0);
-	}
-
-	Subspace.ProjectToSubspace(iProj,iblock);
-	Subspace.ProjectToSubspace(oProj,oblock);
-	//	  blockProject(iProj,iblock,Subspace.subspace);
-	//	  blockProject(oProj,oblock,Subspace.subspace);
-	auto iProj_v = iProj.View() ;
-	auto oProj_v = oProj.View() ;
-	auto A_p     =  A[p].View();
-	auto A_self  = A[self_stencil].View();
-	thread_for(ss, Grid()->oSites(),{
 	  for(int j=0;j<nbasis;j++){
-	    if( disp!= 0 ) {
-	      A_p[ss](j,i) = oProj_v[ss](j);
-	    }
-	    A_self[ss](j,i) =	A_self[ss](j,i) + iProj_v[ss](j);
+	    
+	    blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
+	    
+	    auto iZProj_v = iZProj.View() ;
+	    auto oZProj_v = oZProj.View() ;
+	    auto A_p     =  A[p].View();
+	    auto A_self  = A[self_stencil].View();
+
+	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
+	    //      if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });}
+	    //	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_self[ss](j,i),A_self(ss)(j,i)+iZProj_v(ss)); });
+
+	  }
+	}
+      }
+
+      ///////////////////////////////////////////
+      // Faster alternate self coupling.. use hermiticity to save 2x
+      ///////////////////////////////////////////
+      {
+	mult(tmp,phi,evenmask);  linop.Op(tmp,Mphie);
+	mult(tmp,phi,oddmask );  linop.Op(tmp,Mphio);
+
+	{
+	  auto tmp_      = tmp.View();
+	  auto evenmask_ = evenmask.View();
+	  auto oddmask_  =  oddmask.View();
+	  auto Mphie_    =  Mphie.View();
+	  auto Mphio_    =  Mphio.View();
+	  accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ 
+	      coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
+	    });
+	}
+
+	blockProject(SelfProj,tmp,Subspace.subspace);
+
+	auto SelfProj_ = SelfProj.View();
+	auto A_self  = A[self_stencil].View();
+
+	accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
+	  for(int j=0;j<nbasis;j++){
+	    coalescedWrite(A_self[ss](j,i), SelfProj_(ss)(j));
 	  }
 	});
+
      }
    }
+    if(hermitian) {
+      std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
+      ForceHermitian();
+    }
+      // AssertHermitian();
+      // ForceDiagonal();
+  }

 #if 0
    ///////////////////////////
@@ -513,17 +937,26 @@ public:
    std::cout<<GridLogMessage<< iProj <<std::endl;
    std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
-      //      ForceHermitian();
-      // AssertHermitian();
-      // ForceDiagonal();
-  }
+

  void ForceHermitian(void) {
-    for(int d=0;d<4;d++){
-      int dd=d+1;
-      A[2*d] = adj(Cshift(A[2*d+1],dd,1));
+    CoarseMatrix Diff  (Grid());
+    for(int p=0;p<geom.npoint;p++){
+      int dir   = geom.directions[p];
+      int disp  = geom.displacements[p];
+      if(disp==-1) {
+	// Find the opposite link
+	for(int pp=0;pp<geom.npoint;pp++){
+	  int dirp   = geom.directions[pp];
+	  int dispp  = geom.displacements[pp];
+	  if ( (dirp==dir) && (dispp==1) ){
+	    //	    Diff = adj(Cshift(A[p],dir,1)) - A[pp]; 
+	    //	    std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
+	    A[pp] = adj(Cshift(A[p],dir,1));
+	  }
+	}
+      }
    }
-    //      A[8] = 0.5*(A[8] + adj(A[8]));
  }
  void AssertHermitian(void) {
    CoarseMatrix AA    (Grid());
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -47,6 +47,7 @@ public:
  // Support for coarsening to a multigrid
  virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
  virtual void OpDir  (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
+  virtual void OpDirAll  (const Field &in, std::vector<Field> &out) = 0; // Abstract base

  virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
@@ -83,6 +84,9 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
@@ -93,8 +97,7 @@ public:
    _Mat.MdagM(in,out,n1,n2);
  }
  void HermOp(const Field &in, Field &out){
-    RealD n1,n2;
-    HermOpAndNorm(in,out,n1,n2);
+    _Mat.MdagM(in,out);
  }
 };

@@ -116,6 +119,9 @@ public:
    _Mat.Mdir(in,out,dir,disp);
    assert(0);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
    assert(0);
@@ -154,6 +160,9 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
@@ -162,7 +171,6 @@ public:
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    _Mat.M(in,out);
-	
    ComplexD dot= innerProduct(in,out); n1=real(dot);
    n2=norm2(out);
  }
@@ -183,6 +191,9 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
@@ -234,6 +245,9 @@ public:
      void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	assert(0);
      }
+      void OpDirAll  (const Field &in, std::vector<Field> &out){
+	assert(0);
+      };
    };
    template<class Matrix,class Field>
    class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
@@ -320,9 +334,135 @@ public:
 	return axpy_norm(out,-1.0,tmp,in);
      }
    };
+
+    template<class Field>
+    class NonHermitianSchurOperatorBase :  public LinearOperatorBase<Field> 
+    {
+      public:
+        virtual RealD Mpc      (const Field& in, Field& out) = 0;
+        virtual RealD MpcDag   (const Field& in, Field& out) = 0;
+        virtual void  MpcDagMpc(const Field& in, Field& out, RealD& ni, RealD& no) {
+          Field tmp(in.Grid());
+          tmp.Checkerboard() = in.Checkerboard();
+	        ni = Mpc(in,tmp);
+	        no = MpcDag(tmp,out);
+        }
+        virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
+          assert(0);
+        }
+        virtual void HermOp(const Field& in, Field& out) {
+          assert(0);
+        }
+        void Op(const Field& in, Field& out) {
+          Mpc(in, out);
+        }
+        void AdjOp(const Field& in, Field& out) { 
+          MpcDag(in, out);
+        }
+        // Support for coarsening to a multigrid
+        void OpDiag(const Field& in, Field& out) {
+          assert(0); // must coarsen the unpreconditioned system
+        }
+        void OpDir(const Field& in, Field& out, int dir, int disp) {
+          assert(0);
+        }
+    };
+
+    template<class Matrix, class Field>
+    class NonHermitianSchurDiagMooeeOperator :  public NonHermitianSchurOperatorBase<Field> 
+    {
+      public:
+        Matrix& _Mat;
+        NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
+        virtual RealD Mpc(const Field& in, Field& out) {
+          Field tmp(in.Grid());
+          tmp.Checkerboard() = !in.Checkerboard();
+
+  	      _Mat.Meooe(in, tmp);
+	        _Mat.MooeeInv(tmp, out);
+	        _Mat.Meooe(out, tmp);
+
+	        _Mat.Mooee(in, out);
+	
+          return axpy_norm(out, -1.0, tmp, out);
+        }
+        virtual RealD MpcDag(const Field& in, Field& out) {
+	        Field tmp(in.Grid());
+
+	        _Mat.MeooeDag(in, tmp);
+          _Mat.MooeeInvDag(tmp, out);
+	        _Mat.MeooeDag(out, tmp);
+
+	        _Mat.MooeeDag(in, out);
+	
+          return axpy_norm(out, -1.0, tmp, out);
+      }
+    };
+    
+    template<class Matrix,class Field>
+    class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Field> 
+    {
+      protected:
+        Matrix &_Mat;
+    
+      public:
+        NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
+        virtual RealD Mpc(const Field& in, Field& out) {
+	        Field tmp(in.Grid());
+
+	        _Mat.Meooe(in, out);
+	        _Mat.MooeeInv(out, tmp);
+	        _Mat.Meooe(tmp, out);
+	        _Mat.MooeeInv(out, tmp);
+
+	        return axpy_norm(out, -1.0, tmp, in);
+        }
+        virtual RealD MpcDag(const Field& in, Field& out) {
+	        Field tmp(in.Grid());
+
+	        _Mat.MooeeInvDag(in, out);
+	        _Mat.MeooeDag(out, tmp);
+	        _Mat.MooeeInvDag(tmp, out);
+	        _Mat.MeooeDag(out, tmp);
+
+	        return axpy_norm(out, -1.0, tmp, in);
+        }
+    };
+
+    template<class Matrix, class Field>
+    class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Field> 
+    {
+      protected:
+        Matrix& _Mat;
+    
+      public:
+        NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
+
+        virtual RealD Mpc(const Field& in, Field& out) {
+          Field tmp(in.Grid());
+
+	        _Mat.MooeeInv(in, out);
+	        _Mat.Meooe(out, tmp);
+	        _Mat.MooeeInv(tmp, out);
+	        _Mat.Meooe(out, tmp);
+
+	        return axpy_norm(out, -1.0, tmp, in);
+        }
+        virtual RealD MpcDag(const Field& in, Field& out) {
+	        Field tmp(in.Grid());
+
+          _Mat.MeooeDag(in, out);
+          _Mat.MooeeInvDag(out, tmp);
+          _Mat.MeooeDag(tmp, out);
+          _Mat.MooeeInvDag(out, tmp);
+
+          return axpy_norm(out, -1.0, tmp, in);
+        }
+    };
+
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
-    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
+    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
    template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -45,8 +45,13 @@ public:
    ni=M(in,tmp);
    no=Mdag(tmp,out);
  }
+  virtual void  MdagM(const Field &in, Field &out) {
+    RealD ni, no;
+    MdagM(in,out,ni,no);
+  }
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
 };

 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -56,12 +61,12 @@ template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrix
 public:
  virtual GridBase *RedBlackGrid(void)=0;

-      //////////////////////////////////////////////////////////////////////
-      // Query the even even properties to make algorithmic decisions
-      //////////////////////////////////////////////////////////////////////
-      virtual RealD  Mass(void)        { return 0.0; };
-      virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden
-      virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
+  //////////////////////////////////////////////////////////////////////
+  // Query the even even properties to make algorithmic decisions
+  //////////////////////////////////////////////////////////////////////
+  virtual RealD  Mass(void)        { return 0.0; };
+  virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden
+  virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better

  // half checkerboard operaions
  virtual  void Meooe    (const Field &in, Field &out)=0;
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -95,6 +95,24 @@ public:
    Coeffs[order-1] = 1.;
  };
  
+  // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
+  // Similar kick effect below the threshold as Lanczos filter approach
+  void InitLowPass(RealD _lo,RealD _hi,int _order)
+  {
+    lo=_lo;
+    hi=_hi;
+    order=_order;
+      
+    if(order < 2) exit(-1);
+    Coeffs.resize(order);
+    for(int j=0;j<order;j++){
+      RealD k=(order-1.0);
+      RealD s=std::cos( j*M_PI*(k+0.5)/order );
+      Coeffs[j] = s * 2.0/order;
+    }
+    
+  };
+
  void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
  {
    lo=_lo;
@@ -234,20 +252,20 @@ public:
    RealD xscale = 2.0/(hi-lo);
    RealD mscale = -(hi+lo)/(hi-lo);
    Linop.HermOp(T0,y);
-    T1=y*xscale+in*mscale;
+    axpby(T1,xscale,mscale,y,in);

    // sum = .5 c[0] T0 + c[1] T1
-    out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
+    //    out = ()*T0 + Coeffs[1]*T1;
+    axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
    for(int n=2;n<order;n++){
 	
      Linop.HermOp(*Tn,y);
-
-      y=xscale*y+mscale*(*Tn);
-
-      *Tnp=2.0*y-(*Tnm);
-
-      out=out+Coeffs[n]* (*Tnp);
-
+      //     y=xscale*y+mscale*(*Tn);
+      //      *Tnp=2.0*y-(*Tnm);
+      //      out=out+Coeffs[n]* (*Tnp);
+      axpby(y,xscale,mscale,y,(*Tn));
+      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
+      axpy(out,Coeffs[n],*Tnp,out);
      // Cycle pointers to avoid copies
      Field *swizzle = Tnm;
      Tnm    =Tn;
--- a/Grid/algorithms/approx/JacobiPolynomial.h
+++ b/Grid/algorithms/approx/JacobiPolynomial.h
@@ -0,0 +1,129 @@
+#ifndef GRID_JACOBIPOLYNOMIAL_H
+#define GRID_JACOBIPOLYNOMIAL_H
+
+#include <Grid/algorithms/LinearOperator.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Field>
+class JacobiPolynomial : public OperatorFunction<Field> {
+ private:
+  using OperatorFunction<Field>::operator();
+
+  int order;
+  RealD hi;
+  RealD lo;
+  RealD alpha;
+  RealD beta;
+
+ public:
+  void csv(std::ostream &out){
+    csv(out,lo,hi);
+  }
+  void csv(std::ostream &out,RealD llo,RealD hhi){
+    RealD diff = hhi-llo;
+    RealD delta = diff*1.0e-5;
+    for (RealD x=llo-delta; x<=hhi; x+=delta) {
+      RealD f = approx(x);
+      out<< x<<" "<<f <<std::endl;
+    }
+    return;
+  }
+
+  JacobiPolynomial(){};
+  JacobiPolynomial(RealD _lo,RealD _hi,int _order,RealD _alpha, RealD _beta)
+  {
+      lo=_lo;
+      hi=_hi;
+      alpha=_alpha;
+      beta=_beta;
+      order=_order;
+  };
+
+  RealD approx(RealD x) // Convenience for plotting the approximation                                                       
+  {
+    RealD Tn;
+    RealD Tnm;
+    RealD Tnp;
+
+    RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
+
+    RealD T0=1.0;
+    RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
+
+    Tn =T1;
+    Tnm=T0;
+    for(int n=2;n<=order;n++){
+      RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
+      RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
+      RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
+      RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
+      Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;
+      Tnm=Tn;
+      Tn =Tnp;
+    }
+    return Tnp;
+  };
+
+  // Implement the required interface                                                                                       
+  void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+    GridBase *grid=in.Grid();
+
+    int vol=grid->gSites();
+
+    Field T0(grid);
+    Field T1(grid);
+    Field T2(grid);
+    Field y(grid);
+
+
+    Field *Tnm = &T0;
+    Field *Tn  = &T1;
+    Field *Tnp = &T2;
+
+    //    RealD T0=1.0;                                                                                                     
+    T0=in;
+
+    //    RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));                                                                           
+    //           = x * 2/(hi-lo) - (hi+lo)/(hi-lo)                                                                          
+    Linop.HermOp(T0,y);
+    RealD xscale = 2.0/(hi-lo);
+    RealD mscale = -(hi+lo)/(hi-lo);
+    Linop.HermOp(T0,y);
+    y=y*xscale+in*mscale;
+
+    // RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
+    RealD halfAmB  = (alpha-beta)*0.5;
+    RealD halfApBp2= (alpha+beta+2.0)*0.5;
+    T1 = halfAmB * in + halfApBp2*y;
+
+    for(int n=2;n<=order;n++){
+
+      Linop.HermOp(*Tn,y);
+      y=xscale*y+mscale*(*Tn);
+
+      RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
+      RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
+      RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
+      RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
+
+      //      Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;                                                             
+      cny=cny/cnp;
+      cn1=cn1/cnp;
+      cn1=cn1/cnp;
+      cnm=cnm/cnp;
+
+      *Tnp=cny*y + cn1 *(*Tn) + cnm * (*Tnm);
+
+      // Cycle pointers to avoid copies                                                                                     
+      Field *swizzle = Tnm;
+      Tnm    =Tn;
+      Tn     =Tnp;
+      Tnp    =swizzle;
+    }
+    out=*Tnp;
+
+  }
+};
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/approx/RemezGeneral.cc
+++ b/Grid/algorithms/approx/RemezGeneral.cc
@@ -0,0 +1,473 @@
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<string>
+#include<iostream>
+#include<iomanip>
+#include<cassert>
+
+#include<Grid/algorithms/approx/RemezGeneral.h>
+
+
+// Constructor
+AlgRemezGeneral::AlgRemezGeneral(double lower, double upper, long precision,
+				 bigfloat (*f)(bigfloat x, void *data), void *data): f(f), 
+										     data(data), 
+										     prec(precision),
+										     apstrt(lower), apend(upper), apwidt(upper - lower),
+										     n(0), d(0), pow_n(0), pow_d(0)
+{
+  bigfloat::setDefaultPrecision(prec);
+
+  std::cout<<"Approximation bounds are ["<<apstrt<<","<<apend<<"]\n";
+  std::cout<<"Precision of arithmetic is "<<precision<<std::endl;
+}
+
+//Determine the properties of the numerator and denominator polynomials
+void AlgRemezGeneral::setupPolyProperties(int num_degree, int den_degree, PolyType num_type_in, PolyType den_type_in){
+  pow_n = num_degree;
+  pow_d = den_degree;
+
+  if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) assert(0);
+  if(pow_n % 2 == 1 && num_type_in == PolyType::Even) assert(0);
+
+  if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) assert(0);
+  if(pow_d % 2 == 1 && den_type_in == PolyType::Even) assert(0);
+
+  num_type = num_type_in;
+  den_type = den_type_in;
+
+  num_pows.resize(pow_n+1);
+  den_pows.resize(pow_d+1);
+
+  int n_in = 0;
+  bool odd = num_type == PolyType::Full || num_type == PolyType::Odd;
+  bool even = num_type == PolyType::Full || num_type == PolyType::Even;
+  for(int i=0;i<=pow_n;i++){
+    num_pows[i] = -1;
+    if(i % 2 == 0 && even) num_pows[i] = n_in++;
+    if(i % 2 == 1 && odd) num_pows[i] = n_in++;
+  }
+
+  std::cout << n_in << " terms in numerator" << std::endl;
+  --n_in; //power is 1 less than the number of terms, eg  pow=1   a x^1  + b x^0
+
+  int d_in = 0;
+  odd = den_type == PolyType::Full || den_type == PolyType::Odd;
+  even = den_type == PolyType::Full || den_type == PolyType::Even;
+  for(int i=0;i<=pow_d;i++){
+    den_pows[i] = -1;
+    if(i % 2 == 0 && even) den_pows[i] = d_in++;
+    if(i % 2 == 1 && odd) den_pows[i] = d_in++;
+  }
+
+  std::cout << d_in << " terms in denominator" << std::endl;
+  --d_in;
+
+  n = n_in;
+  d = d_in;
+}
+
+//Setup algorithm
+void AlgRemezGeneral::reinitializeAlgorithm(){
+  spread = 1.0e37;
+  iter = 0;
+
+  neq = n + d + 1; //not +2 because highest-power term in denominator is fixed to 1
+
+  param.resize(neq);
+  yy.resize(neq+1);
+
+  //Initialize linear equation temporaries
+  A.resize(neq*neq);
+  B.resize(neq);
+  IPS.resize(neq);
+
+  //Initialize maximum and minimum errors
+  xx.resize(neq+2);
+  mm.resize(neq+1);
+  initialGuess();
+
+  //Initialize search steps
+  step.resize(neq+1);
+  stpini();
+}
+
+double AlgRemezGeneral::generateApprox(const int num_degree, const int den_degree, 
+				       const PolyType num_type_in, const PolyType den_type_in, 
+				       const double _tolerance, const int report_freq){
+  //Setup the properties of the polynomial
+  setupPolyProperties(num_degree, den_degree, num_type_in, den_type_in);
+
+  //Setup the algorithm
+  reinitializeAlgorithm();
+
+  bigfloat tolerance = _tolerance;
+
+  //Iterate until convergance
+  while (spread > tolerance) { 
+    if (iter++ % report_freq==0)
+      std::cout<<"Iteration " <<iter-1<<" spread "<<(double)spread<<" delta "<<(double)delta << std::endl; 
+
+    equations();
+    if (delta < tolerance) {
+      std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n";
+      assert(0);
+    };    
+    assert( delta>= tolerance );
+
+    search();
+  }
+
+  int sign;
+  double error = (double)getErr(mm[0],&sign);
+  std::cout<<"Converged at "<<iter<<" iterations; error = "<<error<<std::endl;
+
+  // Return the maximum error in the approximation
+  return error;
+}
+
+
+// Initial values of maximal and minimal errors
+void AlgRemezGeneral::initialGuess(){
+  // Supply initial guesses for solution points
+  long ncheb = neq;			// Degree of Chebyshev error estimate
+
+  // Find ncheb+1 extrema of Chebyshev polynomial
+  bigfloat a = ncheb;
+  bigfloat r;
+
+  mm[0] = apstrt;
+  for (long i = 1; i < ncheb; i++) {
+    r = 0.5 * (1 - cos((M_PI * i)/(double) a));
+    //r *= sqrt_bf(r);
+    r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
+    mm[i] = apstrt + r * apwidt;
+  }
+  mm[ncheb] = apend;
+
+  a = 2.0 * ncheb;
+  for (long i = 0; i <= ncheb; i++) {
+    r = 0.5 * (1 - cos(M_PI * (2*i+1)/(double) a));
+    //r *= sqrt_bf(r); // Squeeze to low end of interval
+    r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
+    xx[i] = apstrt + r * apwidt;
+  }
+}
+
+// Initialise step sizes
+void AlgRemezGeneral::stpini(){
+  xx[neq+1] = apend;
+  delta = 0.25;
+  step[0] = xx[0] - apstrt;
+  for (int i = 1; i < neq; i++) step[i] = xx[i] - xx[i-1];
+  step[neq] = step[neq-1];
+}
+
+// Search for error maxima and minima
+void AlgRemezGeneral::search(){
+  bigfloat a, q, xm, ym, xn, yn, xx1;
+  int emsign, ensign, steps;
+
+  int meq = neq + 1;
+
+  bigfloat eclose = 1.0e30;
+  bigfloat farther = 0l;
+
+  bigfloat xx0 = apstrt;
+
+  for (int i = 0; i < meq; i++) {
+    steps = 0;
+    xx1 = xx[i]; // Next zero
+    if (i == meq-1) xx1 = apend;
+    xm = mm[i];
+    ym = getErr(xm,&emsign);
+    q = step[i];
+    xn = xm + q;
+    if (xn < xx0 || xn >= xx1) {	// Cannot skip over adjacent boundaries
+      q = -q;
+      xn = xm;
+      yn = ym;
+      ensign = emsign;
+    } else {
+      yn = getErr(xn,&ensign);
+      if (yn < ym) {
+	q = -q;
+	xn = xm;
+	yn = ym;
+	ensign = emsign;
+      }
+    }
+  
+    while(yn >= ym) {		// March until error becomes smaller.
+      if (++steps > 10)
+      	break;
+      
+      ym = yn;
+      xm = xn;
+      emsign = ensign;
+      a = xm + q;
+      if (a == xm || a <= xx0 || a >= xx1)
+	break;// Must not skip over the zeros either side.      
+
+      xn = a;
+      yn = getErr(xn,&ensign);
+    }
+
+    mm[i] = xm;			// Position of maximum
+    yy[i] = ym;			// Value of maximum
+
+    if (eclose > ym) eclose = ym;
+    if (farther < ym) farther = ym;
+
+    xx0 = xx1; // Walk to next zero.
+  } // end of search loop
+
+  q = (farther - eclose);	// Decrease step size if error spread increased
+
+  if (eclose != 0.0) q /= eclose; // Relative error spread
+
+  if (q >= spread)
+    delta *= 0.5; // Spread is increasing; decrease step size
+  
+  spread = q;
+
+  for (int i = 0; i < neq; i++) {
+    q = yy[i+1];
+    if (q != 0.0) q = yy[i] / q  - (bigfloat)1l;
+    else q = 0.0625;
+    if (q > (bigfloat)0.25) q = 0.25;
+    q *= mm[i+1] - mm[i];
+    step[i] = q * delta;
+  }
+  step[neq] = step[neq-1];
+  
+  for (int i = 0; i < neq; i++) {	// Insert new locations for the zeros.
+    xm = xx[i] - step[i];
+
+    if (xm <= apstrt)
+      continue;
+
+    if (xm >= apend)
+      continue;
+
+    if (xm <= mm[i])
+      xm = (bigfloat)0.5 * (mm[i] + xx[i]);    
+
+    if (xm >= mm[i+1])
+      xm = (bigfloat)0.5 * (mm[i+1] + xx[i]);
+    
+    xx[i] = xm;
+  }
+}
+
+// Solve the equations
+void AlgRemezGeneral::equations(){
+  bigfloat x, y, z;
+  bigfloat *aa;
+  
+  for (int i = 0; i < neq; i++) {	// set up the equations for solution by simq()
+    int ip = neq * i;		// offset to 1st element of this row of matrix
+    x = xx[i];			// the guess for this row
+    y = func(x);		// right-hand-side vector
+
+    z = (bigfloat)1l;
+    aa = A.data()+ip;
+    int t = 0;
+    for (int j = 0; j <= pow_n; j++) {
+      if(num_pows[j] != -1){ *aa++ = z; t++; }
+      z *= x;
+    }
+    assert(t == n+1);
+
+    z = (bigfloat)1l;
+    t = 0;
+    for (int j = 0; j < pow_d; j++) {
+      if(den_pows[j] != -1){ *aa++ = -y * z; t++; }
+      z *= x;
+    }
+    assert(t == d);
+
+    B[i] = y * z;		// Right hand side vector
+  }
+
+  // Solve the simultaneous linear equations.
+  if (simq()){
+    std::cout<<"simq failed\n";
+    exit(0);
+  }
+}
+
+
+// Evaluate the rational form P(x)/Q(x) using coefficients
+// from the solution vector param
+bigfloat AlgRemezGeneral::approx(const bigfloat x) const{
+  // Work backwards toward the constant term.
+  int c = n;
+  bigfloat yn = param[c--];		// Highest order numerator coefficient
+  for (int i = pow_n-1; i >= 0; i--) yn = x * yn  +  (num_pows[i] != -1 ? param[c--] : bigfloat(0l));  
+
+  c = n+d;
+  bigfloat yd = 1l; //Highest degree coefficient is 1.0
+  for (int i = pow_d-1; i >= 0; i--) yd = x * yd  +  (den_pows[i] != -1 ? param[c--] : bigfloat(0l)); 
+
+  return(yn/yd);
+}
+
+// Compute size and sign of the approximation error at x
+bigfloat AlgRemezGeneral::getErr(bigfloat x, int *sign) const{
+  bigfloat f = func(x);
+  bigfloat e = approx(x) - f;
+  if (f != 0) e /= f;
+  if (e < (bigfloat)0.0) {
+    *sign = -1;
+    e = -e;
+  }
+  else *sign = 1;
+  
+  return(e);
+}
+
+// Solve the system AX=B
+int AlgRemezGeneral::simq(){
+
+  int ip, ipj, ipk, ipn;
+  int idxpiv;
+  int kp, kp1, kpk, kpn;
+  int nip, nkp;
+  bigfloat em, q, rownrm, big, size, pivot, sum;
+  bigfloat *aa;
+  bigfloat *X = param.data();
+
+  int n = neq;
+  int nm1 = n - 1;
+  // Initialize IPS and X
+  
+  int ij = 0;
+  for (int i = 0; i < n; i++) {
+    IPS[i] = i;
+    rownrm = 0.0;
+    for(int j = 0; j < n; j++) {
+      q = abs_bf(A[ij]);
+      if(rownrm < q) rownrm = q;
+      ++ij;
+    }
+    if (rownrm == (bigfloat)0l) {
+      std::cout<<"simq rownrm=0\n";
+      return(1);
+    }
+    X[i] = (bigfloat)1.0 / rownrm;
+  }
+  
+  for (int k = 0; k < nm1; k++) {
+    big = 0.0;
+    idxpiv = 0;
+    
+    for (int i = k; i < n; i++) {
+      ip = IPS[i];
+      ipk = n*ip + k;
+      size = abs_bf(A[ipk]) * X[ip];
+      if (size > big) {
+	big = size;
+	idxpiv = i;
+      }
+    }
+    
+    if (big == (bigfloat)0l) {
+      std::cout<<"simq big=0\n";
+      return(2);
+    }
+    if (idxpiv != k) {
+      int j = IPS[k];
+      IPS[k] = IPS[idxpiv];
+      IPS[idxpiv] = j;
+    }
+    kp = IPS[k];
+    kpk = n*kp + k;
+    pivot = A[kpk];
+    kp1 = k+1;
+    for (int i = kp1; i < n; i++) {
+      ip = IPS[i];
+      ipk = n*ip + k;
+      em = -A[ipk] / pivot;
+      A[ipk] = -em;
+      nip = n*ip;
+      nkp = n*kp;
+      aa = A.data()+nkp+kp1;
+      for (int j = kp1; j < n; j++) {
+	ipj = nip + j;
+	A[ipj] = A[ipj] + em * *aa++;
+      }
+    }
+  }
+  kpn = n * IPS[n-1] + n - 1;	// last element of IPS[n] th row
+  if (A[kpn] == (bigfloat)0l) {
+    std::cout<<"simq A[kpn]=0\n";
+    return(3);
+  }
+
+  
+  ip = IPS[0];
+  X[0] = B[ip];
+  for (int i = 1; i < n; i++) {
+    ip = IPS[i];
+    ipj = n * ip;
+    sum = 0.0;
+    for (int j = 0; j < i; j++) {
+      sum += A[ipj] * X[j];
+      ++ipj;
+    }
+    X[i] = B[ip] - sum;
+  }
+  
+  ipn = n * IPS[n-1] + n - 1;
+  X[n-1] = X[n-1] / A[ipn];
+  
+  for (int iback = 1; iback < n; iback++) {
+    //i goes (n-1),...,1
+    int i = nm1 - iback;
+    ip = IPS[i];
+    nip = n*ip;
+    sum = 0.0;
+    aa = A.data()+nip+i+1;
+    for (int j= i + 1; j < n; j++) 
+      sum += *aa++ * X[j];
+    X[i] = (X[i] - sum) / A[nip+i];
+  }
+  
+  return(0);
+}
+
+void AlgRemezGeneral::csv(std::ostream & os) const{
+  os << "Numerator" << std::endl;
+  for(int i=0;i<=pow_n;i++){
+    os << getCoeffNum(i) << "*x^" << i;
+    if(i!=pow_n) os << " + ";
+  }
+  os << std::endl;
+
+  os << "Denominator" << std::endl;
+  for(int i=0;i<=pow_d;i++){
+    os << getCoeffDen(i) << "*x^" << i;
+    if(i!=pow_d) os << " + ";
+  }
+  os << std::endl;
+
+  //For a true minimax solution the errors should all be equal and the signs should oscillate +-+-+- etc
+  int sign;
+  os << "Errors at maxima: coordinate, error, (sign)" << std::endl;
+  for(int i=0;i<neq+1;i++){ 
+    os << mm[i] << " " << getErr(mm[i],&sign) << " (" << sign << ")" << std::endl;
+  }
+
+  os << "Scan over range:" << std::endl;
+  int npt = 60;
+  bigfloat dlt = (apend - apstrt)/bigfloat(npt-1);
+
+  for (bigfloat x=apstrt; x<=apend; x = x + dlt) {
+    double f = evaluateFunc(x);
+    double r = evaluateApprox(x);
+    os<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
+  }
+  return;
+}
--- a/Grid/algorithms/approx/RemezGeneral.h
+++ b/Grid/algorithms/approx/RemezGeneral.h
@@ -0,0 +1,170 @@
+/*
+  C.Kelly Jan 2020 based on implementation by M. Clark May 2005
+
+  AlgRemezGeneral is an implementation of the Remez algorithm for approximating an arbitrary function by a rational polynomial 
+  It includes optional restriction to odd/even polynomials for the numerator and/or denominator
+*/
+
+#ifndef INCLUDED_ALG_REMEZ_GENERAL_H
+#define INCLUDED_ALG_REMEZ_GENERAL_H
+
+#include <stddef.h>
+#include <Grid/GridStd.h>
+
+#ifdef HAVE_LIBGMP
+#include "bigfloat.h"
+#else
+#include "bigfloat_double.h"
+#endif
+
+
+class AlgRemezGeneral{
+ public:
+  enum PolyType { Even, Odd, Full };
+
+ private:
+
+  // In GSL-style, pass the function as a function pointer. Any data required to evaluate the function is passed in as a void pointer
+  bigfloat (*f)(bigfloat x, void *data);
+  void *data;
+
+  // The approximation parameters
+  std::vector<bigfloat> param;
+  bigfloat norm;
+
+  // The number of non-zero terms in the numerator and denominator
+  int n, d;
+  // The numerator and denominator degree (i.e.  the largest power)
+  int pow_n, pow_d;
+  
+  // Specify if the numerator and/or denominator are odd/even polynomials
+  PolyType num_type;
+  PolyType den_type;
+  std::vector<int> num_pows; //contains the mapping, with -1 if not present
+  std::vector<int> den_pows;
+
+  // The bounds of the approximation
+  bigfloat apstrt, apwidt, apend;
+
+  // Variables used to calculate the approximation
+  int nd1, iter;
+  std::vector<bigfloat> xx;
+  std::vector<bigfloat> mm;
+  std::vector<bigfloat> step;
+
+  bigfloat delta, spread;
+  
+  // Variables used in search
+  std::vector<bigfloat> yy;
+
+  // Variables used in solving linear equations
+  std::vector<bigfloat> A;
+  std::vector<bigfloat> B;
+  std::vector<int> IPS;
+
+  // The number of equations we must solve at each iteration (n+d+1)
+  int neq;
+
+  // The precision of the GNU MP library
+  long prec;
+
+  // Initialize member variables associated with the polynomial's properties
+  void setupPolyProperties(int num_degree, int den_degree, PolyType num_type_in, PolyType den_type_in);
+
+  // Initial values of maximal and minmal errors
+  void initialGuess();
+
+  // Initialise step sizes
+  void stpini();
+
+  // Initialize the algorithm
+  void reinitializeAlgorithm();
+
+  // Solve the equations
+  void equations();
+
+  // Search for error maxima and minima
+  void search(); 
+
+  // Calculate function required for the approximation
+  inline bigfloat func(bigfloat x) const{
+    return f(x, data);
+  }
+
+  // Compute size and sign of the approximation error at x
+  bigfloat getErr(bigfloat x, int *sign) const;
+
+  // Solve the system AX=B   where X = param
+  int simq();
+
+  // Evaluate the rational form P(x)/Q(x) using coefficients from the solution vector param
+  bigfloat approx(bigfloat x) const;
+
+ public:
+  
+  AlgRemezGeneral(double lower, double upper, long prec,
+		  bigfloat (*f)(bigfloat x, void *data), void *data);
+
+  inline int getDegree(void) const{ 
+    assert(n==d);
+    return n;
+  }
+  // Reset the bounds of the approximation
+  inline void setBounds(double lower, double upper) {
+    apstrt = lower;
+    apend = upper;
+    apwidt = apend - apstrt;
+  }
+
+  // Get the bounds of the approximation
+  inline void getBounds(double &lower, double &upper) const{ 
+    lower=(double)apstrt;
+    upper=(double)apend;
+  }
+
+  // Run the algorithm to generate the rational approximation
+  double generateApprox(int num_degree, int den_degree, 
+			PolyType num_type, PolyType den_type,
+			const double tolerance = 1e-15, const int report_freq = 1000);
+  
+  inline double generateApprox(int num_degree, int den_degree, 
+			       const double tolerance = 1e-15, const int report_freq = 1000){
+    return generateApprox(num_degree, den_degree, Full, Full, tolerance, report_freq);
+  }
+  
+  // Evaluate the rational form P(x)/Q(x) using coefficients from the
+  // solution vector param
+  inline double evaluateApprox(double x) const{
+    return (double)approx((bigfloat)x);
+  }
+
+  // Evaluate the rational form Q(x)/P(x) using coefficients from the solution vector param
+  inline double evaluateInverseApprox(double x) const{
+    return 1.0/(double)approx((bigfloat)x);
+  }  
+
+  // Calculate function required for the approximation
+  inline double evaluateFunc(double x) const{
+    return (double)func((bigfloat)x);
+  }
+
+  // Calculate inverse function required for the approximation
+  inline double evaluateInverseFunc(double x) const{
+    return 1.0/(double)func((bigfloat)x);
+  }
+
+  // Dump csv of function, approx and error
+  void csv(std::ostream &os = std::cout) const;
+
+  // Get the coefficient of the term x^i in the numerator
+  inline double getCoeffNum(const int i) const{    
+    return num_pows[i] == -1 ? 0. : double(param[num_pows[i]]);
+  }
+  // Get the coefficient of the term x^i in the denominator
+  inline double getCoeffDen(const int i) const{ 
+    if(i == pow_d) return 1.0;
+    else return den_pows[i] == -1 ? 0. : double(param[den_pows[i]+n+1]); 
+  }
+};
+
+#endif
--- a/Grid/algorithms/approx/ZMobius.cc
+++ b/Grid/algorithms/approx/ZMobius.cc
@@ -0,0 +1,183 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/approx/ZMobius.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/algorithms/approx/ZMobius.h>
+#include <Grid/algorithms/approx/RemezGeneral.h>
+
+NAMESPACE_BEGIN(Grid);
+NAMESPACE_BEGIN(Approx);
+
+//Compute the tanh approximation
+inline double epsilonMobius(const double x, const std::vector<ComplexD> &w){
+  int Ls = w.size();
+
+  ComplexD fxp = 1., fmp = 1.;
+  for(int i=0;i<Ls;i++){
+    fxp = fxp * ( w[i] + x );
+    fmp = fmp * ( w[i] - x );
+  }
+  return ((fxp - fmp)/(fxp + fmp)).real();
+}
+inline double epsilonMobius(const double x, const std::vector<RealD> &w){
+  int Ls = w.size();
+
+  double fxp = 1., fmp = 1.;
+  for(int i=0;i<Ls;i++){
+    fxp = fxp * ( w[i] + x );
+    fmp = fmp * ( w[i] - x );
+  }
+  return (fxp - fmp)/(fxp + fmp);
+}
+
+
+
+//Compute the tanh approximation in a form suitable for the Remez
+bigfloat epsilonMobius(bigfloat x, void* data){
+  const std::vector<RealD> &omega = *( (std::vector<RealD> const*)data );
+  bigfloat fxp(1.0);
+  bigfloat fmp(1.0);
+
+  for(int i=0;i<omega.size();i++){
+    fxp = fxp * ( bigfloat(omega[i]) + x);
+    fmp = fmp * ( bigfloat(omega[i]) - x);
+  }
+  return (fxp - fmp)/(fxp + fmp);
+}
+
+//Compute the Zmobius Omega parameters suitable for eigenvalue range   -lambda_bound <= lambda <= lambda_bound
+//Note omega_i = 1/(b_i + c_i)   where b_i and c_i are the Mobius parameters
+void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
+			 const std::vector<RealD> &omega_in, const int Ls_in,
+			 const RealD lambda_bound){
+  assert(omega_in.size() == Ls_in);
+  omega_out.resize(Ls_out);
+
+  //Use the Remez algorithm to generate the appropriate rational polynomial
+  //For odd polynomial, to satisfy Haar condition must take either positive or negative half of range (cf https://arxiv.org/pdf/0803.0439.pdf page 6)  
+  AlgRemezGeneral remez(0, lambda_bound, 64, &epsilonMobius, (void*)&omega_in); 
+  remez.generateApprox(Ls_out-1, Ls_out,AlgRemezGeneral::Odd, AlgRemezGeneral::Even, 1e-15, 100);
+  remez.csv(std::cout);
+
+  //The rational approximation has the form  [ f(x) - f(-x) ] / [ f(x) + f(-x) ]  where  f(x) = \Prod_{i=0}^{L_s-1} ( \omega_i + x )
+  //cf https://academiccommons.columbia.edu/doi/10.7916/D8T72HD7  pg 102
+  //omega_i are therefore the negative of the complex roots of f(x)
+
+  //We can find the roots by recognizing that the eigenvalues of a matrix A are the roots of the characteristic polynomial
+  // \rho(\lambda) = det( A - \lambda I )    where I is the unit matrix
+  //The matrix whose characteristic polynomial is an arbitrary monic polynomial a0 + a1 x + a2 x^2 + ... x^n   is the companion matrix 
+  // A = | 0    1   0    0 0 .... 0 |
+  //     | 0    0   1    0 0 .... 0 |
+  //     | :    :   :    : :      : |
+  //     | 0    0   0    0 0      1
+  //     | -a0 -a1 -a2  ...  ... -an|
+
+
+  //Note the Remez defines the largest power to have unit coefficient
+  std::vector<RealD> coeffs(Ls_out+1);
+  for(int i=0;i<Ls_out+1;i+=2) coeffs[i] = coeffs[i] = remez.getCoeffDen(i); //even powers
+  for(int i=1;i<Ls_out+1;i+=2) coeffs[i] = coeffs[i] = remez.getCoeffNum(i); //odd powers
+
+  std::vector<std::complex<RealD> > roots(Ls_out);
+
+  //Form the companion matrix
+  Eigen::MatrixXd compn(Ls_out,Ls_out);
+  for(int i=0;i<Ls_out-1;i++) compn(i,0) = 0.;
+  compn(Ls_out - 1, 0) = -coeffs[0];
+  
+  for(int j=1;j<Ls_out;j++){
+    for(int i=0;i<Ls_out-1;i++) compn(i,j) = i == j-1 ? 1. : 0.;
+    compn(Ls_out - 1, j) = -coeffs[j];
+  }
+
+  //Eigensolve
+  Eigen::EigenSolver<Eigen::MatrixXd> slv(compn, false);
+
+  const auto & ev = slv.eigenvalues();
+  for(int i=0;i<Ls_out;i++)
+    omega_out[i] = -ev(i);
+
+  //Sort ascending (smallest at start of vector!)
+  std::sort(omega_out.begin(), omega_out.end(), 
+	    [&](const ComplexD &a, const ComplexD &b){ return a.real() < b.real() || (a.real() == b.real() && a.imag() < b.imag()); });
+
+  //McGlynn thesis pg 122 suggest improved iteration counts if magnitude of omega diminishes towards the center of the 5th dimension
+  std::vector<ComplexD> omega_tmp = omega_out;
+  int s_low=0, s_high=Ls_out-1, ss=0;
+  for(int s_from = Ls_out-1; s_from >= 0; s_from--){ //loop from largest omega
+    int s_to;
+    if(ss % 2 == 0){
+      s_to = s_low++;
+    }else{
+      s_to = s_high--;
+    }
+    omega_out[s_to] = omega_tmp[s_from];
+    ++ss;
+  }
+  
+  std::cout << "Resulting omega_i:" << std::endl;  
+  for(int i=0;i<Ls_out;i++)
+    std::cout << omega_out[i] << std::endl;
+
+  std::cout << "Test result matches the approximate polynomial found by the Remez" << std::endl;
+  std::cout << "<x> <remez approx> <poly approx> <diff poly approx remez approx> <exact> <diff poly approx exact>\n";
+  
+  int npt = 60;
+  double dlt = lambda_bound/double(npt-1);
+
+  for (int i =0; i<npt; i++){
+    double x = i*dlt;
+    double r = remez.evaluateApprox(x);
+    double p = epsilonMobius(x, omega_out);
+    double e = epsilonMobius(x, omega_in);
+
+    std::cout << x<< " " << r << " " << p <<" " <<r-p << " " << e << " " << e-p << std::endl;
+  }
+
+}
+  
+//mobius_param = b+c   with b-c=1
+void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound){
+  std::vector<RealD> omega_in(Ls_in, 1./mobius_param);
+  computeZmobiusOmega(omega_out, Ls_out, omega_in, Ls_in, lambda_bound);
+}
+
+//ZMobius class takes  gamma_i = (b+c) omega_i as its input, where b, c are factored out
+void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, 
+			 const RealD mobius_param_out, const int Ls_out, 
+			 const RealD mobius_param_in, const int Ls_in,
+			 const RealD lambda_bound){
+  computeZmobiusOmega(gamma_out, Ls_out, mobius_param_in, Ls_in, lambda_bound);
+  for(int i=0;i<Ls_out;i++) gamma_out[i] = gamma_out[i] * mobius_param_out;
+}
+//Assumes mobius_param_out == mobius_param_in
+void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound){
+  computeZmobiusGamma(gamma_out, mobius_param, Ls_out, mobius_param, Ls_in, lambda_bound);
+}
+
+NAMESPACE_END(Approx);
+NAMESPACE_END(Grid);
--- a/Grid/algorithms/approx/ZMobius.h
+++ b/Grid/algorithms/approx/ZMobius.h
@@ -0,0 +1,57 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/approx/ZMobius.h
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_ZMOBIUS_APPROX_H
+#define GRID_ZMOBIUS_APPROX_H
+
+#include <Grid/GridCore.h>
+
+NAMESPACE_BEGIN(Grid);
+NAMESPACE_BEGIN(Approx);
+
+//Compute the Zmobius Omega parameters suitable for eigenvalue range   -lambda_bound <= lambda <= lambda_bound
+//Note omega_i = 1/(b_i + c_i)   where b_i and c_i are the Mobius parameters
+void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
+			 const std::vector<RealD> &omega_in, const int Ls_in,
+			 const RealD lambda_bound);
+  
+//mobius_param = b+c   with b-c=1
+void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound);
+
+//ZMobius class takes  gamma_i = (b+c) omega_i as its input, where b, c are factored out
+void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, 
+			 const RealD mobius_param_out, const int Ls_out, 
+			 const RealD mobius_param_in, const int Ls_in,
+			 const RealD lambda_bound);
+
+//Assumes mobius_param_out == mobius_param_in
+void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound);
+
+NAMESPACE_END(Approx);
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/algorithms/approx/bigfloat_double.h
+++ b/Grid/algorithms/approx/bigfloat_double.h
@@ -25,6 +25,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
+#ifndef INCLUDED_BIGFLOAT_DOUBLE_H
+#define INCLUDED_BIGFLOAT_DOUBLE_H
+
 #include <math.h>

 typedef double mfloat; 
@@ -186,4 +190,6 @@ public:
  //  friend bigfloat& random(void);
 };

+#endif
+

--- a/Grid/algorithms/iterative/BiCGSTAB.h
+++ b/Grid/algorithms/iterative/BiCGSTAB.h
@@ -0,0 +1,222 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/BiCGSTAB.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: juettner <juettner@soton.ac.uk>
+Author: David Murphy <djmurphy@mit.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef GRID_BICGSTAB_H
+#define GRID_BICGSTAB_H
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////////////
+// Base classes for iterative processes based on operators
+// single input vec, single output vec.
+/////////////////////////////////////////////////////////////
+
+template <class Field>
+class BiCGSTAB : public OperatorFunction<Field> 
+{
+  public:
+    using OperatorFunction<Field>::operator();
+    
+    bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+                             // Defaults true.
+    RealD Tolerance;
+    Integer MaxIterations;
+    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  
+    BiCGSTAB(RealD tol, Integer maxit, bool err_on_no_conv = true) : 
+      Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){};
+
+    void operator()(LinearOperatorBase<Field>& Linop, const Field& src, Field& psi) 
+    {
+      psi.Checkerboard() = src.Checkerboard();
+      conformable(psi, src);
+
+      RealD cp(0), rho(1), rho_prev(0), alpha(1), beta(0), omega(1);
+      RealD a(0), bo(0), b(0), ssq(0);
+
+      Field p(src);
+      Field r(src);
+      Field rhat(src);
+      Field v(src);
+      Field s(src);
+      Field t(src);
+      Field h(src);
+
+      v = Zero();
+      p = Zero();
+
+      // Initial residual computation & set up
+      RealD guess = norm2(psi);
+      assert(std::isnan(guess) == 0);
+    
+      Linop.Op(psi, v);
+      b = norm2(v);
+
+      r = src - v;
+      rhat = r;
+      a = norm2(r);
+      ssq = norm2(src);
+
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: guess " << guess << std::endl;
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:   src " << ssq << std::endl;
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:    mp " << b << std::endl;
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:     r " << a << std::endl;
+
+      RealD rsq = Tolerance * Tolerance * ssq;
+
+      // Check if guess is really REALLY good :)
+      if(a <= rsq){ return; }
+
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: k=0 residual " << a << " target " << rsq << std::endl;
+
+      GridStopWatch LinalgTimer;
+      GridStopWatch InnerTimer;
+      GridStopWatch AxpyNormTimer;
+      GridStopWatch LinearCombTimer;
+      GridStopWatch MatrixTimer;
+      GridStopWatch SolverTimer;
+
+      SolverTimer.Start();
+      int k;
+      for (k = 1; k <= MaxIterations; k++) 
+      {
+        rho_prev = rho;
+
+        LinalgTimer.Start();
+        InnerTimer.Start();
+        ComplexD Crho  = innerProduct(rhat,r);
+        InnerTimer.Stop();
+        rho = Crho.real();
+
+        beta = (rho / rho_prev) * (alpha / omega);
+
+        LinearCombTimer.Start();
+        bo = beta * omega;
+        auto p_v = p.View();
+        auto r_v = r.View();
+        auto v_v = v.View();
+        accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
+          coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
+        });
+        LinearCombTimer.Stop();
+        LinalgTimer.Stop();
+
+        MatrixTimer.Start();
+        Linop.Op(p,v);
+        MatrixTimer.Stop();
+
+        LinalgTimer.Start();
+        InnerTimer.Start();
+        ComplexD Calpha = innerProduct(rhat,v);
+        InnerTimer.Stop();
+        alpha = rho / Calpha.real();
+
+        LinearCombTimer.Start();
+        auto h_v = h.View();
+        auto psi_v = psi.View();
+        accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
+          coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
+        });
+        
+        auto s_v = s.View();
+        accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
+          coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
+        });
+        LinearCombTimer.Stop();
+        LinalgTimer.Stop();
+
+        MatrixTimer.Start();
+        Linop.Op(s,t);
+        MatrixTimer.Stop();
+
+        LinalgTimer.Start();
+        InnerTimer.Start();
+        ComplexD Comega = innerProduct(t,s);
+        InnerTimer.Stop();
+        omega = Comega.real() / norm2(t);
+
+        LinearCombTimer.Start();
+        auto t_v = t.View();
+        accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
+          coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
+          coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
+        });
+        LinearCombTimer.Stop();
+
+        cp = norm2(r);
+        LinalgTimer.Stop();
+
+        std::cout << GridLogIterative << "BiCGSTAB: Iteration " << k << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
+
+        // Stopping condition
+        if(cp <= rsq) 
+        {
+          SolverTimer.Stop();
+          Linop.Op(psi, v);
+          p = v - src;
+
+          RealD srcnorm = sqrt(norm2(src));
+          RealD resnorm = sqrt(norm2(p));
+          RealD true_residual = resnorm / srcnorm;
+
+          std::cout << GridLogMessage << "BiCGSTAB Converged on iteration " << k << std::endl;
+          std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp/ssq) << std::endl;
+          std::cout << GridLogMessage << "\tTrue residual " << true_residual << std::endl;
+          std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+
+          std::cout << GridLogMessage << "Time breakdown " << std::endl;
+          std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl;
+
+          if(ErrorOnNoConverge){ assert(true_residual / Tolerance < 10000.0); }
+
+          IterationsToComplete = k;	
+
+          return;
+        }
+      }
+      
+      std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl;
+
+      if(ErrorOnNoConverge){ assert(0); }
+      IterationsToComplete = k;
+    }
+};
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
+++ b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
@@ -0,0 +1,158 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: ./lib/algorithms/iterative/BiCGSTABMixedPrec.h
+
+Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+Author: David Murphy <djmurphy@mit.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef GRID_BICGSTAB_MIXED_PREC_H
+#define GRID_BICGSTAB_MIXED_PREC_H
+
+NAMESPACE_BEGIN(Grid);
+
+// Mixed precision restarted defect correction BiCGSTAB
+template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
+{
+  public:                                                
+    RealD   Tolerance;
+    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid; // Grid for single-precision fields
+    RealD OuterLoopNormMult; // Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+    LinearOperatorBase<FieldF> &Linop_f;
+    LinearOperatorBase<FieldD> &Linop_d;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
+    LinearFunction<FieldF> *guesser;
+    
+    MixedPrecisionBiCGSTAB(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, 
+        LinearOperatorBase<FieldF>& _Linop_f, LinearOperatorBase<FieldD>& _Linop_d) : 
+      Linop_f(_Linop_f), Linop_d(_Linop_d), Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid), OuterLoopNormMult(100.), guesser(NULL) {};
+
+    void useGuesser(LinearFunction<FieldF>& g){
+      guesser = &g;
+    }
+  
+    void operator() (const FieldD& src_d_in, FieldD& sol_d)
+    {
+      TotalInnerIterations = 0;
+    
+      GridStopWatch TotalTimer;
+      TotalTimer.Start();
+      
+      int cb = src_d_in.Checkerboard();
+      sol_d.Checkerboard() = cb;
+      
+      RealD src_norm = norm2(src_d_in);
+      RealD stop = src_norm * Tolerance*Tolerance;
+
+      GridBase* DoublePrecGrid = src_d_in.Grid();
+      FieldD tmp_d(DoublePrecGrid);
+      tmp_d.Checkerboard() = cb;
+      
+      FieldD tmp2_d(DoublePrecGrid);
+      tmp2_d.Checkerboard() = cb;
+      
+      FieldD src_d(DoublePrecGrid);
+      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
+      
+      RealD inner_tol = InnerTolerance;
+      
+      FieldF src_f(SinglePrecGrid);
+      src_f.Checkerboard() = cb;
+      
+      FieldF sol_f(SinglePrecGrid);
+      sol_f.Checkerboard() = cb;
+      
+      BiCGSTAB<FieldF> CG_f(inner_tol, MaxInnerIterations);
+      CG_f.ErrorOnNoConverge = false;
+
+      GridStopWatch InnerCGtimer;
+
+      GridStopWatch PrecChangeTimer;
+      
+      Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
+        
+      for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++)
+      {
+        // Compute double precision rsd and also new RHS vector.
+        Linop_d.Op(sol_d, tmp_d);
+        RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
+        
+        std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration " << outer_iter << " residual " << norm << " target " << stop << std::endl;
+
+        if(norm < OuterLoopNormMult * stop){
+          std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration converged on iteration " << outer_iter << std::endl;
+          break;
+        }
+        while(norm * inner_tol * inner_tol < stop){ inner_tol *= 2; } // inner_tol = sqrt(stop/norm) ??
+
+        PrecChangeTimer.Start();
+        precisionChange(src_f, src_d);
+        PrecChangeTimer.Stop();
+        
+        sol_f = Zero();
+
+        //Optionally improve inner solver guess (eg using known eigenvectors)
+        if(guesser != NULL){ (*guesser)(src_f, sol_f); }
+
+        //Inner CG
+        CG_f.Tolerance = inner_tol;
+        InnerCGtimer.Start();
+        CG_f(Linop_f, src_f, sol_f);
+        InnerCGtimer.Stop();
+        TotalInnerIterations += CG_f.IterationsToComplete;
+        
+        //Convert sol back to double and add to double prec solution
+        PrecChangeTimer.Start();
+        precisionChange(tmp_d, sol_f);
+        PrecChangeTimer.Stop();
+        
+        axpy(sol_d, 1.0, tmp_d, sol_d);
+      }
+      
+      //Final trial CG
+      std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Starting final patch-up double-precision solve" << std::endl;
+      
+      BiCGSTAB<FieldD> CG_d(Tolerance, MaxInnerIterations);
+      CG_d(Linop_d, src_d_in, sol_d);
+      TotalFinalStepIterations = CG_d.IterationsToComplete;
+
+      TotalTimer.Stop();
+      std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
+      std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
+  }
+};
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/algorithms/iterative/BlockConjugateGradient.h
+++ b/Grid/algorithms/iterative/BlockConjugateGradient.h
@@ -52,6 +52,7 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  Integer PrintInterval; //GridLogMessages or Iterative
+  RealD TrueResidual;
  
  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100)
@@ -306,7 +307,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)

      Linop.HermOp(X, AD);
      AD = AD-B;
-      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
+      TrueResidual = std::sqrt(norm2(AD)/norm2(B));
+      std::cout << GridLogMessage <<"\tTrue residual is " << TrueResidual <<std::endl;

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -442,7 +444,8 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &

      Linop.HermOp(Psi, AP);
      AP = AP-Src;
-      std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
+      TrueResidual = std::sqrt(norm2(AP)/norm2(Src));
+      std::cout <<GridLogMessage << "\tTrue residual is " << TrueResidual <<std::endl;

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -653,7 +656,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
      if ( rr > max_resid ) max_resid = rr;
    }

-    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl;
+    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;

    if ( max_resid < Tolerance*Tolerance ) { 

@@ -668,7 +671,8 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field

      for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
      for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
-      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl;
+      TrueResidual = std::sqrt(normv(AD)/normv(B));
+      std::cout << GridLogMessage << "\tTrue residual is " << TrueResidual <<std::endl;

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -49,6 +49,7 @@ public:
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  RealD TrueResidual;
  
  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol),
@@ -72,7 +73,6 @@ public:
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    
-    
    Linop.HermOpAndNorm(psi, mmp, d, b);
    
    r = src - mmp;
@@ -82,6 +82,14 @@ public:
    cp = a;
    ssq = norm2(src);

+    // Handle trivial case of zero src
+    if (ssq == 0.){
+      psi = Zero();
+      IterationsToComplete = 1;
+      TrueResidual = 0.;
+      return;
+    }
+
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   src " << ssq << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:    mp " << d << std::endl;
@@ -93,6 +101,7 @@ public:

    // Check if guess is really REALLY good :)
    if (cp <= rsq) {
+      TrueResidual = std::sqrt(a/ssq);
      std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
      IterationsToComplete = 0;	
      return;
@@ -142,7 +151,7 @@ public:
      LinalgTimer.Stop();

      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
-                << " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
+                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;

      // Stopping condition
      if (cp <= rsq) {
@@ -154,26 +163,33 @@ public:
        RealD resnorm = std::sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;

-        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
-        std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
-	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
-	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
+		  << "\tComputed residual " << std::sqrt(cp / ssq)
+		  << "\tTrue residual " << true_residual
+		  << "\tTarget " << Tolerance << std::endl;

-        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
-	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+        std::cout << GridLogIterative << "Time breakdown "<<std::endl;
+	std::cout << GridLogIterative << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;

        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

 	IterationsToComplete = k;	
+	TrueResidual = true_residual;

        return;
      }
    }
+    // Failed. Calculate true residual before giving up                                                         
+    Linop.HermOpAndNorm(psi, mmp, d, qq);
+    p = mmp - src;
+
+    TrueResidual = sqrt(norm2(p)/ssq);
+
    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl;

    if (ErrorOnNoConverge) assert(0);
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -46,15 +46,19 @@ public:

  RealD   Tolerance;
  Integer MaxIterations;
-    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
+  std::vector<RealD> TrueResidualShift;

  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
    verbose=1;
+    IterationsToCompleteShift.resize(_shifts.order);
+    TrueResidualShift.resize(_shifts.order);
  }

  void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
@@ -125,6 +129,17 @@ public:
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src);
+
+    // Handle trivial case of zero src.
+    if( cp == 0. ){
+      for(int s=0;s<nshift;s++){
+	psi[s] = Zero();
+	IterationsToCompleteShift[s] = 1;
+	TrueResidualShift[s] = 0.;
+      }
+      return;
+    }
+
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
@@ -270,6 +285,7 @@ public:
      for(int s=0;s<nshift;s++){
      
 	if ( (!converged[s]) ){
+	  IterationsToCompleteShift[s] = k;
 	
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	
@@ -299,7 +315,8 @@ public:
 	  axpy(r,-alpha[s],src,tmp);
 	  RealD rn = norm2(r);
 	  RealD cn = norm2(src);
-	  std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
+	  TrueResidualShift[s] = std::sqrt(rn/cn);
+	  std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<< TrueResidualShift[s] <<std::endl;
 	}

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -43,6 +43,11 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
 {
+  // If assume basis[j] are already orthonormal,
+  // can take all inner products in parallel saving 2x bandwidth
+  // Save 3x bandwidth on the second line of loop.
+  // perhaps 2.5x speed up.
+  // 2x overall in Multigrid Lanczos  
  for(int j=0; j<k; ++j){
    auto ip = innerProduct(basis[j],w);
    w = w - ip*basis[j];
@@ -54,16 +59,15 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
 {
  typedef decltype(basis[0].View()) View;
  auto tmp_v = basis[0].View();
-  std::vector<View> basis_v(basis.size(),tmp_v);
+  Vector<View> basis_v(basis.size(),tmp_v);
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0].Grid();

  for(int k=0;k<basis.size();k++){
    basis_v[k] = basis[k].View();
  }
-
+#if 0
  std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
-
  thread_region
  {
    vobj* B = Bt.data() + Nm * thread_num();
@@ -81,24 +85,89 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
      }
    });
  }
+#else
+
+  int nrot = j1-j0;
+
+
+  uint64_t oSites   =grid->oSites();
+  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
+
+  //  printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
+
+  Vector <vobj> Bt(siteBlock * nrot); 
+  auto Bp=&Bt[0];
+
+  // GPU readable copy of Eigen matrix
+  Vector<double> Qt_jv(Nm*Nm);
+  double *Qt_p = & Qt_jv[0];
+  for(int k=0;k<Nm;++k){
+    for(int j=0;j<Nm;++j){
+      Qt_p[j*Nm+k]=Qt(j,k);
+    }
+  }
+
+  // Block the loop to keep storage footprint down
+  vobj zz=Zero();
+  for(uint64_t s=0;s<oSites;s+=siteBlock){
+
+    // remaining work in this block
+    int ssites=MIN(siteBlock,oSites-s);
+
+    // zero out the accumulators
+    accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
+	auto z=coalescedRead(zz);
+	coalescedWrite(Bp[ss],z);
+    });
+
+    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
+	
+      int j =sj%nrot;
+      int jj  =j0+j;
+      int ss =sj/nrot;
+      int sss=ss+s;
+
+      for(int k=k0; k<k1; ++k){
+	auto tmp = coalescedRead(Bp[ss*nrot+j]);
+	coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
+      }
+    });
+
+    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
+      int j =sj%nrot;
+      int jj  =j0+j;
+      int ss =sj/nrot;
+      int sss=ss+s;
+      coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
+    });
+  }
+#endif
 }

 // Extract a single rotated vector
 template<class Field>
 void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
 {
+  typedef decltype(basis[0].View()) View;
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0].Grid();

  result.Checkerboard() = basis[0].Checkerboard();
  auto result_v=result.View();
-  thread_for(ss, grid->oSites(),{
-    vobj B = Zero();
+  Vector<View> basis_v(basis.size(),result_v);
+  for(int k=0;k<basis.size();k++){
+    basis_v[k] = basis[k].View();
+  }
+  vobj zz=Zero();
+  Vector<double> Qt_jv(Nm);
+  double * Qt_j = & Qt_jv[0];
+  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
+  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
+    auto B=coalescedRead(zz);
    for(int k=k0; k<k1; ++k){
-      auto basis_k = basis[k].View();
-      B +=Qt(j,k) * basis_k[ss];
+      B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
    }
-    result_v[ss] = B;
+    coalescedWrite(result_v[ss], B);
  });
 }

@@ -282,7 +351,7 @@ public:
 			    RealD _eresid, // resid in lmdue deficit 
 			    int _MaxIter, // Max iterations
 			    RealD _betastp=0.0, // if beta(k) < betastp: converged
-			    int _MinRestart=1, int _orth_period = 1,
+			    int _MinRestart=0, int _orth_period = 1,
 			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOp), _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(Tester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
@@ -298,7 +367,7 @@ public:
 			       RealD _eresid, // resid in lmdue deficit 
 			       int _MaxIter, // Max iterations
 			       RealD _betastp=0.0, // if beta(k) < betastp: converged
-			       int _MinRestart=1, int _orth_period = 1,
+			       int _MinRestart=0, int _orth_period = 1,
 			       IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOp),  _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(SimpleTester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
@@ -347,7 +416,7 @@ until convergence
    GridBase *grid = src.Grid();
    assert(grid == evec[0].Grid());
    
-    GridLogIRL.TimingMode(1);
+    //    GridLogIRL.TimingMode(1);
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@@ -372,14 +441,17 @@ until convergence
    {
      auto src_n = src;
      auto tmp = src;
+      std::cout << GridLogIRL << " IRL source norm " << norm2(src) << std::endl;
      const int _MAX_ITER_IRL_MEVAPP_ = 50;
      for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
 	normalise(src_n);
 	_HermOp(src_n,tmp);
+	//	std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
+	//	std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
 	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
 	RealD vden = norm2(src_n);
 	RealD na = vnum/vden;
-	if (fabs(evalMaxApprox/na - 1.0) < 0.05)
+	if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
 	std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
@@ -577,11 +649,11 @@ until convergence
 /* Saad PP. 195
 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
 2. For k = 1,2,...,m Do:
-3. wk:=Avk−βkv_{k−1}      
-4. αk:=(wk,vk)       // 
-5. wk:=wk−αkvk       // wk orthog vk 
-6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-7. vk+1 := wk/βk+1
+3. wk:=Avk - b_k v_{k-1}      
+4. ak:=(wk,vk)       // 
+5. wk:=wk-akvk       // wk orthog vk 
+6. bk+1 := ||wk||_2. If b_k+1 = 0 then Stop
+7. vk+1 := wk/b_k+1
 8. EndDo
 */
  void step(std::vector<RealD>& lmd,
@@ -589,6 +661,7 @@ until convergence
 	    std::vector<Field>& evec,
 	    Field& w,int Nm,int k)
  {
+    std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
    const RealD tiny = 1.0e-20;
    assert( k< Nm );

@@ -600,20 +673,20 @@ until convergence

    if(k>0) w -= lme[k-1] * evec[k-1];

-    ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
+    ComplexD zalph = innerProduct(evec_k,w);
    RealD     alph = real(zalph);

-    w = w - alph * evec_k;// 5. wk:=wk−αkvk
+    w = w - alph * evec_k;

-    RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-    // 7. vk+1 := wk/βk+1
+    RealD beta = normalise(w); 

    lmd[k] = alph;
    lme[k] = beta;

-    if (k>0 && k % orth_period == 0) {
+    if ( (k>0) && ( (k % orth_period) == 0 )) {
+      std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
      orthogonalize(w,evec,k); // orthonormalise
-      std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
+      std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
    }

    if(k < Nm-1) evec[k+1] = w;
@@ -621,6 +694,8 @@ until convergence
    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
    if ( beta < tiny ) 
      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
+
+    std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
  }

  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
--- a/Grid/algorithms/iterative/NormalEquations.h
+++ b/Grid/algorithms/iterative/NormalEquations.h
@@ -33,25 +33,77 @@ NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Take a matrix and form an NE solver calling a Herm solver
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class Field> class NormalEquations : public OperatorFunction<Field>{
+template<class Field> class NormalEquations {
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
-
+  LinearFunction<Field>   & _Guess;
 public:

  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
-  NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver) 
-    :  _Matrix(Matrix), _HermitianSolver(HermitianSolver) {}; 
+ NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
+		 LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 

  void operator() (const Field &in, Field &out){
 
    Field src(in.Grid());
+    Field tmp(in.Grid());

+    MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
    _Matrix.Mdag(in,src);
-    _HermitianSolver(src,out);  // Mdag M out = Mdag in
+    _Guess(src,out);
+    _HermitianSolver(MdagMOp,src,out);  // Mdag M out = Mdag in
+
+  }     
+};
+
+template<class Field> class HPDSolver {
+private:
+  LinearOperatorBase<Field> & _Matrix;
+  OperatorFunction<Field> & _HermitianSolver;
+  LinearFunction<Field>   & _Guess;
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+ HPDSolver(LinearOperatorBase<Field> &Matrix,
+	   OperatorFunction<Field> &HermitianSolver,
+	   LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 
+
+  void operator() (const Field &in, Field &out){
+ 
+    _Guess(in,out);
+    _HermitianSolver(_Matrix,in,out);  // Mdag M out = Mdag in
+
+  }     
+};
+
+
+template<class Field> class MdagMSolver {
+private:
+  SparseMatrixBase<Field> & _Matrix;
+  OperatorFunction<Field> & _HermitianSolver;
+  LinearFunction<Field>   & _Guess;
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+ MdagMSolver(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
+	     LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 
+
+  void operator() (const Field &in, Field &out){
+ 
+    MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
+    _Guess(in,out);
+
+    _HermitianSolver(MdagMOp,in,out);  // Mdag M out = Mdag in

  }     
 };
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@@ -30,12 +30,12 @@ template<class Field> class PowerMethod
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
      
-      if ( (fabs(evalMaxApprox/na - 1.0) < 0.01) || (i==_MAX_ITER_EST_-1) ) { 
+      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
+	std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
 	return evalMaxApprox; 
      } 
      evalMaxApprox = na; 
-      std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
      src_n = tmp;
    }
    assert(0);
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -38,10 +38,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);

+#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" " 
+
 template<class Field>
-class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
+class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-  using OperatorFunction<Field>::operator();

  RealD   Tolerance;
  Integer MaxIterations;
@@ -49,23 +50,29 @@ public:
  int mmax;
  int nstep;
  int steps;
+  int level;
  GridStopWatch PrecTimer;
  GridStopWatch MatTimer;
  GridStopWatch LinalgTimer;

-  LinearFunction<Field> &Preconditioner;
+  LinearFunction<Field>     &Preconditioner;
+  LinearOperatorBase<Field> &Linop;

-  PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
+  void Level(int lv) { level=lv; };
+
+  PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
    Tolerance(tol), 
    MaxIterations(maxit),
+    Linop(_Linop),
    Preconditioner(Prec),
    mmax(_mmax),
    nstep(_nstep)
  { 
+    level=1;
    verbose=1;
  };

-  void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+  void operator() (const Field &src, Field &psi){

    psi=Zero();
    RealD cp, ssq,rsq;
@@ -84,9 +91,9 @@ public:
    steps=0;
    for(int k=0;k<MaxIterations;k++){

-      cp=GCRnStep(Linop,src,psi,rsq);
+      cp=GCRnStep(src,psi,rsq);

-      std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
+      GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;

      if(cp<rsq) {

@@ -95,24 +102,26 @@ public:
 	Linop.HermOp(psi,r);
 	axpy(r,-1.0,src,r);
 	RealD tr = norm2(r);
-	std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
+	GCRLogLevel<<"PGCR: Converged on iteration " <<steps
 		 << " computed residual "<<sqrt(cp/ssq)
 		 << " true residual "    <<sqrt(tr/ssq)
 		 << " target "           <<Tolerance <<std::endl;

-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
+	GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
+	/*
+	  GCRLogLevel<<"PGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
+	  GCRLogLevel<<"PGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
+	  GCRLogLevel<<"PGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
+	*/
 	return;
      }

    }
-    std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
-    assert(0);
+    GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
+    //    assert(0);
  }

-  RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
+  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){

    RealD cp;
    RealD a, b;
@@ -134,9 +143,7 @@ public:
    std::vector<Field> p(mmax,grid);
    std::vector<RealD> qq(mmax);
      
-    std::cout<<GridLogIterative<< " ************** "<< std::endl;
-    std::cout<<GridLogIterative<< "   GCRnStep("<<nstep<<")"<<std::endl;
-    std::cout<<GridLogIterative<< " ************** "<< std::endl;
+    GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;

    //////////////////////////////////
    // initial guess x0 is taken as nonzero.
@@ -150,35 +157,15 @@ public:
    LinalgTimer.Start();
    r=src-Az;
    LinalgTimer.Stop();
-    std::cout<<GridLogIterative<< " GCRnStep true residual r = src - A psi   "<<norm2(r) <<std::endl;
+    GCRLogLevel<< "PGCR true residual r = src - A psi   "<<norm2(r) <<std::endl;
    
    /////////////////////
    // p = Prec(r)
    /////////////////////

-    std::cout<<GridLogIterative<< " GCRnStep apply preconditioner z= M^-1 r "<< std::endl;
-    std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
    PrecTimer.Start();
    Preconditioner(r,z);
    PrecTimer.Stop();
-    std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
-    std::cout<<GridLogIterative<< " GCRnStep called Preconditioner z "<< norm2(z) <<std::endl;
-
-    //    MatTimer.Start();
-    //    Linop.HermOp(z,tmp); 
-    //    MatTimer.Stop();
-
-    //    LinalgTimer.Start();
-    //    ttmp=tmp;
-    //    tmp=tmp-r;
-    //    LinalgTimer.Stop();
-
-    /*
-      std::cout<<GridLogMessage<<r<<std::endl;
-      std::cout<<GridLogMessage<<z<<std::endl;
-      std::cout<<GridLogMessage<<ttmp<<std::endl;
-      std::cout<<GridLogMessage<<tmp<<std::endl;
-    */

    MatTimer.Start();
    Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
@@ -190,7 +177,6 @@ public:
    p[0]= z;
    q[0]= Az;
    qq[0]= zAAz;
-    std::cout<<GridLogIterative<< " GCRnStep p0=z, q0 = A p0 " <<std::endl;
    
    cp =norm2(r);
    LinalgTimer.Stop();
@@ -212,20 +198,16 @@ public:
      cp = axpy_norm(r,-a,q[peri_k],r);
      LinalgTimer.Stop();

-      std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
+      GCRLogLevel<< "PGCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 

      if((k==nstep-1)||(cp<rsq)){
 	return cp;
      }


-      std::cout<<GridLogIterative<< " GCRnStep apply preconditioner z= M^-1 r "<< std::endl;
-      std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
      PrecTimer.Start();
      Preconditioner(r,z);// solve Az = r
      PrecTimer.Stop();
-      std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
-      std::cout<<GridLogIterative<< " GCRnStep called Preconditioner z "<< norm2(z) <<std::endl;

      MatTimer.Start();
      Linop.HermOpAndNorm(z,Az,zAz,zAAz);
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -405,6 +405,70 @@ namespace Grid {
    }
  };

+  template<class Field> class NonHermitianSchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field> 
+  {
+    public:
+      typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+      NonHermitianSchurRedBlackDiagMooeeSolve(OperatorFunction<Field>& RBSolver, const bool initSubGuess = false,
+          const bool _solnAsInitGuess = false)  
+      : SchurRedBlackBase<Field>(RBSolver, initSubGuess, _solnAsInitGuess) {};
+
+      //////////////////////////////////////////////////////
+      // Override RedBlack specialisation
+      //////////////////////////////////////////////////////
+      virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o)
+      {
+        GridBase* grid  = _Matrix.RedBlackGrid();
+        GridBase* fgrid = _Matrix.Grid();
+
+        Field  tmp(grid);
+        Field Mtmp(grid);
+
+        pickCheckerboard(Even, src_e, src);
+        pickCheckerboard(Odd , src_o, src);
+
+        /////////////////////////////////////////////////////
+        // src_o = Mdag * (source_o - Moe MeeInv source_e)
+        /////////////////////////////////////////////////////
+        _Matrix.MooeeInv(src_e, tmp);   assert(   tmp.Checkerboard() == Even );
+        _Matrix.Meooe   (tmp, Mtmp);    assert(  Mtmp.Checkerboard() == Odd  );     
+        src_o -= Mtmp;                  assert( src_o.Checkerboard() == Odd  );     
+      }
+      
+      virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
+      {
+        GridBase* grid  = _Matrix.RedBlackGrid();
+        GridBase* fgrid = _Matrix.Grid();
+
+        Field     tmp(grid);
+        Field   sol_e(grid);
+        Field src_e_i(grid);
+        
+        ///////////////////////////////////////////////////
+        // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+        ///////////////////////////////////////////////////
+        _Matrix.Meooe(sol_o, tmp);         assert(     tmp.Checkerboard() == Even );
+        src_e_i = src_e - tmp;             assert( src_e_i.Checkerboard() == Even );
+        _Matrix.MooeeInv(src_e_i, sol_e);  assert(   sol_e.Checkerboard() == Even );
+       
+        setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
+        setCheckerboard(sol, sol_o); assert( sol_o.Checkerboard() == Odd  );
+      }
+
+      virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
+      {
+        NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
+        this->_HermitianRBSolver(_OpEO, src_o, sol_o);  assert(sol_o.Checkerboard() == Odd);
+      }
+
+      virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o)
+      {
+        NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
+        this->_HermitianRBSolver(_OpEO, src_o, sol_o); 
+      }
+  };
+
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal is identity, right preconditioned by Mee^inv
  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
@@ -482,5 +546,76 @@ namespace Grid {
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
    }
  };
+
+  template<class Field> class NonHermitianSchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field> 
+  {
+    public:
+      typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+      /////////////////////////////////////////////////////
+      // Wrap the usual normal equations Schur trick
+      /////////////////////////////////////////////////////
+      NonHermitianSchurRedBlackDiagTwoSolve(OperatorFunction<Field>& RBSolver, const bool initSubGuess = false,
+          const bool _solnAsInitGuess = false)  
+      : SchurRedBlackBase<Field>(RBSolver, initSubGuess, _solnAsInitGuess) {};
+
+      virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o)
+      {
+        GridBase* grid  = _Matrix.RedBlackGrid();
+        GridBase* fgrid = _Matrix.Grid();
+
+        Field  tmp(grid);
+        Field Mtmp(grid);
+
+        pickCheckerboard(Even, src_e, src);
+        pickCheckerboard(Odd , src_o, src);
+      
+        /////////////////////////////////////////////////////
+        // src_o = Mdag * (source_o - Moe MeeInv source_e)
+        /////////////////////////////////////////////////////
+        _Matrix.MooeeInv(src_e, tmp);   assert(   tmp.Checkerboard() == Even );
+        _Matrix.Meooe   (tmp, Mtmp);    assert(  Mtmp.Checkerboard() == Odd  );     
+        src_o -= Mtmp;                  assert( src_o.Checkerboard() == Odd  );     
+      }
+
+      virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
+      {
+        GridBase* grid  = _Matrix.RedBlackGrid();
+        GridBase* fgrid = _Matrix.Grid();
+
+        Field sol_o_i(grid);
+        Field     tmp(grid);
+        Field   sol_e(grid);
+
+        ////////////////////////////////////////////////
+        // MooeeInv due to pecond
+        ////////////////////////////////////////////////
+        _Matrix.MooeeInv(sol_o, tmp);
+        sol_o_i = tmp;
+
+        ///////////////////////////////////////////////////
+        // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+        ///////////////////////////////////////////////////
+        _Matrix.Meooe(sol_o_i, tmp);    assert(   tmp.Checkerboard() == Even );
+        tmp = src_e - tmp;              assert( src_e.Checkerboard() == Even );
+        _Matrix.MooeeInv(tmp, sol_e);   assert( sol_e.Checkerboard() == Even );
+       
+        setCheckerboard(sol, sol_e);    assert(   sol_e.Checkerboard() == Even );
+        setCheckerboard(sol, sol_o_i);  assert( sol_o_i.Checkerboard() == Odd  );
+      };
+
+      virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
+      {
+        NonHermitianSchurDiagTwoOperator<Matrix,Field> _OpEO(_Matrix);
+        this->_HermitianRBSolver(_OpEO, src_o, sol_o);
+      };
+
+      virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o,  std::vector<Field>& sol_o)
+      {
+        NonHermitianSchurDiagTwoOperator<Matrix,Field> _OpEO(_Matrix);
+        this->_HermitianRBSolver(_OpEO, src_o, sol_o); 
+      }
+  };
 }
+
 #endif
--- a/Grid/allocator/AlignedAllocator.cc
+++ b/Grid/allocator/AlignedAllocator.cc
@@ -6,6 +6,12 @@ NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;

+#ifdef GRID_NVCC
+#define SMALL_LIMIT (0)
+#else
+#define SMALL_LIMIT (4096)
+#endif
+
 #ifdef POINTER_CACHE
 int PointerCache::victim;

@@ -13,7 +19,7 @@ PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];

 void *PointerCache::Insert(void *ptr,size_t bytes) {

-  if (bytes < 4096 ) return ptr;
+  if (bytes < SMALL_LIMIT ) return ptr;

 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
@@ -50,7 +56,7 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {

 void *PointerCache::Lookup(size_t bytes) {

-  if (bytes < 4096 ) return NULL;
+  if (bytes < SMALL_LIMIT ) return NULL;

 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -49,8 +49,13 @@ NAMESPACE_BEGIN(Grid);
 #ifdef POINTER_CACHE
 class PointerCache {
 private:
-
+/*Pinning pages is costly*/
+/*Could maintain separate large and small allocation caches*/
+#ifdef GRID_NVCC 
+  static const int Ncache=128;
+#else
  static const int Ncache=8;
+#endif
  static int victim;

  typedef struct { 
@@ -63,7 +68,6 @@ private:

 public:

-
  static void *Insert(void *ptr,size_t bytes) ;
  static void *Lookup(size_t bytes) ;

@@ -170,6 +174,7 @@ public:
    // Unified (managed) memory
    ////////////////////////////////////
    if ( ptr == (_Tp *) NULL ) {
+      //      printf(" alignedAllocater cache miss %ld bytes ",bytes);      BACKTRACEFP(stdout);
      auto err = cudaMallocManaged((void **)&ptr,bytes);
      if( err != cudaSuccess ) {
 	ptr = (_Tp *) NULL;
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -47,20 +47,19 @@ public:
  // Give Lattice access
  template<class object> friend class Lattice;

-  GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) {}; 
+  GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) { LocallyPeriodic=0;}; 

  GridBase(const Coordinate & processor_grid,
 	   const CartesianCommunicator &parent,
 	   int &split_rank) 
-    : CartesianCommunicator(processor_grid,parent,split_rank) {};
+    : CartesianCommunicator(processor_grid,parent,split_rank) {LocallyPeriodic=0;};

  GridBase(const Coordinate & processor_grid,
 	   const CartesianCommunicator &parent) 
-    : CartesianCommunicator(processor_grid,parent,dummy) {};
+    : CartesianCommunicator(processor_grid,parent,dummy) {LocallyPeriodic=0;};

  virtual ~GridBase() = default;

-
  // Physics Grid information.
  Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes.
  Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal
@@ -80,7 +79,8 @@ public:
  Coordinate _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
  Coordinate _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1

-    bool _isCheckerBoarded; 
+  bool _isCheckerBoarded; 
+  int        LocallyPeriodic;

 public:

--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -173,6 +173,7 @@ public:
  ///////////////////////////////////////////////////
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_object scalar_object;
  typedef vobj vector_object;

 private:
--- a/Grid/lattice/Lattice_coordinate.h
+++ b/Grid/lattice/Lattice_coordinate.h
@@ -37,19 +37,18 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
  GridBase *grid = l.Grid();
  int Nsimd = grid->iSites();

-  Coordinate gcoor;
-  ExtractBuffer<scalar_type> mergebuf(Nsimd);
-
-  vector_type vI;
  auto l_v = l.View();
-  for(int o=0;o<grid->oSites();o++){
+  thread_for( o, grid->oSites(), {
+    vector_type vI;
+    Coordinate gcoor;
+    ExtractBuffer<scalar_type> mergebuf(Nsimd);
    for(int i=0;i<grid->iSites();i++){
      grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
      mergebuf[i]=(Integer)gcoor[mu];
    }
    merge<vector_type,scalar_type>(vI,mergebuf);
    l_v[o]=vI;
-  }
+  });
 };

 // LatticeCoordinate();
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -156,7 +156,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
 template<class vobj,class sobj>
-void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
+accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
        
  GridBase *grid = l.Grid();

@@ -185,7 +185,7 @@ void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
 };

 template<class vobj,class sobj>
-void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
+accelerator_inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){

  GridBase *grid=l.Grid();

--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -1,5 +1,4 @@
 /*************************************************************************************
-
    Grid physics library, www.github.com/paboyle/Grid 

    Source file: ./lib/lattice/Lattice_transfer.h
@@ -83,12 +82,35 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
  });
 }
  
-
 template<class vobj,class CComplex,int nbasis>
 inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
+			  const             Lattice<vobj>   &fineData,
+			  const std::vector<Lattice<vobj> > &Basis)
+{
+  GridBase * fine  = fineData.Grid();
+  GridBase * coarse= coarseData.Grid();
+
+  Lattice<CComplex> ip(coarse); 
+
+  //  auto fineData_   = fineData.View();
+  auto coarseData_ = coarseData.View();
+  auto ip_         = ip.View();
+  for(int v=0;v<nbasis;v++) {
+    blockInnerProduct(ip,Basis[v],fineData);
+    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
+	coalescedWrite(coarseData_[sc](v),ip_(sc));
+      });
+  }
+}
+
+template<class vobj,class CComplex,int nbasis>
+inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 const             Lattice<vobj>   &fineData,
 			 const std::vector<Lattice<vobj> > &Basis)
 {
+  typedef iVector<CComplex,nbasis > coarseSiteData;
+  coarseSiteData elide;
+  typedef decltype(coalescedRead(elide)) ScalarComplex;
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();
  int  _ndimension = coarse->_ndimension;
@@ -106,26 +128,40 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
    assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
  }
+  int blockVol = fine->oSites()/coarse->oSites();

  coarseData=Zero();

  auto fineData_   = fineData.View();
  auto coarseData_ = coarseData.View();
-  // Loop over coars parallel, and then loop over fine associated with coarse.
-  thread_for( sf, fine->oSites(), {
-    int sc;
-    Coordinate coor_c(_ndimension);
-    Coordinate coor_f(_ndimension);
-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
-    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
+  // Otherwise do fine inner product per site, and make the update atomic
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////
+  accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {

-    thread_critical {
-      for(int i=0;i<nbasis;i++) {
-	auto Basis_      = Basis[i].View();
-	coarseData_[sc](i)=coarseData_[sc](i) + innerProduct(Basis_[sf],fineData_[sf]);
-      }
+    auto sc=sci/nbasis;
+    auto i=sci%nbasis;
+    auto Basis_      = Basis[i].View();
+
+    Coordinate coor_c(_ndimension);
+    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
+
+    int sf;
+    decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
+
+    for(int sb=0;sb<blockVol;sb++){
+
+      Coordinate coor_b(_ndimension);
+      Coordinate coor_f(_ndimension);
+
+      Lexicographic::CoorFromIndex(coor_b,sb,block_r);
+      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
+      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
+      
+      reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
    }
+    coalescedWrite(coarseData_[sc](i),reduce);
  });
  return;
 }
@@ -160,7 +196,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
  auto fineY_  = fineY.View();
  auto coarseA_= coarseA.View();

-  thread_for(sf, fine->oSites(), {
+  accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
    
    int sc;
    Coordinate coor_c(_ndimension);
@@ -171,7 +207,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);

    // z = A x + y
-    fineZ_[sf]=coarseA_[sc]*fineX_[sf]+fineY_[sf];
+    coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf));

  });

@@ -196,7 +232,7 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,

  fine_inner = localInnerProduct(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
-  thread_for(ss, coarse->oSites(),{
+  accelerator_for(ss, coarse->oSites(), 1, {
    CoarseInner_[ss] = coarse_inner_[ss];
  });
 }
@@ -226,23 +262,29 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  for(int d=0 ; d<_ndimension;d++){
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
  }
+  int blockVol = fine->oSites()/coarse->oSites();

  // Turn this around to loop threaded over sc and interior loop 
  // over sf would thread better
-  coarseData=Zero();
  auto coarseData_ = coarseData.View();
  auto fineData_   = fineData.View();

-  thread_for(sf,fine->oSites(),{
-    int sc;
+  accelerator_for(sc,coarse->oSites(),1,{
+
+    // One thread per sub block
    Coordinate coor_c(_ndimension);
-    Coordinate coor_f(_ndimension);
+    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
+    coarseData_[sc]=Zero();

-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
-    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+    for(int sb=0;sb<blockVol;sb++){
+      
+      int sf;
+      Coordinate coor_b(_ndimension);
+      Coordinate coor_f(_ndimension);
+      Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
+      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
+      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);

-    thread_critical { 
      coarseData_[sc]=coarseData_[sc]+fineData_[sf];
    }

@@ -296,6 +338,7 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
  }
 }

+#if 0
 template<class vobj,class CComplex,int nbasis>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
@@ -321,7 +364,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
  auto coarseData_ = coarseData.View();

  // Loop with a cache friendly loop ordering
-  thread_for(sf,fine->oSites(),{
+  accelerator_for(sf,fine->oSites(),1,{
    int sc;
    Coordinate coor_c(_ndimension);
    Coordinate coor_f(_ndimension);
@@ -332,13 +375,35 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,

    for(int i=0;i<nbasis;i++) {
      auto basis_ = Basis[i].View();
-      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf];
-      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf];
+      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
+      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
    }
  });
  return;
  
 }
+#else
+template<class vobj,class CComplex,int nbasis>
+inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
+			 Lattice<vobj>   &fineData,
+			 const std::vector<Lattice<vobj> > &Basis)
+{
+  GridBase * fine  = fineData.Grid();
+  GridBase * coarse= coarseData.Grid();
+
+  fineData=Zero();
+  for(int i=0;i<nbasis;i++) {
+    Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
+    Lattice<CComplex> cip(coarse);
+    auto cip_ = cip.View();
+    auto  ip_ =  ip.View();
+    accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
+	coalescedWrite(cip_[sc], ip_(sc)());
+    });
+    blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
+  }
+}
+#endif

 // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 // Simd layouts need not match since we use peek/poke Local
@@ -374,6 +439,67 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
  });
 }

+template<class vobj>
+void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate FromLowerLeft, Coordinate ToLowerLeft, Coordinate RegionSize)
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  static const int words=sizeof(vobj)/sizeof(vector_type);
+
+  GridBase *Fg = From.Grid();
+  GridBase *Tg = To.Grid();
+  assert(!Fg->_isCheckerBoarded);
+  assert(!Tg->_isCheckerBoarded);
+  int Nsimd = Fg->Nsimd();
+  int nF = Fg->_ndimension;
+  int nT = Tg->_ndimension;
+  int nd = nF;
+  assert(nF == nT);
+
+  for(int d=0;d<nd;d++){
+    assert(Fg->_processors[d]  == Tg->_processors[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  Coordinate ldf = Fg->_ldimensions;
+  Coordinate rdf = Fg->_rdimensions;
+  Coordinate isf = Fg->_istride;
+  Coordinate osf = Fg->_ostride;
+  Coordinate rdt = Tg->_rdimensions;
+  Coordinate ist = Tg->_istride;
+  Coordinate ost = Tg->_ostride;
+  auto t_v = To.View();
+  auto f_v = From.View();
+  accelerator_for(idx,Fg->lSites(),1,{
+    sobj s;
+    Coordinate Fcoor(nd);
+    Coordinate Tcoor(nd);
+    Lexicographic::CoorFromIndex(Fcoor,idx,ldf);
+    int in_region=1;
+    for(int d=0;d<nd;d++){
+      if ( (Fcoor[d] < FromLowerLeft[d]) || (Fcoor[d]>=FromLowerLeft[d]+RegionSize[d]) ){ 
+	in_region=0;
+      }
+      Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
+    }
+    if (in_region) {
+      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
+      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
+      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
+      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
+      scalar_type * fp = (scalar_type *)&f_v[odx_f];
+      scalar_type * tp = (scalar_type *)&t_v[odx_t];
+      for(int w=0;w<words;w++){
+	tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd];  // FIXME IF RRII layout, type pun no worke
+      }
+      //      peekLocalSite(s,From,Fcoor);
+      //      pokeLocalSite(s,To  ,Tcoor);
+    }
+  });
+}
+

 template<class vobj>
 void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -101,7 +101,8 @@ public:
  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

  // Efficient support for multigrid coarsening
-  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);

  void   Meooe5D       (const FermionField &in, FermionField &out);
  void   MeooeDag5D    (const FermionField &in, FermionField &out);
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -62,14 +62,15 @@ public:

  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);

-      ///////////////////////////////////////////////////////////////
-      // Physical surface field utilities
-      ///////////////////////////////////////////////////////////////
-      //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
-      //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
-      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
-      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+  ///////////////////////////////////////////////////////////////
+  // Physical surface field utilities
+  ///////////////////////////////////////////////////////////////
+  //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
+  //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+  virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);

  // Constructors
  ContinuedFractionFermion5D(GaugeField &_Umu,
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -89,6 +89,7 @@ public:

  virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac


      virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -103,6 +103,7 @@ public:
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);

  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -86,7 +86,8 @@ public:
  void   MooeeDag    (const FermionField &in, FermionField &out);
  void   MooeeInvDag (const FermionField &in, FermionField &out);

-  void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  void Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);

  // These can be overridden by fancy 5d chiral action
--- a/Grid/qcd/action/fermion/MADWF.h
+++ b/Grid/qcd/action/fermion/MADWF.h
@@ -40,6 +40,11 @@ inline void convert(const Fieldi &from,Fieldo &to)
  to=from;
 }

+struct MADWFinnerIterCallbackBase{
+  virtual void operator()(const RealD current_resid){}
+  virtual ~MADWFinnerIterCallbackBase(){}
+};
+
 template<class Matrixo,class Matrixi,class PVinverter,class SchurSolver, class Guesser> 
 class MADWF 
 {
@@ -56,23 +61,29 @@ class MADWF

  RealD target_resid;
  int   maxiter;
- public:

+  //operator() is called on "callback" at the end of every inner iteration. This allows for example the adjustment of the inner
+  //tolerance to speed up subsequent iteration
+  MADWFinnerIterCallbackBase* callback;
+  
+ public:
  MADWF(Matrixo &_Mato,
 	Matrixi &_Mati,
 	PVinverter &_PauliVillarsSolvero,
 	SchurSolver &_SchurSolveri,
 	Guesser & _Guesseri,
 	RealD resid,
-	int _maxiter) :
+	int _maxiter,
+	MADWFinnerIterCallbackBase* _callback = NULL) :

  Mato(_Mato),Mati(_Mati),
    SchurSolveri(_SchurSolveri),
-    PauliVillarsSolvero(_PauliVillarsSolvero),Guesseri(_Guesseri)
-  {   
-    target_resid=resid;
-    maxiter     =_maxiter; 
-  };
+    PauliVillarsSolvero(_PauliVillarsSolvero),Guesseri(_Guesseri),
+    callback(_callback)
+    {
+      target_resid=resid;
+      maxiter     =_maxiter;
+    };
   
  void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
  {
@@ -177,6 +188,8 @@ class MADWF
       std::cout << GridLogMessage << "Residual " << i << ": " << resid  << std::endl;
       std::cout << GridLogMessage << "***************************************" <<std::endl;

+       if(callback != NULL) (*callback)(resid);       
+       
       if (resid < target_resid) {
 	 return;
       }
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@@ -67,12 +67,13 @@ public:

  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);

-      ///////////////////////////////////////////////////////////////
-      // Physical surface field utilities
-      ///////////////////////////////////////////////////////////////
-      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
-      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+  ///////////////////////////////////////////////////////////////
+  // Physical surface field utilities
+  ///////////////////////////////////////////////////////////////
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+  virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);

  // Constructors
  PartialFractionFermion5D(GaugeField &_Umu,
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -115,9 +115,10 @@ public:
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
-  void DhopDirDisp(const FermionField &in, FermionField &out, int dirdisp,
-                   int gamma, int dag);
+  void DhopDirAll(const FermionField &in, std::vector<FermionField> &out);
+  void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag);

  ///////////////////////////////////////////////////////////////
  // Extra methods added by derived
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -111,15 +111,16 @@ public:
  virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
  virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

  // These can be overridden by fancy 5d chiral action
  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

-      void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
-      void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
-      void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;

  // Implement hopping term non-hermitian hopping term; half cb or both
  // Implement s-diagonal DW
@@ -131,6 +132,9 @@ public:
  // add a DhopComm
  // -- suboptimal interface will presently trigger multiple comms.
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+  void DhopDirAll(const FermionField &in,std::vector<FermionField> &out);
+  void DhopDirComms(const FermionField &in);
+  void DhopDirCalc(const FermionField &in, FermionField &out,int point);
    
  ///////////////////////////////////////////////////////////////
  // New methods added 
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -60,14 +60,26 @@ public:
 			    int Ls, int Nsite, const FermionField &in, FermionField &out,
 			    int interior=1,int exterior=1) ;

+  static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
+			  int Nsite, const FermionField &in, std::vector<FermionField> &out) ;
+
  static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
 			    int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);

 private:

-  static accelerator void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
+  static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
 				   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);

+  static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirTp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirTm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+      
  // Specialised variants
  static accelerator void GenericDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
 					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
--- a/Grid/qcd/action/fermion/g5HermitianLinop.h
+++ b/Grid/qcd/action/fermion/g5HermitianLinop.h
@@ -54,6 +54,14 @@ public:
    _Mat.Mdir(in,tmp,dir,disp);
    G5R5(out,tmp);
  }
+  void OpDirAll(const Field &in, std::vector<Field> &out) {
+    Field tmp(in.Grid());
+    _Mat.MdirAll(in,out);
+    for(int p=0;p<out.size();p++) {
+      tmp=out[p];
+      G5R5(out[p],tmp);
+    }
+  }

  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){

@@ -96,6 +104,12 @@ public:
    _Mat.Mdir(in,tmp,dir,disp);
    out=g5*tmp;
  }
+  void OpDirAll(const Field &in, std::vector<Field> &out) {
+    _Mat.MdirAll(in,out);
+    for(int p=0;p<out.size();p++) {
+      out[p]=g5*out[p];
+    }
+  }

  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){

--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -389,6 +389,14 @@ void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,in
  Meo5D(psi,tmp);
  this->DhopDir(tmp,chi,dir,disp);
 }
+template<class Impl>
+void  CayleyFermion5D<Impl>::MdirAll(const FermionField &psi, std::vector<FermionField> &out)
+{
+  FermionField tmp(psi.Grid());
+  Meo5D(psi,tmp);
+  this->DhopDirAll(tmp,out);
+}
+
 // force terms; five routines; default to Dhop on diagonal
 template<class Impl>
 void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@@ -143,6 +143,25 @@ void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionFi
  }
 }
 template<class Impl>
+void  ContinuedFractionFermion5D<Impl>::MdirAll (const FermionField &psi, std::vector<FermionField> &chi)
+{
+  int Ls = this->Ls;
+
+  this->DhopDirAll(psi,chi); // Dslash on diagonal. g5 Dslash is hermitian
+
+  for(int p=0;p<chi.size();p++){
+    int sign=1;
+    for(int s=0;s<Ls;s++){
+      if ( s==(Ls-1) ){
+	ag5xpby_ssp(chi[p],Beta[s]*ZoloHiInv,chi[p],0.0,chi[p],s,s);
+      } else {
+	ag5xpby_ssp(chi[p],cc[s]*Beta[s]*sign*ZoloHiInv,chi[p],0.0,chi[p],s,s);
+      }
+      sign=-sign; 
+    }
+  }
+}
+template<class Impl>
 void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
 {
  int Ls = this->Ls;
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@@ -538,10 +538,16 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
 // Implement the general interface. Here we use SAME mass on all slices
 /////////////////////////////////////////////////////////////////////////
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
  DhopDir(in, out, dir, disp);
 }
 template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  assert(0);
+}
+template <class Impl>
 RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@@ -362,12 +362,19 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
  DhopDir(in, out, dir, disp);
 }
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  assert(0); // Not implemented yet
+}

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
+{

  Compressor compressor;
  Stencil.HaloExchange(in, compressor);
@@ -380,6 +387,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
  });
 };

+
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
 						  DoubledGaugeField &U,
@@ -404,7 +412,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
 #ifdef GRID_OMP
  Compressor compressor; 
  int len =  U.Grid()->oSites();
-  const int LLs =  1;

  DhopTotalTime   -= usecond();

--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@@ -31,7 +31,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class Impl>
+ template<class Impl>
 void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
  // this does both dag and undag but is trivial; make a common helper routing
  int Ls = this->Ls;
@@ -45,8 +45,25 @@ void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionFiel
    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
  }
  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
-
 }
+template<class Impl>
+void  PartialFractionFermion5D<Impl>::MdirAll (const FermionField &psi, std::vector<FermionField> &chi){
+  // this does both dag and undag but is trivial; make a common helper routing
+  int Ls = this->Ls;
+
+  this->DhopDirAll(psi,chi);
+
+  for(int point=0;point<chi.size();point++){
+    int nblock=(Ls-1)/2;
+    for(int b=0;b<nblock;b++){
+      int s = 2*b;
+      ag5xpby_ssp(chi[point],-scale,chi[point],0.0,chi[point],s,s); 
+      ag5xpby_ssp(chi[point], scale,chi[point],0.0,chi[point],s+1,s+1); 
+    }
+    ag5xpby_ssp(chi[point],p[nblock]*scale/amax,chi[point],0.0,chi[point],Ls-1,Ls-1);
+  }
+}
+
 template<class Impl>
 void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
 {
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -241,6 +241,15 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);

 };
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
+{
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in,compressor);
+  uint64_t Nsite = Umu.Grid()->oSites();
+  Kernels::DhopDirAll(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out);
+};
+

 template<class Impl>
 void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -319,28 +319,51 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
 }

 template <class Impl>
-void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
  DhopDir(in, out, dir, disp);
 }
+template <class Impl>
+void WilsonFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  DhopDirAll(in, out);
+}

 template <class Impl>
 void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in, compressor);
+
  int skip = (disp == 1) ? 0 : 1;
  int dirdisp = dir + skip * 4;
  int gamma = dir + (1 - skip) * 4;

-  DhopDirDisp(in, out, dirdisp, gamma, DaggerNo);
+  DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
 };
-
 template <class Impl>
-void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
+void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
-  Compressor compressor(dag);
-
+  Compressor compressor(DaggerNo);
  Stencil.HaloExchange(in, compressor);
+
+  assert((out.size()==8)||(out.size()==9)); 
+  for(int dir=0;dir<Nd;dir++){
+    for(int disp=-1;disp<=1;disp+=2){
+
+      int skip = (disp == 1) ? 0 : 1;
+      int dirdisp = dir + skip * 4;
+      int gamma = dir + (1 - skip) * 4;
+
+      DhopDirCalc(in, out[dirdisp], dirdisp, gamma, DaggerNo);
+    }
+  }
+}
+template <class Impl>
+void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
+{
  int Ls=1;
-  int Nsite=in.oSites();
+  uint64_t Nsite=in.oSites();
  Kernels::DhopDirKernel(Stencil, Umu, Stencil.CommBuf(), Ls, Nsite, in, out, dirdisp, gamma);
 };

@@ -348,7 +371,8 @@ template <class Impl>
 void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       DoubledGaugeField &U,
                                       const FermionField &in,
-                                       FermionField &out, int dag) {
+                                       FermionField &out, int dag) 
+{
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -91,8 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  }								\
  synchronise();						

-#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
-  if (gamma == Dir) {						\
+#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon)		\
    if (SE->_is_local ) {					\
      int perm= SE->_permute;					\
      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
@@ -102,10 +101,14 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
    }								\
    synchronise();						\
    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
-    Recon(result, Uchi);					\
-    synchronise();						\
+    Recon(result, Uchi);					
+
+#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
+  if (gamma == Dir) {						\
+    GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon);			\
  }

+
  ////////////////////////////////////////////////////////////////////
  // All legs kernels ; comms then compute
  ////////////////////////////////////////////////////////////////////
@@ -284,6 +287,35 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
  }
 };

+#define DhopDirMacro(Dir,spProj,spRecon)	\
+  template <class Impl>							\
+  void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
+					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
+  {									\
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;		\
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;			\
+  calcHalfSpinor chi;							\
+  calcSpinor result;							\
+  calcHalfSpinor Uchi;							\
+  StencilEntry *SE;							\
+  int ptype;								\
+  const int Nsimd = SiteHalfSpinor::Nsimd();				\
+  const int lane=SIMTlane(Nsimd);					\
+									\
+  SE = st.GetEntry(ptype, dir, sF);					\
+  GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon);				\
+  coalescedWrite(out[sF], result,lane);					\
+  }									
+
+DhopDirMacro(Xp,spProjXp,spReconXp);
+DhopDirMacro(Yp,spProjYp,spReconYp);
+DhopDirMacro(Zp,spProjZp,spReconZp);
+DhopDirMacro(Tp,spProjTp,spReconTp);
+DhopDirMacro(Xm,spProjXm,spReconXm);
+DhopDirMacro(Ym,spProjYm,spReconYm);
+DhopDirMacro(Zm,spProjZm,spReconZm);
+DhopDirMacro(Tm,spProjTm,spReconTm);
+
 template <class Impl> 
 void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
 				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) 
@@ -299,18 +331,7 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
  const int lane=SIMTlane(Nsimd);

  SE = st.GetEntry(ptype, dir, sF);
-  if (gamma == Xp) {						
-    if (SE->_is_local ) {					
-      int perm= SE->_permute;					
-      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	
-      spProjXp(chi,tmp);						
-    } else {							
-      chi = coalescedRead(buf[SE->_offset],lane);			
-    }								
-    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		
-    spReconXp(result, Uchi);					
-  }
-
+  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
  GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
  GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
  GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
@@ -321,6 +342,38 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
  coalescedWrite(out[sF], result,lane);
 }

+template <class Impl>
+void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
+				      int Nsite, const FermionField &in, std::vector<FermionField> &out) 
+{
+   auto U_v   = U.View();
+   auto in_v  = in.View();
+   auto st_v  = st.View();
+
+   auto out_Xm = out[0].View();
+   auto out_Ym = out[1].View();
+   auto out_Zm = out[2].View();
+   auto out_Tm = out[3].View();
+   auto out_Xp = out[4].View();
+   auto out_Yp = out[5].View();
+   auto out_Zp = out[6].View();
+   auto out_Tp = out[7].View();
+
+   accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
+      int sU=sss/Ls;				
+      int sF =sss;				
+      DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0);
+      DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1);
+      DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2);
+      DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3);
+      DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4);
+      DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5);
+      DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6);
+      DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7);
+   });
+}
+
+
 template <class Impl>
 void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
 					 int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma) 
@@ -332,13 +385,32 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
   auto in_v  = in.View();
   auto out_v = out.View();
   auto st_v  = st.View();
-   accelerator_for(ss,Nsite,Simd::Nsimd(),{
-    for(int s=0;s<Ls;s++){
-      int sU=ss;
-      int sF = s+Ls*sU; 
-      DhopDirK(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp,gamma);
-    }
-  });
+#define LoopBody(Dir)				\
+   case Dir :			\
+     accelerator_forNB(ss,Nsite,Simd::Nsimd(),{	\
+       for(int s=0;s<Ls;s++){			\
+	 int sU=ss;				\
+	 int sF = s+Ls*sU;						\
+	 DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
+       }							       \
+       });							       \
+     break;
+
+   switch(gamma){
+   LoopBody(Xp);
+   LoopBody(Yp);
+   LoopBody(Zp);
+   LoopBody(Tp);
+
+   LoopBody(Xm);
+   LoopBody(Ym);
+   LoopBody(Zm);
+   LoopBody(Tm);
+   default:
+     assert(0);
+     break;
+   }
+#undef LoopBody
 } 

 #define KERNEL_CALLNB(A) \
--- a/Grid/qcd/modules/Registration.h
+++ b/Grid/qcd/modules/Registration.h
@@ -80,6 +80,8 @@ static Registrar<OneFlavourRatioEOFModule<FermionImplementationPolicy>,

 static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>,   
                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient"); 
+static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,   
+                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("BiCGSTAB"); 
 static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,   
                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual"); 

--- a/Grid/qcd/modules/SolverModules.h
+++ b/Grid/qcd/modules/SolverModules.h
@@ -119,6 +119,17 @@ class ConjugateGradientModule: public SolverModule<ConjugateGradient, Field, Sol
  }
 };

+template <class Field >
+class BiCGSTABModule: public SolverModule<BiCGSTAB, Field, SolverParameters> {
+  typedef SolverModule<BiCGSTAB, Field, SolverParameters> SolverBase;
+  using SolverBase::SolverBase; // for constructors
+
+  // acquire resource
+  virtual void initialize(){
+    this->SolverPtr.reset(new BiCGSTAB<Field>(this->Par_.tolerance, this->Par_.max_iterations, true));
+  }
+};
+
 template <class Field >
 class ConjugateResidualModule: public SolverModule<ConjugateResidual, Field, SolverParameters> {
  typedef SolverModule<ConjugateResidual, Field, SolverParameters> SolverBase;
--- a/Grid/qcd/utils/CovariantLaplacian.h
+++ b/Grid/qcd/utils/CovariantLaplacian.h
@@ -92,6 +92,7 @@ public:
  };

  void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
+  void MdirAll(const GaugeField&, std::vector<GaugeField> &){ assert(0);}
  void Mdiag(const GaugeField&, GaugeField&){ assert(0);}

  void ImportGauge(const GaugeField& _U) {
--- a/Grid/serialisation/BaseIO.h
+++ b/Grid/serialisation/BaseIO.h
@@ -97,6 +97,23 @@ namespace Grid {
    template<typename T, typename V = void> struct is_tensor_variable : public std::false_type {};
    template<typename T> struct is_tensor_variable<T, typename std::enable_if<is_tensor<T>::value
        && !is_tensor_fixed<T>::value>::type> : public std::true_type {};
+
+    // Helper functions to get the ultimate scalar inside a tensor, and corresponding size
+    template <typename ET>
+    inline typename std::enable_if<is_tensor<ET>::value, const typename ET::Index>::type
+    getScalarCount(const ET &eigenTensor) { return eigenTensor.size() * Traits<ET>::count; }
+    template <typename ET>
+    inline typename std::enable_if<is_tensor_of_scalar<ET>::value, const typename ET::Scalar *>::type
+    getFirstScalar(const ET &eigenTensor) { return eigenTensor.data(); }
+    template <typename ET>
+    inline typename std::enable_if<is_tensor_of_scalar<ET>::value, typename ET::Scalar *>::type
+    getFirstScalar(ET &eigenTensor) { return eigenTensor.data(); }
+    template <typename ET>
+    inline typename std::enable_if<is_tensor_of_container<ET>::value, const typename Traits<ET>::scalar_type *>::type
+    getFirstScalar(const ET &eigenTensor) { return eigenTensor.data()->begin(); }
+    template <typename ET>
+    inline typename std::enable_if<is_tensor_of_container<ET>::value, typename Traits<ET>::scalar_type *>::type
+    getFirstScalar(ET &eigenTensor) { return eigenTensor.data()->begin(); }
  }

  // Abstract writer/reader classes ////////////////////////////////////////////
@@ -128,23 +145,6 @@ namespace Grid {
    typename std::enable_if<EigenIO::is_tensor<ETensor>::value>::type
    write(const std::string &s, const ETensor &output);

-    // Helper functions for Scalar vs Container specialisations
-    template <typename ETensor>
-    inline typename std::enable_if<EigenIO::is_tensor_of_scalar<ETensor>::value,
-    const typename ETensor::Scalar *>::type
-    getFirstScalar(const ETensor &output)
-    {
-      return output.data();
-    }
-    
-    template <typename ETensor>
-    inline typename std::enable_if<EigenIO::is_tensor_of_container<ETensor>::value,
-    const typename EigenIO::Traits<ETensor>::scalar_type *>::type
-    getFirstScalar(const ETensor &output)
-    {
-      return output.data()->begin();
-    }
-    
    template <typename S>
    inline typename std::enable_if<EigenIO::is_scalar<S>::value, void>::type
    copyScalars(S * &pCopy, const S &Source)
@@ -318,12 +318,12 @@ namespace Grid {
      TotalDims[TensorRank + i] = Traits::Dimension(i);

    // If the Tensor isn't in Row-Major order, then we'll need to copy it's data
-    const bool CopyData{NumElements > 1 && ETensor::Layout != Eigen::StorageOptions::RowMajor};
+    const bool CopyData{NumElements > 1 && static_cast<int>( ETensor::Layout ) != static_cast<int>( Eigen::StorageOptions::RowMajor )};
    const Scalar * pWriteBuffer;
    std::vector<Scalar> CopyBuffer;
    const Index TotalNumElements = NumElements * Traits::count;
    if( !CopyData ) {
-      pWriteBuffer = getFirstScalar( output );
+      pWriteBuffer = EigenIO::getFirstScalar( output );
    } else {
      // Regardless of the Eigen::Tensor storage order, the copy will be Row Major
      CopyBuffer.resize( TotalNumElements );
--- a/Grid/simd/Grid_gpu_vec.h
+++ b/Grid/simd/Grid_gpu_vec.h
@@ -403,6 +403,10 @@ namespace Optimization {
    accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b){
      return a/b;
    }
+    accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b){
+      return a/b;
+    }
+
    // Danger -- element wise divide fro complex, not complex div. 
    // See Grid_vector_types.h lines around 735, applied after "toReal"
    accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b){
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -628,6 +628,7 @@ void Grid_debug_handler_init(void)
  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
  sigaction(SIGBUS,&sa,NULL);
+  sigaction(SIGUSR2,&sa,NULL);

  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);

--- a/Hadrons/A2AMatrix.hpp
+++ b/Hadrons/A2AMatrix.hpp
@@ -1,777 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/A2AMatrix.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef A2A_Matrix_hpp_
-#define A2A_Matrix_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/TimerArray.hpp>
-#include <Grid/Eigen/unsupported/CXX11/Tensor>
-#ifdef USE_MKL
-#include "mkl.h"
-#include "mkl_cblas.h"
-#endif
-
-#ifndef HADRONS_A2AM_NAME 
-#define HADRONS_A2AM_NAME "a2aMatrix"
-#endif
-
-#ifndef HADRONS_A2AM_IO_TYPE
-#define HADRONS_A2AM_IO_TYPE ComplexF
-#endif
-
-#define HADRONS_A2AM_PARALLEL_IO
-
-BEGIN_HADRONS_NAMESPACE
-
-// general A2A matrix set based on Eigen tensors and Grid-allocated memory
-// Dimensions:
-//   0 - ext - external field (momentum, EM field, ...)
-//   1 - str - spin-color structure
-//   2 - t   - timeslice
-//   3 - i   - left  A2A mode index
-//   4 - j   - right A2A mode index
-template <typename T>
-using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;
-
-template <typename T>
-using A2AMatrix = Eigen::Matrix<T, -1, -1, Eigen::RowMajor>;
-
-template <typename T>
-using A2AMatrixTr = Eigen::Matrix<T, -1, -1, Eigen::ColMajor>;
-
-/******************************************************************************
- *                      Abstract class for A2A kernels                        *
- ******************************************************************************/
-template <typename T, typename Field>
-class A2AKernel
-{
-public:
-    A2AKernel(void) = default;
-    virtual ~A2AKernel(void) = default;
-    virtual void operator()(A2AMatrixSet<T> &m, const Field *left, const Field *right,
-                          const unsigned int orthogDim, double &time) = 0;
-    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
-    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
-};
-
-/******************************************************************************
- *                  Class to handle A2A matrix block HDF5 I/O                 *
- ******************************************************************************/
-template <typename T>
-class A2AMatrixIo
-{
-public:
-    // constructors
-    A2AMatrixIo(void) = default;
-    A2AMatrixIo(std::string filename, std::string dataname, 
-                const unsigned int nt, const unsigned int ni = 0,
-                const unsigned int nj = 0);
-    // destructor
-    ~A2AMatrixIo(void) = default;
-    // access
-    unsigned int getNi(void) const;
-    unsigned int getNj(void) const;
-    unsigned int getNt(void) const;
-    size_t       getSize(void) const;
-    // file allocation
-    template <typename MetadataType>
-    void initFile(const MetadataType &d, const unsigned int chunkSize);
-    // block I/O
-    void saveBlock(const T *data, const unsigned int i, const unsigned int j,
-                   const unsigned int blockSizei, const unsigned int blockSizej);
-    void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
-                   const unsigned int i, const unsigned int j);
-    template <template <class> class Vec, typename VecT>
-    void load(Vec<VecT> &v, double *tRead = nullptr, GridBase *grid = nullptr);
-private:
-    std::string  filename_{""}, dataname_{""};
-    unsigned int nt_{0}, ni_{0}, nj_{0};
-};
-
-/******************************************************************************
- *                  Wrapper for A2A matrix block computation                  *
- ******************************************************************************/
-template <typename T, typename Field, typename MetadataType, typename TIo = T>
-class A2AMatrixBlockComputation
-{
-private:
-    struct IoHelper
-    {
-        A2AMatrixIo<TIo> io;
-        MetadataType     md;
-        unsigned int     e, s, i, j;
-    };
-    typedef std::function<std::string(const unsigned int, const unsigned int)>  FilenameFn;
-    typedef std::function<MetadataType(const unsigned int, const unsigned int)> MetadataFn;
-public:
-    // constructor
-    A2AMatrixBlockComputation(GridBase *grid,
-                              const unsigned int orthogDim,
-                              const unsigned int next,
-                              const unsigned int nstr,
-                              const unsigned int blockSize,
-                              const unsigned int cacheBlockSize,
-                              TimerArray *tArray = nullptr);
-    // execution
-    void execute(const std::vector<Field> &left, 
-                 const std::vector<Field> &right,
-                 A2AKernel<T, Field> &kernel,
-                 const FilenameFn &ionameFn,
-                 const FilenameFn &filenameFn,
-                 const MetadataFn &metadataFn);
-private:
-    // I/O handler
-    void saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h);
-private:
-    TimerArray            *tArray_;
-    GridBase              *grid_;
-    unsigned int          orthogDim_, nt_, next_, nstr_, blockSize_, cacheBlockSize_;
-    Vector<T>             mCache_;
-    Vector<TIo>           mBuf_;
-    std::vector<IoHelper> nodeIo_;
-};
-
-/******************************************************************************
- *                       A2A matrix contraction kernels                       *
- ******************************************************************************/
-class A2AContraction
-{
-public:
-    // accTrMul(acc, a, b): acc += tr(a*b)
-    template <typename C, typename MatLeft, typename MatRight>
-    static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
-    {
-        const int RowMajor = Eigen::RowMajor;
-        const int ColMajor = Eigen::ColMajor;
-        if ((MatLeft::Options  == RowMajor) and
-            (MatRight::Options == ColMajor))
-        {
-  	  thread_for(r,a.rows(),
-            {
-                C tmp;
-#ifdef USE_MKL
-                dotuRow(tmp, r, a, b);
-#else
-                tmp = a.row(r).conjugate().dot(b.col(r));
-#endif
-                thread_critical
-                {
-                    acc += tmp;
-                }
-            });
-        }
-        else
-	  {
-            thread_for(c,a.cols(),
-            {
-                C tmp;
-#ifdef USE_MKL 
-                dotuCol(tmp, c, a, b);
-#else
-                tmp = a.col(c).conjugate().dot(b.row(c));
-#endif
-                thread_critical
-                {
-                    acc += tmp;
-                }
-            });
-        }
-    }
-
-    template <typename MatLeft, typename MatRight>
-    static inline double accTrMulFlops(const MatLeft &a, const MatRight &b)
-    {
-        double n = a.rows()*a.cols();
-
-        return 8.*n;
-    }
-
-    // mul(res, a, b): res = a*b
-#ifdef USE_MKL
-    template <template <class, int...> class Mat, int... Opts>
-    static inline void mul(Mat<ComplexD, Opts...> &res, 
-                           const Mat<ComplexD, Opts...> &a, 
-                           const Mat<ComplexD, Opts...> &b)
-    {
-        static const ComplexD one(1., 0.), zero(0., 0.);
-        const int RowMajor = Eigen::RowMajor;
-        const int ColMajor = Eigen::ColMajor;
-
-        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
-        {
-            res.resize(a.rows(), b.cols());
-        }
-        if (Mat<ComplexD, Opts...>::Options == RowMajor)
-        {
-            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
-                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
-                        res.data(), res.cols());
-        }
-        else if (Mat<ComplexD, Opts...>::Options == ColMajor)
-        {
-            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
-                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
-                        res.data(), res.rows());
-        }
-    }
-
-    template <template <class, int...> class Mat, int... Opts>
-    static inline void mul(Mat<ComplexF, Opts...> &res, 
-                           const Mat<ComplexF, Opts...> &a, 
-                           const Mat<ComplexF, Opts...> &b)
-    {
-        static const ComplexF one(1., 0.), zero(0., 0.);
-        const int RowMajor = Eigen::RowMajor;
-        const int ColMajor = Eigen::ColMajor;
-
-        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
-        {
-            res.resize(a.rows(), b.cols());
-        }
-        if (Mat<ComplexF, Opts...>::Options == RowMajor)
-        {
-            cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
-                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
-                        res.data(), res.cols());
-        }
-        else if (Mat<ComplexF, Opts...>::Options == ColMajor)
-        {
-            cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
-                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
-                        res.data(), res.rows());
-        }
-    }
-#else
-    template <typename Mat>
-    static inline void mul(Mat &res, const Mat &a, const Mat &b)
-    {
-        res = a*b;
-    }
-#endif
-    template <typename Mat>
-    static inline double mulFlops(const Mat &a, const Mat &b)
-    {
-        double nr = a.rows(), nc = a.cols();
-
-        return nr*nr*(6.*nc + 2.*(nc - 1.));
-    }
-private:
-    template <typename C, typename MatLeft, typename MatRight>
-    static inline void makeDotRowPt(C * &aPt, unsigned int &aInc, C * &bPt, 
-                                    unsigned int &bInc, const unsigned int aRow, 
-                                    const MatLeft &a, const MatRight &b)
-    {
-        const int RowMajor = Eigen::RowMajor;
-        const int ColMajor = Eigen::ColMajor;
-
-        if (MatLeft::Options == RowMajor)
-        {
-            aPt  = a.data() + aRow*a.cols();
-            aInc = 1;
-        }
-        else if (MatLeft::Options == ColMajor)
-        {
-            aPt  = a.data() + aRow;
-            aInc = a.rows();
-        }
-        if (MatRight::Options == RowMajor)
-        {
-            bPt  = b.data() + aRow;
-            bInc = b.cols();
-        }
-        else if (MatRight::Options == ColMajor)
-        {
-            bPt  = b.data() + aRow*b.rows();
-            bInc = 1;
-        }
-    }
-
-#ifdef USE_MKL
-    template <typename C, typename MatLeft, typename MatRight>
-    static inline void makeDotColPt(C * &aPt, unsigned int &aInc, C * &bPt, 
-                                    unsigned int &bInc, const unsigned int aCol, 
-                                    const MatLeft &a, const MatRight &b)
-    {
-        const int RowMajor = Eigen::RowMajor;
-        const int ColMajor = Eigen::ColMajor;
-        if (MatLeft::Options == RowMajor)
-        {
-            aPt  = a.data() + aCol;
-            aInc = a.cols();
-        }
-        else if (MatLeft::Options == ColMajor)
-        {
-            aPt  = a.data() + aCol*a.rows();
-            aInc = 1;
-        }
-        if (MatRight::Options == RowMajor)
-        {
-            bPt  = b.data() + aCol*b.cols();
-            bInc = 1;
-        }
-        else if (MatRight::Options == ColMajor)
-        {
-            bPt  = b.data() + aCol;
-            bInc = b.rows();
-        }
-    }
-
-    template <typename MatLeft, typename MatRight>
-    static inline void dotuRow(ComplexF &res, const unsigned int aRow,
-                               const MatLeft &a, const MatRight &b)
-    {
-        const ComplexF *aPt, *bPt;
-        unsigned int   aInc, bInc;
-
-        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
-        cblas_cdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
-    }
-
-    template <typename MatLeft, typename MatRight>
-    static inline void dotuCol(ComplexF &res, const unsigned int aCol,
-                               const MatLeft &a, const MatRight &b)
-    {
-        const ComplexF *aPt, *bPt;
-        unsigned int   aInc, bInc;
-
-        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
-        cblas_cdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
-    }
-
-    template <typename MatLeft, typename MatRight>
-    static inline void dotuRow(ComplexD &res, const unsigned int aRow,
-                               const MatLeft &a, const MatRight &b)
-    {
-        const ComplexD *aPt, *bPt;
-        unsigned int   aInc, bInc;
-
-        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
-        cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
-    }
-
-    template <typename MatLeft, typename MatRight>
-    static inline void dotuCol(ComplexD &res, const unsigned int aCol,
-                               const MatLeft &a, const MatRight &b)
-    {
-        const ComplexD *aPt, *bPt;
-        unsigned int   aInc, bInc;
-
-        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
-        cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
-    }
-#endif
-};
-
-/******************************************************************************
- *                     A2AMatrixIo template implementation                    *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-template <typename T>
-A2AMatrixIo<T>::A2AMatrixIo(std::string filename, std::string dataname, 
-                            const unsigned int nt, const unsigned int ni,
-                            const unsigned int nj)
-: filename_(filename), dataname_(dataname)
-, nt_(nt), ni_(ni), nj_(nj)
-{}
-
-// access //////////////////////////////////////////////////////////////////////
-template <typename T>
-unsigned int A2AMatrixIo<T>::getNt(void) const
-{
-    return nt_;
-}
-
-template <typename T>
-unsigned int A2AMatrixIo<T>::getNi(void) const
-{
-    return ni_;
-}
-
-template <typename T>
-unsigned int A2AMatrixIo<T>::getNj(void) const
-{
-    return nj_;
-}
-
-template <typename T>
-size_t A2AMatrixIo<T>::getSize(void) const
-{
-    return nt_*ni_*nj_*sizeof(T);
-}
-
-// file allocation /////////////////////////////////////////////////////////////
-template <typename T>
-template <typename MetadataType>
-void A2AMatrixIo<T>::initFile(const MetadataType &d, const unsigned int chunkSize)
-{
-#ifdef HAVE_HDF5
-    std::vector<hsize_t>    dim = {static_cast<hsize_t>(nt_), 
-                                   static_cast<hsize_t>(ni_), 
-                                   static_cast<hsize_t>(nj_)},
-                            chunk = {static_cast<hsize_t>(nt_), 
-                                     static_cast<hsize_t>(chunkSize), 
-                                     static_cast<hsize_t>(chunkSize)};
-    H5NS::DataSpace         dataspace(dim.size(), dim.data());
-    H5NS::DataSet           dataset;
-    H5NS::DSetCreatPropList plist;
-    
-    // create empty file just with metadata
-    {
-        Hdf5Writer writer(filename_);
-        write(writer, dataname_, d);
-    }
-
-    // create the dataset
-    Hdf5Reader reader(filename_, false);
-
-    push(reader, dataname_);
-    auto &group = reader.getGroup();
-    plist.setChunk(chunk.size(), chunk.data());
-    plist.setFletcher32();
-    dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type<T>::type(), dataspace, plist);
-#else
-    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
-#endif
-}
-
-// block I/O ///////////////////////////////////////////////////////////////////
-template <typename T>
-void A2AMatrixIo<T>::saveBlock(const T *data, 
-                               const unsigned int i, 
-                               const unsigned int j,
-                               const unsigned int blockSizei,
-                               const unsigned int blockSizej)
-{
-#ifdef HAVE_HDF5
-    Hdf5Reader           reader(filename_, false);
-    std::vector<hsize_t> count = {nt_, blockSizei, blockSizej},
-                         offset = {0, static_cast<hsize_t>(i),
-                                   static_cast<hsize_t>(j)},
-                         stride = {1, 1, 1},
-                         block  = {1, 1, 1}; 
-    H5NS::DataSpace      memspace(count.size(), count.data()), dataspace;
-    H5NS::DataSet        dataset;
-    //    size_t               shift;
-
-    push(reader, dataname_);
-    auto &group = reader.getGroup();
-    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
-    dataspace   = dataset.getSpace();
-    dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
-                              stride.data(), block.data());
-    dataset.write(data, Hdf5Type<T>::type(), memspace, dataspace);
-#else
-    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
-#endif
-}
-
-template <typename T>
-void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,
-                               const unsigned int ext, const unsigned int str,
-                               const unsigned int i, const unsigned int j)
-{
-    unsigned int blockSizei = m.dimension(3);
-    unsigned int blockSizej = m.dimension(4);
-    unsigned int nstr       = m.dimension(1);
-    size_t       offset     = (ext*nstr + str)*nt_*blockSizei*blockSizej;
-
-    saveBlock(m.data() + offset, i, j, blockSizei, blockSizej);
-}
-
-template <typename T>
-template <template <class> class Vec, typename VecT>
-void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead, GridBase *grid)
-{
-#ifdef HAVE_HDF5
-    std::vector<hsize_t> hdim;
-    H5NS::DataSet        dataset;
-    H5NS::DataSpace      dataspace;
-    H5NS::CompType       datatype;
-
-    if (!(grid) || grid->IsBoss())
-    {
-        Hdf5Reader reader(filename_);
-        push(reader, dataname_);
-        auto &group = reader.getGroup();
-        dataset = group.openDataSet(HADRONS_A2AM_NAME);
-        datatype = dataset.getCompType();
-        dataspace = dataset.getSpace();
-        hdim.resize(dataspace.getSimpleExtentNdims());
-        dataspace.getSimpleExtentDims(hdim.data());
-        if ((nt_ * ni_ * nj_ != 0) and
-            ((hdim[0] != nt_) or (hdim[1] != ni_) or (hdim[2] != nj_)))
-        {
-            HADRONS_ERROR(Size, "all-to-all matrix size mismatch (got "
-                + std::to_string(hdim[0]) + "x" + std::to_string(hdim[1]) + "x"
-                + std::to_string(hdim[2]) + ", expected "
-                + std::to_string(nt_) + "x" + std::to_string(ni_) + "x"
-                + std::to_string(nj_));
-        }
-        else if (ni_*nj_ == 0)
-        {
-            if (hdim[0] != nt_)
-            {
-                HADRONS_ERROR(Size, "all-to-all time size mismatch (got "
-                    + std::to_string(hdim[0]) + ", expected "
-                    + std::to_string(nt_) + ")");
-            }
-            ni_ = hdim[1];
-            nj_ = hdim[2];
-        }
-    }
-    if (grid)
-    {
-        grid->Broadcast(grid->BossRank(), &ni_, sizeof(unsigned int));
-        grid->Broadcast(grid->BossRank(), &nj_, sizeof(unsigned int));
-    }
-
-    A2AMatrix<T>         buf(ni_, nj_);
-    int broadcastSize =  sizeof(T) * buf.size();
-    std::vector<hsize_t> count    = {1, static_cast<hsize_t>(ni_),
-                                     static_cast<hsize_t>(nj_)},
-                         stride   = {1, 1, 1},
-                         block    = {1, 1, 1},
-                         memCount = {static_cast<hsize_t>(ni_),
-                                     static_cast<hsize_t>(nj_)};
-    H5NS::DataSpace      memspace(memCount.size(), memCount.data());
-
-    std::cout << "Loading timeslice";
-    std::cout.flush();
-    *tRead = 0.;
-    for (unsigned int tp1 = nt_; tp1 > 0; --tp1)
-    {
-        unsigned int         t      = tp1 - 1;
-        std::vector<hsize_t> offset = {static_cast<hsize_t>(t), 0, 0};
-        
-        if (t % 10 == 0)
-        {
-            std::cout << " " << t;
-            std::cout.flush();
-        }
-        if (!(grid) || grid->IsBoss())
-        {
-            dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
-                                      stride.data(), block.data());
-        }
-        if (tRead) *tRead -= usecond();
-        if (!(grid) || grid->IsBoss())
-        {
-            dataset.read(buf.data(), datatype, memspace, dataspace);
-        }
-        if (grid)
-        {
-            grid->Broadcast(grid->BossRank(), buf.data(), broadcastSize);
-        }
-        if (tRead) *tRead += usecond();
-        v[t] = buf.template cast<VecT>();
-    }
-    std::cout << std::endl;
-#else
-    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
-#endif
-}
-
-/******************************************************************************
- *               A2AMatrixBlockComputation template implementation            *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-template <typename T, typename Field, typename MetadataType, typename TIo>
-A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
-::A2AMatrixBlockComputation(GridBase *grid,
-                            const unsigned int orthogDim,
-                            const unsigned int next, 
-                            const unsigned int nstr,
-                            const unsigned int blockSize, 
-                            const unsigned int cacheBlockSize,
-                            TimerArray *tArray)
-: grid_(grid), nt_(grid->GlobalDimensions()[orthogDim]), orthogDim_(orthogDim)
-, next_(next), nstr_(nstr), blockSize_(blockSize), cacheBlockSize_(cacheBlockSize)
-, tArray_(tArray)
-{
-    mCache_.resize(nt_*next_*nstr_*cacheBlockSize_*cacheBlockSize_);
-    mBuf_.resize(nt_*next_*nstr_*blockSize_*blockSize_);
-}
-
-#define START_TIMER(name) if (tArray_) tArray_->startTimer(name)
-#define STOP_TIMER(name)  if (tArray_) tArray_->stopTimer(name)
-#define GET_TIMER(name)   ((tArray_ != nullptr) ? tArray_->getDTimer(name) : 0.)
-
-// execution ///////////////////////////////////////////////////////////////////
-template <typename T, typename Field, typename MetadataType, typename TIo>
-void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
-::execute(const std::vector<Field> &left, const std::vector<Field> &right,
-          A2AKernel<T, Field> &kernel, const FilenameFn &ionameFn,
-          const FilenameFn &filenameFn, const MetadataFn &metadataFn)
-{
-    //////////////////////////////////////////////////////////////////////////
-    // i,j   is first  loop over blockSize_ factors
-    // ii,jj is second loop over cacheBlockSize_ factors for high perf contractions
-    // iii,jjj are loops within cacheBlock
-    // Total index is sum of these  i+ii+iii etc...
-    //////////////////////////////////////////////////////////////////////////
-    int    N_i = left.size();
-    int    N_j = right.size();
-    double flops, bytes, t_kernel;
-    double nodes = grid_->NodeCount();
-    
-    int NBlock_i = N_i/blockSize_ + (((N_i % blockSize_) != 0) ? 1 : 0);
-    int NBlock_j = N_j/blockSize_ + (((N_j % blockSize_) != 0) ? 1 : 0);
-
-    for(int i=0;i<N_i;i+=blockSize_)
-    for(int j=0;j<N_j;j+=blockSize_)
-    {
-        // Get the W and V vectors for this block^2 set of terms
-        int N_ii = MIN(N_i-i,blockSize_);
-        int N_jj = MIN(N_j-j,blockSize_);
-        A2AMatrixSet<TIo> mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj);
-
-        LOG(Message) << "All-to-all matrix block " 
-                     << j/blockSize_ + NBlock_j*i/blockSize_ + 1 
-                     << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
-                     << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
-                     << std::endl;
-        // Series of cache blocked chunks of the contractions within this block
-        flops    = 0.0;
-        bytes    = 0.0;
-        t_kernel = 0.0;
-        for(int ii=0;ii<N_ii;ii+=cacheBlockSize_)
-        for(int jj=0;jj<N_jj;jj+=cacheBlockSize_)
-        {
-            double t;
-            int N_iii = MIN(N_ii-ii,cacheBlockSize_);
-            int N_jjj = MIN(N_jj-jj,cacheBlockSize_);
-            A2AMatrixSet<T> mCacheBlock(mCache_.data(), next_, nstr_, nt_, N_iii, N_jjj);
-
-            START_TIMER("kernel");
-            kernel(mCacheBlock, &left[i+ii], &right[j+jj], orthogDim_, t);
-            STOP_TIMER("kernel");
-            t_kernel += t;
-            flops    += kernel.flops(N_iii, N_jjj);
-            bytes    += kernel.bytes(N_iii, N_jjj);
-
-            START_TIMER("cache copy");
-            thread_for_collapse( 5,e,next_,{
-              for(int s =0;s< nstr_;s++)
-              for(int t =0;t< nt_;t++)
-              for(int iii=0;iii< N_iii;iii++)
-              for(int jjj=0;jjj< N_jjj;jjj++)
-              {
-                mBlock(e,s,t,ii+iii,jj+jjj) = mCacheBlock(e,s,t,iii,jjj);
-              }
-            });
-            STOP_TIMER("cache copy");
-        }
-
-        // perf
-        LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes 
-                     << " Gflop/s/node " << std::endl;
-        LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes 
-                     << " GB/s/node "  << std::endl;
-
-        // IO
-        double       blockSize, ioTime;
-        unsigned int myRank = grid_->ThisRank(), nRank  = grid_->RankCount();
-    
-        LOG(Message) << "Writing block to disk" << std::endl;
-        ioTime = -GET_TIMER("IO: write block");
-        START_TIMER("IO: total");
-        makeFileDir(filenameFn(0, 0), grid_);
-#ifdef HADRONS_A2AM_PARALLEL_IO
-        grid_->Barrier();
-        // make task list for current node
-        nodeIo_.clear();
-        for(int f = myRank; f < next_*nstr_; f += nRank)
-        {
-            IoHelper h;
-
-            h.i  = i;
-            h.j  = j;
-            h.e  = f/nstr_;
-            h.s  = f % nstr_;
-            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
-                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
-            h.md = metadataFn(h.e, h.s);
-            nodeIo_.push_back(h);
-        }
-        // parallel IO
-        for (auto &h: nodeIo_)
-        {
-            saveBlock(mBlock, h);
-        }
-        grid_->Barrier();
-#else
-        // serial IO, for testing purposes only
-        for(int e = 0; e < next_; e++)
-        for(int s = 0; s < nstr_; s++)
-        {
-            IoHelper h;
-
-            h.i  = i;
-            h.j  = j;
-            h.e  = e;
-            h.s  = s;
-            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
-                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
-            h.md = metadataFn(h.e, h.s);
-            saveBlock(mfBlock, h);
-        }
-#endif
-        STOP_TIMER("IO: total");
-        blockSize  = static_cast<double>(next_*nstr_*nt_*N_ii*N_jj*sizeof(TIo));
-        ioTime    += GET_TIMER("IO: write block");
-        LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
-                     << ioTime  << " us (" 
-                     << blockSize/ioTime*1.0e6/1024/1024
-                     << " MB/s)" << std::endl;
-    }
-}
-
-// I/O handler /////////////////////////////////////////////////////////////////
-template <typename T, typename Field, typename MetadataType, typename TIo>
-void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
-::saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h)
-{
-    if ((h.i == 0) and (h.j == 0))
-    {
-        START_TIMER("IO: file creation");
-        h.io.initFile(h.md, blockSize_);
-        STOP_TIMER("IO: file creation");
-    }
-    START_TIMER("IO: write block");
-    h.io.saveBlock(m, h.e, h.s, h.i, h.j);
-    STOP_TIMER("IO: write block");
-}
-
-#undef START_TIMER
-#undef STOP_TIMER
-#undef GET_TIMER
-
-END_HADRONS_NAMESPACE
-
-#endif // A2A_Matrix_hpp_
--- a/Hadrons/A2AVectors.hpp
+++ b/Hadrons/A2AVectors.hpp
@@ -1,342 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/A2AVectors.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: fionnoh <fionnoh@gmail.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef A2A_Vectors_hpp_
-#define A2A_Vectors_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Environment.hpp>
-#include <Hadrons/Solver.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                 Class to generate V & W all-to-all vectors                 *
- ******************************************************************************/
-template <typename FImpl>
-class A2AVectorsSchurDiagTwo
-{
-public:
-    FERM_TYPE_ALIASES(FImpl,);
-    SOLVER_TYPE_ALIASES(FImpl,);
-public:
-    A2AVectorsSchurDiagTwo(FMat &action, Solver &solver);
-    virtual ~A2AVectorsSchurDiagTwo(void) = default;
-    void makeLowModeV(FermionField &vout, 
-                      const FermionField &evec, const Real &eval);
-    void makeLowModeV5D(FermionField &vout_4d, FermionField &vout_5d, 
-                        const FermionField &evec, const Real &eval);
-    void makeLowModeW(FermionField &wout, 
-                      const FermionField &evec, const Real &eval);
-    void makeLowModeW5D(FermionField &wout_4d, FermionField &wout_5d, 
-                        const FermionField &evec, const Real &eval);
-    void makeHighModeV(FermionField &vout, const FermionField &noise);
-    void makeHighModeV5D(FermionField &vout_4d, FermionField &vout_5d, 
-                         const FermionField &noise_5d);
-    void makeHighModeW(FermionField &wout, const FermionField &noise);
-    void makeHighModeW5D(FermionField &vout_5d, FermionField &wout_5d, 
-                         const FermionField &noise_5d);
-private:
-    FMat                                     &action_;
-    Solver                                   &solver_;
-    GridBase                                 *fGrid_, *frbGrid_, *gGrid_;
-    bool                                     is5d_;
-    FermionField                             src_o_, sol_e_, sol_o_, tmp_, tmp5_;
-    SchurDiagTwoOperator<FMat, FermionField> op_;
-};
-
-/******************************************************************************
- *                  Methods for V & W all-to-all vectors I/O                  *
- ******************************************************************************/
-class A2AVectorsIo
-{
-public:
-    struct Record: Serializable
-    {
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Record,
-                                        unsigned int, index);
-        Record(void): index(0) {}
-    };
-public:
-    template <typename Field>
-    static void write(const std::string fileStem, std::vector<Field> &vec, 
-                      const bool multiFile, const int trajectory = -1);
-    template <typename Field>
-    static void read(std::vector<Field> &vec, const std::string fileStem,
-                     const bool multiFile, const int trajectory = -1);
-private:
-    static inline std::string vecFilename(const std::string stem, const int traj, 
-                                          const bool multiFile)
-    {
-        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
-
-        if (multiFile)
-        {
-            return stem + t;
-        }
-        else
-        {
-            return stem + t + ".bin";
-        }
-    }
-};
-
-/******************************************************************************
- *               A2AVectorsSchurDiagTwo template implementation               *
- ******************************************************************************/
-template <typename FImpl>
-A2AVectorsSchurDiagTwo<FImpl>::A2AVectorsSchurDiagTwo(FMat &action, Solver &solver)
-: action_(action)
-, solver_(solver)
-, fGrid_(action_.FermionGrid())
-, frbGrid_(action_.FermionRedBlackGrid())
-, gGrid_(action_.GaugeGrid())
-, src_o_(frbGrid_)
-, sol_e_(frbGrid_)
-, sol_o_(frbGrid_)
-, tmp_(frbGrid_)
-, tmp5_(fGrid_)
-, op_(action_)
-{}
-
-template <typename FImpl>
-void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeV(FermionField &vout, const FermionField &evec, const Real &eval)
-{
-    src_o_ = evec;
-    src_o_.Checkerboard() = Odd;
-    pickCheckerboard(Even, sol_e_, vout);
-    pickCheckerboard(Odd, sol_o_, vout);
-
-    /////////////////////////////////////////////////////
-    // v_ie = -(1/eval_i) * MeeInv Meo MooInv evec_i
-    /////////////////////////////////////////////////////
-    action_.MooeeInv(src_o_, tmp_);
-    assert(tmp_.Checkerboard() == Odd);
-    action_.Meooe(tmp_, sol_e_);
-    assert(sol_e_.Checkerboard() == Even);
-    action_.MooeeInv(sol_e_, tmp_);
-    assert(tmp_.Checkerboard() == Even);
-    sol_e_ = (-1.0 / eval) * tmp_;
-    assert(sol_e_.Checkerboard() == Even);
-
-    /////////////////////////////////////////////////////
-    // v_io = (1/eval_i) * MooInv evec_i
-    /////////////////////////////////////////////////////
-    action_.MooeeInv(src_o_, tmp_);
-    assert(tmp_.Checkerboard() == Odd);
-    sol_o_ = (1.0 / eval) * tmp_;
-    assert(sol_o_.Checkerboard() == Odd);
-    setCheckerboard(vout, sol_e_);
-    assert(sol_e_.Checkerboard() == Even);
-    setCheckerboard(vout, sol_o_);
-    assert(sol_o_.Checkerboard() == Odd);
-}
-
-template <typename FImpl>
-void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeV5D(FermionField &vout_4d, FermionField &vout_5d, const FermionField &evec, const Real &eval)
-{
-    makeLowModeV(vout_5d, evec, eval);
-    action_.ExportPhysicalFermionSolution(vout_5d, vout_4d);
-}
-
-template <typename FImpl>
-void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeW(FermionField &wout, const FermionField &evec, const Real &eval)
-{
-    src_o_ = evec;
-    src_o_.Checkerboard() = Odd;
-    pickCheckerboard(Even, sol_e_, wout);
-    pickCheckerboard(Odd, sol_o_, wout);
-
-    /////////////////////////////////////////////////////
-    // w_ie = - MeeInvDag MoeDag Doo evec_i
-    /////////////////////////////////////////////////////
-    op_.Mpc(src_o_, tmp_);
-    assert(tmp_.Checkerboard() == Odd);
-    action_.MeooeDag(tmp_, sol_e_);
-    assert(sol_e_.Checkerboard() == Even);
-    action_.MooeeInvDag(sol_e_, tmp_);
-    assert(tmp_.Checkerboard() == Even);
-    sol_e_ = (-1.0) * tmp_;
-
-    /////////////////////////////////////////////////////
-    // w_io = Doo evec_i
-    /////////////////////////////////////////////////////
-    op_.Mpc(src_o_, sol_o_);
-    assert(sol_o_.Checkerboard() == Odd);
-    setCheckerboard(wout, sol_e_);
-    assert(sol_e_.Checkerboard() == Even);
-    setCheckerboard(wout, sol_o_);
-    assert(sol_o_.Checkerboard() == Odd);
-}
-
-template <typename FImpl>
-void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeW5D(FermionField &wout_4d, 
-                                                   FermionField &wout_5d, 
-                                                   const FermionField &evec, 
-                                                   const Real &eval)
-{
-    makeLowModeW(tmp5_, evec, eval);
-    action_.DminusDag(tmp5_, wout_5d);
-    action_.ExportPhysicalFermionSource(wout_5d, wout_4d);
-}
-
-template <typename FImpl>
-void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeV(FermionField &vout, 
-                                                  const FermionField &noise)
-{
-    solver_(vout, noise);
-}
-
-template <typename FImpl>
-void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeV5D(FermionField &vout_4d, 
-                                                    FermionField &vout_5d, 
-                                                    const FermionField &noise)
-{
-    if (noise.Grid()->Dimensions() == fGrid_->Dimensions() - 1)
-    {
-        action_.ImportPhysicalFermionSource(noise, tmp5_);
-    }
-    else
-    {
-        tmp5_ = noise;
-    }
-    makeHighModeV(vout_5d, tmp5_);
-    action_.ExportPhysicalFermionSolution(vout_5d, vout_4d);
-}
-
-template <typename FImpl>
-void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeW(FermionField &wout, 
-                                                  const FermionField &noise)
-{
-    wout = noise;
-}
-
-template <typename FImpl>
-void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeW5D(FermionField &wout_4d, 
-                                                    FermionField &wout_5d, 
-                                                    const FermionField &noise)
-{
-    if (noise.Grid()->Dimensions() == fGrid_->Dimensions() - 1)
-    {
-        action_.ImportUnphysicalFermion(noise, wout_5d);
-        wout_4d = noise;
-    }
-    else
-    {
-        wout_5d = noise;
-        action_.ExportPhysicalFermionSource(wout_5d, wout_4d);
-    }
-}
-
-/******************************************************************************
- *               all-to-all vectors I/O template implementation               *
- ******************************************************************************/
-template <typename Field>
-void A2AVectorsIo::write(const std::string fileStem, std::vector<Field> &vec, 
-                         const bool multiFile, const int trajectory)
-{
-    Record       record;
-    GridBase     *grid = vec[0].Grid();
-    ScidacWriter binWriter(grid->IsBoss());
-    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
-
-    if (multiFile)
-    {
-        std::string fullFilename;
-
-        for (unsigned int i = 0; i < vec.size(); ++i)
-        {
-            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
-
-            LOG(Message) << "Writing vector " << i << std::endl;
-            makeFileDir(fullFilename, grid);
-            binWriter.open(fullFilename);
-            record.index = i;
-            binWriter.writeScidacFieldRecord(vec[i], record);
-            binWriter.close();
-        }
-    }
-    else
-    {
-        makeFileDir(filename, grid);
-        binWriter.open(filename);
-        for (unsigned int i = 0; i < vec.size(); ++i)
-        {
-            LOG(Message) << "Writing vector " << i << std::endl;
-            record.index = i;
-            binWriter.writeScidacFieldRecord(vec[i], record);
-        }
-        binWriter.close();
-    }
-}
-
-template <typename Field>
-void A2AVectorsIo::read(std::vector<Field> &vec, const std::string fileStem, 
-                        const bool multiFile, const int trajectory)
-{
-    Record       record;
-    ScidacReader binReader;
-    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
-
-    if (multiFile)
-    {
-        std::string fullFilename;
-
-        for (unsigned int i = 0; i < vec.size(); ++i)
-        {
-            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
-
-            LOG(Message) << "Reading vector " << i << std::endl;
-            binReader.open(fullFilename);
-            binReader.readScidacFieldRecord(vec[i], record);
-            binReader.close();
-            if (record.index != i)
-            {
-                HADRONS_ERROR(Io, "vector index mismatch");
-            }
-        }
-    }
-    else
-    {
-        binReader.open(filename);
-        for (unsigned int i = 0; i < vec.size(); ++i)
-        {
-            LOG(Message) << "Reading vector " << i << std::endl;
-            binReader.readScidacFieldRecord(vec[i], record);
-            if (record.index != i)
-            {
-                HADRONS_ERROR(Io, "vector index mismatch");
-            }
-        }
-        binReader.close();
-    }
-}
-
-END_HADRONS_NAMESPACE
-
-#endif // A2A_Vectors_hpp_
--- a/Hadrons/Application.cc
+++ b/Hadrons/Application.cc
@@ -1,287 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Application.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Hadrons/Application.hpp>
-#include <Hadrons/GeneticScheduler.hpp>
-#include <Hadrons/Modules.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-
-#define BIG_SEP "================"
-#define SEP     "----------------"
-
-/******************************************************************************
- *                       Application implementation                           *
- ******************************************************************************/
-// constructors ////////////////////////////////////////////////////////////////
-#define MACOUT(macro)    macro              << " (" << #macro << ")"
-#define MACOUTS(macro) HADRONS_STR(macro) << " (" << #macro << ")"
-
-Application::Application(void)
-{
-    initLogger();
-    auto dim = GridDefaultLatt(), mpi = GridDefaultMpi(), loc(dim);
-
-    if (dim.size())
-    {
-        locVol_ = 1;
-        for (unsigned int d = 0; d < dim.size(); ++d)
-        {
-            loc[d]  /= mpi[d];
-            locVol_ *= loc[d];
-        }
-        LOG(Message) << "====== HADRONS APPLICATION INITIALISATION ======" << std::endl;
-        LOG(Message) << "** Dimensions" << std::endl;
-        LOG(Message) << "Global lattice: " << dim << std::endl;
-        LOG(Message) << "MPI partition : " << mpi << std::endl;
-        LOG(Message) << "Local lattice : " << loc << std::endl;
-        LOG(Message) << std::endl;
-        LOG(Message) << "** Default parameters (and associated C macros)" << std::endl;
-        LOG(Message) << "ASCII output precision  : " << MACOUT(DEFAULT_ASCII_PREC) << std::endl;
-        LOG(Message) << "Fermion implementation  : " << MACOUTS(FIMPLBASE) << std::endl;
-        LOG(Message) << "z-Fermion implementation: " << MACOUTS(ZFIMPLBASE) << std::endl;
-        LOG(Message) << "Scalar implementation   : " << MACOUTS(SIMPLBASE) << std::endl;
-        LOG(Message) << "Gauge implementation    : " << MACOUTS(GIMPLBASE) << std::endl;
-        LOG(Message) << "Eigenvector base size   : " 
-                    << MACOUT(HADRONS_DEFAULT_LANCZOS_NBASIS) << std::endl;
-        LOG(Message) << "Schur decomposition     : " << MACOUTS(HADRONS_DEFAULT_SCHUR) << std::endl;
-        LOG(Message) << std::endl;
-    }
-}
-
-Application::Application(const Application::GlobalPar &par)
-: Application()
-{
-    setPar(par);
-}
-
-Application::Application(const std::string parameterFileName)
-: Application()
-{
-    parameterFileName_ = parameterFileName;
-}
-
-// access //////////////////////////////////////////////////////////////////////
-void Application::setPar(const Application::GlobalPar &par)
-{
-    par_ = par;
-}
-
-const Application::GlobalPar & Application::getPar(void)
-{
-    return par_;
-}
-
-// execute /////////////////////////////////////////////////////////////////////
-void Application::run(void)
-{
-    LOG(Message) << "====== HADRONS APPLICATION START ======" << std::endl;
-    if (!parameterFileName_.empty() and (vm().getNModule() == 0))
-    {
-        parseParameterFile(parameterFileName_);
-    }
-    if (getPar().runId.empty())
-    {
-        HADRONS_ERROR(Definition, "run id is empty");
-    }
-    LOG(Message) << "RUN ID '" << getPar().runId << "'" << std::endl;
-    BinaryIO::latticeWriteMaxRetry = getPar().parallelWriteMaxRetry;
-    LOG(Message) << "Attempt(s) for resilient parallel I/O: " 
-                 << BinaryIO::latticeWriteMaxRetry << std::endl;
-    vm().setRunId(getPar().runId);
-    vm().printContent();
-    env().printContent();
-    if (getPar().saveSchedule or getPar().scheduleFile.empty())
-    {
-        schedule();
-        if (getPar().saveSchedule)
-        {
-            std::string filename;
-
-            filename = (getPar().scheduleFile.empty()) ? 
-                         "hadrons.sched" : getPar().scheduleFile;
-            saveSchedule(filename);
-        }
-    }
-    else
-    {
-        loadSchedule(getPar().scheduleFile);
-    }
-    printSchedule();
-    if (!getPar().graphFile.empty())
-    {
-        makeFileDir(getPar().graphFile, env().getGrid());
-        vm().dumpModuleGraph(getPar().graphFile);
-    }
-    configLoop();
-}
-
-// parse parameter file ////////////////////////////////////////////////////////
-class ObjectId: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(ObjectId,
-                                    std::string, name,
-                                    std::string, type);
-};
-
-void Application::parseParameterFile(const std::string parameterFileName)
-{
-    XmlReader reader(parameterFileName);
-    GlobalPar par;
-    ObjectId  id;
-    
-    LOG(Message) << "Building application from '" << parameterFileName << "'..." << std::endl;
-    read(reader, "parameters", par);
-    setPar(par);
-    if (!push(reader, "modules"))
-    {
-        HADRONS_ERROR(Parsing, "Cannot open node 'modules' in parameter file '" 
-                              + parameterFileName + "'");
-    }
-    if (!push(reader, "module"))
-    {
-        HADRONS_ERROR(Parsing, "Cannot open node 'modules/module' in parameter file '" 
-                              + parameterFileName + "'");
-    }
-    do
-    {
-        read(reader, "id", id);
-        vm().createModule(id.name, id.type, reader);
-    } while (reader.nextElement("module"));
-    pop(reader);
-    pop(reader);
-}
-
-void Application::saveParameterFile(const std::string parameterFileName, unsigned int prec)
-{
-    LOG(Message) << "Saving application to '" << parameterFileName << "'..." << std::endl;
-    if (env().getGrid()->IsBoss())
-    {
-        XmlWriter          writer(parameterFileName);
-        writer.setPrecision(prec);
-        ObjectId           id;
-        const unsigned int nMod = vm().getNModule();
-
-        write(writer, "parameters", getPar());
-        push(writer, "modules");
-        for (unsigned int i = 0; i < nMod; ++i)
-        {
-            push(writer, "module");
-            id.name = vm().getModuleName(i);
-            id.type = vm().getModule(i)->getRegisteredName();
-            write(writer, "id", id);
-            vm().getModule(i)->saveParameters(writer, "options");
-            pop(writer);
-        }
-        pop(writer);
-        pop(writer);
-    }
-}
-
-// schedule computation ////////////////////////////////////////////////////////
-void Application::schedule(void)
-{
-    if (!scheduled_ and !loadedSchedule_)
-    {
-        program_   = vm().schedule(par_.genetic);
-        scheduled_ = true;
-    }
-}
-
-void Application::saveSchedule(const std::string filename)
-{
-    LOG(Message) << "Saving current schedule to '" << filename << "'..."
-                 << std::endl;
-    if (env().getGrid()->IsBoss())
-    {
-        TextWriter               writer(filename);
-        std::vector<std::string> program;
-        
-        if (!scheduled_)
-        {
-            HADRONS_ERROR(Definition, "Computation not scheduled");
-        }
-
-        for (auto address: program_)
-        {
-            program.push_back(vm().getModuleName(address));
-        }
-        write(writer, "schedule", program);
-    }
-}
-
-void Application::loadSchedule(const std::string filename)
-{
-    TextReader               reader(filename);
-    std::vector<std::string> program;
-    
-    LOG(Message) << "Loading schedule from '" << filename << "'..."
-                 << std::endl;
-    read(reader, "schedule", program);
-    program_.clear();
-    for (auto &name: program)
-    {
-        program_.push_back(vm().getModuleAddress(name));
-    }
-    loadedSchedule_ = true;
-    scheduled_      = true;
-}
-
-void Application::printSchedule(void)
-{
-    if (!scheduled_ and !loadedSchedule_)
-    {
-        HADRONS_ERROR(Definition, "Computation not scheduled");
-    }
-    auto peak = vm().memoryNeeded(program_);
-    LOG(Message) << "Schedule (memory needed: " << sizeString(peak) << "):"
-                 << std::endl;
-    for (unsigned int i = 0; i < program_.size(); ++i)
-    {
-        LOG(Message) << std::setw(4) << i + 1 << ": "
-                     << vm().getModuleName(program_[i]) << std::endl;
-    }
-}
-
-// loop on configurations //////////////////////////////////////////////////////
-void Application::configLoop(void)
-{
-    auto range = par_.trajCounter;
-    
-    for (unsigned int t = range.start; t < range.end; t += range.step)
-    {
-        LOG(Message) << BIG_SEP << " Starting measurement for trajectory " << t
-                     << " " << BIG_SEP << std::endl;
-        vm().setTrajectory(t);
-        vm().executeProgram(program_);
-    }
-    LOG(Message) << BIG_SEP << " End of measurement " << BIG_SEP << std::endl;
-    env().freeAll();
-}
--- a/Hadrons/Application.hpp
+++ b/Hadrons/Application.hpp
@@ -1,126 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Application.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_Application_hpp_
-#define Hadrons_Application_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/VirtualMachine.hpp>
-#include <Hadrons/Module.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                         Main program manager                               *
- ******************************************************************************/
-class Application
-{
-public:
-    class TrajRange: Serializable
-    {
-    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
-                                        unsigned int, start,
-                                        unsigned int, end,
-                                        unsigned int, step);
-    };
-    class GlobalPar: Serializable
-    {
-    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(GlobalPar,
-                                        TrajRange,                  trajCounter,
-                                        VirtualMachine::GeneticPar, genetic,
-                                        std::string,                runId,
-                                        std::string,                graphFile,
-                                        std::string,                scheduleFile,
-                                        bool,                       saveSchedule,
-                                        int,                        parallelWriteMaxRetry);
-        GlobalPar(void): parallelWriteMaxRetry{-1} {}
-    };
-public:
-    // constructors
-    Application(void);
-    Application(const GlobalPar &par);
-    Application(const std::string parameterFileName);
-    // destructor
-    virtual ~Application(void) = default;
-    // access
-    void              setPar(const GlobalPar &par);
-    const GlobalPar & getPar(void);
-    // module creation
-    template <typename M>
-    void createModule(const std::string name);
-    template <typename M>
-    void createModule(const std::string name, const typename M::Par &par);
-    // execute
-    void run(void);
-    // XML parameter file I/O
-    void parseParameterFile(const std::string parameterFileName);
-    void saveParameterFile(const std::string parameterFileName, unsigned int prec=15);
-    // schedule computation
-    void schedule(void);
-    void saveSchedule(const std::string filename);
-    void loadSchedule(const std::string filename);
-    void printSchedule(void);
-    // loop on configurations
-    void configLoop(void);
-private:
-    // environment shortcut
-    DEFINE_ENV_ALIAS;
-    // virtual machine shortcut
-    DEFINE_VM_ALIAS;
-private:
-    long unsigned int       locVol_;
-    std::string             parameterFileName_{""};
-    GlobalPar               par_;
-    VirtualMachine::Program program_;
-    bool                    scheduled_{false}, loadedSchedule_{false};
-};
-
-/******************************************************************************
- *                     Application template implementation                    *
- ******************************************************************************/
-// module creation /////////////////////////////////////////////////////////////
-template <typename M>
-void Application::createModule(const std::string name)
-{
-    vm().createModule<M>(name);
-    scheduled_ = false;
-}
-
-template <typename M>
-void Application::createModule(const std::string name,
-                               const typename M::Par &par)
-{
-    vm().createModule<M>(name, par);
-    scheduled_ = false;
-}
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_Application_hpp_
--- a/Hadrons/Archive/Modules/ScalarVP.cc
+++ b/Hadrons/Archive/Modules/ScalarVP.cc
@@ -1,564 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/ScalarVP.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: James Harrison <jch1g10@soton.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Hadrons/Modules/MScalar/ChargedProp.hpp>
-#include <Hadrons/Modules/MScalar/ScalarVP.hpp>
-#include <Hadrons/Modules/MScalar/Scalar.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace MScalar;
-
-/*
- * Scalar QED vacuum polarisation up to O(alpha)
- *
- * Conserved vector 2-point function diagram notation:
- *        _______
- *       /       \
- * U_nu *         * U_mu
- *       \_______/
- *
- *                (   adj(S(a\hat{nu}|x)) U_mu(x) S(0|x+a\hat{mu}) U_nu(0)    )
- *          = 2 Re(                             -                             )
- *                ( adj(S(a\hat{nu}|x+a\hat{mu})) adj(U_mu(x)) S(0|x) U_nu(0) )
- *  
- *
- *            _______
- *           /       \
- * free = 1 *         * 1
- *           \_______/
- *
- *
- *
- *             _______
- *            /       \
- * S = iA_nu *         * iA_mu
- *            \_______/
- *
- *
- *         Delta_1
- *         ___*___
- *        /       \
- * X = 1 *         * 1
- *        \___*___/
- *         Delta_1
- *
- *          Delta_1                     Delta_1
- *          ___*___                     ___*___
- *         /       \                   /       \
- *      1 *         * iA_mu  +  iA_nu *         * 1
- *         \_______/                   \_______/
- * 4C =        _______                     _______
- *            /       \                   /       \
- *      +  1 *         * iA_mu  +  iA_nu *         * 1
- *            \___*___/                   \___*___/
- *             Delta_1                     Delta_1
- *
- *     Delta_1   Delta_1
- *          _*___*_             _______
- *         /       \           /       \
- * 2E = 1 *         * 1  +  1 *         * 1
- *         \_______/           \_*___*_/
- *                         Delta_1   Delta_1
- *
- *          Delta_2
- *          ___*___             _______
- *         /       \           /       \
- * 2T = 1 *         * 1  +  1 *         * 1
- *         \_______/           \___*___/
- *                              Delta_2
- *
- *
- *                    _______
- *                   /       \
- * srcT = -A_nu^2/2 *         * 1
- *                   \_______/
- *
- *
- *
- *            _______
- *           /       \
- * snkT = 1 *         * -A_mu^2/2
- *           \_______/
- *
- * Full VP to O(alpha) = free + q^2*(S+X+4C+2E+2T+srcT+snkT)
- */
-
-/******************************************************************************
-*                  TScalarVP implementation                             *
-******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-TScalarVP::TScalarVP(const std::string name)
-: Module<ScalarVPPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-std::vector<std::string> TScalarVP::getInput(void)
-{
-    prop0Name_ = par().scalarProp + "_0";
-    propQName_ = par().scalarProp + "_Q";
-    propSunName_ = par().scalarProp + "_Sun";
-    propTadName_ = par().scalarProp + "_Tad";
-
-	std::vector<std::string> in = {par().emField, prop0Name_, propQName_,
-                                   propSunName_, propTadName_};
-    
-    return in;
-}
-
-std::vector<std::string> TScalarVP::getOutput(void)
-{
-    std::vector<std::string> out;
-    
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        // out.push_back(getName() + "_propQ_" + std::to_string(mu));
-
-        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
-        {
-            out.push_back(getName() + "_" + std::to_string(mu)
-                          + "_" + std::to_string(nu));
-        }
-    }
-
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-void TScalarVP::setup(void)
-{
-	freeMomPropName_ = FREEMOMPROP(static_cast<TChargedProp *>(vm().getModule(par().scalarProp))->par().mass);
-	GFSrcName_ = par().scalarProp + "_DinvSrc";
-    fftName_   = par().scalarProp + "_fft";
-	phaseName_.clear();
-	muPropQName_.clear();
-    vpTensorName_.clear();
-    momPhaseName_.clear();
-	for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
-        muPropQName_.push_back(getName() + "_propQ_" + std::to_string(mu));
-
-        std::vector<std::string> vpTensorName_mu;
-        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
-        {
-            vpTensorName_mu.push_back(getName() + "_" + std::to_string(mu)
-                                      + "_" + std::to_string(nu));
-        }
-        vpTensorName_.push_back(vpTensorName_mu);
-    }
-    if (!par().output.empty())
-    {
-        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-        {
-            momPhaseName_.push_back("_momentumphase_" + std::to_string(i_p));
-        }
-    }
-
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-	{
-	    envCreateLat(ScalarField, muPropQName_[mu]);
-
-        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
-        {
-            envCreateLat(ScalarField, vpTensorName_[mu][nu]);
-        }
-	}
-    if (!par().output.empty())
-    {
-        momPhasesDone_ = env().hasCreatedObject(momPhaseName_[0]);
-        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-        {
-            envCacheLat(ScalarField, momPhaseName_[i_p]);
-        }
-    }
-    envTmpLat(ScalarField, "buf");
-    envTmpLat(ScalarField, "result");
-    envTmpLat(ScalarField, "Amu");
-    envTmpLat(ScalarField, "Usnk");
-    envTmpLat(ScalarField, "tmpProp");
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-void TScalarVP::execute(void)
-{
-    // CACHING ANALYTIC EXPRESSIONS
-    makeCaches();
-
-    Complex ci(0.0,1.0);
-    Real    q        = static_cast<TChargedProp *>(vm().getModule(par().scalarProp))->par().charge;
-    auto    &prop0   = envGet(ScalarField, prop0Name_);
-    auto    &propQ   = envGet(ScalarField, propQName_);
-    auto    &propSun = envGet(ScalarField, propSunName_);
-    auto    &propTad = envGet(ScalarField, propTadName_);
-    auto    &GFSrc   = envGet(ScalarField, GFSrcName_);
-    auto    &G       = envGet(ScalarField, freeMomPropName_);
-    auto    &fft     = envGet(FFT, fftName_);
-    phase_.clear();
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        auto &phmu = envGet(ScalarField, phaseName_[mu]);
-        phase_.push_back(&phmu);
-    }
-    
-    // PROPAGATORS FROM SHIFTED SOURCES
-    LOG(Message) << "Computing O(q) charged scalar propagators..."
-                 << std::endl;
-    std::vector<ScalarField *> muPropQ;
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        auto &propmu = envGet(ScalarField, muPropQName_[mu]);
-
-        // -G*momD1*G*F*tau_mu*Src (momD1 = F*D1*Finv)
-        propmu = adj(*phase_[mu])*GFSrc;
-        momD1(propmu, fft);
-        propmu = -G*propmu;
-        fft.FFT_all_dim(propmu, propmu, FFT::backward);
-
-        muPropQ.push_back(&propmu);
-    }
-
-    // CONTRACTIONS
-    auto        &A = envGet(EmField, par().emField);
-    envGetTmp(ScalarField, buf);
-    envGetTmp(ScalarField, result);
-    envGetTmp(ScalarField, Amu);
-    envGetTmp(ScalarField, Usnk);
-    envGetTmp(ScalarField, tmpProp);
-    TComplex    Anu0, Usrc;
-    std::vector<int> coor0 = {0, 0, 0, 0};
-    std::vector<std::vector<ScalarField *> > vpTensor;
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        std::vector<ScalarField *> vpTensor_mu;
-        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
-        {
-            auto &vpmunu = envGet(ScalarField, vpTensorName_[mu][nu]);
-            vpTensor_mu.push_back(&vpmunu);
-        }
-        vpTensor.push_back(vpTensor_mu);
-    }
-
-    // Prepare output data structure if necessary
-    Result outputData;
-    if (!par().output.empty())
-    {
-        outputData.projection.resize(par().outputMom.size());
-        outputData.lattice_size = env().getGrid()->FullDimensions().toVector();
-        outputData.mass = static_cast<TChargedProp *>(vm().getModule(par().scalarProp))->par().mass;
-        outputData.charge = q;
-        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-        {
-            outputData.projection[i_p].momentum = strToVec<int>(par().outputMom[i_p]);
-            outputData.projection[i_p].pi.resize(env().getNd());
-            outputData.projection[i_p].pi_free.resize(env().getNd());
-            outputData.projection[i_p].pi_2E.resize(env().getNd());
-            outputData.projection[i_p].pi_2T.resize(env().getNd());
-            outputData.projection[i_p].pi_S.resize(env().getNd());
-            outputData.projection[i_p].pi_4C.resize(env().getNd());
-            outputData.projection[i_p].pi_X.resize(env().getNd());
-            outputData.projection[i_p].pi_srcT.resize(env().getNd());
-            outputData.projection[i_p].pi_snkT.resize(env().getNd());
-            for (unsigned int nu = 0; nu < env().getNd(); ++nu)
-            {
-                outputData.projection[i_p].pi[nu].resize(env().getNd());
-                outputData.projection[i_p].pi_free[nu].resize(env().getNd());
-                outputData.projection[i_p].pi_2E[nu].resize(env().getNd());
-                outputData.projection[i_p].pi_2T[nu].resize(env().getNd());
-                outputData.projection[i_p].pi_S[nu].resize(env().getNd());
-                outputData.projection[i_p].pi_4C[nu].resize(env().getNd());
-                outputData.projection[i_p].pi_X[nu].resize(env().getNd());
-                outputData.projection[i_p].pi_srcT[nu].resize(env().getNd());
-                outputData.projection[i_p].pi_snkT[nu].resize(env().getNd());
-            }
-        }
-    }
-
-    // Do contractions
-    for (unsigned int nu = 0; nu < env().getNd(); ++nu)
-    {
-        peekSite(Anu0, peekLorentz(A, nu), coor0);
-
-        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-        {
-            LOG(Message) << "Computing Pi[" << mu << "][" << nu << "]..."
-                         << std::endl;
-            Amu = peekLorentz(A, mu);
-
-            // free
-            tmpProp = Cshift(prop0, nu, -1);     // S_0(0|x-a\hat{\nu})
-                                                 // = S_0(a\hat{\nu}|x)
-            Usrc    = Complex(1.0,0.0);
-            vpContraction(result, prop0, tmpProp, Usrc, mu);
-            *vpTensor[mu][nu] = result;
-            // Do momentum projections if necessary
-            if (!par().output.empty())
-            {
-                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-                {
-                    project(outputData.projection[i_p].pi_free[mu][nu], result,
-                            i_p);
-                }
-            }
-            tmpProp = result; // Just using tmpProp as a temporary ScalarField
-                              // here (buf is modified by calls to writeVP())
-
-            // srcT
-            result = tmpProp * (-0.5)*Anu0*Anu0;
-            *vpTensor[mu][nu] += q*q*result;
-            // Do momentum projections if necessary
-            if (!par().output.empty())
-            {
-                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-                {
-                    project(outputData.projection[i_p].pi_srcT[mu][nu], result,
-                            i_p);
-                }
-            }
-
-            // snkT
-            result = tmpProp * (-0.5)*Amu*Amu;
-            *vpTensor[mu][nu] += q*q*result;
-            // Do momentum projections if necessary
-            if (!par().output.empty())
-            {
-                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-                {
-                    project(outputData.projection[i_p].pi_snkT[mu][nu], result,
-                            i_p);
-                }
-            }
-
-            // S
-            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
-            Usrc    = ci*Anu0;
-            Usnk    = ci*Amu;
-            vpContraction(result, prop0, tmpProp, Usrc, Usnk, mu);
-            *vpTensor[mu][nu] += q*q*result;
-            // Do momentum projections if necessary
-            if (!par().output.empty())
-            {
-                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-                {
-                    project(outputData.projection[i_p].pi_S[mu][nu], result,
-                            i_p);
-                }
-            }
-
-            // 4C
-            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
-            Usrc    = Complex(1.0,0.0);
-            Usnk    = ci*Amu;
-            vpContraction(result, propQ, tmpProp, Usrc, Usnk, mu);
-            Usrc    = ci*Anu0;
-            vpContraction(buf, propQ, tmpProp, Usrc, mu);
-            result += buf;
-            vpContraction(buf, prop0, *muPropQ[nu], Usrc, mu);
-            result += buf;
-            Usrc = Complex(1.0,0.0);
-            Usnk = ci*Amu;
-            vpContraction(buf, prop0, *muPropQ[nu], Usrc, Usnk, mu);
-            result += buf;
-            *vpTensor[mu][nu] += q*q*result;
-            // Do momentum projections if necessary
-            if (!par().output.empty())
-            {
-                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-                {
-                    project(outputData.projection[i_p].pi_4C[mu][nu], result,
-                            i_p);
-                }
-            }
-
-            // X
-            Usrc = Complex(1.0,0.0);
-            vpContraction(result, propQ, *muPropQ[nu], Usrc, mu);
-            *vpTensor[mu][nu] += q*q*result;
-            // Do momentum projections if necessary
-            if (!par().output.empty())
-            {
-                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-                {
-                    project(outputData.projection[i_p].pi_X[mu][nu], result,
-                            i_p);
-                }
-            }
-
-            // 2E
-            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
-            Usrc    = Complex(1.0,0.0);
-            vpContraction(result, propSun, tmpProp, Usrc, mu);
-            tmpProp = Cshift(propSun, nu, -1);     // S_\Sigma(0|x-a\hat{\nu})
-                               //(Note: <S(0|x-a\hat{\nu})> = <S(a\hat{\nu}|x)>)
-            vpContraction(buf, prop0, tmpProp, Usrc, mu);
-            result += buf;
-            *vpTensor[mu][nu] += q*q*result;
-            // Do momentum projections if necessary
-            if (!par().output.empty())
-            {
-                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-                {
-                    project(outputData.projection[i_p].pi_2E[mu][nu], result,
-                            i_p);
-                }
-            }
-
-            // 2T
-            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
-            Usrc    = Complex(1.0,0.0);
-            vpContraction(result, propTad, tmpProp, Usrc, mu);
-            tmpProp = Cshift(propTad, nu, -1);     // S_T(0|x-a\hat{\nu})
-            vpContraction(buf, prop0, tmpProp, Usrc, mu);
-            result += buf;
-            *vpTensor[mu][nu] += q*q*result;
-            // Do momentum projections if necessary
-            if (!par().output.empty())
-            {
-                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-                {
-                    project(outputData.projection[i_p].pi_2T[mu][nu], result,
-                            i_p);
-                }
-            }
-
-            // Do momentum projections of full VP if necessary
-            if (!par().output.empty())
-            {
-                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-                {
-                    project(outputData.projection[i_p].pi[mu][nu],
-                            *vpTensor[mu][nu], i_p);
-                }
-            }
-        }
-    }
-
-    // OUTPUT IF NECESSARY
-    if (!par().output.empty())
-    {
-        LOG(Message) << "Saving momentum-projected HVP to '"
-                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
-                     << std::endl;
-        saveResult(par().output, "HVP", outputData);
-    }
-}
-
-void TScalarVP::makeCaches(void)
-{
-    envGetTmp(ScalarField, buf);
-
-    if ( (!par().output.empty()) && (!momPhasesDone_) )
-    {
-        LOG(Message) << "Caching phases for momentum projections..."
-                     << std::endl;
-        auto l = env().getGrid()->FullDimensions();
-        Complex          ci(0.0,1.0);
-
-        // Calculate phase factors
-        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-        {
-            std::vector<int> mom = strToVec<int>(par().outputMom[i_p]);
-            auto &momph_ip = envGet(ScalarField, momPhaseName_[i_p]);
-            momph_ip = Zero();
-            for (unsigned int j = 0; j < env().getNd()-1; ++j)
-            {
-                Real twoPiL = M_PI*2./l[j];
-                LatticeCoordinate(buf, j);
-                buf = mom[j]*twoPiL*buf;
-                momph_ip = momph_ip + buf;
-            }
-            momph_ip = exp(-ci*momph_ip);
-            momPhase_.push_back(&momph_ip);
-        }
-    }
-}
-
-void TScalarVP::vpContraction(ScalarField &vp,
-                   ScalarField &prop_0_x, ScalarField &prop_nu_x,
-                   TComplex u_src, ScalarField &u_snk, int mu)
-{
-    // Note: this function assumes a point source is used.
-    vp = adj(prop_nu_x) * u_snk * Cshift(prop_0_x, mu, 1) * u_src;
-    vp -= Cshift(adj(prop_nu_x), mu, 1) * adj(u_snk) * prop_0_x * u_src;
-    vp = 2.0*real(vp);
-}
-
-void TScalarVP::vpContraction(ScalarField &vp,
-                   ScalarField &prop_0_x, ScalarField &prop_nu_x,
-                   TComplex u_src, int mu)
-{
-    // Note: this function assumes a point source is used.
-    vp = adj(prop_nu_x) * Cshift(prop_0_x, mu, 1) * u_src;
-    vp -= Cshift(adj(prop_nu_x), mu, 1) * prop_0_x * u_src;
-    vp = 2.0*real(vp);
-}
-
-void TScalarVP::project(std::vector<Complex> &projection, const ScalarField &vp, int i_p)
-{
-    std::vector<TComplex>   vecBuf;
-    envGetTmp(ScalarField, buf);
-
-    buf = vp*(*momPhase_[i_p]);
-    sliceSum(buf, vecBuf, Tp);
-    projection.resize(vecBuf.size());
-    for (unsigned int t = 0; t < vecBuf.size(); ++t)
-    {
-        projection[t] = TensorRemove(vecBuf[t]);
-    }
-}
-
-void TScalarVP::momD1(ScalarField &s, FFT &fft)
-{
-    auto        &A = envGet(EmField, par().emField);
-    Complex     ci(0.0,1.0);
-
-    envGetTmp(ScalarField, buf);
-    envGetTmp(ScalarField, result);
-    envGetTmp(ScalarField, Amu);
-
-    result = Zero();
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        Amu = peekLorentz(A, mu);
-        buf = (*phase_[mu])*s;
-        fft.FFT_all_dim(buf, buf, FFT::backward);
-        buf = Amu*buf;
-        fft.FFT_all_dim(buf, buf, FFT::forward);
-        result = result - ci*buf;
-    }
-    fft.FFT_all_dim(s, s, FFT::backward);
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        Amu = peekLorentz(A, mu);
-        buf = Amu*s;
-        fft.FFT_all_dim(buf, buf, FFT::forward);
-        result = result + ci*adj(*phase_[mu])*buf;
-    }
-
-    s = result;
-}
--- a/Hadrons/Archive/Modules/ScalarVP.hpp
+++ b/Hadrons/Archive/Modules/ScalarVP.hpp
@@ -1,129 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/ScalarVP.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: James Harrison <jch1g10@soton.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef Hadrons_MScalar_ScalarVP_hpp_
-#define Hadrons_MScalar_ScalarVP_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Module.hpp>
-#include <Hadrons/ModuleFactory.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                         Scalar vacuum polarisation                         *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MScalar)
-
-class ScalarVPPar: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarVPPar,
-                                    std::string, emField,
-                                    std::string, scalarProp,
-                                    std::string, output,
-                                    std::vector<std::string>, outputMom);
-};
-
-class TScalarVP: public Module<ScalarVPPar>
-{
-public:
-    BASIC_TYPE_ALIASES(SIMPL,);
-    typedef PhotonR::GaugeField     EmField;
-    typedef PhotonR::GaugeLinkField EmComp;
-    class Result: Serializable
-    {
-    public:
-        class Projection: Serializable
-        {
-        public:
-            GRID_SERIALIZABLE_CLASS_MEMBERS(Projection,
-                                            std::vector<int>,     momentum,
-                                            std::vector<std::vector<std::vector<Complex>>>, pi,
-                                            std::vector<std::vector<std::vector<Complex>>>, pi_free,
-                                            std::vector<std::vector<std::vector<Complex>>>, pi_2E,
-                                            std::vector<std::vector<std::vector<Complex>>>, pi_2T,
-                                            std::vector<std::vector<std::vector<Complex>>>, pi_S,
-                                            std::vector<std::vector<std::vector<Complex>>>, pi_4C,
-                                            std::vector<std::vector<std::vector<Complex>>>, pi_X,
-                                            std::vector<std::vector<std::vector<Complex>>>, pi_srcT,
-                                            std::vector<std::vector<std::vector<Complex>>>, pi_snkT);
-        };
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
-                                        std::vector<int>,        lattice_size,
-                                        double,                  mass,
-                                        double,                  charge,
-                                        std::vector<Projection>, projection);
-    };
-public:
-    // constructor
-    TScalarVP(const std::string name);
-    // destructor
-    virtual ~TScalarVP(void) {};
-    // dependency relation
-    virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getOutput(void);
-protected:
-    // setup
-    virtual void setup(void);
-    // execution
-    virtual void execute(void);
-private:
-    void makeCaches(void);
-    // conserved vector two-point contraction
-    void vpContraction(ScalarField &vp,
-                       ScalarField &prop_0_x, ScalarField &prop_nu_x,
-                       TComplex u_src, ScalarField &u_snk, int mu);
-    // conserved vector two-point contraction with unit gauge link at sink
-    void vpContraction(ScalarField &vp,
-                       ScalarField &prop_0_x, ScalarField &prop_nu_x,
-                       TComplex u_src, int mu);
-    // write momentum-projected vacuum polarisation to file(s)
-    void project(std::vector<Complex> &projection, const ScalarField &vp,
-                 int i_p);
-    // momentum-space Delta_1 insertion
-    void momD1(ScalarField &s, FFT &fft);
-private:
-    bool                                        momPhasesDone_;
-    std::string                                 freeMomPropName_, GFSrcName_,
-                                                prop0Name_, propQName_,
-                                                propSunName_, propTadName_,
-                                                fftName_;
-    std::vector<std::string>                    phaseName_, muPropQName_,
-                                                momPhaseName_;
-    std::vector<std::vector<std::string> >      vpTensorName_;
-    std::vector<ScalarField *>                  phase_, momPhase_;
-};
-
-MODULE_REGISTER(ScalarVP, TScalarVP, MScalar);
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_MScalar_ScalarVP_hpp_
--- a/Hadrons/Archive/Modules/TestSeqConserved.cc
+++ b/Hadrons/Archive/Modules/TestSeqConserved.cc
@@ -1,35 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/TestSeqConserved.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Hadrons/Modules/MUtilities/TestSeqConserved.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace MUtilities;
-
-template class Grid::Hadrons::MUtilities::TTestSeqConserved<FIMPL>;
-
--- a/Hadrons/Archive/Modules/TestSeqConserved.hpp
+++ b/Hadrons/Archive/Modules/TestSeqConserved.hpp
@@ -1,186 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/TestSeqConserved.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_MUtilities_TestSeqConserved_hpp_
-#define Hadrons_MUtilities_TestSeqConserved_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Module.hpp>
-#include <Hadrons/ModuleFactory.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/*
-  Ward Identity contractions using sequential propagators.
- -----------------------------
- 
- * options:
- - q:      point source propagator, 5D if available (string)
- - qSeq:   result of sequential insertion of conserved current using q (string)
- - action: action used for computation of q (string)
- - origin: string giving point source origin of q (string)
- - t_J:    time at which sequential current is inserted (int)
- - mu:     Lorentz index of current inserted (int)
- - curr:   current type, e.g. vector/axial (Current)
-*/
-
-/******************************************************************************
- *                            TestSeqConserved                                *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MUtilities)
-
-class TestSeqConservedPar: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(TestSeqConservedPar,
-                                    std::string,  q,
-                                    std::string,  qSeq,
-                                    std::string,  action,
-                                    std::string,  origin,
-                                    unsigned int, t_J,
-                                    unsigned int, mu,
-                                    Current,      curr);
-};
-
-template <typename FImpl>
-class TTestSeqConserved: public Module<TestSeqConservedPar>
-{
-public:
-    FERM_TYPE_ALIASES(FImpl,);
-public:
-    // constructor
-    TTestSeqConserved(const std::string name);
-    // destructor
-    virtual ~TTestSeqConserved(void) {};
-    // dependency relation
-    virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getOutput(void);
-protected:
-    // setup
-    virtual void setup(void);
-    // execution
-    virtual void execute(void);
-};
-
-MODULE_REGISTER_TMP(TestSeqConserved, TTestSeqConserved<FIMPL>, MUtilities);
-
-/******************************************************************************
- *                     TTestSeqConserved implementation                       *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-template <typename FImpl>
-TTestSeqConserved<FImpl>::TTestSeqConserved(const std::string name)
-: Module<TestSeqConservedPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-template <typename FImpl>
-std::vector<std::string> TTestSeqConserved<FImpl>::getInput(void)
-{
-    std::vector<std::string> in = {par().q, par().qSeq, par().action};
-    
-    return in;
-}
-
-template <typename FImpl>
-std::vector<std::string> TTestSeqConserved<FImpl>::getOutput(void)
-{
-    std::vector<std::string> out = {getName()};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void TTestSeqConserved<FImpl>::setup(void)
-{
-    auto Ls = env().getObjectLs(par().q);
-    if (Ls != env().getObjectLs(par().action))
-    {
-        HADRONS_ERROR(Size, "Ls mismatch between quark action and propagator");
-    }
-    envTmpLat(PropagatorField, "tmp");
-    envTmpLat(LatticeComplex, "c");
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void TTestSeqConserved<FImpl>::execute(void)
-{
-    // Check sequential insertion of current gives same result as conserved 
-    // current sink upon contraction. Assume q uses a point source.
-
-    auto                  &q    = envGet(PropagatorField, par().q);
-    auto                  &qSeq = envGet(PropagatorField, par().qSeq);
-    auto                  &act  = envGet(FMat, par().action);
-    Gamma                 g5(Gamma::Algebra::Gamma5);
-    Gamma::Algebra        gA = (par().curr == Current::Axial) ?
-                                  Gamma::Algebra::Gamma5 :
-                                  Gamma::Algebra::Identity;
-    Gamma                 g(gA);
-    SitePropagator        qSite;
-    Complex               test_S, test_V, check_S, check_V;
-    std::vector<TComplex> check_buf;
-    std::vector<int>      siteCoord;
-
-    envGetTmp(PropagatorField, tmp);
-    envGetTmp(LatticeComplex, c);
-    siteCoord = strToVec<int>(par().origin);
-    peekSite(qSite, qSeq, siteCoord);
-    test_S = trace(qSite*g);
-    test_V = trace(qSite*g*Gamma::gmu[par().mu]);
-    act.ContractConservedCurrent(q, q, tmp, par().curr, par().mu);
-    c = trace(tmp*g);
-    sliceSum(c, check_buf, Tp);
-    check_S = TensorRemove(check_buf[par().t_J]);
-
-    c = trace(tmp*g*Gamma::gmu[par().mu]);
-    sliceSum(c, check_buf, Tp);
-    check_V = TensorRemove(check_buf[par().t_J]);
-
-    LOG(Message) << "Test S  = " << abs(test_S)   << std::endl;
-    LOG(Message) << "Test V  = " << abs(test_V) << std::endl;
-    LOG(Message) << "Check S = " << abs(check_S) << std::endl;
-    LOG(Message) << "Check V = " << abs(check_V) << std::endl;
-
-    // Check difference = 0
-    check_S -= test_S;
-    check_V -= test_V;
-
-    LOG(Message) << "Consistency check for sequential conserved " 
-                 << par().curr << " current insertion: " << std::endl; 
-    LOG(Message) << "Diff S  = " << abs(check_S) << std::endl;
-    LOG(Message) << "Diff V  = " << abs(check_V) << std::endl;
-}
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_TestSeqConserved_hpp_
--- a/Hadrons/Archive/Modules/TestSeqGamma.cc
+++ b/Hadrons/Archive/Modules/TestSeqGamma.cc
@@ -1,35 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/TestSeqGamma.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Hadrons/Modules/MUtilities/TestSeqGamma.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace MUtilities;
-
-template class Grid::Hadrons::MUtilities::TTestSeqGamma<FIMPL>;
-
--- a/Hadrons/Archive/Modules/TestSeqGamma.hpp
+++ b/Hadrons/Archive/Modules/TestSeqGamma.hpp
@@ -1,150 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/TestSeqGamma.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_MUtilities_TestSeqGamma_hpp_
-#define Hadrons_MUtilities_TestSeqGamma_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Module.hpp>
-#include <Hadrons/ModuleFactory.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                              TestSeqGamma                                  *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MUtilities)
-
-class TestSeqGammaPar: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(TestSeqGammaPar,
-                                    std::string,    q,
-                                    std::string,    qSeq,
-                                    std::string,    origin,
-                                    Gamma::Algebra, gamma,
-                                    unsigned int,   t_g);
-};
-
-template <typename FImpl>
-class TTestSeqGamma: public Module<TestSeqGammaPar>
-{
-public:
-    FERM_TYPE_ALIASES(FImpl,);
-public:
-    // constructor
-    TTestSeqGamma(const std::string name);
-    // destructor
-    virtual ~TTestSeqGamma(void) {};
-    // dependency relation
-    virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getOutput(void);
-protected:
-    // setup
-    virtual void setup(void);
-    // execution
-    virtual void execute(void);
-};
-
-MODULE_REGISTER_TMP(TestSeqGamma, TTestSeqGamma<FIMPL>, MUtilities);
-
-/******************************************************************************
- *                      TTestSeqGamma implementation                          *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-template <typename FImpl>
-TTestSeqGamma<FImpl>::TTestSeqGamma(const std::string name)
-: Module<TestSeqGammaPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-template <typename FImpl>
-std::vector<std::string> TTestSeqGamma<FImpl>::getInput(void)
-{
-    std::vector<std::string> in = {par().q, par().qSeq};
-    
-    return in;
-}
-
-template <typename FImpl>
-std::vector<std::string> TTestSeqGamma<FImpl>::getOutput(void)
-{
-    std::vector<std::string> out = {getName()};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void TTestSeqGamma<FImpl>::setup(void)
-{
-    envTmpLat(LatticeComplex, "c");
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void TTestSeqGamma<FImpl>::execute(void)
-{
-    auto                  &q    = envGet(PropagatorField, par().q);
-    auto                  &qSeq = envGet(PropagatorField, par().qSeq);
-    Gamma                 g5(Gamma::Algebra::Gamma5);
-    Gamma                 g(par().gamma);
-    SitePropagator        qSite;
-    Complex               test, check;
-    std::vector<TComplex> check_buf;
-    std::vector<int>      siteCoord;
-
-    // Check sequential insertion of gamma matrix gives same result as 
-    // insertion of gamma at sink upon contraction. Assume q uses a point 
-    // source.
-    
-    envGetTmp(LatticeComplex, c);
-    siteCoord = strToVec<int>(par().origin);
-    peekSite(qSite, qSeq, siteCoord);
-    test = trace(g*qSite);
-
-    c = trace(adj(g)*g5*adj(q)*g5*g*q);
-    sliceSum(c, check_buf, Tp);
-    check = TensorRemove(check_buf[par().t_g]);
-
-    LOG(Message) << "Seq Result = " << abs(test)  << std::endl;
-    LOG(Message) << "Reference  = " << abs(check) << std::endl;
-
-    // Check difference = 0
-    check -= test;
-
-    LOG(Message) << "Consistency check for sequential " << par().gamma  
-                 << " insertion = " << abs(check) << std::endl;
-}
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_TestSeqGamma_hpp_
--- a/Hadrons/Archive/Modules/VPCounterTerms.cc
+++ b/Hadrons/Archive/Modules/VPCounterTerms.cc
@@ -1,260 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/VPCounterTerms.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: James Harrison <jch1g10@soton.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Hadrons/Modules/MScalar/VPCounterTerms.hpp>
-#include <Hadrons/Modules/MScalar/Scalar.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace MScalar;
-
-/******************************************************************************
-*                  TVPCounterTerms implementation                             *
-******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-TVPCounterTerms::TVPCounterTerms(const std::string name)
-: Module<VPCounterTermsPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-std::vector<std::string> TVPCounterTerms::getInput(void)
-{
-    std::vector<std::string> in = {par().source};
-    
-    return in;
-}
-
-std::vector<std::string> TVPCounterTerms::getOutput(void)
-{
-    std::vector<std::string> out;
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-void TVPCounterTerms::setup(void)
-{
-	freeMomPropName_ = FREEMOMPROP(par().mass);
-    phaseName_.clear();
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
-    }
-    GFSrcName_ = getName() + "_DinvSrc";
-    phatsqName_ = getName() + "_pHatSquared";
-    prop0Name_ = getName() + "_freeProp";
-    twoscalarName_ = getName() + "_2scalarProp";
-    psquaredName_ = getName() + "_psquaredProp";
-    if (!par().output.empty())
-    {
-        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-        {
-            momPhaseName_.push_back("_momentumphase_" + std::to_string(i_p));
-        }
-    }
-
-    envCreateLat(ScalarField, freeMomPropName_);
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        envCreateLat(ScalarField, phaseName_[mu]);
-    }
-    envCreateLat(ScalarField, phatsqName_);
-    envCreateLat(ScalarField, GFSrcName_);
-    envCreateLat(ScalarField, prop0Name_);
-    envCreateLat(ScalarField, twoscalarName_);
-    envCreateLat(ScalarField, psquaredName_);
-    if (!par().output.empty())
-    {
-        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-        {
-            envCacheLat(ScalarField, momPhaseName_[i_p]);
-        }
-    }
-    envTmpLat(ScalarField, "buf");
-    envTmpLat(ScalarField, "tmp_vp");
-    envTmpLat(ScalarField, "vpPhase");
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-void TVPCounterTerms::execute(void)
-{
-	auto &source = envGet(ScalarField, par().source);
-    Complex     ci(0.0,1.0);
-    FFT         fft(env().getGrid());
-    envGetTmp(ScalarField, buf);
-    envGetTmp(ScalarField, tmp_vp);
-    
-    // Momentum-space free scalar propagator
-    auto &G = envGet(ScalarField, freeMomPropName_);
-    SIMPL::MomentumSpacePropagator(G, par().mass);
-
-    // Phases and hat{p}^2
-    auto &phatsq = envGet(ScalarField, phatsqName_);
-    Coordinate l = env().getGrid()->FullDimensions();
-    
-    LOG(Message) << "Calculating shift phases..." << std::endl;
-    phatsq = Zero();
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        Real    twoPiL = M_PI*2./l[mu];
-        auto &phmu  = envGet(ScalarField, phaseName_[mu]);
-
-        LatticeCoordinate(buf, mu);
-        phmu = exp(ci*twoPiL*buf);
-        phase_.push_back(&phmu);
-        buf = 2.*sin(.5*twoPiL*buf);
-		phatsq = phatsq + buf*buf;
-    }
-
-    // G*F*src
-    auto &GFSrc       = envGet(ScalarField, GFSrcName_);
-    fft.FFT_all_dim(GFSrc, source, FFT::forward);
-    GFSrc = G*GFSrc;
-
-    // Position-space free scalar propagator
-    auto &prop0       = envGet(ScalarField, prop0Name_);
-    prop0 = GFSrc;
-    fft.FFT_all_dim(prop0, prop0, FFT::backward);
-
-    // Propagators for counter-terms
-    auto &twoscalarProp        = envGet(ScalarField, twoscalarName_);
-    auto &psquaredProp         = envGet(ScalarField, psquaredName_);
-
-    twoscalarProp = G*GFSrc;
-    fft.FFT_all_dim(twoscalarProp, twoscalarProp, FFT::backward);
-
-    psquaredProp = G*phatsq*GFSrc;
-    fft.FFT_all_dim(psquaredProp, psquaredProp, FFT::backward);
-
-    // Prepare output data structure if necessary
-    Result outputData;
-    if (!par().output.empty())
-    {
-        outputData.projection.resize(par().outputMom.size());
-        outputData.lattice_size = env().getGrid()->FullDimensions().toVector();
-        outputData.mass = par().mass;
-        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-        {
-            outputData.projection[i_p].momentum = strToVec<int>(par().outputMom[i_p]);
-            outputData.projection[i_p].twoScalar.resize(env().getNd());
-            outputData.projection[i_p].threeScalar.resize(env().getNd());
-            outputData.projection[i_p].pSquaredInsertion.resize(env().getNd());
-            for (unsigned int nu = 0; nu < env().getNd(); ++nu)
-            {
-                outputData.projection[i_p].twoScalar[nu].resize(env().getNd());
-                outputData.projection[i_p].threeScalar[nu].resize(env().getNd());
-                outputData.projection[i_p].pSquaredInsertion[nu].resize(env().getNd());
-            }
-            // Calculate phase factors
-            auto &momph_ip = envGet(ScalarField, momPhaseName_[i_p]);
-            momph_ip = Zero();
-            for (unsigned int j = 0; j < env().getNd()-1; ++j)
-            {
-                Real twoPiL = M_PI*2./l[j];
-                LatticeCoordinate(buf, j);
-                buf = outputData.projection[i_p].momentum[j]*twoPiL*buf;
-                momph_ip = momph_ip + buf;
-            }
-            momph_ip = exp(-ci*momph_ip);
-            momPhase_.push_back(&momph_ip);
-        }
-    }
-
-    // Contractions
-    for (unsigned int nu = 0; nu < env().getNd(); ++nu)
-    {
-    	buf = adj(Cshift(prop0, nu, -1));
-        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-        {
-            // Two-scalar loop
-            tmp_vp = buf * Cshift(prop0, mu, 1);
-            tmp_vp -= Cshift(buf, mu, 1) * prop0;
-            tmp_vp = 2.0*real(tmp_vp);
-            // Output if necessary
-            if (!par().output.empty())
-            {
-                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-                {
-                    project(outputData.projection[i_p].twoScalar[mu][nu],
-                            tmp_vp, i_p);
-                }
-            }
-
-        	// Three-scalar loop (no vertex)
-    		tmp_vp = buf * Cshift(twoscalarProp, mu, 1);
-            tmp_vp -= Cshift(buf, mu, 1) * twoscalarProp;
-            tmp_vp = 2.0*real(tmp_vp);
-            // Output if necessary
-            if (!par().output.empty())
-            {
-                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-                {
-                    project(outputData.projection[i_p].threeScalar[mu][nu],
-                            tmp_vp, i_p);
-                }
-            }
-
-            // Three-scalar loop (hat{p}^2 insertion)
-    		tmp_vp = buf * Cshift(psquaredProp, mu, 1);
-            tmp_vp -= Cshift(buf, mu, 1) * psquaredProp;
-            tmp_vp = 2.0*real(tmp_vp);
-            // Output if necessary
-            if (!par().output.empty())
-            {
-                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
-                {
-                    project(outputData.projection[i_p].pSquaredInsertion[mu][nu],
-                            tmp_vp, i_p);
-                }
-            }
-        }
-    }
-
-    // OUTPUT IF NECESSARY
-    if (!par().output.empty())
-    {
-        LOG(Message) << "Saving momentum-projected correlators to '"
-                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
-                     << std::endl;
-        saveResult(par().output, "scalar_loops", outputData);
-    }
-}
-
-void TVPCounterTerms::project(std::vector<Complex> &projection, const ScalarField &vp, int i_p)
-{
-    std::vector<TComplex>   vecBuf;
-    envGetTmp(ScalarField, vpPhase);
-
-    vpPhase = vp*(*momPhase_[i_p]);
-    sliceSum(vpPhase, vecBuf, Tp);
-    projection.resize(vecBuf.size());
-    for (unsigned int t = 0; t < vecBuf.size(); ++t)
-    {
-        projection[t] = TensorRemove(vecBuf[t]);
-    }
-}
--- a/Hadrons/Archive/Modules/VPCounterTerms.hpp
+++ b/Hadrons/Archive/Modules/VPCounterTerms.hpp
@@ -1,103 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/VPCounterTerms.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: James Harrison <jch1g10@soton.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef Hadrons_MScalar_VPCounterTerms_hpp_
-#define Hadrons_MScalar_VPCounterTerms_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Module.hpp>
-#include <Hadrons/ModuleFactory.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                         VPCounterTerms                                 *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MScalar)
-
-class VPCounterTermsPar: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(VPCounterTermsPar,
-                                    std::string, source,
-                                    double,      mass,
-                                    std::string, output,
-                                    std::vector<std::string>, outputMom);
-};
-
-class TVPCounterTerms: public Module<VPCounterTermsPar>
-{
-public:
-    BASIC_TYPE_ALIASES(SIMPL,);
-    class Result: Serializable
-    {
-    public:
-        class Projection: Serializable
-        {
-        public:
-            GRID_SERIALIZABLE_CLASS_MEMBERS(Projection,
-                                            std::vector<int>,     momentum,
-                                            std::vector<std::vector<std::vector<Complex>>>, twoScalar,
-                                            std::vector<std::vector<std::vector<Complex>>>, threeScalar,
-                                            std::vector<std::vector<std::vector<Complex>>>, pSquaredInsertion);
-        };
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
-                                        std::vector<int>,        lattice_size,
-                                        double,                  mass,
-                                        std::vector<Projection>, projection);
-    };
-public:
-    // constructor
-    TVPCounterTerms(const std::string name);
-    // destructor
-    virtual ~TVPCounterTerms(void) {};
-    // dependency relation
-    virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getOutput(void);
-protected:
-    // setup
-    virtual void setup(void);
-    // execution
-    virtual void execute(void);
-private:
-    void project(std::vector<Complex> &projection, const ScalarField &vp, int i_p);
-private:
-    std::string                freeMomPropName_, GFSrcName_, phatsqName_, prop0Name_,
-                               twoscalarName_, twoscalarVertexName_,
-                               psquaredName_, psquaredVertexName_;
-    std::vector<std::string>   phaseName_, momPhaseName_;
-    std::vector<ScalarField *> phase_, momPhase_;
-};
-
-MODULE_REGISTER(VPCounterTerms, TVPCounterTerms, MScalar);
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_MScalar_VPCounterTerms_hpp_
--- a/Hadrons/Archive/Modules/WardIdentity.cc
+++ b/Hadrons/Archive/Modules/WardIdentity.cc
@@ -1,35 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/WardIdentity.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Hadrons/Modules/MContraction/WardIdentity.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace MContraction;
-
-template class Grid::Hadrons::MContraction::TWardIdentity<FIMPL>;
-
--- a/Hadrons/Archive/Modules/WardIdentity.hpp
+++ b/Hadrons/Archive/Modules/WardIdentity.hpp
@@ -1,224 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/WardIdentity.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_MContraction_WardIdentity_hpp_
-#define Hadrons_MContraction_WardIdentity_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Module.hpp>
-#include <Hadrons/ModuleFactory.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/*
-  Ward Identity contractions
- -----------------------------
- 
- * options:
- - q:          propagator, 5D if available (string)
- - action:     action module used for propagator solution (string)
- - mass:       mass of quark (double)
- - test_axial: whether or not to test PCAC relation.
-*/
-
-/******************************************************************************
- *                              WardIdentity                                  *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MContraction)
-
-class WardIdentityPar: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(WardIdentityPar,
-                                    std::string, q,
-                                    std::string, action,
-                                    double,      mass,
-                                    bool,        test_axial);
-};
-
-template <typename FImpl>
-class TWardIdentity: public Module<WardIdentityPar>
-{
-public:
-    FERM_TYPE_ALIASES(FImpl,);
-public:
-    // constructor
-    TWardIdentity(const std::string name);
-    // destructor
-    virtual ~TWardIdentity(void) {};
-    // dependency relation
-    virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getOutput(void);
-protected:
-    // setup
-    virtual void setup(void);
-    // execution
-    virtual void execute(void);
-private:
-    unsigned int Ls_;
-};
-
-MODULE_REGISTER_TMP(WardIdentity, TWardIdentity<FIMPL>, MContraction);
-
-/******************************************************************************
- *                     TWardIdentity implementation                           *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-template <typename FImpl>
-TWardIdentity<FImpl>::TWardIdentity(const std::string name)
-: Module<WardIdentityPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-template <typename FImpl>
-std::vector<std::string> TWardIdentity<FImpl>::getInput(void)
-{
-    std::vector<std::string> in = {par().q, par().action};
-    
-    return in;
-}
-
-template <typename FImpl>
-std::vector<std::string> TWardIdentity<FImpl>::getOutput(void)
-{
-    std::vector<std::string> out = {};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void TWardIdentity<FImpl>::setup(void)
-{
-    Ls_ = env().getObjectLs(par().q);
-    if (Ls_ != env().getObjectLs(par().action))
-    {
-        HADRONS_ERROR(Size, "Ls mismatch between quark action and propagator");
-    }
-    envTmpLat(PropagatorField, "tmp");
-    envTmpLat(PropagatorField, "vector_WI");
-    if (par().test_axial)
-    {
-        envTmpLat(PropagatorField, "psi");
-        envTmpLat(LatticeComplex,  "PP");
-        envTmpLat(LatticeComplex,  "axial_defect");
-        envTmpLat(LatticeComplex,  "PJ5q");
-    }
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void TWardIdentity<FImpl>::execute(void)
-{
-    LOG(Message) << "Performing Ward Identity checks for quark '" << par().q
-                 << "'." << std::endl;
-
-    auto  &q   = envGet(PropagatorField, par().q);
-    auto  &act = envGet(FMat, par().action);
-    Gamma g5(Gamma::Algebra::Gamma5);
-
-    // Compute D_mu V_mu, D here is backward derivative.
-    envGetTmp(PropagatorField, tmp);
-    envGetTmp(PropagatorField, vector_WI);
-    vector_WI    = Zero();
-    for (unsigned int mu = 0; mu < Nd; ++mu)
-    {
-        act.ContractConservedCurrent(q, q, tmp, Current::Vector, mu);
-        tmp -= Cshift(tmp, mu, -1);
-        vector_WI += tmp;
-    }
-
-    // Test ward identity D_mu V_mu = 0;
-    LOG(Message) << "Vector Ward Identity check Delta_mu V_mu = " 
-                 << norm2(vector_WI) << std::endl;
-
-    if (par().test_axial)
-    {
-        envGetTmp(PropagatorField, psi);
-        envGetTmp(LatticeComplex, PP);
-        envGetTmp(LatticeComplex, axial_defect);
-        envGetTmp(LatticeComplex, PJ5q);
-        std::vector<TComplex> axial_buf;
-
-        // Compute <P|D_mu A_mu>, D is backwards derivative.
-        axial_defect = Zero();
-        for (unsigned int mu = 0; mu < Nd; ++mu)
-        {
-            act.ContractConservedCurrent(q, q, tmp, Current::Axial, mu);
-            tmp -= Cshift(tmp, mu, -1);
-            axial_defect += trace(g5*tmp);
-        }
-
-        // Get <P|J5q> for 5D (Zero(); for 4D) and <P|P>.
-        PJ5q = Zero();
-        if (Ls_ > 1)
-        {
-            // <P|P>
-            ExtractSlice(tmp, q, 0, 0);
-            psi  = 0.5 * (tmp - g5*tmp);
-            ExtractSlice(tmp, q, Ls_ - 1, 0);
-            psi += 0.5 * (tmp + g5*tmp);
-            PP = trace(adj(psi)*psi);
-
-            // <P|5Jq>
-            ExtractSlice(tmp, q, Ls_/2 - 1, 0);
-            psi  = 0.5 * (tmp + g5*tmp);
-            ExtractSlice(tmp, q, Ls_/2, 0);
-            psi += 0.5 * (tmp - g5*tmp);
-            PJ5q = trace(adj(psi)*psi);
-        }
-        else
-        {
-            PP = trace(adj(q)*q);
-        }
-
-        // Test ward identity <P|D_mu A_mu> = 2m<P|P> + 2<P|J5q>
-        LOG(Message) << "|D_mu A_mu|^2 = " << norm2(axial_defect) << std::endl;
-        LOG(Message) << "|PP|^2        = " << norm2(PP) << std::endl;
-        LOG(Message) << "|PJ5q|^2      = " << norm2(PJ5q) << std::endl;
-        LOG(Message) << "Axial Ward Identity defect Delta_mu A_mu = "
-                     << norm2(axial_defect) << std::endl;
-    
-        // Axial defect by timeslice.
-        axial_defect -= 2.*(par().mass*PP + PJ5q);
-        LOG(Message) << "Check Axial defect by timeslice" << std::endl;
-        sliceSum(axial_defect, axial_buf, Tp);
-        for (int t = 0; t < axial_buf.size(); ++t)
-        {
-            LOG(Message) << "t = " << t << ": " 
-                         << TensorRemove(axial_buf[t]) << std::endl;
-        }
-    }
-}
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_WardIdentity_hpp_
--- a/Hadrons/Archive/Modules/WeakHamiltonian.hpp
+++ b/Hadrons/Archive/Modules/WeakHamiltonian.hpp
@@ -1,118 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/WeakHamiltonian.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_MContraction_WeakHamiltonian_hpp_
-#define Hadrons_MContraction_WeakHamiltonian_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Module.hpp>
-#include <Hadrons/ModuleFactory.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                         WeakHamiltonian                                    *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MContraction)
-
-/*******************************************************************************
- * Utilities for contractions involving the Weak Hamiltonian.
- ******************************************************************************/
-//// Sum and store correlator.
-#define MAKE_DIAG(exp, buf, res, n)\
-sliceSum(exp, buf, Tp);\
-res.name = (n);\
-res.corr.resize(buf.size());\
-for (unsigned int t = 0; t < buf.size(); ++t)\
-{\
-    res.corr[t] = TensorRemove(buf[t]);\
-}
-
-//// Contraction of mu index: use 'mu' variable in exp.
-#define SUM_MU(buf,exp)\
-   buf = Zero();				\
-for (unsigned int mu = 0; mu < ndim; ++mu)\
-{\
-    buf += exp;\
-}
-
-enum 
-{
-  i_V = 0,
-  i_A = 1,
-  n_i = 2
-};
-
-class WeakHamiltonianPar: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(WeakHamiltonianPar,
-                                    std::string, q1,
-                                    std::string, q2,
-                                    std::string, q3,
-                                    std::string, q4,
-                                    unsigned int, tSnk,
-                                    std::string, output);
-};
-
-#define MAKE_WEAK_MODULE(modname)\
-class T##modname: public Module<WeakHamiltonianPar>\
-{\
-public:\
-    FERM_TYPE_ALIASES(FIMPL,)\
-    class Result: Serializable\
-    {\
-    public:\
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,\
-                                        std::string, name,\
-                                        std::vector<Complex>, corr);\
-    };\
-public:\
-    /* constructor */ \
-    T##modname(const std::string name);\
-    /* destructor */ \
-    virtual ~T##modname(void) {};\
-    /* dependency relation */ \
-    virtual std::vector<std::string> getInput(void);\
-    virtual std::vector<std::string> getOutput(void);\
-public:\
-    std::vector<std::string> VA_label = {"V", "A"};\
-protected:\
-    /* setup */ \
-    virtual void setup(void);\
-    /* execution */ \
-    virtual void execute(void);\
-};\
-MODULE_REGISTER(modname, T##modname, MContraction);
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_MContraction_WeakHamiltonian_hpp_
--- a/Hadrons/Archive/Modules/WeakHamiltonianEye.cc
+++ b/Hadrons/Archive/Modules/WeakHamiltonianEye.cc
@@ -1,151 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/WeakHamiltonianEye.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace MContraction;
-
-/*
- * Weak Hamiltonian current-current contractions, Eye-type.
- * 
- * These contractions are generated by the Q1 and Q2 operators in the physical
- * basis (see e.g. Fig 3 of arXiv:1507.03094).
- * 
- * Schematics:        q4                 |                  
- *                  /-<-¬                |                             
- *                 /     \               |             q2           q3
- *                 \     /               |        /----<------*------<----¬                        
- *            q2    \   /    q3          |       /          /-*-¬          \
- *       /-----<-----* *-----<----¬      |      /          /     \          \
- *    i *            H_W           * f   |   i *           \     /  q4      * f
- *       \                        /      |      \           \->-/          /   
- *        \                      /       |       \                        /       
- *         \---------->---------/        |        \----------->----------/        
- *                   q1                  |                   q1                  
- *                                       |
- *                Saucer (S)             |                  Eye (E)
- * 
- * S: trace(q3*g5*q1*adj(q2)*g5*gL[mu][p_1]*q4*gL[mu][p_2])
- * E: trace(q3*g5*q1*adj(q2)*g5*gL[mu][p_1])*trace(q4*gL[mu][p_2])
- * 
- * Note q1 must be sink smeared.
- */
-
-/******************************************************************************
- *                  TWeakHamiltonianEye implementation                        *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-TWeakHamiltonianEye::TWeakHamiltonianEye(const std::string name)
-: Module<WeakHamiltonianPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-std::vector<std::string> TWeakHamiltonianEye::getInput(void)
-{
-    std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
-    
-    return in;
-}
-
-std::vector<std::string> TWeakHamiltonianEye::getOutput(void)
-{
-    std::vector<std::string> out = {};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-void TWeakHamiltonianEye::setup(void)
-{
-    unsigned int ndim = env().getNd();
-
-    envTmpLat(LatticeComplex,  "expbuf");
-    envTmpLat(PropagatorField, "tmp1");
-    envTmpLat(LatticeComplex,  "tmp2");
-    envTmp(std::vector<PropagatorField>, "S_body", 1, ndim, PropagatorField(env().getGrid()));
-    envTmp(std::vector<PropagatorField>, "S_loop", 1, ndim, PropagatorField(env().getGrid()));
-    envTmp(std::vector<LatticeComplex>,  "E_body", 1, ndim, LatticeComplex(env().getGrid()));
-    envTmp(std::vector<LatticeComplex>,  "E_loop", 1, ndim, LatticeComplex(env().getGrid()));
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-void TWeakHamiltonianEye::execute(void)
-{
-    LOG(Message) << "Computing Weak Hamiltonian (Eye type) contractions '" 
-                 << getName() << "' using quarks '" << par().q1 << "', '" 
-                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
-                 << "'." << std::endl;
-
-    auto                   &q1 = envGet(SlicedPropagator, par().q1);
-    auto                   &q2 = envGet(PropagatorField, par().q2);
-    auto                   &q3 = envGet(PropagatorField, par().q3);
-    auto                   &q4 = envGet(PropagatorField, par().q4);
-    Gamma                  g5  = Gamma(Gamma::Algebra::Gamma5);
-    std::vector<TComplex>  corrbuf;
-    std::vector<Result>    result(n_eye_diag);
-    unsigned int ndim    = env().getNd();
-
-    envGetTmp(LatticeComplex,               expbuf); 
-    envGetTmp(PropagatorField,              tmp1);
-    envGetTmp(LatticeComplex,               tmp2);
-    envGetTmp(std::vector<PropagatorField>, S_body);
-    envGetTmp(std::vector<PropagatorField>, S_loop);
-    envGetTmp(std::vector<LatticeComplex>,  E_body);
-    envGetTmp(std::vector<LatticeComplex>,  E_loop);
-
-    // Get sink timeslice of q1.
-    SitePropagator q1Snk = q1[par().tSnk];
-
-    // Setup for S-type contractions.
-    for (int mu = 0; mu < ndim; ++mu)
-    {
-        S_body[mu] = MAKE_SE_BODY(q1Snk, q2, q3, GammaL(Gamma::gmu[mu]));
-        S_loop[mu] = MAKE_SE_LOOP(q4, GammaL(Gamma::gmu[mu]));
-    }
-
-    // Perform S-type contractions.    
-    SUM_MU(expbuf, trace(S_body[mu]*S_loop[mu]))
-    MAKE_DIAG(expbuf, corrbuf, result[S_diag], "HW_S")
-
-    // Recycle sub-expressions for E-type contractions.
-    for (unsigned int mu = 0; mu < ndim; ++mu)
-    {
-        E_body[mu] = trace(S_body[mu]);
-        E_loop[mu] = trace(S_loop[mu]);
-    }
-
-    // Perform E-type contractions.
-    SUM_MU(expbuf, E_body[mu]*E_loop[mu])
-    MAKE_DIAG(expbuf, corrbuf, result[E_diag], "HW_E")
-
-    // IO
-    saveResult(par().output, "HW_Eye", result);
-}
--- a/Hadrons/Archive/Modules/WeakHamiltonianEye.hpp
+++ b/Hadrons/Archive/Modules/WeakHamiltonianEye.hpp
@@ -1,59 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/WeakHamiltonianEye.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_MContraction_WeakHamiltonianEye_hpp_
-#define Hadrons_MContraction_WeakHamiltonianEye_hpp_
-
-#include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                         WeakHamiltonianEye                                 *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MContraction)
-
-enum
-{
-    S_diag = 0,
-    E_diag = 1,
-    n_eye_diag = 2
-};
-
-// Saucer and Eye subdiagram contractions.
-#define MAKE_SE_BODY(Q_1, Q_2, Q_3, gamma) (Q_3*g5*Q_1*adj(Q_2)*g5*gamma)
-#define MAKE_SE_LOOP(Q_loop, gamma) (Q_loop*gamma)
-
-MAKE_WEAK_MODULE(WeakHamiltonianEye)
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_MContraction_WeakHamiltonianEye_hpp_
--- a/Hadrons/Archive/Modules/WeakHamiltonianNonEye.cc
+++ b/Hadrons/Archive/Modules/WeakHamiltonianNonEye.cc
@@ -1,148 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/WeakHamiltonianNonEye.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace MContraction;
-
-/*
- * Weak Hamiltonian current-current contractions, Non-Eye-type.
- * 
- * These contractions are generated by the Q1 and Q2 operators in the physical
- * basis (see e.g. Fig 3 of arXiv:1507.03094).
- * 
- * Schematic:     
- *            q2             q3          |           q2              q3
- *          /--<--¬       /--<--¬        |        /--<--¬         /--<--¬       
- *         /       \     /       \       |       /       \       /       \      
- *        /         \   /         \      |      /         \     /         \     
- *       /           \ /           \     |     /           \   /           \    
- *    i *             * H_W         *  f |  i *             * * H_W         * f 
- *      \             *             |    |     \           /   \           /
- *       \           / \           /     |      \         /     \         /    
- *        \         /   \         /      |       \       /       \       /  
- *         \       /     \       /       |        \-->--/         \-->--/      
- *          \-->--/       \-->--/        |          q1               q4 
- *            q1             q4          |
- *                Connected (C)          |                 Wing (W)
- *
- * C: trace(q1*adj(q2)*g5*gL[mu]*q3*adj(q4)*g5*gL[mu])
- * W: trace(q1*adj(q2)*g5*gL[mu])*trace(q3*adj(q4)*g5*gL[mu])
- * 
- */
-
-/******************************************************************************
- *                  TWeakHamiltonianNonEye implementation                     *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-TWeakHamiltonianNonEye::TWeakHamiltonianNonEye(const std::string name)
-: Module<WeakHamiltonianPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-std::vector<std::string> TWeakHamiltonianNonEye::getInput(void)
-{
-    std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
-    
-    return in;
-}
-
-std::vector<std::string> TWeakHamiltonianNonEye::getOutput(void)
-{
-    std::vector<std::string> out = {};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-void TWeakHamiltonianNonEye::setup(void)
-{
-    unsigned int ndim = env().getNd();
-
-    envTmpLat(LatticeComplex,  "expbuf");
-    envTmpLat(PropagatorField, "tmp1");
-    envTmpLat(LatticeComplex,  "tmp2");
-    envTmp(std::vector<PropagatorField>, "C_i_side_loop", 1, ndim, PropagatorField(env().getGrid()));
-    envTmp(std::vector<PropagatorField>, "C_f_side_loop", 1, ndim, PropagatorField(env().getGrid()));
-    envTmp(std::vector<LatticeComplex>,  "W_i_side_loop", 1, ndim, LatticeComplex(env().getGrid()));
-    envTmp(std::vector<LatticeComplex>,  "W_f_side_loop", 1, ndim, LatticeComplex(env().getGrid()));
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-void TWeakHamiltonianNonEye::execute(void)
-{
-    LOG(Message) << "Computing Weak Hamiltonian (Non-Eye type) contractions '" 
-                 << getName() << "' using quarks '" << par().q1 << "', '" 
-                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
-                 << "'." << std::endl;
-    
-    auto                  &q1 = envGet(PropagatorField, par().q1);
-    auto                  &q2 = envGet(PropagatorField, par().q2);
-    auto                  &q3 = envGet(PropagatorField, par().q3);
-    auto                  &q4 = envGet(PropagatorField, par().q4);
-    Gamma                 g5  = Gamma(Gamma::Algebra::Gamma5);
-    std::vector<TComplex> corrbuf;
-    std::vector<Result>   result(n_noneye_diag); 
-    unsigned int          ndim = env().getNd();
-
-    envGetTmp(LatticeComplex,               expbuf); 
-    envGetTmp(PropagatorField,              tmp1);
-    envGetTmp(LatticeComplex,               tmp2);
-    envGetTmp(std::vector<PropagatorField>, C_i_side_loop);
-    envGetTmp(std::vector<PropagatorField>, C_f_side_loop);
-    envGetTmp(std::vector<LatticeComplex>,  W_i_side_loop);
-    envGetTmp(std::vector<LatticeComplex>,  W_f_side_loop);
-
-    // Setup for C-type contractions.
-    for (int mu = 0; mu < ndim; ++mu)
-    {
-        C_i_side_loop[mu] = MAKE_CW_SUBDIAG(q1, q2, GammaL(Gamma::gmu[mu]));
-        C_f_side_loop[mu] = MAKE_CW_SUBDIAG(q3, q4, GammaL(Gamma::gmu[mu]));
-    }
-
-    // Perform C-type contractions.    
-    SUM_MU(expbuf, trace(C_i_side_loop[mu]*C_f_side_loop[mu]))
-    MAKE_DIAG(expbuf, corrbuf, result[C_diag], "HW_C")
-
-    // Recycle sub-expressions for W-type contractions.
-    for (unsigned int mu = 0; mu < ndim; ++mu)
-    {
-        W_i_side_loop[mu] = trace(C_i_side_loop[mu]);
-        W_f_side_loop[mu] = trace(C_f_side_loop[mu]);
-    }
-
-    // Perform W-type contractions.
-    SUM_MU(expbuf, W_i_side_loop[mu]*W_f_side_loop[mu])
-    MAKE_DIAG(expbuf, corrbuf, result[W_diag], "HW_W")
-
-    // IO
-    saveResult(par().output, "HW_NonEye", result);
-}
--- a/Hadrons/Archive/Modules/WeakHamiltonianNonEye.hpp
+++ b/Hadrons/Archive/Modules/WeakHamiltonianNonEye.hpp
@@ -1,58 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/WeakHamiltonianNonEye.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
-#define Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
-
-#include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                         WeakHamiltonianNonEye                              *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MContraction)
-
-enum
-{
-    W_diag = 0,
-    C_diag = 1,
-    n_noneye_diag = 2
-};
-
-// Wing and Connected subdiagram contractions
-#define MAKE_CW_SUBDIAG(Q_1, Q_2, gamma) (Q_1*adj(Q_2)*g5*gamma)
-
-MAKE_WEAK_MODULE(WeakHamiltonianNonEye)
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
--- a/Hadrons/Archive/Modules/WeakNeutral4ptDisc.cc
+++ b/Hadrons/Archive/Modules/WeakNeutral4ptDisc.cc
@@ -1,142 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/WeakNeutral4ptDisc.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace MContraction;
-
-/*
- * Weak Hamiltonian + current contractions, disconnected topology for neutral 
- * mesons.
- * 
- * These contractions are generated by operators Q_1,...,10 of the dS=1 Weak
- * Hamiltonian in the physical basis and an additional current J (see e.g. 
- * Fig 11 of arXiv:1507.03094).
- * 
- * Schematic:
- *                        
- *           q2          q4             q3
- *       /--<--¬     /---<--¬       /---<--¬
- *     /         \ /         \     /        \
- *  i *           * H_W      |  J *          * f
- *     \         / \         /     \        /
- *      \--->---/   \-------/       \------/
- *          q1 
- * 
- * options
- * - q1: input propagator 1 (string)
- * - q2: input propagator 2 (string)
- * - q3: input propagator 3 (string), assumed to be sequential propagator 
- * - q4: input propagator 4 (string), assumed to be a loop
- * 
- * type 1: trace(q1*adj(q2)*g5*gL[mu])*trace(loop*gL[mu])*trace(q3*g5)
- * type 2: trace(q1*adj(q2)*g5*gL[mu]*loop*gL[mu])*trace(q3*g5)
- */
-
-/*******************************************************************************
- *                  TWeakNeutral4ptDisc implementation                         *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-TWeakNeutral4ptDisc::TWeakNeutral4ptDisc(const std::string name)
-: Module<WeakHamiltonianPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-std::vector<std::string> TWeakNeutral4ptDisc::getInput(void)
-{
-    std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
-    
-    return in;
-}
-
-std::vector<std::string> TWeakNeutral4ptDisc::getOutput(void)
-{
-    std::vector<std::string> out = {};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-void TWeakNeutral4ptDisc::setup(void)
-{
-    unsigned int ndim = env().getNd();
-
-    envTmpLat(LatticeComplex,  "expbuf");
-    envTmpLat(PropagatorField, "tmp");
-    envTmpLat(LatticeComplex,  "curr");
-    envTmp(std::vector<PropagatorField>, "meson", 1, ndim, PropagatorField(env().getGrid()));
-    envTmp(std::vector<PropagatorField>, "loop", 1, ndim,  PropagatorField(env().getGrid()));
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-void TWeakNeutral4ptDisc::execute(void)
-{
-    LOG(Message) << "Computing Weak Hamiltonian neutral disconnected contractions '" 
-                 << getName() << "' using quarks '" << par().q1 << "', '" 
-                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
-                 << "'." << std::endl;
-
-    auto                  &q1 = envGet(PropagatorField, par().q1);
-    auto                  &q2 = envGet(PropagatorField, par().q2);
-    auto                  &q3 = envGet(PropagatorField, par().q3);
-    auto                  &q4 = envGet(PropagatorField, par().q4);
-    Gamma                 g5  = Gamma(Gamma::Algebra::Gamma5);
-    std::vector<TComplex> corrbuf;
-    std::vector<Result>   result(n_neut_disc_diag);
-    unsigned int          ndim = env().getNd();
-
-    envGetTmp(LatticeComplex,               expbuf); 
-    envGetTmp(PropagatorField,              tmp);
-    envGetTmp(LatticeComplex,               curr);
-    envGetTmp(std::vector<PropagatorField>, meson);
-    envGetTmp(std::vector<PropagatorField>, loop);
-
-    // Setup for type 1 contractions.
-    for (int mu = 0; mu < ndim; ++mu)
-    {
-        meson[mu] = MAKE_DISC_MESON(q1, q2, GammaL(Gamma::gmu[mu]));
-        loop[mu] = MAKE_DISC_LOOP(q4, GammaL(Gamma::gmu[mu]));
-    }
-    curr = MAKE_DISC_CURR(q3, GammaL(Gamma::Algebra::Gamma5));
-
-    // Perform type 1 contractions.    
-    SUM_MU(expbuf, trace(meson[mu]*loop[mu]))
-    expbuf *= curr;
-    MAKE_DIAG(expbuf, corrbuf, result[neut_disc_1_diag], "HW_disc0_1")
-
-    // Perform type 2 contractions.
-    SUM_MU(expbuf, trace(meson[mu])*trace(loop[mu]))
-    expbuf *= curr;
-    MAKE_DIAG(expbuf, corrbuf, result[neut_disc_2_diag], "HW_disc0_2")
-
-    // IO
-    saveResult(par().output, "HW_disc0", result);
-}
--- a/Hadrons/Archive/Modules/WeakNeutral4ptDisc.hpp
+++ b/Hadrons/Archive/Modules/WeakNeutral4ptDisc.hpp
@@ -1,60 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Archive/Modules/WeakNeutral4ptDisc.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
-#define Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
-
-#include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                         WeakNeutral4ptDisc                                 *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MContraction)
-
-enum
-{
-    neut_disc_1_diag = 0,
-    neut_disc_2_diag = 1,
-    n_neut_disc_diag = 2
-};
-
-// Neutral 4pt disconnected subdiagram contractions.
-#define MAKE_DISC_MESON(Q_1, Q_2, gamma) (Q_1*adj(Q_2)*g5*gamma)
-#define MAKE_DISC_LOOP(Q_LOOP, gamma) (Q_LOOP*gamma)
-#define MAKE_DISC_CURR(Q_c, gamma) (trace(Q_c*gamma))
-
-MAKE_WEAK_MODULE(WeakNeutral4ptDisc)
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
--- a/Hadrons/DilutedNoise.hpp
+++ b/Hadrons/DilutedNoise.hpp
@@ -1,356 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/DilutedNoise.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
-Author: Vera Guelpers <vmg1n14@soton.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef Hadrons_DilutedNoise_hpp_
-#define Hadrons_DilutedNoise_hpp_
-
-#include <Hadrons/Global.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                   Abstract container for diluted noise                     *
- ******************************************************************************/
-template <typename FImpl>
-class DilutedNoise
-{
-public:
-    typedef typename FImpl::FermionField FermionField;
-public:
-    // constructor/destructor
-    DilutedNoise(GridCartesian *g);
-    DilutedNoise(GridCartesian *g, const unsigned int nNoise);
-    virtual ~DilutedNoise(void) = default;
-    // access
-    std::vector<FermionField> &       getNoise(void);
-    const std::vector<FermionField> & getNoise(void) const;
-    const FermionField &              operator[](const unsigned int i) const;
-    FermionField &                    operator[](const unsigned int i);
-    void                              normalise(Real norm);
-    void                              resize(const unsigned int nNoise);
-    unsigned int                      size(void) const;
-    GridCartesian                     *getGrid(void) const;
-    // generate noise (pure virtual)
-    virtual void generateNoise(GridParallelRNG &rng) = 0;
-private:
-    std::vector<FermionField> noise_;
-    GridCartesian             *grid_;
-    unsigned int              nNoise_;
-};
-
-template <typename FImpl>
-class TimeDilutedSpinColorDiagonalNoise: public DilutedNoise<FImpl>
-{
-public:
-    typedef typename FImpl::FermionField FermionField;
-public:
-    // constructor/destructor
-    TimeDilutedSpinColorDiagonalNoise(GridCartesian *g);
-    virtual ~TimeDilutedSpinColorDiagonalNoise(void) = default;
-    // generate noise
-    virtual void generateNoise(GridParallelRNG &rng);
-private:
-    unsigned int nt_;
-};
-
-template <typename FImpl>
-class FullVolumeSpinColorDiagonalNoise: public DilutedNoise<FImpl>
-{
-public:
-    typedef typename FImpl::FermionField FermionField;
-public:
-    // constructor/destructor
-    FullVolumeSpinColorDiagonalNoise(GridCartesian *g, unsigned int n_src);
-    virtual ~FullVolumeSpinColorDiagonalNoise(void) = default;
-    // generate noise
-    virtual void generateNoise(GridParallelRNG &rng);
-private:
-    unsigned int nSrc_;
-};
-
-template <typename FImpl>
-class SparseSpinColorDiagonalNoise: public DilutedNoise<FImpl>
-{
-public:
-    typedef typename FImpl::FermionField FermionField;
-public:
-    // constructor/destructor
-    SparseSpinColorDiagonalNoise(GridCartesian *g, unsigned int n_src, unsigned int n_sparse);
-    virtual ~SparseSpinColorDiagonalNoise(void) = default;
-    // generate noise
-    virtual void generateNoise(GridParallelRNG &rng);
-private:
-    unsigned int nSrc_;
-    unsigned int nSparse_;
-};
-
-/******************************************************************************
- *                    DilutedNoise template implementation                    *
- ******************************************************************************/
-template <typename FImpl>
-DilutedNoise<FImpl>::DilutedNoise(GridCartesian *g)
-: grid_(g)
-{}
-
-template <typename FImpl>
-DilutedNoise<FImpl>::DilutedNoise(GridCartesian *g,
-                                  const unsigned int nNoise)
-: DilutedNoise(g)
-{
-    resize(nNoise);
-}
-
-template <typename FImpl>
-std::vector<typename DilutedNoise<FImpl>::FermionField> & DilutedNoise<FImpl>::
-getNoise(void)
-{
-    return noise_;
-}
-
-template <typename FImpl>
-const std::vector<typename DilutedNoise<FImpl>::FermionField> & DilutedNoise<FImpl>::
-getNoise(void) const
-{
-    return noise_;
-}
-
-template <typename FImpl>
-const typename DilutedNoise<FImpl>::FermionField & 
-DilutedNoise<FImpl>::operator[](const unsigned int i) const
-{
-    return noise_[i];
-}
-
-template <typename FImpl>
-typename DilutedNoise<FImpl>::FermionField & 
-DilutedNoise<FImpl>::operator[](const unsigned int i)
-{
-    return noise_[i];
-}
-
-template <typename FImpl>
-void DilutedNoise<FImpl>::normalise(Real norm)
-{
-    for(int i=0;i<noise_.size();i++)
-    {
-        noise_[i] = norm*noise_[i];
-    }
-}
-
-template <typename FImpl>
-void DilutedNoise<FImpl>::resize(const unsigned int nNoise)
-{
-    nNoise_ = nNoise;
-    noise_.resize(nNoise, grid_);
-}
-
-template <typename FImpl>
-unsigned int DilutedNoise<FImpl>::size(void) const
-{  
-    return noise_.size();
-}
-
-template <typename FImpl>
-GridCartesian * DilutedNoise<FImpl>::getGrid(void) const
-{
-    return grid_;
-}
-
-/******************************************************************************
- *        TimeDilutedSpinColorDiagonalNoise template implementation           *
- ******************************************************************************/
-template <typename FImpl>
-TimeDilutedSpinColorDiagonalNoise<FImpl>::
-TimeDilutedSpinColorDiagonalNoise(GridCartesian *g)
-: DilutedNoise<FImpl>(g)
-{
-    nt_ = this->getGrid()->GlobalDimensions().size();
-    this->resize(nt_*Ns*FImpl::Dimension);
-}
-
-template <typename FImpl>
-void TimeDilutedSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng)
-{
-    typedef decltype(peekColour((*this)[0], 0)) SpinField;
-
-    auto                       &noise = *this;
-    auto                       g      = this->getGrid();
-    auto                       nd     = g->GlobalDimensions().size();
-    auto                       nc     = FImpl::Dimension;
-    Complex                    shift(1., 1.);
-    Lattice<iScalar<vInteger>> tLat(g);
-    LatticeComplex             eta(g), etaCut(g);
-    SpinField                  etas(g);
-    unsigned int               i = 0;
-
-    LatticeCoordinate(tLat, nd - 1);
-    bernoulli(rng, eta);
-    eta = (2.*eta - shift)*(1./::sqrt(2.));
-    for (unsigned int t = 0; t < nt_; ++t)
-    {
-        etaCut = where((tLat == t), eta, 0.*eta);
-        for (unsigned int s = 0; s < Ns; ++s)
-        {
-	    etas = Zero();
-	    pokeSpin(etas, etaCut, s);
-            for (unsigned int c = 0; c < nc; ++c)
-            {
-  	        noise[i] = Zero();
-                pokeColour(noise[i], etas, c);
-                i++;
-            }
-        }
-    }
-}
-
-/******************************************************************************
- *        FullVolumeSpinColorDiagonalNoise template implementation           *
- ******************************************************************************/
-template <typename FImpl>
-FullVolumeSpinColorDiagonalNoise<FImpl>::
-FullVolumeSpinColorDiagonalNoise(GridCartesian *g, unsigned int nSrc)
-: DilutedNoise<FImpl>(g, nSrc*Ns*FImpl::Dimension), nSrc_(nSrc)
-{}
-
-template <typename FImpl>
-void FullVolumeSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng)
-{
-    typedef decltype(peekColour((*this)[0], 0)) SpinField;
-
-    auto                       &noise = *this;
-    auto                       g      = this->getGrid();
-    auto                       nd     = g->GlobalDimensions().size();
-    auto                       nc     = FImpl::Dimension;
-    Complex                    shift(1., 1.);
-    LatticeComplex             eta(g);
-    SpinField                  etas(g);
-    unsigned int               i = 0;
-
-    bernoulli(rng, eta);
-    eta = (2.*eta - shift)*(1./::sqrt(2.));
-    for (unsigned int n = 0; n < nSrc_; ++n)
-    {
-        for (unsigned int s = 0; s < Ns; ++s)
-        {
-  	    etas = Zero();
-            pokeSpin(etas, eta, s);
-            for (unsigned int c = 0; c < nc; ++c)
-            {
-	        noise[i] = Zero();
-                pokeColour(noise[i], etas, c);
-                i++;
-            }
-        }
-    }
-}
-
-/******************************************************************************
- *        SparseSpinColorDiagonalNoise template implementation           *
- ******************************************************************************/
-template <typename FImpl>
-SparseSpinColorDiagonalNoise<FImpl>::
-SparseSpinColorDiagonalNoise(GridCartesian *g, unsigned int nSrc, unsigned int nSparse)
-: DilutedNoise<FImpl>(g, nSrc*Ns*FImpl::Dimension), nSrc_(nSrc), nSparse_(nSparse)
-{}
-
-template <typename FImpl>
-void SparseSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng)
-{
-    typedef decltype(peekColour((*this)[0], 0)) SpinField;
-
-    auto                       &noise = *this;
-    auto                       g      = this->getGrid();
-    auto                       nd     = g->GlobalDimensions().size();
-    auto                       nc     = FImpl::Dimension;
-    LatticeInteger             coor(g), coorTot(g); coorTot = 0.;
-    Complex                    shift(1., 1.);
-    LatticeComplex             eta(g), etaSparse(g);
-    SpinField                  etas(g);
-    unsigned int               i = 0;
-    unsigned int               j = 0;
-    unsigned int               nSrc_ec;
-    
-    if(nSrc_%nSparse_==0)
-    {
-         nSrc_ec = nSrc_/nSparse_;
-    }
-    else
-    {
-         nSrc_ec = (nSrc_ - nSrc_%nSparse_)/nSparse_;
-    }
-
-    for (unsigned int n = 0; n < nSrc_; ++n)
-    {
-        bernoulli(rng, eta);
-        eta = (2.*eta - shift)*(1./::sqrt(2.));
-
-        if(nSparse_ != 1)
-        { 
-        assert(g->GlobalDimensions()[1]%nSparse_ == 0);
-        // # 0 # 0
-        // 0 # 0 #
-        // # 0 # 0
-        // 0 # 0 #
-
-        coorTot = 0;
-
-            for(unsigned int d = 0; d < nd; ++d) 
-            {
-                LatticeCoordinate(coor, d);
-                coorTot = coorTot + coor;
-            }
-            coorTot = coorTot + j;
-            eta = where(mod(coorTot,nSparse_), 0.*eta, eta);
-            
-        }
-        
-        for (unsigned int s = 0; s < Ns; ++s)
-        {
-            etas = Zero();
-            pokeSpin(etas, eta, s);
-            for (unsigned int c = 0; c < nc; ++c)
-            {
-                noise[i] = Zero();
-                pokeColour(noise[i], etas, c);
-                
-                i++;
-                
-                /**/ 
-            
-            }
-        }
-        ((n+1)%nSrc_ec == 0) ? j++: 0;
-    }
-    Real norm = sqrt(1./nSrc_ec);
-    this->normalise(norm);
-}
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_DilutedNoise_hpp_
--- a/Hadrons/DiskVector.hpp
+++ b/Hadrons/DiskVector.hpp
@@ -1,511 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/DiskVector.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef Hadrons_DiskVector_hpp_
-#define Hadrons_DiskVector_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/A2AMatrix.hpp>
-#include <deque>
-#include <sys/stat.h>
-#include <ftw.h>
-#include <unistd.h>
-
-#ifdef DV_DEBUG
-#define DV_DEBUG_MSG(dv, stream) LOG(Debug) << "diskvector " << (dv) << ": " << stream << std::endl
-#else
-#define DV_DEBUG_MSG(dv, stream)
-#endif
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                           Abstract base class                              *
- ******************************************************************************/
-template <typename T>
-class DiskVectorBase
-{
-public:
-    typedef T ObjectType;
-
-    // helper for read/write vector access
-    class RwAccessHelper
-    {
-    public:
-        RwAccessHelper(DiskVectorBase<T> &master, const unsigned int i)
-        : master_(master), cmaster_(master), i_(i) {}
-
-        // operator=: somebody is trying to store a vector element
-        // write to cache and tag as modified
-        T &operator=(const T &obj) const
-        {
-            auto &cache    = *master_.cachePtr_;
-            auto &modified = *master_.modifiedPtr_;
-            auto &index    = *master_.indexPtr_;
-
-            DV_DEBUG_MSG(&master_, "writing to " << i_);
-            master_.cacheInsert(i_, obj);
-            modified[index.at(i_)] = true;
-            
-            return cache[index.at(i_)];
-        }
-
-        // implicit cast to const object reference and redirection
-        // to the const operator[] for read-only operations
-        operator const T&() const
-        {
-            return cmaster_[i_];
-        }
-    private:
-        DiskVectorBase<T>       &master_;
-        const DiskVectorBase<T> &cmaster_;
-        const unsigned int      i_;
-    };
-public:
-    DiskVectorBase(const std::string dirname, const unsigned int size = 0,
-                   const unsigned int cacheSize = 1, const bool clean = true,
-                   GridBase *grid = nullptr);
-    DiskVectorBase(DiskVectorBase<T> &&v) = default;
-    virtual ~DiskVectorBase(void);
-    const T & operator[](const unsigned int i) const;
-    RwAccessHelper operator[](const unsigned int i);
-    double hitRatio(void) const;
-    void resetStat(void);
-    void setSize(unsigned int size_);
-    unsigned int getSize() const;
-    unsigned int dvSize;
-    void setGrid(GridBase *grid_);
-    GridBase *getGrid() const;
-    GridBase *dvGrid;
-private:
-    virtual void load(T &obj, const std::string filename) const = 0;
-    virtual void save(const std::string filename, const T &obj) const = 0;
-    virtual std::string filename(const unsigned int i) const;
-    void evict(void) const;
-    void fetch(const unsigned int i) const;
-    void cacheInsert(const unsigned int i, const T &obj) const;
-    void clean(void);
-private:
-    std::string                                           dirname_;
-    unsigned int                                          size_, cacheSize_;
-    double                                                access_{0.}, hit_{0.};
-    bool                                                  clean_;
-    GridBase                                              *grid_;
-    // using pointers to allow modifications when class is const
-    // semantic: const means data unmodified, but cache modification allowed
-    std::unique_ptr<std::vector<T>>                       cachePtr_;
-    std::unique_ptr<std::vector<bool>>                    modifiedPtr_;
-    std::unique_ptr<std::map<unsigned int, unsigned int>> indexPtr_;
-    std::unique_ptr<std::stack<unsigned int>>             freePtr_;
-    std::unique_ptr<std::deque<unsigned int>>             loadsPtr_;                
-};
-
-/******************************************************************************
- *                   Specialisation for serialisable classes                  *
- ******************************************************************************/
-template <typename T, typename Reader, typename Writer>
-class SerializableDiskVector: public DiskVectorBase<T>
-{
-public:
-    using DiskVectorBase<T>::DiskVectorBase;
-private:
-    virtual void load(T &obj, const std::string filename) const
-    {
-        Reader reader(filename);
-
-        read(reader, basename(filename), obj);
-    }
-
-    virtual void save(const std::string filename, const T &obj) const
-    {
-        Writer writer(filename);
-
-        write(writer, basename(filename), obj);
-    }
-};
-
-/******************************************************************************
- *                      Specialisation for Eigen matrices                     *
- ******************************************************************************/
-template <typename T>
-using EigenDiskVectorMat = A2AMatrix<T>;
-
-template <typename T>
-class EigenDiskVector: public DiskVectorBase<EigenDiskVectorMat<T>>
-{
-public:
-    using DiskVectorBase<EigenDiskVectorMat<T>>::DiskVectorBase;
-    typedef EigenDiskVectorMat<T> Matrix;
-public:
-    T operator()(const unsigned int i, const Eigen::Index j,
-                 const Eigen::Index k) const
-    {
-        return (*this)[i](j, k);
-    }
-    std::vector<int> dimensions() const
-    {
-        std::vector<int> dims(3);
-        dims[0] = (*this).getSize();
-        dims[1] = (*this)[0].rows();
-        dims[2] = (*this)[0].cols();
-        return dims;
-    }
-private:
-    virtual void load(EigenDiskVectorMat<T> &obj, const std::string filename) const
-    {
-        GridBase *loadGrid;
-        loadGrid = (*this).getGrid();
-        if (!(loadGrid) || loadGrid->IsBoss())
-        {
-            std::ifstream f(filename, std::ios::binary);
-            uint32_t      crc, check;
-            Eigen::Index  nRow, nCol;
-            size_t        matSize;
-            double        tRead, tHash;
-
-            f.read(reinterpret_cast<char *>(&crc), sizeof(crc));
-            f.read(reinterpret_cast<char *>(&nRow), sizeof(nRow));
-            f.read(reinterpret_cast<char *>(&nCol), sizeof(nCol));
-            obj.resize(nRow, nCol);
-            matSize = nRow*nCol*sizeof(T);
-            tRead  = -usecond();
-            f.read(reinterpret_cast<char *>(obj.data()), matSize);
-            tRead += usecond();
-            tHash  = -usecond();
-    #ifdef USE_IPP
-            check  = GridChecksum::crc32c(obj.data(), matSize);
-    #else
-            check  = GridChecksum::crc32(obj.data(), matSize);
-    #endif
-            tHash += usecond();
-            DV_DEBUG_MSG(this, "Eigen read " << tRead/1.0e6 << " sec " << matSize/tRead*1.0e6/1024/1024 << " MB/s");
-            DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << check << std::dec 
-                        << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
-            if (crc != check)
-            {
-                HADRONS_ERROR(Io, "checksum failed")
-            }
-        }
-        int broadcastSize;
-        broadcastSize = sizeof(T)*obj.size();
-        if (loadGrid)
-        {
-            loadGrid->Broadcast(loadGrid->BossRank(), obj.data(), broadcastSize);
-            loadGrid->Barrier();
-        }
-    }
-
-    virtual void save(const std::string filename, const EigenDiskVectorMat<T> &obj) const
-    {
-        GridBase *saveGrid;
-        saveGrid = (*this).getGrid();
-        if (!(saveGrid) || saveGrid->IsBoss())
-        {
-            std::ofstream f(filename, std::ios::binary);
-            uint32_t      crc;
-            Eigen::Index  nRow, nCol;
-            size_t        matSize;
-            double        tWrite, tHash;
-            
-            nRow    = obj.rows();
-            nCol    = obj.cols();
-            matSize = nRow*nCol*sizeof(T);
-            tHash   = -usecond();
-    #ifdef USE_IPP
-            crc     = GridChecksum::crc32c(obj.data(), matSize);
-    #else
-            crc     = GridChecksum::crc32(obj.data(), matSize);
-    #endif
-            tHash  += usecond();
-            f.write(reinterpret_cast<char *>(&crc), sizeof(crc));
-            f.write(reinterpret_cast<char *>(&nRow), sizeof(nRow));
-            f.write(reinterpret_cast<char *>(&nCol), sizeof(nCol));
-            tWrite = -usecond();
-            f.write(reinterpret_cast<const char *>(obj.data()), matSize);
-            tWrite += usecond();
-            DV_DEBUG_MSG(this, "Eigen write " << tWrite/1.0e6 << " sec " << matSize/tWrite*1.0e6/1024/1024 << " MB/s");
-            DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << crc << std::dec
-                        << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
-        }
-        if (saveGrid)   saveGrid->Barrier();
-    }
-};
-
-/******************************************************************************
- *                       DiskVectorBase implementation                         *
- ******************************************************************************/
-template <typename T>
-DiskVectorBase<T>::DiskVectorBase(const std::string dirname, 
-                                  const unsigned int size,
-                                  const unsigned int cacheSize,
-                                  const bool clean,
-                                  GridBase *grid)
-: dirname_(dirname), size_(size), cacheSize_(cacheSize), clean_(clean), grid_(grid)
-, cachePtr_(new std::vector<T>(size))
-, modifiedPtr_(new std::vector<bool>(size, false))
-, indexPtr_(new std::map<unsigned int, unsigned int>())
-, freePtr_(new std::stack<unsigned int>)
-, loadsPtr_(new std::deque<unsigned int>())
-{
-    struct stat s;
-
-    if (!(grid_) || grid_->IsBoss())
-    {
-        if(stat(dirname.c_str(), &s) == 0)
-        {
-            HADRONS_ERROR(Io, "directory '" + dirname + "' already exists")
-        }
-        mkdir(dirname);
-    }
-    if (grid_)  grid_->Barrier();
-    for (unsigned int i = 0; i < cacheSize_; ++i)
-    {
-        freePtr_->push(i);
-    }
-    setSize(size_);
-    setGrid(grid_);
-}
-
-template <typename T>
-DiskVectorBase<T>::~DiskVectorBase(void)
-{
-    if (clean_)
-    {
-        clean();
-    }
-}
-
-template <typename T>
-void DiskVectorBase<T>::setSize(unsigned int size_)
-{
-    dvSize = size_;
-}
-
-template <typename T>
-unsigned int DiskVectorBase<T>::getSize() const
-{
-    return dvSize;
-}
-
-template <typename T>
-void DiskVectorBase<T>::setGrid(GridBase *grid_)
-{
-    dvGrid = grid_;
-}
-
-template <typename T>
-GridBase *DiskVectorBase<T>::getGrid() const
-{
-    return dvGrid;
-}
-
-template <typename T>
-const T & DiskVectorBase<T>::operator[](const unsigned int i) const
-{
-    auto &cache   = *cachePtr_;
-    auto &index   = *indexPtr_;
-    auto &freeInd = *freePtr_;
-    auto &loads   = *loadsPtr_;
-
-    DV_DEBUG_MSG(this, "accessing " << i << " (RO)");
-
-    if (i >= size_)
-    {
-        HADRONS_ERROR(Size, "index out of range");
-    }
-    const_cast<double &>(access_)++;
-    if (index.find(i) == index.end())
-    {
-        // cache miss
-        DV_DEBUG_MSG(this, "cache miss");
-        fetch(i);
-    }
-    else
-    {
-        DV_DEBUG_MSG(this, "cache hit");
-
-        auto pos = std::find(loads.begin(), loads.end(), i);
-
-        const_cast<double &>(hit_)++;
-        loads.erase(pos);
-        loads.push_back(i);
-    }
-
-#ifdef DV_DEBUG
-    std::string msg;
-
-    for (auto &p: loads)
-    {
-        msg += std::to_string(p) + " ";
-    }
-    DV_DEBUG_MSG(this, "in cache: " << msg);
-#endif
-    if (grid_)  grid_->Barrier();
-    return cache[index.at(i)];
-}
-
-template <typename T>
-typename DiskVectorBase<T>::RwAccessHelper DiskVectorBase<T>::operator[](const unsigned int i)
-{
-    DV_DEBUG_MSG(this, "accessing " << i << " (RW)");
-
-    if (i >= size_)
-    {
-        HADRONS_ERROR(Size, "index out of range");
-    }
-
-    return RwAccessHelper(*this, i);
-}
-
-template <typename T>
-double DiskVectorBase<T>::hitRatio(void) const
-{
-    return hit_/access_;
-}
-
-template <typename T>
-void DiskVectorBase<T>::resetStat(void)
-{
-    access_ = 0.;
-    hit_    = 0.;
-}
-
-template <typename T>
-std::string DiskVectorBase<T>::filename(const unsigned int i) const
-{
-    return dirname_ + "/elem_" + std::to_string(i);
-}
-
-template <typename T>
-void DiskVectorBase<T>::evict(void) const
-{
-    auto &cache    = *cachePtr_;
-    auto &modified = *modifiedPtr_;
-    auto &index    = *indexPtr_;
-    auto &freeInd  = *freePtr_;
-    auto &loads    = *loadsPtr_;
-
-    if (index.size() >= cacheSize_)
-    {
-        unsigned int i = loads.front();
-        
-        DV_DEBUG_MSG(this, "evicting " << i);
-        if (modified[index.at(i)])
-        {
-            DV_DEBUG_MSG(this, "element " << i << " modified, saving to disk");
-            save(filename(i), cache[index.at(i)]);
-        }
-        freeInd.push(index.at(i));
-        index.erase(i);
-        loads.pop_front();
-    }
-    if (grid_)  grid_->Barrier();
-}
-
-template <typename T>
-void DiskVectorBase<T>::fetch(const unsigned int i) const
-{
-    auto &cache    = *cachePtr_;
-    auto &modified = *modifiedPtr_;
-    auto &index    = *indexPtr_;
-    auto &freeInd  = *freePtr_;
-    auto &loads    = *loadsPtr_;
-
-    struct stat s;
-
-    DV_DEBUG_MSG(this, "loading " << i << " from disk");
-
-    evict();
-    
-    if(stat(filename(i).c_str(), &s) != 0)
-    {
-        HADRONS_ERROR(Io, "disk vector element " + std::to_string(i) + " uninitialised");
-    }
-    index[i] = freeInd.top();
-    freeInd.pop();
-    load(cache[index.at(i)], filename(i));
-    loads.push_back(i);
-    modified[index.at(i)] = false;
-}
-
-template <typename T>
-void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
-{
-    auto &cache    = *cachePtr_;
-    auto &modified = *modifiedPtr_;
-    auto &index    = *indexPtr_;
-    auto &freeInd  = *freePtr_;
-    auto &loads    = *loadsPtr_;
-
-    evict();
-    index[i] = freeInd.top();
-    freeInd.pop();
-    cache[index.at(i)] = obj;
-    loads.push_back(i);
-    modified[index.at(i)] = false;
-
-    if (grid_)  grid_->Barrier();
-#ifdef DV_DEBUG
-    std::string msg;
-
-    for (auto &p: loads)
-    {
-        msg += std::to_string(p) + " ";
-    }
-    DV_DEBUG_MSG(this, "in cache: " << msg);
-#endif
-}
-
-#ifdef DV_DEBUG
-#undef DV_DEBUG_MSG
-#endif
-
-template <typename T>
-void DiskVectorBase<T>::clean(void)
-{
-    if (!(grid_) || grid_->IsBoss())
-    {
-        auto unlink = [](const char *fpath, const struct stat *sb,
-                         int typeflag, struct FTW *ftwbuf) {
-            int rv = remove(fpath);
-
-            if (rv)
-            {
-                HADRONS_ERROR(Io, "cannot remove '" + std::string(fpath) + "': " + std::string(std::strerror(errno)));
-            }
-
-            return rv;
-        };
-
-        nftw(dirname_.c_str(), unlink, 64, FTW_DEPTH | FTW_PHYS);
-    }
-    if (grid_)  grid_->Barrier();
-}
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_DiskVector_hpp_
--- a/Hadrons/EigenPack.hpp
+++ b/Hadrons/EigenPack.hpp
@@ -1,416 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/EigenPack.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef Hadrons_EigenPack_hpp_
-#define Hadrons_EigenPack_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Grid/algorithms/iterative/Deflation.h>
-#include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
-
-BEGIN_HADRONS_NAMESPACE
-
-// Lanczos type
-#ifndef HADRONS_DEFAULT_LANCZOS_NBASIS
-#define HADRONS_DEFAULT_LANCZOS_NBASIS 60
-#endif
-
-#define HADRONS_DUMP_EP_METADATA(record) \
-LOG(Message) << "Eigenpack metadata:" << std::endl;\
-LOG(Message) << "* operator" << std::endl;\
-LOG(Message) << (record).operatorXml << std::endl;\
-LOG(Message) << "* solver" << std::endl;\
-LOG(Message) << (record).solverXml << std::endl;
-
-struct PackRecord
-{
-    std::string operatorXml, solverXml;
-};
-
-struct VecRecord: Serializable
-{
-    GRID_SERIALIZABLE_CLASS_MEMBERS(VecRecord,
-                                    unsigned int, index,
-                                    double,       eval);
-    VecRecord(void): index(0), eval(0.) {}
-};
-
-namespace EigenPackIo
-{
-    inline void readHeader(PackRecord &record, ScidacReader &binReader)
-    {
-        std::string recordXml;
-
-        binReader.readLimeObject(recordXml, SCIDAC_FILE_XML);
-        XmlReader xmlReader(recordXml, true, "eigenPackPar");
-        xmlReader.push();
-        xmlReader.readCurrentSubtree(record.operatorXml);
-        xmlReader.nextElement();
-        xmlReader.readCurrentSubtree(record.solverXml);
-    }
-
-    template <typename T, typename TIo = T>
-    void readElement(T &evec, RealD &eval, const unsigned int index,
-                     ScidacReader &binReader, TIo *ioBuf = nullptr)
-    {
-        VecRecord vecRecord;
-
-        LOG(Message) << "Reading eigenvector " << index << std::endl;
-        if (ioBuf == nullptr)
-        {
-            binReader.readScidacFieldRecord(evec, vecRecord);
-        }
-        else
-        {
-            binReader.readScidacFieldRecord(*ioBuf, vecRecord);
-            precisionChange(evec, *ioBuf);
-        }
-        if (vecRecord.index != index)
-        {
-            HADRONS_ERROR(Io, "Eigenvector " + std::to_string(index) + " has a"
-                            + " wrong index (expected " + std::to_string(vecRecord.index) 
-                            + ")");
-        }
-        eval = vecRecord.eval;
-    }
-
-    template <typename T, typename TIo = T>
-    static void readPack(std::vector<T> &evec, std::vector<RealD> &eval,
-                         PackRecord &record, const std::string filename, 
-                         const unsigned int size, bool multiFile, 
-                         GridBase *gridIo = nullptr)
-    {
-        std::unique_ptr<TIo> ioBuf{nullptr};
-        ScidacReader         binReader;
-
-        if (typeHash<T>() != typeHash<TIo>())
-        {
-            if (gridIo == nullptr)
-            {
-                HADRONS_ERROR(Definition, 
-                              "I/O type different from vector type but null I/O grid passed");
-            }
-            ioBuf.reset(new TIo(gridIo));
-        }
-        if (multiFile)
-        {
-            std::string fullFilename;
-
-            for(int k = 0; k < size; ++k) 
-            {
-                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
-                binReader.open(fullFilename);
-                readHeader(record, binReader);
-                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
-                binReader.close();
-            }
-        }
-        else
-        {
-            binReader.open(filename);
-            readHeader(record, binReader);
-            for(int k = 0; k < size; ++k) 
-            {
-                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
-            }
-            binReader.close();
-        }
-    }
-
-    inline void writeHeader(ScidacWriter &binWriter, PackRecord &record)
-    {
-        XmlWriter xmlWriter("", "eigenPackPar");
-
-        xmlWriter.pushXmlString(record.operatorXml);
-        xmlWriter.pushXmlString(record.solverXml);
-        binWriter.writeLimeObject(1, 1, xmlWriter, "parameters", SCIDAC_FILE_XML);
-    }
-
-    template <typename T, typename TIo = T>
-    void writeElement(ScidacWriter &binWriter, T &evec, RealD &eval, 
-                      const unsigned int index, TIo *ioBuf, 
-                      T *testBuf = nullptr)
-    {
-        VecRecord vecRecord;
-
-        LOG(Message) << "Writing eigenvector " << index << std::endl;
-        vecRecord.eval  = eval;
-        vecRecord.index = index;
-        if ((ioBuf == nullptr) || (testBuf == nullptr))
-        {
-            binWriter.writeScidacFieldRecord(evec, vecRecord, DEFAULT_ASCII_PREC);
-        }
-        else
-        {
-            precisionChange(*ioBuf, evec);
-            precisionChange(*testBuf, *ioBuf);
-            *testBuf -= evec;
-            LOG(Message) << "Precision diff norm^2 " << norm2(*testBuf) << std::endl;
-            binWriter.writeScidacFieldRecord(*ioBuf, vecRecord, DEFAULT_ASCII_PREC);
-        }   
-    }
-    
-    template <typename T, typename TIo = T>
-    static void writePack(const std::string filename, std::vector<T> &evec, 
-                          std::vector<RealD> &eval, PackRecord &record, 
-                          const unsigned int size, bool multiFile, 
-                          GridBase *gridIo = nullptr)
-    {
-        GridBase             *grid = evec[0].Grid();
-        std::unique_ptr<TIo> ioBuf{nullptr}; 
-        std::unique_ptr<T>   testBuf{nullptr};
-        ScidacWriter         binWriter(grid->IsBoss());
-
-        if (typeHash<T>() != typeHash<TIo>())
-        {
-            if (gridIo == nullptr)
-            {
-                HADRONS_ERROR(Definition, 
-                              "I/O type different from vector type but null I/O grid passed");
-            }
-            ioBuf.reset(new TIo(gridIo));
-            testBuf.reset(new T(grid));
-        }
-        if (multiFile)
-        {
-            std::string fullFilename;
-
-            for(int k = 0; k < size; ++k) 
-            {
-                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
-
-                makeFileDir(fullFilename, grid);
-                binWriter.open(fullFilename);
-                writeHeader(binWriter, record);
-                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
-                binWriter.close();
-            }
-        }
-        else
-        {
-            makeFileDir(filename, grid);
-            binWriter.open(filename);
-            writeHeader(binWriter, record);
-            for(int k = 0; k < size; ++k) 
-            {
-                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
-            }
-            binWriter.close();
-        }
-    }
-}
-
-template <typename F>
-class BaseEigenPack
-{
-public:
-    typedef F Field;
-public:
-    std::vector<RealD> eval;
-    std::vector<F>     evec;
-    PackRecord         record;
-public:
-    BaseEigenPack(void)          = default;
-    BaseEigenPack(const size_t size, GridBase *grid)
-    {
-        resize(size, grid);
-    }
-    virtual ~BaseEigenPack(void) = default;
-    void resize(const size_t size, GridBase *grid)
-    {
-        eval.resize(size);
-        evec.resize(size, grid);
-    }
-};
-
-template <typename F, typename FIo = F>
-class EigenPack: public BaseEigenPack<F>
-{
-public:
-    typedef F   Field;
-    typedef FIo FieldIo;
-public:
-    EigenPack(void)          = default;
-    virtual ~EigenPack(void) = default;
-
-    EigenPack(const size_t size, GridBase *grid, GridBase *gridIo = nullptr)
-    : BaseEigenPack<F>(size, grid)
-    {
-        if (typeHash<F>() != typeHash<FIo>())
-        {
-            if (gridIo == nullptr)
-            {
-                HADRONS_ERROR(Definition, 
-                              "I/O type different from vector type but null I/O grid passed");
-            }
-        }
-        gridIo_ = gridIo;
-    }
-
-    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
-    {
-        EigenPackIo::readPack<F, FIo>(this->evec, this->eval, this->record, 
-                                      evecFilename(fileStem, traj, multiFile), 
-                                      this->evec.size(), multiFile, gridIo_);
-        HADRONS_DUMP_EP_METADATA(this->record);
-    }
-
-    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
-    {
-        EigenPackIo::writePack<F, FIo>(evecFilename(fileStem, traj, multiFile), 
-                                       this->evec, this->eval, this->record, 
-                                       this->evec.size(), multiFile, gridIo_);
-    }
-protected:
-    std::string evecFilename(const std::string stem, const int traj, const bool multiFile)
-    {
-        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
-
-        if (multiFile)
-        {
-            return stem + t;
-        }
-        else
-        {
-            return stem + t + ".bin";
-        }
-    }
-protected:
-    GridBase *gridIo_;
-};
-
-template <typename FineF, typename CoarseF, 
-          typename FineFIo = FineF, typename CoarseFIo = CoarseF>
-class CoarseEigenPack: public EigenPack<FineF, FineFIo>
-{
-public:
-    typedef CoarseF   CoarseField;
-    typedef CoarseFIo CoarseFieldIo;
-public:      
-    std::vector<CoarseF> evecCoarse;
-    std::vector<RealD>   evalCoarse;
-public:
-    CoarseEigenPack(void)          = default;
-    virtual ~CoarseEigenPack(void) = default;
-
-    CoarseEigenPack(const size_t sizeFine, const size_t sizeCoarse, 
-                    GridBase *gridFine, GridBase *gridCoarse,
-                    GridBase *gridFineIo = nullptr, 
-                    GridBase *gridCoarseIo = nullptr)
-    {
-        if (typeHash<FineF>() != typeHash<FineFIo>())
-        {
-            if (gridFineIo == nullptr)
-            {
-                HADRONS_ERROR(Definition, 
-                              "Fine I/O type different from vector type but null fine I/O grid passed");
-            }
-        }
-        if (typeHash<CoarseF>() != typeHash<CoarseFIo>())
-        {
-            if (gridCoarseIo == nullptr)
-            {
-                HADRONS_ERROR(Definition, 
-                              "Coarse I/O type different from vector type but null coarse I/O grid passed");
-            }
-        }
-        this->gridIo_ = gridFineIo;
-        gridCoarseIo_ = gridCoarseIo;
-        resize(sizeFine, sizeCoarse, gridFine, gridCoarse);
-    }
-
-    void resize(const size_t sizeFine, const size_t sizeCoarse, 
-                GridBase *gridFine, GridBase *gridCoarse)
-    {
-        EigenPack<FineF, FineFIo>::resize(sizeFine, gridFine);
-        evalCoarse.resize(sizeCoarse);
-        evecCoarse.resize(sizeCoarse, gridCoarse);
-    }
-
-    void readFine(const std::string fileStem, const bool multiFile, const int traj = -1)
-    {
-        EigenPack<FineF, FineFIo>::read(fileStem + "_fine", multiFile, traj);
-    }
-
-    void readCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
-    {
-        PackRecord dummy;
-
-        EigenPackIo::readPack<CoarseF, CoarseFIo>(evecCoarse, evalCoarse, dummy, 
-                              this->evecFilename(fileStem + "_coarse", traj, multiFile), 
-                              evecCoarse.size(), multiFile, gridCoarseIo_);
-    }
-
-    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
-    {
-        readFine(fileStem, multiFile, traj);
-        readCoarse(fileStem, multiFile, traj);
-    }
-
-    void writeFine(const std::string fileStem, const bool multiFile, const int traj = -1)
-    {
-        EigenPack<FineF, FineFIo>::write(fileStem + "_fine", multiFile, traj);
-    }
-
-    void writeCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
-    {
-        EigenPackIo::writePack<CoarseF, CoarseFIo>(this->evecFilename(fileStem + "_coarse", traj, multiFile), 
-                                                   evecCoarse, evalCoarse, this->record, 
-                                                   evecCoarse.size(), multiFile, gridCoarseIo_);
-    }
-    
-    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
-    {
-        writeFine(fileStem, multiFile, traj);
-        writeCoarse(fileStem, multiFile, traj);
-    }
-private:
-    GridBase *gridCoarseIo_;
-};
-
-template <typename FImpl>
-using BaseFermionEigenPack = BaseEigenPack<typename FImpl::FermionField>;
-
-template <typename FImpl, typename FImplIo = FImpl>
-using FermionEigenPack = EigenPack<typename FImpl::FermionField, typename FImplIo::FermionField>;
-
-template <typename FImpl, int nBasis, typename FImplIo = FImpl>
-using CoarseFermionEigenPack = CoarseEigenPack<
-    typename FImpl::FermionField,
-    typename LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
-                                   typename FImpl::SiteComplex, 
-                                   nBasis>::CoarseField,
-    typename FImplIo::FermionField,
-    typename LocalCoherenceLanczos<typename FImplIo::SiteSpinor, 
-                                   typename FImplIo::SiteComplex, 
-                                   nBasis>::CoarseField>;
-
-#undef HADRONS_DUMP_EP_METADATA
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_EigenPack_hpp_
--- a/Hadrons/Environment.cc
+++ b/Hadrons/Environment.cc
@@ -1,347 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Environment.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Hadrons/Environment.hpp>
-#include <Hadrons/Module.hpp>
-#include <Hadrons/ModuleFactory.hpp>
-
-using namespace Grid;
- 
-using namespace Hadrons;
-
-#define ERROR_NO_ADDRESS(address)\
-HADRONS_ERROR_REF(ObjectDefinition, "no object with address " + std::to_string(address), address);
-
-/******************************************************************************
- *                       Environment implementation                           *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-Environment::Environment(void)
-{
-    dim_ = GridDefaultLatt().toVector();
-    nd_  = dim_.size();
-    vol_ = 1.;
-    for (auto d: dim_)
-    {
-        vol_ *= d;
-    }
-}
-
-// grids ///////////////////////////////////////////////////////////////////////
-unsigned int Environment::getNd(void) const
-{
-    return nd_;
-}
-
-std::vector<int> Environment::getDim(void) const
-{
-    return dim_;
-}
-
-int Environment::getDim(const unsigned int mu) const
-{
-    return dim_[mu];
-}
-
-double Environment::getVolume(void) const
-{
-    return vol_;
-}
-
-// random number generator /////////////////////////////////////////////////////
-GridParallelRNG * Environment::get4dRng(void)
-{
-    if (rng4d_ == nullptr)
-    {
-        rng4d_.reset(new GridParallelRNG(getGrid()));
-    }
-
-    return rng4d_.get();
-}
-
-GridSerialRNG * Environment::getSerialRng(void)
-{
-    if (rngSerial_ == nullptr)
-    {
-        rngSerial_.reset(new GridSerialRNG());
-    }
-
-    return rngSerial_.get();
-}
-
-// general memory management ///////////////////////////////////////////////////
-void Environment::addObject(const std::string name, const int moduleAddress)
-{
-    if (!hasObject(name))
-    {
-        ObjInfo info;
-        
-        info.name   = name;
-        info.module = moduleAddress;
-        info.data   = nullptr;
-        object_.push_back(std::move(info));
-        objectAddress_[name] = static_cast<unsigned int>(object_.size() - 1);
-    }
-    else
-    {
-        HADRONS_ERROR_REF(ObjectDefinition, "object '" + name + "' already exists",
-                          getObjectAddress(name));
-    }
-}
-
-void Environment::setObjectModule(const unsigned int objAddress,
-                                  const int modAddress)
-{
-    object_[objAddress].module = modAddress;
-}
-
-unsigned int Environment::getMaxAddress(void) const
-{
-    return object_.size();
-}
-
-unsigned int Environment::getObjectAddress(const std::string name) const
-{
-    if (hasObject(name))
-    {
-        return objectAddress_.at(name);
-    }
-    else
-    {
-        HADRONS_ERROR(Definition, "no object with name '" + name + "'");
-    }
-}
-
-std::string Environment::getObjectName(const unsigned int address) const
-{
-    if (hasObject(address))
-    {
-        return object_[address].name;
-    }
-    else
-    {
-        ERROR_NO_ADDRESS(address);
-    }
-}
-
-std::string Environment::getObjectType(const unsigned int address) const
-{
-    if (hasObject(address))
-    {
-        if (object_[address].type)
-        {
-            return typeName(object_[address].type);
-        }
-        else
-        {
-            return "<no type>";
-        }
-    }
-    else
-    {
-        ERROR_NO_ADDRESS(address);
-    }
-}
-
-std::string Environment::getObjectType(const std::string name) const
-{
-    return getObjectType(getObjectAddress(name));
-}
-
-Environment::Size Environment::getObjectSize(const unsigned int address) const
-{
-    if (hasObject(address))
-    {
-        return object_[address].size;
-    }
-    else
-    {
-        ERROR_NO_ADDRESS(address);
-    }
-}
-
-Environment::Size Environment::getObjectSize(const std::string name) const
-{
-    return getObjectSize(getObjectAddress(name));
-}
-
-Environment::Storage Environment::getObjectStorage(const unsigned int address) const
-{
-    if (hasObject(address))
-    {
-        return object_[address].storage;
-    }
-    else
-    {
-        ERROR_NO_ADDRESS(address);
-    }
-}
-
-Environment::Storage Environment::getObjectStorage(const std::string name) const
-{
-    return getObjectStorage(getObjectAddress(name));
-}
-
-int Environment::getObjectModule(const unsigned int address) const
-{
-    if (hasObject(address))
-    {
-        return object_[address].module;
-    }
-    else
-    {
-        ERROR_NO_ADDRESS(address);
-    }
-}
-
-int Environment::getObjectModule(const std::string name) const
-{
-    return getObjectModule(getObjectAddress(name));
-}
-
-unsigned int Environment::getObjectLs(const unsigned int address) const
-{
-    if (hasCreatedObject(address))
-    {
-        return object_[address].Ls;
-    }
-    else
-    {
-        ERROR_NO_ADDRESS(address);
-    }
-}
-
-unsigned int Environment::getObjectLs(const std::string name) const
-{
-    return getObjectLs(getObjectAddress(name));
-}
-
-bool Environment::hasObject(const unsigned int address) const
-{
-    return (address < object_.size());
-}
-
-bool Environment::hasObject(const std::string name) const
-{
-    auto it = objectAddress_.find(name);
-    
-    return ((it != objectAddress_.end()) and hasObject(it->second));
-}
-
-bool Environment::hasCreatedObject(const unsigned int address) const
-{
-    if (hasObject(address))
-    {
-        return (object_[address].data != nullptr);
-    }
-    else
-    {
-        return false;
-    }
-}
-
-bool Environment::hasCreatedObject(const std::string name) const
-{
-    if (hasObject(name))
-    {
-        return hasCreatedObject(getObjectAddress(name));
-    }
-    else
-    {
-        return false;
-    }
-}
-
-bool Environment::isObject5d(const unsigned int address) const
-{
-    return (getObjectLs(address) > 1);
-}
-
-bool Environment::isObject5d(const std::string name) const
-{
-    return (getObjectLs(name) > 1);
-}
-
-Environment::Size Environment::getTotalSize(void) const
-{
-    Environment::Size size = 0;
-    
-    for (auto &o: object_)
-    {
-        size += o.size;
-    }
-    
-    return size;
-}
-
-void Environment::freeObject(const unsigned int address)
-{
-    if (hasCreatedObject(address))
-    {
-        LOG(Message) << "Destroying object '" << object_[address].name
-                     << "'" << std::endl;
-    }
-    object_[address].size = 0;
-    object_[address].type = nullptr;
-    object_[address].data.reset(nullptr);
-}
-
-void Environment::freeObject(const std::string name)
-{
-    freeObject(getObjectAddress(name));
-}
-
-void Environment::freeAll(void)
-{
-    for (unsigned int i = 0; i < object_.size(); ++i)
-    {
-        freeObject(i);
-    }
-}
-
-void Environment::protectObjects(const bool protect)
-{
-    protect_ = protect;
-}
-
-bool Environment::objectsProtected(void) const
-{
-    return protect_;
-}
-
-// print environment content ///////////////////////////////////////////////////
-void Environment::printContent(void) const
-{
-    LOG(Debug) << "Objects: " << std::endl;
-    for (unsigned int i = 0; i < object_.size(); ++i)
-    {
-        LOG(Debug) << std::setw(4) << i << ": "
-                   << getObjectName(i) << " ("
-                   << sizeString(getObjectSize(i)) << ")" << std::endl;
-    }
-}
--- a/Hadrons/Environment.hpp
+++ b/Hadrons/Environment.hpp
@@ -1,588 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Environment.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_Environment_hpp_
-#define Hadrons_Environment_hpp_
-
-#include <Hadrons/Global.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                         Global environment                                 *
- ******************************************************************************/
-class Object
-{
-public:
-    Object(void) = default;
-    virtual ~Object(void) = default;
-};
-
-template <typename T>
-class Holder: public Object
-{
-public:
-    Holder(void) = default;
-    Holder(T *pt);
-    virtual ~Holder(void) = default;
-    T &       get(void) const;
-    T *       getPt(void) const;
-    void      reset(T *pt);
-private:
-    std::unique_ptr<T> objPt_{nullptr};
-};
-
-#define DEFINE_ENV_ALIAS \
-inline Environment & env(void) const\
-{\
-    return Environment::getInstance();\
-}
-
-#define DEFINE_ENV_LAMBDA \
-auto env = [](void)->Environment &{return Environment::getInstance();}
-
-class Environment
-{
-    SINGLETON(Environment);
-public:
-    typedef SITE_SIZE_TYPE                         Size;
-    typedef std::unique_ptr<GridCartesian>         GridPt;
-    typedef std::unique_ptr<GridRedBlackCartesian> GridRbPt;
-    typedef std::unique_ptr<GridParallelRNG>       RngPt;
-    typedef std::unique_ptr<GridSerialRNG>         SerialRngPt;
-    enum class Storage {object, cache, temporary};
-private:
-    struct ObjInfo
-    {
-        Size                    size{0};
-        Storage                 storage{Storage::object};
-        unsigned int            Ls{0};
-        const std::type_info    *type{nullptr}, *derivedType{nullptr};
-        std::string             name;
-        int                     module{-1};
-        std::unique_ptr<Object> data{nullptr};
-    };
-    typedef std::pair<size_t, unsigned int>     FineGridKey;
-    typedef std::pair<size_t, std::vector<int>> CoarseGridKey;
-public:
-    // grids
-    template <typename VType = vComplex>
-    void                    createGrid(const unsigned int Ls);
-    template <typename VType = vComplex>
-    void                    createCoarseGrid(const std::vector<int> &blockSize,
-                                             const unsigned int Ls);
-    template <typename VType = vComplex>
-    GridCartesian *         getGrid(void);
-    template <typename VType = vComplex>
-    GridRedBlackCartesian * getRbGrid(void);
-    template <typename VType = vComplex>
-    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize);
-    template <typename VType = vComplex>
-    GridCartesian *         getGrid(const unsigned int Ls);
-    template <typename VType = vComplex>
-    GridRedBlackCartesian * getRbGrid(const unsigned int Ls);
-    template <typename VType = vComplex>
-    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize,
-                                          const unsigned int Ls);
-    std::vector<int>        getDim(void) const;
-    int                     getDim(const unsigned int mu) const;
-    unsigned int            getNd(void) const;
-    double                  getVolume(void) const;
-    // random number generator
-    GridParallelRNG *       get4dRng(void);
-    GridSerialRNG *         getSerialRng(void);
-    // general memory management
-    void                    addObject(const std::string name,
-                                      const int moduleAddress = -1);
-    template <typename B, typename T, typename ... Ts>
-    void                    createDerivedObject(const std::string name,
-                                                const Environment::Storage storage,
-                                                const unsigned int Ls,
-                                                Ts && ... args);
-    template <typename T, typename ... Ts>
-    void                    createObject(const std::string name,
-                                         const Environment::Storage storage,
-                                         const unsigned int Ls,
-                                         Ts && ... args);
-    void                    setObjectModule(const unsigned int objAddress,
-                                            const int modAddress);
-    template <typename B, typename T>
-    T *                     getDerivedObject(const unsigned int address) const;
-    template <typename B, typename T>
-    T *                     getDerivedObject(const std::string name) const;
-    template <typename T>
-    T *                     getObject(const unsigned int address) const;
-    template <typename T>
-    T *                     getObject(const std::string name) const;
-    unsigned int            getMaxAddress(void) const;
-    unsigned int            getObjectAddress(const std::string name) const;
-    std::string             getObjectName(const unsigned int address) const;
-    std::string             getObjectType(const unsigned int address) const;
-    std::string             getObjectType(const std::string name) const;
-    Size                    getObjectSize(const unsigned int address) const;
-    Size                    getObjectSize(const std::string name) const;
-    Storage                 getObjectStorage(const unsigned int address) const;
-    Storage                 getObjectStorage(const std::string name) const;
-    int                     getObjectModule(const unsigned int address) const;
-    int                     getObjectModule(const std::string name) const;
-    unsigned int            getObjectLs(const unsigned int address) const;
-    unsigned int            getObjectLs(const std::string name) const;
-    bool                    hasObject(const unsigned int address) const;
-    bool                    hasObject(const std::string name) const;
-    bool                    hasCreatedObject(const unsigned int address) const;
-    bool                    hasCreatedObject(const std::string name) const;
-    bool                    isObject5d(const unsigned int address) const;
-    bool                    isObject5d(const std::string name) const;
-    template <typename T>
-    bool                    isObjectOfType(const unsigned int address) const;
-    template <typename T>
-    bool                    isObjectOfType(const std::string name) const;
-    Environment::Size       getTotalSize(void) const;
-    void                    freeObject(const unsigned int address);
-    void                    freeObject(const std::string name);
-    void                    freeAll(void);
-    void                    protectObjects(const bool protect);
-    bool                    objectsProtected(void) const;
-    // print environment content
-    void                    printContent(void) const;
-private:
-    // general
-    double                              vol_;
-    bool                                protect_{true};
-    // grids
-    std::vector<int>                    dim_;
-    std::map<FineGridKey, GridPt>       grid4d_;
-    std::map<FineGridKey, GridPt>       grid5d_;
-    std::map<FineGridKey, GridRbPt>     gridRb4d_;
-    std::map<FineGridKey, GridRbPt>     gridRb5d_;
-    std::map<CoarseGridKey, GridPt>     gridCoarse4d_;
-    std::map<CoarseGridKey, GridPt>     gridCoarse5d_;
-    unsigned int                        nd_;
-    // random number generator
-    RngPt                               rng4d_{nullptr};
-    SerialRngPt                         rngSerial_{nullptr};
-    // object store
-    std::vector<ObjInfo>                object_;
-    std::map<std::string, unsigned int> objectAddress_;
-};
-
-/******************************************************************************
- *                       Holder template implementation                       *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-template <typename T>
-Holder<T>::Holder(T *pt)
-: objPt_(pt)
-{}
-
-// access //////////////////////////////////////////////////////////////////////
-template <typename T>
-T & Holder<T>::get(void) const
-{
-    return *objPt_.get();
-}
-
-template <typename T>
-T * Holder<T>::getPt(void) const
-{
-    return objPt_.get();
-}
-
-template <typename T>
-void Holder<T>::reset(T *pt)
-{
-    objPt_.reset(pt);
-}
-
-/******************************************************************************
- *                     Environment template implementation                    *
- ******************************************************************************/
-// grids ///////////////////////////////////////////////////////////////////////
-#define HADRONS_DUMP_GRID(...)\
-LOG(Debug) << "New grid " << (__VA_ARGS__) << std::endl;\
-LOG(Debug) << " - cb  : " << (__VA_ARGS__)->_isCheckerBoarded << std::endl;\
-LOG(Debug) << " - fdim: " << (__VA_ARGS__)->_fdimensions << std::endl;\
-LOG(Debug) << " - gdim: " << (__VA_ARGS__)->_gdimensions << std::endl;\
-LOG(Debug) << " - ldim: " << (__VA_ARGS__)->_ldimensions << std::endl;\
-LOG(Debug) << " - rdim: " << (__VA_ARGS__)->_rdimensions << std::endl;
-
-template <typename VType>
-void Environment::createGrid(const unsigned int Ls)
-{
-    size_t hash = typeHash<VType>();
-
-    if (grid4d_.find({hash, 1}) == grid4d_.end())
-    {
-        grid4d_[{hash, 1}].reset(
-            SpaceTimeGrid::makeFourDimGrid(getDim(), 
-                                        GridDefaultSimd(getNd(), VType::Nsimd()),
-                                        GridDefaultMpi()));
-        HADRONS_DUMP_GRID(grid4d_[{hash, 1}].get());
-        gridRb4d_[{hash, 1}].reset(
-            SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_[{hash, 1}].get()));
-        HADRONS_DUMP_GRID(gridRb4d_[{hash, 1}].get());
-    }
-    if (grid5d_.find({hash, Ls}) == grid5d_.end())
-    {
-        auto g = grid4d_[{hash, 1}].get();
-        
-        grid5d_[{hash, Ls}].reset(SpaceTimeGrid::makeFiveDimGrid(Ls, g));
-        HADRONS_DUMP_GRID(grid5d_[{hash, Ls}].get());
-        gridRb5d_[{hash, Ls}].reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, g));
-        HADRONS_DUMP_GRID(gridRb5d_[{hash, Ls}].get());
-    }
-}
-
-template <typename VType>
-void Environment::createCoarseGrid(const std::vector<int> &blockSize,
-                                   const unsigned int Ls)
-{
-    int              nd      = getNd();
-    std::vector<int> fineDim = getDim(), coarseDim(nd);
-    unsigned int     cLs;
-    auto             key4d = blockSize, key5d = blockSize;
-    size_t           hash  = typeHash<VType>();
-
-    createGrid(Ls);
-    for (int d = 0; d < coarseDim.size(); d++)
-    {
-        coarseDim[d] = fineDim[d]/blockSize[d];
-        if (coarseDim[d]*blockSize[d] != fineDim[d])
-        {
-            HADRONS_ERROR(Size, "Fine dimension " + std::to_string(d) 
-                         + " (" + std::to_string(fineDim[d]) 
-                         + ") not divisible by coarse dimension ("
-                         + std::to_string(coarseDim[d]) + ")"); 
-        }
-    }
-    if (blockSize.size() > nd)
-    {
-        cLs = Ls/blockSize[nd];
-        if (cLs*blockSize[nd] != Ls)
-        {
-            HADRONS_ERROR(Size, "Fine Ls (" + std::to_string(Ls) 
-                         + ") not divisible by coarse Ls ("
-                         + std::to_string(cLs) + ")");
-        }
-    }
-    else
-    {
-        cLs = Ls;
-    }
-    key4d.resize(nd);
-    key5d.push_back(Ls);
-
-    CoarseGridKey hkey4d = {hash, key4d}, hkey5d = {hash, key5d};
-
-    if (gridCoarse4d_.find(hkey4d) == gridCoarse4d_.end())
-    {
-        gridCoarse4d_[hkey4d].reset(
-            SpaceTimeGrid::makeFourDimGrid(coarseDim, 
-                GridDefaultSimd(nd, VType::Nsimd()), GridDefaultMpi()));
-        HADRONS_DUMP_GRID(gridCoarse4d_[hkey4d].get());
-    }
-    if (gridCoarse5d_.find(hkey5d) == gridCoarse5d_.end())
-    {
-        gridCoarse5d_[hkey5d].reset(
-            SpaceTimeGrid::makeFiveDimGrid(cLs, gridCoarse4d_[hkey4d].get()));
-        HADRONS_DUMP_GRID(gridCoarse5d_[hkey5d].get());
-    }
-}
-
-#undef HADRONS_DUMP_GRID
-
-template <typename VType>
-GridCartesian * Environment::getGrid(void)
-{
-    FineGridKey key = {typeHash<VType>(), 1};
-
-    auto it = grid4d_.find(key);
-
-    if (it != grid4d_.end())
-    {
-        return it->second.get();
-    }
-    else
-    {
-        createGrid<VType>(1);
-
-        return grid4d_.at(key).get();
-    }
-}
-
-template <typename VType>
-GridRedBlackCartesian * Environment::getRbGrid(void)
-{
-    FineGridKey key = {typeHash<VType>(), 1};
-    auto        it  = gridRb4d_.find(key);
-
-    if (it != gridRb4d_.end())
-    {
-        return it->second.get();
-    }
-    else
-    {
-        createGrid<VType>(1);
-
-        return gridRb4d_.at(key).get();
-    }
-}
-
-template <typename VType>
-GridCartesian * Environment::getCoarseGrid(const std::vector<int> &blockSize)
-{
-    std::vector<int> s = blockSize;
-
-    s.resize(getNd());
-
-    CoarseGridKey key = {typeHash<VType>(), s};
-    auto          it  = gridCoarse4d_.find(key);
-
-    if (it != gridCoarse4d_.end())
-    {
-        return it->second.get();
-    }
-    else
-    {
-        createCoarseGrid<VType>(blockSize, 1);
-        
-        return gridCoarse4d_.at(key).get();
-    }
-}
-
-template <typename VType>
-GridCartesian * Environment::getGrid(const unsigned int Ls)
-{
-    FineGridKey key = {typeHash<VType>(), Ls};
-    auto        it  = grid5d_.find(key);
-
-    if (it != grid5d_.end())
-    {
-        return it->second.get();
-    }
-    else
-    {
-        createGrid<VType>(Ls);
-
-        return grid5d_.at(key).get();
-    }
-}
-
-template <typename VType>
-GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls)
-{
-    FineGridKey key = {typeHash<VType>(), Ls};
-    auto        it  = gridRb5d_.find(key);
-
-    if (it != gridRb5d_.end())
-    {
-        return it->second.get();
-    }
-    else
-    {
-        createGrid<VType>(Ls);
-
-        return gridRb5d_.at(key).get();
-    }
-}
-
-template <typename VType>
-GridCartesian * Environment::getCoarseGrid(const std::vector<int> &blockSize,
-                                           const unsigned int Ls)
-{
-    std::vector<int> s = blockSize;
-
-    s.push_back(Ls);
-
-    CoarseGridKey key = {typeHash<VType>(), s};
-
-    auto it = gridCoarse5d_.find(key);
-    if (it != gridCoarse5d_.end())
-    {
-        return it->second.get();
-    }
-    else
-    {
-        createCoarseGrid<VType>(blockSize, Ls);
-
-        return gridCoarse5d_.at(key).get();
-    }
-}
-
-
-// general memory management ///////////////////////////////////////////////////
-template <typename B, typename T, typename ... Ts>
-void Environment::createDerivedObject(const std::string name,
-                                      const Environment::Storage storage,
-                                      const unsigned int Ls,
-                                      Ts && ... args)
-{
-    if (!hasObject(name))
-    {
-        addObject(name);
-    }
-    
-    unsigned int address = getObjectAddress(name);
-    
-    if (!object_[address].data or !objectsProtected())
-    {
-        MemoryStats memStats;
-    
-        if (!MemoryProfiler::stats)
-        {
-            MemoryProfiler::stats = &memStats;
-        }
-        size_t initMem               = MemoryProfiler::stats->currentlyAllocated;
-        object_[address].storage     = storage;
-        object_[address].Ls          = Ls;
-        object_[address].data.reset(new Holder<B>(new T(std::forward<Ts>(args)...)));
-        object_[address].size        = MemoryProfiler::stats->maxAllocated - initMem;
-        object_[address].type        = typeIdPt<B>();
-        object_[address].derivedType = typeIdPt<T>();
-        if (MemoryProfiler::stats == &memStats)
-        {
-            MemoryProfiler::stats = nullptr;
-        }
-    }
-    // object already exists, no error if it is a cache, error otherwise
-    else if ((object_[address].storage               != Storage::cache) or 
-             (object_[address].storage               != storage)        or
-             (object_[address].name                  != name)           or
-             (typeHash(object_[address].type)        != typeHash<B>())  or
-             (typeHash(object_[address].derivedType) != typeHash<T>()))
-    {
-        HADRONS_ERROR_REF(ObjectDefinition, "object '" + name + "' already allocated", address);
-    }
-}
-
-template <typename T, typename ... Ts>
-void Environment::createObject(const std::string name, 
-                               const Environment::Storage storage,
-                               const unsigned int Ls,
-                               Ts && ... args)
-{
-    createDerivedObject<T, T>(name, storage, Ls, std::forward<Ts>(args)...);
-}
-
-template <typename B, typename T>
-T * Environment::getDerivedObject(const unsigned int address) const
-{
-    if (hasObject(address))
-    {
-        if (hasCreatedObject(address))
-        {
-            if (auto h = dynamic_cast<Holder<B> *>(object_[address].data.get()))
-            {
-                if (&typeid(T) == &typeid(B))
-                {
-                    return dynamic_cast<T *>(h->getPt());
-                }
-                else
-                {
-                    if (auto hder = dynamic_cast<T *>(h->getPt()))
-                    {
-                        return hder;
-                    }
-                    else
-                    {
-                        HADRONS_ERROR_REF(ObjectType, "object with address " +
-                            std::to_string(address) +
-                            " cannot be casted to '" + typeName(&typeid(T)) +
-                            "' (has type '" + typeName(&typeid(h->get())) + "')", address);
-                    }
-                }
-            }
-            else
-            {
-                HADRONS_ERROR_REF(ObjectType, "object with address " + 
-                            std::to_string(address) +
-                            " does not have type '" + typeName(&typeid(B)) +
-                            "' (has type '" + getObjectType(address) + "')", address);
-            }
-        }
-        else
-        {
-            HADRONS_ERROR_REF(ObjectDefinition, "object with address " + 
-                              std::to_string(address) + " is empty", address);
-        }
-    }
-    else
-    {
-        HADRONS_ERROR_REF(ObjectDefinition, "no object with address " + 
-                          std::to_string(address), address);
-    }
-}
-
-template <typename B, typename T>
-T * Environment::getDerivedObject(const std::string name) const
-{
-    return getDerivedObject<B, T>(getObjectAddress(name));
-}
-
-template <typename T>
-T * Environment::getObject(const unsigned int address) const
-{
-    return getDerivedObject<T, T>(address);
-}
-
-template <typename T>
-T * Environment::getObject(const std::string name) const
-{
-    return getObject<T>(getObjectAddress(name));
-}
-
-template <typename T>
-bool Environment::isObjectOfType(const unsigned int address) const
-{
-    if (hasObject(address))
-    {
-        if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
-        {
-            return true;
-        }
-        else
-        {
-            return false;
-        }
-    }
-    else
-    {
-        HADRONS_ERROR_REF(ObjectDefinition, "no object with address " 
-                          + std::to_string(address), address);
-    }
-}
-
-template <typename T>
-bool Environment::isObjectOfType(const std::string name) const
-{
-    return isObjectOfType<T>(getObjectAddress(name));
-}
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_Environment_hpp_
--- a/Hadrons/Exceptions.cc
+++ b/Hadrons/Exceptions.cc
@@ -1,102 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Exceptions.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Hadrons/Exceptions.hpp>
-#include <Hadrons/VirtualMachine.hpp>
-#include <Hadrons/Module.hpp>
-
-#ifndef ERR_SUFF
-#define ERR_SUFF " (" + loc + ")"
-#endif
-
-#define CTOR_EXC(name, init) \
-name::name(std::string msg, std::string loc)\
-:init\
-{}
-
-#define CTOR_EXC_REF(name, init) \
-name::name(std::string msg, std::string loc, const unsigned int address)\
-:init\
-{}
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace Exceptions;
-
-// backtrace cache
-std::vector<std::string> Grid::Hadrons::Exceptions::backtraceStr;
-
-// logic errors
-CTOR_EXC(Logic, logic_error(msg + ERR_SUFF))
-CTOR_EXC(Definition, Logic("definition error: " + msg, loc))
-CTOR_EXC(Implementation, Logic("implementation error: " + msg, loc))
-CTOR_EXC(Range, Logic("range error: " + msg, loc))
-CTOR_EXC(Size, Logic("size error: " + msg, loc))
-
-// runtime errors
-CTOR_EXC(Runtime, runtime_error(msg + ERR_SUFF))
-CTOR_EXC(Argument, Runtime("argument error: " + msg, loc))
-CTOR_EXC(Io, Runtime("IO error: " + msg, loc))
-CTOR_EXC(Memory, Runtime("memory error: " + msg, loc))
-CTOR_EXC(Parsing, Runtime("parsing error: " + msg, loc))
-CTOR_EXC(Program, Runtime("program error: " + msg, loc))
-CTOR_EXC(System, Runtime("system error: " + msg, loc))
-
-// virtual machine errors
-CTOR_EXC_REF(ObjectDefinition, RuntimeRef("object definition error: " + msg, loc, address));
-CTOR_EXC_REF(ObjectType, RuntimeRef("object type error: " + msg, loc, address));
-
-// abort functions
-void Grid::Hadrons::Exceptions::abort(const std::exception& e)
-{
-    auto &vm = VirtualMachine::getInstance();
-    int  mod = vm.getCurrentModule();
-
-    LOG(Error) << "FATAL ERROR -- Exception " << typeName(&typeid(e)) 
-               << std::endl;
-    if (mod >= 0)
-    {
-        LOG(Error) << "During execution of module '"
-                    << vm.getModuleName(mod) << "' (address " << mod << ")"
-                    << std::endl;
-    }
-    LOG(Error) << e.what() << std::endl;
-    if (!backtraceStr.empty())
-    {
-        LOG(Error) << "-- BACKTRACE --------------" << std::endl;
-        for (auto &s: backtraceStr)
-        {
-            LOG(Error) << s << std::endl;
-        }
-        LOG(Error) << "---------------------------" << std::endl;
-    }
-    LOG(Error) << "Aborting program" << std::endl;
-    Grid_finalize();
-
-    exit(EXIT_FAILURE);
-}
--- a/Hadrons/Exceptions.hpp
+++ b/Hadrons/Exceptions.hpp
@@ -1,129 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Exceptions.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_Exceptions_hpp_
-#define Hadrons_Exceptions_hpp_
-
-#include <stdexcept>
-#include <execinfo.h>
-#ifndef Hadrons_Global_hpp_
-#include <Hadrons/Global.hpp>
-#endif
-
-#define HADRONS_SRC_LOC std::string(__FUNCTION__) + " at " \
-                        + std::string(__FILE__) + ":" + std::to_string(__LINE__)
-#define HADRONS_BACKTRACE_MAX 128
-#ifdef HAVE_EXECINFO_H
-#define HADRONS_CACHE_BACKTRACE \
-{\
-    void* _callstack[HADRONS_BACKTRACE_MAX];\
-    int _i, _frames = backtrace(_callstack, HADRONS_BACKTRACE_MAX);\
-    char** _strs = backtrace_symbols(_callstack, _frames);\
-    Grid::Hadrons::Exceptions::backtraceStr.clear();\
-    for (_i = 0; _i < _frames; ++_i)\
-    {\
-        Hadrons::Exceptions::backtraceStr.push_back(std::string(_strs[_i]));\
-    }\
-    free(_strs);\
-}
-#else
-#define HADRONS_CACHE_BACKTRACE \
-Grid::Hadrons::Exceptions::backtraceStr.clear();\
-Grid::Hadrons::Exceptions::backtraceStr.push_back("<backtrace not supported>");
-#endif
-
-#define HADRONS_ERROR(exc, msg)\
-HADRONS_CACHE_BACKTRACE \
-throw(Exceptions::exc(msg, HADRONS_SRC_LOC));
-
-#define HADRONS_ERROR_REF(exc, msg, address)\
-HADRONS_CACHE_BACKTRACE \
-throw(Exceptions::exc(msg, HADRONS_SRC_LOC, address));
-
-#define DECL_EXC(name, base) \
-class name: public base\
-{\
-public:\
-    name(std::string msg, std::string loc);\
-}
-
-#define DECL_EXC_REF(name, base) \
-class name: public base\
-{\
-public:\
-    name(std::string msg, std::string loc, const unsigned int address);\
-}
-
-BEGIN_HADRONS_NAMESPACE
-
-namespace Exceptions
-{
-    // backtrace cache
-    extern std::vector<std::string> backtraceStr;
-
-    // logic errors
-    DECL_EXC(Logic, std::logic_error);
-    DECL_EXC(Definition, Logic);
-    DECL_EXC(Implementation, Logic);
-    DECL_EXC(Range, Logic);
-    DECL_EXC(Size, Logic);
-
-    // runtime errors
-    DECL_EXC(Runtime, std::runtime_error);
-    DECL_EXC(Argument, Runtime);
-    DECL_EXC(Io, Runtime);
-    DECL_EXC(Memory, Runtime);
-    DECL_EXC(Parsing, Runtime);
-    DECL_EXC(Program, Runtime);
-    DECL_EXC(System, Runtime);
-
-    // virtual machine errors
-    class RuntimeRef: public Runtime
-    {
-    public:
-        RuntimeRef(std::string msg, std::string loc, const unsigned int address)
-        : Runtime(msg, loc), address_(address)
-        {}
-        unsigned int getAddress(void) const
-        {
-            return address_;
-        }
-    private:
-        unsigned int address_;
-    };
-
-    DECL_EXC_REF(ObjectDefinition, RuntimeRef);
-    DECL_EXC_REF(ObjectType, RuntimeRef);
-
-    // abort functions
-    void abort(const std::exception& e);
-}
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_Exceptions_hpp_
--- a/Hadrons/Factory.hpp
+++ b/Hadrons/Factory.hpp
@@ -1,105 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Factory.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_Factory_hpp_
-#define Hadrons_Factory_hpp_
-
-#include <Hadrons/Global.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                        abstract factory class                              *
- ******************************************************************************/
-template <typename T>
-class Factory
-{
-public:
-    typedef std::function<std::unique_ptr<T>(const std::string)> Func;
-public:
-    // constructor
-    Factory(void) = default;
-    // destructor
-    virtual ~Factory(void) = default;
-    // registration
-    void registerBuilder(const std::string type, const Func &f);
-    // get builder list
-    std::vector<std::string> getBuilderList(void) const;
-    // factory
-    std::unique_ptr<T> create(const std::string type,
-                              const std::string name) const;
-private:
-    std::map<std::string, Func> builder_;
-};
-
-/******************************************************************************
- *                         template implementation                            *
- ******************************************************************************/
-// registration ////////////////////////////////////////////////////////////////
-template <typename T>
-void Factory<T>::registerBuilder(const std::string type, const Func &f)
-{
-    builder_[type] = f;
-}
-
-// get module list /////////////////////////////////////////////////////////////
-template <typename T>
-std::vector<std::string> Factory<T>::getBuilderList(void) const
-{
-    std::vector<std::string> list;
-    
-    for (auto &b: builder_)
-    {
-        list.push_back(b.first);
-    }
-    
-    return list;
-}
-
-// factory /////////////////////////////////////////////////////////////////////
-template <typename T>
-std::unique_ptr<T> Factory<T>::create(const std::string type,
-                                      const std::string name) const
-{
-    Func func;
-    
-    try
-    {
-        func = builder_.at(type);
-    }
-    catch (std::out_of_range &)
-    {
-        HADRONS_ERROR(Argument, "object of type '" + type + "' unknown");
-    }
-    
-    return func(name);
-}
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_Factory_hpp_
--- a/Hadrons/GeneticScheduler.hpp
+++ b/Hadrons/GeneticScheduler.hpp
@@ -1,321 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/GeneticScheduler.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_GeneticScheduler_hpp_
-#define Hadrons_GeneticScheduler_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Graph.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                   Scheduler based on a genetic algorithm                   *
- ******************************************************************************/
-template <typename V, typename T>
-class GeneticScheduler
-{
-public:
-    typedef std::vector<T>                 Gene;
-    typedef std::pair<Gene *, Gene *>      GenePair;
-    typedef std::function<V(const Gene &)> ObjFunc;
-    struct Parameters
-    {
-        double       mutationRate;
-        unsigned int popSize, seed;
-    };
-public:
-    // constructor
-    GeneticScheduler(Graph<T> &graph, const ObjFunc &func,
-                     const Parameters &par);
-    // destructor
-    virtual ~GeneticScheduler(void) = default;
-    // access
-    const Gene & getMinSchedule(void);
-    V            getMinValue(void);
-    // reset population
-    void initPopulation(void);
-    // breed a new generation
-    void nextGeneration(void);
-    // heuristic benchmarks
-    void benchmarkCrossover(const unsigned int nIt);
-    // print population
-    friend std::ostream & operator<<(std::ostream &out,
-                                     const GeneticScheduler<V, T> &s)
-    {
-        out << "[";
-        for (auto &p: s.population_)
-        {
-            out << p.first << ", ";
-        }
-        out << "\b\b]";
-        
-        return out;
-    }
-private:
-    void doCrossover(void);
-    void doMutation(void);
-    // genetic operators
-    GenePair selectPair(void);
-    void     crossover(Gene &c1, Gene &c2, const Gene &p1, const Gene &p2);
-    void     mutation(Gene &m, const Gene &c);
-    
-private:
-    Graph<T>               &graph_;
-    const ObjFunc          &func_;
-    const Parameters       par_;
-    std::multimap<V, Gene> population_;
-    std::mt19937           gen_;
-};
-
-/******************************************************************************
- *                       template implementation                              *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-template <typename V, typename T>
-GeneticScheduler<V, T>::GeneticScheduler(Graph<T> &graph, const ObjFunc &func,
-                                      const Parameters &par)
-: graph_(graph)
-, func_(func)
-, par_(par)
-{
-    gen_.seed(par_.seed);
-}
-
-// access //////////////////////////////////////////////////////////////////////
-template <typename V, typename T>
-const typename GeneticScheduler<V, T>::Gene &
-GeneticScheduler<V, T>::getMinSchedule(void)
-{
-    return population_.begin()->second;
-}
-
-template <typename V, typename T>
-V GeneticScheduler<V, T>::getMinValue(void)
-{
-    return population_.begin()->first;
-}
-
-// breed a new generation //////////////////////////////////////////////////////
-template <typename V, typename T>
-void GeneticScheduler<V, T>::nextGeneration(void)
-{
-    // random initialization of the population if necessary
-    if (population_.size() != par_.popSize)
-    {
-        initPopulation();
-    }
-    //LOG(Debug) << "Starting population:\n" << *this << std::endl;
-    
-    // random mutations
-    for (unsigned int i = 0; i < par_.popSize; ++i)
-    {
-        doMutation();
-    }
-    //LOG(Debug) << "After mutations:\n" << *this << std::endl;
-    
-    // mating
-    for (unsigned int i = 0; i < par_.popSize/2; ++i)
-    {
-        doCrossover();
-    }
-    //LOG(Debug) << "After mating:\n" << *this << std::endl;
-    
-    // grim reaper
-    auto it = population_.begin();
-    
-    std::advance(it, par_.popSize);
-    population_.erase(it, population_.end());
-    //LOG(Debug) << "After grim reaper:\n" << *this << std::endl;
-}
-
-// evolution steps /////////////////////////////////////////////////////////////
-template <typename V, typename T>
-void GeneticScheduler<V, T>::initPopulation(void)
-{
-    population_.clear();
-    for (unsigned int i = 0; i < par_.popSize; ++i)
-    {
-        auto p = graph_.topoSort(gen_);
-        
-        population_.insert(std::make_pair(func_(p), p));
-    }
-}
-
-template <typename V, typename T>
-void GeneticScheduler<V, T>::doCrossover(void)
-{
-    auto p = selectPair();
-    Gene &p1 = *(p.first), &p2 = *(p.second);
-    Gene c1, c2;
-    
-    crossover(c1, c2, p1, p2);
-    thread_critical
-    {
-        population_.insert(std::make_pair(func_(c1), c1));
-        population_.insert(std::make_pair(func_(c2), c2));
-    }
-}
-
-template <typename V, typename T>
-void GeneticScheduler<V, T>::doMutation(void)
-{
-    std::uniform_real_distribution<double>      mdis(0., 1.);
-    std::uniform_int_distribution<unsigned int> pdis(0, population_.size() - 1);
-    
-    if (mdis(gen_) < par_.mutationRate)
-    {
-        Gene m;
-        auto it = population_.begin();
-        
-        std::advance(it, pdis(gen_));
-        mutation(m, it->second);
-	thread_critical
-        {
-            population_.insert(std::make_pair(func_(m), m));
-        }
-    }
-}
-
-// genetic operators ///////////////////////////////////////////////////////////
-template <typename V, typename T>
-typename GeneticScheduler<V, T>::GenePair GeneticScheduler<V, T>::selectPair(void)
-{
-    std::vector<double> prob;
-    unsigned int        ind;
-    Gene                *p1, *p2;
-    const double        max = population_.rbegin()->first;
-    
-
-    for (auto &c: population_)
-    {
-        prob.push_back(std::exp((c.first-1.)/max));
-    }        
-    std::discrete_distribution<unsigned int> dis1(prob.begin(), prob.end());
-    auto rIt = population_.begin();
-    ind = dis1(gen_);
-    std::advance(rIt, ind);
-    p1 = &(rIt->second);
-    prob[ind] = 0.;
-    std::discrete_distribution<unsigned int> dis2(prob.begin(), prob.end());
-    rIt = population_.begin();
-    std::advance(rIt, dis2(gen_));
-    p2 = &(rIt->second);
-    
-    return std::make_pair(p1, p2);
-}
-
-template <typename V, typename T>
-void GeneticScheduler<V, T>::crossover(Gene &c1, Gene &c2, const Gene &p1,
-                                    const Gene &p2)
-{
-    Gene                                        buf;
-    std::uniform_int_distribution<unsigned int> dis(0, p1.size() - 1);
-    unsigned int                                cut = dis(gen_);
-    
-    c1.clear();
-    buf = p2;
-    for (unsigned int i = 0; i < cut; ++i)
-    {
-        c1.push_back(p1[i]);
-        buf.erase(std::find(buf.begin(), buf.end(), p1[i]));
-    }
-    for (unsigned int i = 0; i < buf.size(); ++i)
-    {
-        c1.push_back(buf[i]);
-    }
-    c2.clear();
-    buf = p2;
-    for (unsigned int i = cut; i < p1.size(); ++i)
-    {
-        buf.erase(std::find(buf.begin(), buf.end(), p1[i]));
-    }
-    for (unsigned int i = 0; i < buf.size(); ++i)
-    {
-        c2.push_back(buf[i]);
-    }
-    for (unsigned int i = cut; i < p1.size(); ++i)
-    {
-        c2.push_back(p1[i]);
-    }
-}
-
-template <typename V, typename T>
-void GeneticScheduler<V, T>::mutation(Gene &m, const Gene &c)
-{
-    Gene                                        buf;
-    std::uniform_int_distribution<unsigned int> dis(0, c.size() - 1);
-    unsigned int                                cut = dis(gen_);
-    Graph<T>                                    g1 = graph_, g2 = graph_;
-    
-    for (unsigned int i = 0; i < cut; ++i)
-    {
-        g1.removeVertex(c[i]);
-    }
-    for (unsigned int i = cut; i < c.size(); ++i)
-    {
-        g2.removeVertex(c[i]);
-    }
-    if (g1.size() > 0)
-    {
-        buf = g1.topoSort(gen_);
-    }
-    if (g2.size() > 0)
-    {
-        m = g2.topoSort(gen_);
-    }
-    for (unsigned int i = cut; i < c.size(); ++i)
-    {
-        m.push_back(buf[i - cut]);
-    }
-}
-
-template <typename V, typename T>
-void GeneticScheduler<V, T>::benchmarkCrossover(const unsigned int nIt)
-{
-    Gene   p1, p2, c1, c2;
-    double neg = 0., eq = 0., pos = 0., total;
-    int    improvement;
-    
-    LOG(Message) << "Benchmarking crossover..." << std::endl;
-    for (unsigned int i = 0; i < nIt; ++i)
-    {
-        p1 = graph_.topoSort(gen_);
-        p2 = graph_.topoSort(gen_);
-        crossover(c1, c2, p1, p2);
-        improvement = (func_(c1) + func_(c2) - func_(p1) - func_(p2))/2;
-        if (improvement < 0) neg++; else if (improvement == 0) eq++; else pos++;
-    }
-    total = neg + eq + pos;
-    LOG(Message) << "  -: " << neg/total << "  =: " << eq/total
-                 << "  +: " << pos/total << std::endl;
-}
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_GeneticScheduler_hpp_
--- a/Hadrons/Global.cc
+++ b/Hadrons/Global.cc
@@ -1,213 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Global.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Hadrons/Global.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-
-HadronsLogger Hadrons::HadronsLogError(1,"Error");
-HadronsLogger Hadrons::HadronsLogWarning(1,"Warning");
-HadronsLogger Hadrons::HadronsLogMessage(1,"Message");
-HadronsLogger Hadrons::HadronsLogIterative(1,"Iterative");
-HadronsLogger Hadrons::HadronsLogDebug(1,"Debug");
-HadronsLogger Hadrons::HadronsLogIRL(1,"IRL");
-
-void Hadrons::initLogger(void)
-{
-    auto w  = std::string("Hadrons").length();
-    int  cw = 8;
-
-
-    GridLogError.setTopWidth(w);
-    GridLogWarning.setTopWidth(w);
-    GridLogMessage.setTopWidth(w);
-    GridLogIterative.setTopWidth(w);
-    GridLogDebug.setTopWidth(w);
-    GridLogIRL.setTopWidth(w);
-    GridLogError.setChanWidth(cw);
-    GridLogWarning.setChanWidth(cw);
-    GridLogMessage.setChanWidth(cw);
-    GridLogIterative.setChanWidth(cw);
-    GridLogDebug.setChanWidth(cw);
-    GridLogIRL.setChanWidth(cw);
-    HadronsLogError.Active(true);
-    HadronsLogWarning.Active(true);
-    HadronsLogMessage.Active(GridLogMessage.isActive());
-    HadronsLogIterative.Active(GridLogIterative.isActive());
-    HadronsLogDebug.Active(GridLogDebug.isActive());
-    HadronsLogIRL.Active(GridLogIRL.isActive());
-    HadronsLogError.setChanWidth(cw);
-    HadronsLogWarning.setChanWidth(cw);
-    HadronsLogMessage.setChanWidth(cw);
-    HadronsLogIterative.setChanWidth(cw);
-    HadronsLogDebug.setChanWidth(cw);
-    HadronsLogIRL.setChanWidth(cw);
-}
-
-// type utilities //////////////////////////////////////////////////////////////
-size_t Hadrons::typeHash(const std::type_info *info)
-{
-    return info->hash_code();
-}
-
-//constexpr unsigned int maxNameSize = 1024u;
-
-std::string Hadrons::typeName(const std::type_info *info)
-{
-    char        *buf;
-    std::string name;
-    
-    buf  = abi::__cxa_demangle(info->name(), nullptr, nullptr, nullptr);
-    name = buf;
-    free(buf);
-    
-    return name;
-}
-
-// default writers/readers /////////////////////////////////////////////////////
-#ifdef HAVE_HDF5
-const std::string Hadrons::resultFileExt = "h5";
-#else
-const std::string Hadrons::resultFileExt = "xml";
-#endif
-
-// recursive mkdir /////////////////////////////////////////////////////////////
-int Hadrons::mkdir(const std::string dirName)
-{
-    if (!dirName.empty() and access(dirName.c_str(), R_OK|W_OK|X_OK))
-    {
-        mode_t mode755;
-        char   tmp[MAX_PATH_LENGTH];
-        char   *p = NULL;
-        size_t len;
-
-        mode755 = S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
-
-        snprintf(tmp, sizeof(tmp), "%s", dirName.c_str());
-        len = strlen(tmp);
-        if(tmp[len - 1] == '/')
-        {
-            tmp[len - 1] = 0;
-        }
-        for(p = tmp + 1; *p; p++)
-        {
-            if(*p == '/')
-            {
-                *p = 0;
-                ::mkdir(tmp, mode755);
-                *p = '/';
-            }
-        }
-
-        return ::mkdir(tmp, mode755);
-    }
-    else
-    {
-        return 0;
-    }
-}
-
-std::string Hadrons::basename(const std::string &s)
-{
-    constexpr char sep = '/';
-    size_t         i   = s.rfind(sep, s.length());
-    
-    if (i != std::string::npos)
-    {
-        return s.substr(i+1, s.length() - i);
-    }
-    else
-    {
-        return s;
-    }
-}
-
-std::string Hadrons::dirname(const std::string &s)
-{
-    constexpr char sep = '/';
-    size_t         i   = s.rfind(sep, s.length());
-    
-    if (i != std::string::npos)
-    {
-        return s.substr(0, i);
-    }
-    else
-    {
-        return "";
-    }
-}
-
-void Hadrons::makeFileDir(const std::string filename, GridBase *g)
-{
-    bool doIt = true;
-
-    if (g)
-    {
-        doIt = g->IsBoss();
-    }
-    if (doIt)
-    {
-        std::string dir    = dirname(filename);
-        int         status = mkdir(dir);
-
-        if (status)
-        {
-            HADRONS_ERROR(Io, "cannot create directory '" + dir
-                          + "' ( " + std::strerror(errno) + ")");
-        }
-    }
-}
-
-void Hadrons::printTimeProfile(const std::map<std::string, GridTime> &timing, 
-                               GridTime total)
-{
-    typedef decltype(total.count()) Count;
-
-    std::map<Count, std::string, std::greater<Count>> rtiming;
-    const double dtotal = static_cast<double>(total.count());
-    auto cf = std::cout.flags();
-    auto p  = std::cout.precision();
-    unsigned int width = 0;
-
-    for (auto &t: timing)
-    {
-        width = std::max(width, static_cast<unsigned int>(t.first.length()));
-        rtiming[t.second.count()] = t.first;
-    }
-    for (auto &rt: rtiming)
-    {
-        LOG(Message) << std::setw(width) << rt.second << ": " 
-                     << rt.first << " us (" << std::fixed 
-                     << std::setprecision(1) 
-                     << static_cast<double>(rt.first)/dtotal*100 << "%)"
-                     << std::endl;
-    }
-    std::cout.flags(cf);
-    std::cout.precision(p);
-}
--- a/Hadrons/Global.hpp
+++ b/Hadrons/Global.hpp
@@ -1,282 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Global.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_Global_hpp_
-#define Hadrons_Global_hpp_
-
-#include <set>
-#include <stack>
-#include <regex>
-#include <Grid/Grid.h>
-#include <cxxabi.h>
-
-#ifndef SITE_SIZE_TYPE
-#define SITE_SIZE_TYPE size_t
-#endif
-
-#ifndef DEFAULT_ASCII_PREC
-#define DEFAULT_ASCII_PREC 16
-#endif
-
-#define ARG(...) __VA_ARGS__
-
-/* the 'using Grid::operator<<;' statement prevents a very nasty compilation
- * error with GCC 5 (clang & GCC 6 compile fine without it).
- */
-
-#define BEGIN_HADRONS_NAMESPACE \
-namespace Grid {\
-namespace Hadrons {\
-using Grid::operator<<;\
-using Grid::operator>>;
-#define END_HADRONS_NAMESPACE }}
-
-#define BEGIN_MODULE_NAMESPACE(name)\
-namespace name {\
-using Grid::operator<<;\
-using Grid::operator>>;
-
-#define END_MODULE_NAMESPACE }
-
-#define _HADRONS_IMPL(impl, sub) impl##sub
-#define HADRONS_IMPL(impl, sub)   _HADRONS_IMPL(impl, sub)
-
-#ifndef FIMPLBASE
-#define FIMPLBASE WilsonImpl
-#endif
-#define FIMPL  HADRONS_IMPL(FIMPLBASE, R)
-#define FIMPLF HADRONS_IMPL(FIMPLBASE, F)
-#define FIMPLD HADRONS_IMPL(FIMPLBASE, D)
-
-#ifndef ZFIMPLBASE
-#define ZFIMPLBASE ZWilsonImpl
-#endif
-#define ZFIMPL  HADRONS_IMPL(ZFIMPLBASE, R)
-#define ZFIMPLF HADRONS_IMPL(ZFIMPLBASE, F)
-#define ZFIMPLD HADRONS_IMPL(ZFIMPLBASE, D)
-
-#ifndef SIMPLBASE
-#define SIMPLBASE ScalarImplC
-#endif
-#define SIMPL  HADRONS_IMPL(SIMPLBASE, R)
-#define SIMPLF HADRONS_IMPL(SIMPLBASE, F)
-#define SIMPLD HADRONS_IMPL(SIMPLBASE, D)
-
-#ifndef GIMPLBASE
-#define GIMPLBASE PeriodicGimpl
-#endif
-#define GIMPL  HADRONS_IMPL(GIMPLBASE, R)
-#define GIMPLF HADRONS_IMPL(GIMPLBASE, F)
-#define GIMPLD HADRONS_IMPL(GIMPLBASE, D)
-
-BEGIN_HADRONS_NAMESPACE
-
-// type aliases
-#define BASIC_TYPE_ALIASES(Impl, suffix)\
-typedef typename Impl::Field                         ScalarField##suffix;\
-typedef typename Impl::PropagatorField               PropagatorField##suffix;\
-typedef typename Impl::SitePropagator::scalar_object SitePropagator##suffix;\
-typedef typename Impl::ComplexField                  ComplexField##suffix;\
-typedef std::vector<SitePropagator##suffix>          SlicedPropagator##suffix;\
-typedef std::vector<typename ComplexField##suffix::vector_object::scalar_object> SlicedComplex##suffix;
-
-#define FERM_TYPE_ALIASES(FImpl, suffix)\
-BASIC_TYPE_ALIASES(FImpl, suffix);\
-typedef FermionOperator<FImpl>                     FMat##suffix;\
-typedef typename FImpl::FermionField               FermionField##suffix;\
-typedef typename FImpl::GaugeField                 GaugeField##suffix;\
-typedef typename FImpl::DoubledGaugeField          DoubledGaugeField##suffix;\
-typedef Lattice<iSpinMatrix<typename FImpl::Simd>> SpinMatrixField##suffix;
-
-#define GAUGE_TYPE_ALIASES(GImpl, suffix)\
-typedef typename GImpl::GaugeField GaugeField##suffix;
-
-#define SOLVER_TYPE_ALIASES(FImpl, suffix)\
-typedef Solver<FImpl> Solver##suffix;
-
-#define SINK_TYPE_ALIASES(suffix)\
-typedef std::function<SlicedPropagator##suffix\
-                      (const PropagatorField##suffix &)> SinkFn##suffix;
-
-// logger
-class HadronsLogger: public Logger
-{
-public:
-    HadronsLogger(int on, std::string nm): Logger("Hadrons", on, nm,
-                                                  GridLogColours, "BLACK"){};
-};
-
-#define LOG(channel) std::cout << HadronsLog##channel
-#define HADRONS_DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
-
-extern HadronsLogger HadronsLogError;
-extern HadronsLogger HadronsLogWarning;
-extern HadronsLogger HadronsLogMessage;
-extern HadronsLogger HadronsLogIterative;
-extern HadronsLogger HadronsLogDebug;
-extern HadronsLogger HadronsLogIRL;
-
-void initLogger(void);
-
-// singleton pattern
-#define SINGLETON(name)\
-public:\
-    name(const name &e) = delete;\
-    void operator=(const name &e) = delete;\
-    static name & getInstance(void)\
-    {\
-        static name e;\
-        return e;\
-    }\
-private:\
-    name(void);
-
-#define SINGLETON_DEFCTOR(name)\
-public:\
-    name(const name &e) = delete;\
-    void operator=(const name &e) = delete;\
-    static name & getInstance(void)\
-    {\
-        static name e;\
-        return e;\
-    }\
-private:\
-    name(void) = default;
-
-// type utilities
-template <typename T>
-const std::type_info * typeIdPt(const T &x)
-{
-    return &typeid(x);
-}
-
-template <typename T>
-const std::type_info * typeIdPt(void)
-{
-    return &typeid(T);
-}
-
-size_t typeHash(const std::type_info *info);
-
-template <typename T>
-size_t typeHash(const T &x)
-{
-    return typeHash(typeIdPt(x));
-}
-
-template <typename T>
-size_t typeHash(void)
-{
-    return typeHash(typeIdPt<T>());
-}
-
-std::string typeName(const std::type_info *info);
-
-template <typename T>
-std::string typeName(const T &x)
-{
-    return typeName(typeIdPt(x));
-}
-
-template <typename T>
-std::string typeName(void)
-{
-    return typeName(typeIdPt<T>());
-}
-
-// default writers/readers
-extern const std::string resultFileExt;
-
-#ifdef HAVE_HDF5
-typedef Hdf5Reader ResultReader;
-typedef Hdf5Writer ResultWriter;
-#else
-typedef XmlReader ResultReader;
-typedef XmlWriter ResultWriter;
-#endif
-
-#define RESULT_FILE_NAME(name, traj) \
-name + "." + std::to_string(traj) + "." + resultFileExt
-
-// recursive mkdir
-#define MAX_PATH_LENGTH 512u
-int         mkdir(const std::string dirName);
-std::string basename(const std::string &s);
-std::string dirname(const std::string &s);
-void        makeFileDir(const std::string filename, GridBase *g = nullptr);
-
-// default Schur convention
-#ifndef HADRONS_DEFAULT_SCHUR 
-#define HADRONS_DEFAULT_SCHUR DiagTwo
-#endif
-#define _HADRONS_SCHUR_OP_(conv) Schur##conv##Operator
-#define HADRONS_SCHUR_OP(conv) _HADRONS_SCHUR_OP_(conv)
-#define HADRONS_DEFAULT_SCHUR_OP HADRONS_SCHUR_OP(HADRONS_DEFAULT_SCHUR)
-#define _HADRONS_SCHUR_SOLVE_(conv) SchurRedBlack##conv##Solve
-#define HADRONS_SCHUR_SOLVE(conv) _HADRONS_SCHUR_SOLVE_(conv)
-#define HADRONS_DEFAULT_SCHUR_SOLVE HADRONS_SCHUR_SOLVE(HADRONS_DEFAULT_SCHUR)
-#define _HADRONS_SCHUR_A2A_(conv) A2AVectorsSchur##conv
-#define HADRONS_SCHUR_A2A(conv) _HADRONS_SCHUR_A2A_(conv)
-#define HADRONS_DEFAULT_SCHUR_A2A HADRONS_SCHUR_A2A(HADRONS_DEFAULT_SCHUR)
-
-// stringify macro
-#define _HADRONS_STR(x) #x
-#define HADRONS_STR(x) _HADRONS_STR(x)
-
-// pretty print time profile
-void printTimeProfile(const std::map<std::string, GridTime> &timing, GridTime total);
-
-// token replacement utility
-template <typename T>
-void tokenReplace(std::string &str, const std::string token,
-                  const T &x, const std::string mark = "@")
-{
-    std::string fullToken = mark + token + mark;
-    
-    auto pos = str.find(fullToken);
-    if (pos != std::string::npos)
-    {
-        str.replace(pos, fullToken.size(), std::to_string(x));
-    }
-}
-
-// generic correlator class
-template <typename Metadata, typename Scalar = Complex>
-struct Correlator: Serializable
-{
-    GRID_SERIALIZABLE_CLASS_MEMBERS(ARG(Correlator<Metadata, Scalar>),
-                                    Metadata,             info,
-                                    std::vector<Scalar>, corr);
-};
-
-END_HADRONS_NAMESPACE
-
-#include <Hadrons/Exceptions.hpp>
-
-#endif // Hadrons_Global_hpp_
--- a/Hadrons/Graph.hpp
+++ b/Hadrons/Graph.hpp
@@ -1,759 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Graph.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_Graph_hpp_
-#define Hadrons_Graph_hpp_
-
-#include <Hadrons/Global.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                          Oriented graph class                              *
- ******************************************************************************/
-// I/O for edges
-template <typename T>
-std::ostream & operator<<(std::ostream &out, const std::pair<T, T> &e)
-{
-    out << "\""  << e.first << "\" -> \"" << e.second << "\"";
-    
-    return out;
-}
-
-// main class
-template <typename T>
-class Graph
-{
-public:
-    typedef std::pair<T, T> Edge;
-public:
-    // constructor
-    Graph(void);
-    // destructor
-    virtual ~Graph(void) = default;
-    // access
-    void           addVertex(const T &value);
-    void           addEdge(const Edge &e);
-    void           addEdge(const T &start, const T &end);
-    std::vector<T> getVertices(void) const;
-    void           removeVertex(const T &value);
-    void           removeEdge(const Edge &e);
-    void           removeEdge(const T &start, const T &end);
-    unsigned int   size(void) const;
-    // tests
-    bool gotValue(const T &value) const;
-    // graph topological manipulations
-    std::vector<T>              getAdjacentVertices(const T &value) const;
-    std::vector<T>              getChildren(const T &value) const;
-    std::vector<T>              getParents(const T &value) const;
-    std::vector<T>              getRoots(void) const;
-    std::vector<Graph<T>>       getConnectedComponents(void) const;
-    std::vector<T>              topoSort(void);
-    template <typename Gen>
-    std::vector<T>              topoSort(Gen &gen);
-    std::vector<std::vector<T>> allTopoSort(void);
-    // I/O
-    friend std::ostream & operator<<(std::ostream &out, const Graph<T> &g)
-    {
-        out << "{";
-        for (auto &e: g.edgeSet_)
-        {
-            out << e << ", ";
-        }
-        if (g.edgeSet_.size() != 0)
-        {
-            out << "\b\b";
-        }
-        out << "}";
-        
-        return out;
-    }
-private:
-    // vertex marking
-    void      mark(const T &value, const bool doMark = true);
-    void      markAll(const bool doMark = true);
-    void      unmark(const T &value);
-    void      unmarkAll(void);
-    bool      isMarked(const T &value) const;
-    const T * getFirstMarked(const bool isMarked = true) const;
-    template <typename Gen>
-    const T * getRandomMarked(const bool isMarked, Gen &gen);
-    const T * getFirstUnmarked(void) const;
-    template <typename Gen>
-    const T * getRandomUnmarked(Gen &gen);
-    // prune marked/unmarked vertices
-    void removeMarked(const bool isMarked = true);
-    void removeUnmarked(void);
-    // depth-first search marking
-    void depthFirstSearch(void);
-    void depthFirstSearch(const T &root);
-private:
-    std::map<T, bool>  isMarked_;
-    std::set<Edge>     edgeSet_;
-};
-
-// build depedency matrix from topological sorts
-template <typename T>
-std::map<T, std::map<T, bool>>
-makeDependencyMatrix(const std::vector<std::vector<T>> &topSort);
-
-/******************************************************************************
- *                       template implementation                              *
- ******************************************************************************
- * in all the following V is the number of vertex and E is the number of edge
- * in the worst case E = V^2
- */
-
-// constructor /////////////////////////////////////////////////////////////////
-template <typename T>
-Graph<T>::Graph(void)
-{}
-
-// access //////////////////////////////////////////////////////////////////////
-// complexity: log(V)
-template <typename T>
-void Graph<T>::addVertex(const T &value)
-{
-    isMarked_[value] = false;
-}
-
-// complexity: O(log(V))
-template <typename T>
-void Graph<T>::addEdge(const Edge &e)
-{
-    addVertex(e.first);
-    addVertex(e.second);
-    edgeSet_.insert(e);
-}
-
-// complexity: O(log(V))
-template <typename T>
-void Graph<T>::addEdge(const T &start, const T &end)
-{
-    addEdge(Edge(start, end));
-}
-
-template <typename T>
-std::vector<T> Graph<T>::getVertices(void) const
-{
-    std::vector<T> vertex;
-    
-    for (auto &v: isMarked_)
-    {
-        vertex.push_back(v.first);
-    }
-    
-    return vertex;
-}
-
-// complexity: O(V*log(V))
-template <typename T>
-void Graph<T>::removeVertex(const T &value)
-{
-    // remove vertex from the mark table
-    auto vIt = isMarked_.find(value);
-    
-    if (vIt != isMarked_.end())
-    {
-        isMarked_.erase(vIt);
-    }
-    else
-    {
-        HADRONS_ERROR(Range, "vertex does not exists");
-    }
-
-    // remove all edges containing the vertex
-    auto pred = [&value](const Edge &e)
-    {
-        return ((e.first == value) or (e.second == value));
-    };
-    auto eIt = find_if(edgeSet_.begin(), edgeSet_.end(), pred);
-    
-    while (eIt != edgeSet_.end())
-    {
-        edgeSet_.erase(eIt);
-        eIt = find_if(edgeSet_.begin(), edgeSet_.end(), pred);
-    }
-}
-
-// complexity: O(log(V))
-template <typename T>
-void Graph<T>::removeEdge(const Edge &e)
-{
-    auto eIt = edgeSet_.find(e);
-    
-    if (eIt != edgeSet_.end())
-    {
-        edgeSet_.erase(eIt);
-    }
-    else
-    {
-        HADRONS_ERROR(Range, "edge does not exists");
-    }
-}
-
-// complexity: O(log(V))
-template <typename T>
-void Graph<T>::removeEdge(const T &start, const T &end)
-{
-    removeEdge(Edge(start, end));
-}
-
-// complexity: O(1)
-template <typename T>
-unsigned int Graph<T>::size(void) const
-{
-    return isMarked_.size();
-}
-
-// tests ///////////////////////////////////////////////////////////////////////
-// complexity: O(log(V))
-template <typename T>
-bool Graph<T>::gotValue(const T &value) const
-{
-    auto it = isMarked_.find(value);
-    
-    if (it == isMarked_.end())
-    {
-        return false;
-    }
-    else
-    {
-        return true;
-    }
-}
-
-// vertex marking //////////////////////////////////////////////////////////////
-// complexity: O(log(V))
-template <typename T>
-void Graph<T>::mark(const T &value, const bool doMark)
-{
-    if (gotValue(value))
-    {
-        isMarked_[value] = doMark;
-    }
-    else
-    {
-        HADRONS_ERROR(Range, "vertex does not exists");
-    }
-}
-
-// complexity: O(V*log(V))
-template <typename T>
-void Graph<T>::markAll(const bool doMark)
-{
-    for (auto &v: isMarked_)
-    {
-        mark(v.first, doMark);
-    }
-}
-
-// complexity: O(log(V))
-template <typename T>
-void Graph<T>::unmark(const T &value)
-{
-    mark(value, false);
-}
-
-// complexity: O(V*log(V))
-template <typename T>
-void Graph<T>::unmarkAll(void)
-{
-    markAll(false);
-}
-
-// complexity: O(log(V))
-template <typename T>
-bool Graph<T>::isMarked(const T &value) const
-{
-    if (gotValue(value))
-    {
-        return isMarked_.at(value);
-    }
-    else
-    {
-        HADRONS_ERROR(Range, "vertex does not exists");
-        
-        return false;
-    }
-}
-
-// complexity: O(log(V))
-template <typename T>
-const T * Graph<T>::getFirstMarked(const bool isMarked) const
-{
-    auto pred = [&isMarked](const std::pair<T, bool> &v)
-    {
-        return (v.second == isMarked);
-    };
-    auto vIt = std::find_if(isMarked_.begin(), isMarked_.end(), pred);
-    
-    if (vIt != isMarked_.end())
-    {
-        return &(vIt->first);
-    }
-    else
-    {
-        return nullptr;
-    }
-}
-
-// complexity: O(log(V))
-template <typename T>
-template <typename Gen>
-const T * Graph<T>::getRandomMarked(const bool isMarked, Gen &gen)
-{
-    auto pred = [&isMarked](const std::pair<T, bool> &v)
-    {
-        return (v.second == isMarked);
-    };
-    std::uniform_int_distribution<unsigned int> dis(0, size() - 1);
-    auto                                        rIt = isMarked_.begin();
-    
-    std::advance(rIt, dis(gen));
-    auto vIt = std::find_if(rIt, isMarked_.end(), pred);
-    if (vIt != isMarked_.end())
-    {
-        return &(vIt->first);
-    }
-    else
-    {
-        vIt = std::find_if(isMarked_.begin(), rIt, pred);
-        if (vIt != rIt)
-        {
-            return &(vIt->first);
-        }
-        else
-        {
-            return nullptr;
-        }
-    }
-}
-
-// complexity: O(log(V))
-template <typename T>
-const T * Graph<T>::getFirstUnmarked(void) const
-{
-    return getFirstMarked(false);
-}
-
-// complexity: O(log(V))
-template <typename T>
-template <typename Gen>
-const T * Graph<T>::getRandomUnmarked(Gen &gen)
-{
-    return getRandomMarked(false, gen);
-}
-
-// prune marked/unmarked vertices //////////////////////////////////////////////
-// complexity: O(V^2*log(V))
-template <typename T>
-void Graph<T>::removeMarked(const bool isMarked)
-{
-    auto isMarkedCopy = isMarked_;
-    
-    for (auto &v: isMarkedCopy)
-    {
-        if (v.second == isMarked)
-        {
-            removeVertex(v.first);
-        }
-    }
-}
-
-// complexity: O(V^2*log(V))
-template <typename T>
-void Graph<T>::removeUnmarked(void)
-{
-    removeMarked(false);
-}
-
-// depth-first search marking //////////////////////////////////////////////////
-// complexity: O(V*log(V))
-template <typename T>
-void Graph<T>::depthFirstSearch(void)
-{
-    depthFirstSearch(isMarked_.begin()->first);
-}
-
-// complexity: O(V*log(V))
-template <typename T>
-void Graph<T>::depthFirstSearch(const T &root)
-{
-    std::vector<T> adjacentVertex;
-    
-    mark(root);
-    adjacentVertex = getAdjacentVertices(root);
-    for (auto &v: adjacentVertex)
-    {
-        if (!isMarked(v))
-        {
-            depthFirstSearch(v);
-        }
-    }
-}
-
-// graph topological manipulations /////////////////////////////////////////////
-// complexity: O(V*log(V))
-template <typename T>
-std::vector<T> Graph<T>::getAdjacentVertices(const T &value) const
-{
-    std::vector<T> adjacentVertex;
-    
-    auto pred = [&value](const Edge &e)
-    {
-        return ((e.first == value) or (e.second == value));
-    };
-    auto eIt = std::find_if(edgeSet_.begin(), edgeSet_.end(), pred);
-    
-    while (eIt != edgeSet_.end())
-    {
-        if (eIt->first == value)
-        {
-            adjacentVertex.push_back((*eIt).second);
-        }
-        else if (eIt->second == value)
-        {
-            adjacentVertex.push_back((*eIt).first);
-        }
-        eIt = std::find_if(++eIt, edgeSet_.end(), pred);
-    }
-    
-    return adjacentVertex;
-}
-
-// complexity: O(V*log(V))
-template <typename T>
-std::vector<T> Graph<T>::getChildren(const T &value) const
-{
-    std::vector<T> child;
-    
-    auto pred = [&value](const Edge &e)
-    {
-        return (e.first == value);
-    };
-    auto eIt = std::find_if(edgeSet_.begin(), edgeSet_.end(), pred);
-    
-    while (eIt != edgeSet_.end())
-    {
-        child.push_back((*eIt).second);
-        eIt = std::find_if(++eIt, edgeSet_.end(), pred);
-    }
-    
-    return child;
-}
-
-// complexity: O(V*log(V))
-template <typename T>
-std::vector<T> Graph<T>::getParents(const T &value) const
-{
-    std::vector<T> parent;
-    
-    auto pred = [&value](const Edge &e)
-    {
-        return (e.second == value);
-    };
-    auto eIt = std::find_if(edgeSet_.begin(), edgeSet_.end(), pred);
-    
-    while (eIt != edgeSet_.end())
-    {
-        parent.push_back((*eIt).first);
-        eIt = std::find_if(++eIt, edgeSet_.end(), pred);
-    }
-    
-    return parent;
-}
-
-// complexity: O(V^2*log(V))
-template <typename T>
-std::vector<T> Graph<T>::getRoots(void) const
-{
-    std::vector<T> root;
-    
-    for (auto &v: isMarked_)
-    {
-        auto parent = getParents(v.first);
-        
-        if (parent.size() == 0)
-        {
-            root.push_back(v.first);
-        }
-    }
-    
-    return root;
-}
-
-// complexity: O(V^2*log(V))
-template <typename T>
-std::vector<Graph<T>> Graph<T>::getConnectedComponents(void) const
-{
-    std::vector<Graph<T>> res;
-    Graph<T>              copy(*this);
-    
-    while (copy.size() > 0)
-    {
-        copy.depthFirstSearch();
-        res.push_back(copy);
-        res.back().removeUnmarked();
-        res.back().unmarkAll();
-        copy.removeMarked();
-        copy.unmarkAll();
-    }
-    
-    return res;
-}
-
-// topological sort using a directed DFS algorithm
-// complexity: O(V*log(V))
-template <typename T>
-std::vector<T> Graph<T>::topoSort(void)
-{
-    std::stack<T>     buf;
-    std::vector<T>    res;
-    const T           *vPt;
-    std::map<T, bool> tmpMarked(isMarked_);
-
-    // visit function
-    std::function<void(const T &)> visit = [&](const T &v)
-    {
-        if (tmpMarked.at(v))
-        {
-            HADRONS_ERROR(Range, "cannot topologically sort a cyclic graph");
-        }
-        if (!isMarked(v))
-        {
-            std::vector<T> child = getChildren(v);
-
-            tmpMarked[v] = true;
-            for (auto &c: child)
-            {
-                visit(c);
-            }
-            mark(v);
-            tmpMarked[v] = false;
-            buf.push(v);
-        }
-    };
-    
-    // reset temporary marks
-    for (auto &v: tmpMarked)
-    {
-        tmpMarked.at(v.first) = false;
-    }
-    
-    // loop on unmarked vertices
-    unmarkAll();
-    vPt = getFirstUnmarked();
-    while (vPt)
-    {
-        visit(*vPt);
-        vPt = getFirstUnmarked();
-    }
-    unmarkAll();
-    
-    // create result vector
-    while (!buf.empty())
-    {
-        res.push_back(buf.top());
-        buf.pop();
-    }
-    
-    return res;
-}
-
-// random version of the topological sort
-// complexity: O(V*log(V))
-template <typename T>
-template <typename Gen>
-std::vector<T> Graph<T>::topoSort(Gen &gen)
-{
-    std::stack<T>     buf;
-    std::vector<T>    res;
-    const T           *vPt;
-    std::map<T, bool> tmpMarked(isMarked_);
-    
-    // visit function
-    std::function<void(const T &)> visit = [&](const T &v)
-    {
-        if (tmpMarked.at(v))
-        {
-            HADRONS_ERROR(Range, "cannot topologically sort a cyclic graph");
-        }
-        if (!isMarked(v))
-        {
-            std::vector<T> child = getChildren(v);
-            
-            tmpMarked[v] = true;
-            std::shuffle(child.begin(), child.end(), gen);
-            for (auto &c: child)
-            {
-                visit(c);
-            }
-            mark(v);
-            tmpMarked[v] = false;
-            buf.push(v);
-        }
-    };
-    
-    // reset temporary marks
-    for (auto &v: tmpMarked)
-    {
-        tmpMarked.at(v.first) = false;
-    }
-    
-    // loop on unmarked vertices
-    unmarkAll();
-    vPt = getRandomUnmarked(gen);
-    while (vPt)
-    {
-        visit(*vPt);
-        vPt = getRandomUnmarked(gen);
-    }
-    unmarkAll();
-    
-    // create result vector
-    while (!buf.empty())
-    {
-        res.push_back(buf.top());
-        buf.pop();
-    }
-    
-    return res;
-}
-
-// generate all possible topological sorts
-// Y. L. Varol & D. Rotem, Comput. J. 24(1), pp. 83–84, 1981
-// http://comjnl.oupjournals.org/cgi/doi/10.1093/comjnl/24.1.83
-// complexity: O(V*log(V)) (from the paper, but really ?)
-template <typename T>
-std::vector<std::vector<T>> Graph<T>::allTopoSort(void)
-{
-    std::vector<std::vector<T>>    res;
-    std::map<T, std::map<T, bool>> iMat;
-    
-    // create incidence matrix
-    for (auto &v1: isMarked_)
-    for (auto &v2: isMarked_)
-    {
-        iMat[v1.first][v2.first] = false;
-    }
-    for (auto &v: isMarked_)
-    {
-        auto cVec = getChildren(v.first);
-        
-        for (auto &c: cVec)
-        {
-            iMat[v.first][c] = true;
-        }
-    }
-    
-    // generate initial topological sort
-    res.push_back(topoSort());
-    
-    // generate all other topological sorts by permutation
-    std::vector<T>            p = res[0];
-    const unsigned int        n = size();
-    std::vector<unsigned int> loc(n);
-    unsigned int              i, k, k1;
-    T                         obj_k, obj_k1;
-    bool                      isFinal;
-    
-    for (unsigned int j = 0; j < n; ++j)
-    {
-        loc[j] = j;
-    }
-    i = 0;
-    while (i < n-1)
-    {
-        k      = loc[i];
-        k1     = k + 1;
-        obj_k  = p[k];
-        if (k1 >= n)
-        {
-            isFinal = true;
-            obj_k1  = obj_k;
-        }
-        else
-        {
-            isFinal = false;
-            obj_k1  = p[k1];
-        }
-        if (iMat[res[0][i]][obj_k1] or isFinal)
-        {
-            for (unsigned int l = k; l >= i + 1; --l)
-            {
-                p[l]   = p[l-1];
-            }
-            p[i]   = obj_k;
-            loc[i] = i;
-            i++;
-        }
-        else
-        {
-            p[k]   = obj_k1;
-            p[k1]  = obj_k;
-            loc[i] = k1;
-            i      = 0;
-            res.push_back(p);
-        }
-    }
-    
-    return res;
-}
-
-// build depedency matrix from topological sorts ///////////////////////////////
-// complexity: something like O(V^2*log(V!))
-template <typename T>
-std::map<T, std::map<T, bool>>
-makeDependencyMatrix(const std::vector<std::vector<T>> &topSort)
-{
-    std::map<T, std::map<T, bool>> m;
-    const std::vector<T>           &vList = topSort[0];
-    
-    for (auto &v1: vList)
-    for (auto &v2: vList)
-    {
-        bool dep = true;
-        
-        for (auto &t: topSort)
-        {
-            auto i1 = std::find(t.begin(), t.end(), v1);
-            auto i2 = std::find(t.begin(), t.end(), v2);
-            
-            dep = dep and (i1 - i2 > 0);
-            if (!dep) break;
-        }
-        m[v1][v2] = dep;
-    }
-    
-    return m;
-}
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_Graph_hpp_
--- a/Hadrons/Makefile.am
+++ b/Hadrons/Makefile.am
@@ -1,38 +0,0 @@
-SUBDIRS = . Utilities
-
-lib_LIBRARIES = libHadrons.a
-
-include modules.inc
-
-libHadrons_a_SOURCES = \
-    Application.cc     \
-    Environment.cc     \
-	Exceptions.cc      \
-    Global.cc          \
-    Module.cc		   \
-	TimerArray.cc      \
-	VirtualMachine.cc  \
-	$(modules_cc)
-	
-libHadrons_adir = $(includedir)/Hadrons
-nobase_libHadrons_a_HEADERS = \
-	A2AVectors.hpp            \
-	A2AMatrix.hpp             \
-	Application.hpp           \
-	DilutedNoise.hpp          \
-	DiskVector.hpp            \
-	EigenPack.hpp             \
-	Environment.hpp           \
-	Exceptions.hpp            \
-	Factory.hpp               \
-	GeneticScheduler.hpp      \
-	Global.hpp                \
-	Graph.hpp                 \
-	Module.hpp                \
-	Modules.hpp               \
-	ModuleFactory.hpp         \
-        NamedTensor.hpp           \
-	Solver.hpp                \
-	TimerArray.hpp            \
-	VirtualMachine.hpp        \
-	$(modules_hpp)
--- a/Hadrons/Module.cc
+++ b/Hadrons/Module.cc
@@ -1,110 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Module.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Hadrons/Module.hpp>
-
-using namespace Grid;
- 
-using namespace Hadrons;
-
-/******************************************************************************
- *                       ModuleBase implementation                            *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-ModuleBase::ModuleBase(const std::string name)
-: name_(name)
-{}
-
-// access //////////////////////////////////////////////////////////////////////
-std::string ModuleBase::getName(void) const
-{
-    return name_;
-}
-
-// get factory registration name if available
-std::string ModuleBase::getRegisteredName(void)
-{
-    HADRONS_ERROR(Definition, "module '" + getName() + "' has no registered type"
-                 + " in the factory");
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-void ModuleBase::operator()(void)
-{
-    resetTimers();
-    startTimer("_total");
-    startTimer("_setup");
-    setup();
-    stopTimer("_setup");
-    startTimer("_execute");
-    execute();
-    stopAllTimers();
-}
-
-std::string ModuleBase::makeSeedString(void)
-{
-    std::string seed;
-
-    if (!vm().getRunId().empty())
-    {
-        seed += vm().getRunId() + "-";
-    }
-    seed += getName() + "-" + std::to_string(vm().getTrajectory());
-
-    return seed;
-}
-
-GridParallelRNG & ModuleBase::rng4d(void)
-{
-    auto &r = *env().get4dRng();
-
-    if (makeSeedString() != seed_)
-    {
-        seed_ = makeSeedString();
-        LOG(Message) << "Seeding 4D RNG " << &r << " with string '" 
-                     << seed_ << "'" << std::endl;
-        r.SeedUniqueString(seed_);
-    }
-
-    return r;
-}
-
-GridSerialRNG & ModuleBase::rngSerial(void)
-{
-    auto &r = *env().getSerialRng();
-
-    if (makeSeedString() != seed_)
-    {
-        seed_ = makeSeedString();
-        LOG(Message) << "Seeding Serial RNG " << &r << " with string '" 
-                     << seed_ << "'" << std::endl;
-        r.SeedUniqueString(seed_);
-    }
-
-    return r;
-}
--- a/Hadrons/Module.hpp
+++ b/Hadrons/Module.hpp
@@ -1,295 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-Source file: Hadrons/Module.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_Module_hpp_
-#define Hadrons_Module_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/TimerArray.hpp>
-#include <Hadrons/VirtualMachine.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-// module registration macros
-#define MODULE_REGISTER(mod, base, ns)\
-class mod: public base\
-{\
-public:\
-    typedef base Base;\
-    using Base::Base;\
-    virtual std::string getRegisteredName(void)\
-    {\
-        return std::string(#ns "::" #mod);\
-    }\
-};\
-class ns##mod##ModuleRegistrar\
-{\
-public:\
-    ns##mod##ModuleRegistrar(void)\
-    {\
-        ModuleFactory &modFac = ModuleFactory::getInstance();\
-        modFac.registerBuilder(#ns "::" #mod, [&](const std::string name)\
-                              {\
-                                  return std::unique_ptr<ns::mod>(new ns::mod(name));\
-                              });\
-    }\
-};\
-static ns##mod##ModuleRegistrar ns##mod##ModuleRegistrarInstance;
-
-#define MODULE_REGISTER_TMP(mod, base, ns)\
-extern template class base;\
-MODULE_REGISTER(mod, ARG(base), ns);
-
-#define HADRONS_MACRO_REDIRECT_12(arg1, arg2, macro, ...) macro
-#define HADRONS_MACRO_REDIRECT_23(arg1, arg2, arg3, macro, ...) macro
-
-#define envGetGrid4(latticeType)\
-env().template getGrid<typename latticeType::vector_type>()
-
-#define envGetGrid5(latticeType, Ls)\
-env().template getGrid<typename latticeType::vector_type>(Ls)
-
-#define envGetGrid(...)\
-HADRONS_MACRO_REDIRECT_12(__VA_ARGS__, envGetGrid5, envGetGrid4)(__VA_ARGS__)
-
-#define envGetCoarseGrid4(latticeType, blockSize)\
-env().template getCoarseGrid<typename latticeType::vector_type>(blockSize)
-
-#define envGetCoarseGrid5(latticeType, blockSize, Ls)\
-env().template getCoarseGrid<typename latticeType::vector_type>(blockSize, Ls)
-
-#define envGetCoarseGrid(...)\
-HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envGetCoarseGrid5, envGetCoarseGrid4)(__VA_ARGS__)
-
-#define envGetRbGrid4(latticeType)\
-env().template getRbGrid<typename latticeType::vector_type>()
-
-#define envGetRbGrid5(latticeType, Ls)\
-env().template getRbGrid<typename latticeType::vector_type>(Ls)
-
-#define envGetRbGrid(...)\
-HADRONS_MACRO_REDIRECT_12(__VA_ARGS__, envGetRbGrid5, envGetRbGrid4)(__VA_ARGS__)
-
-#define envGet(type, name)\
-*env().template getObject<type>(name)
-
-#define envGetDerived(base, type, name)\
-*env().template getDerivedObject<base, type>(name)
-
-#define envGetTmp(type, var)\
-type &var = *env().template getObject<type>(getName() + "_tmp_" + #var)
-
-#define envHasType(type, name)\
-env().template isObjectOfType<type>(name)
-
-#define envCreate(type, name, Ls, ...)\
-env().template createObject<type>(name, Environment::Storage::object, Ls, __VA_ARGS__)
-
-#define envCreateDerived(base, type, name, Ls, ...)\
-env().template createDerivedObject<base, type>(name, Environment::Storage::object, Ls, __VA_ARGS__)
-
-#define envCreateLat4(type, name)\
-envCreate(type, name, 1, envGetGrid(type))
-
-#define envCreateLat5(type, name, Ls)\
-envCreate(type, name, Ls, envGetGrid(type, Ls))
-
-#define envCreateLat(...)\
-HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envCreateLat5, envCreateLat4)(__VA_ARGS__)
-
-#define envCache(type, name, Ls, ...)\
-env().template createObject<type>(name, Environment::Storage::cache, Ls, __VA_ARGS__)
-
-#define envCacheLat4(type, name)\
-envCache(type, name, 1, envGetGrid(type))
-
-#define envCacheLat5(type, name, Ls)\
-envCache(type, name, Ls, envGetGrid(type, Ls))
-
-#define envCacheLat(...)\
-HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envCacheLat5, envCacheLat4)(__VA_ARGS__)
-
-#define envTmp(type, name, Ls, ...)\
-env().template createObject<type>(getName() + "_tmp_" + name,         \
-                                  Environment::Storage::temporary, Ls, __VA_ARGS__)
-
-#define envTmpLat4(type, name)\
-envTmp(type, name, 1, envGetGrid(type))
-
-#define envTmpLat5(type, name, Ls)\
-envTmp(type, name, Ls, envGetGrid(type, Ls))
-
-#define envTmpLat(...)\
-HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envTmpLat5, envTmpLat4)(__VA_ARGS__)
-
-#define saveResult(ioStem, name, result)\
-if (env().getGrid()->IsBoss() and !ioStem.empty())\
-{\
-    makeFileDir(ioStem, env().getGrid());\
-    {\
-        ResultWriter _writer(RESULT_FILE_NAME(ioStem, vm().getTrajectory()));\
-        write(_writer, name, result);\
-    }\
-}
-
-/******************************************************************************
- *                            Module class                                    *
- ******************************************************************************/
-// base class
-class ModuleBase: public TimerArray
-{
-public:
-    // constructor
-    ModuleBase(const std::string name);
-    // destructor
-    virtual ~ModuleBase(void) = default;
-    // access
-    std::string getName(void) const;
-    // get factory registration name if available
-    virtual std::string getRegisteredName(void);
-    // dependencies/products
-    virtual std::vector<std::string> getInput(void) = 0;
-    virtual std::vector<std::string> getReference(void)
-    {
-        return std::vector<std::string>(0);
-    };
-    virtual std::vector<std::string> getOutput(void) = 0;
-    // parse parameters
-    virtual void parseParameters(XmlReader &reader, const std::string name) = 0;
-    virtual void saveParameters(XmlWriter &writer, const std::string name) = 0;
-    // parameter string
-    virtual std::string parString(void) const = 0;
-    // setup
-    virtual void setup(void) {};
-    virtual void execute(void) = 0;
-    // execution
-    void operator()(void);
-protected:
-    // environment shortcut
-    DEFINE_ENV_ALIAS;
-    // virtual machine shortcut
-    DEFINE_VM_ALIAS;
-    // RNG seeded from module string
-    GridParallelRNG &rng4d(void);
-    GridSerialRNG &rngSerial(void);
-private:
-    std::string makeSeedString(void);
-private:
-    std::string                          name_, currentTimer_, seed_;
-    std::map<std::string, GridStopWatch> timer_; 
-};
-
-// derived class, templating the parameter class
-template <typename P>
-class Module: public ModuleBase
-{
-public:
-    typedef P Par;
-public:
-    // constructor
-    Module(const std::string name);
-    // destructor
-    virtual ~Module(void) = default;
-    // parse parameters
-    virtual void parseParameters(XmlReader &reader, const std::string name);
-    virtual void saveParameters(XmlWriter &writer, const std::string name);
-    // parameter string
-    virtual std::string parString(void) const;
-    // parameter access
-    const P &   par(void) const;
-    void        setPar(const P &par);
-private:
-    P par_;
-};
-
-// no parameter type
-class NoPar {};
-
-template <>
-class Module<NoPar>: public ModuleBase
-{
-public:
-    // constructor
-    Module(const std::string name): ModuleBase(name) {};
-    // destructor
-    virtual ~Module(void) = default;
-    // parse parameters (do nothing)
-    virtual void parseParameters(XmlReader &reader, const std::string name) {};
-    virtual void saveParameters(XmlWriter &writer, const std::string name)
-    {
-        push(writer, "options");
-        pop(writer);
-    };
-    // parameter string (empty)
-    virtual std::string parString(void) const {return "";};
-};
-
-/******************************************************************************
- *                           Template implementation                          *
- ******************************************************************************/
-template <typename P>
-Module<P>::Module(const std::string name)
-: ModuleBase(name)
-{}
-
-template <typename P>
-void Module<P>::parseParameters(XmlReader &reader, const std::string name)
-{
-    read(reader, name, par_);
-}
-
-template <typename P>
-void Module<P>::saveParameters(XmlWriter &writer, const std::string name)
-{
-    write(writer, name, par_);
-}
-
-template <typename P>
-std::string Module<P>::parString(void) const
-{
-    XmlWriter writer("", "");
-
-    write(writer, par_.SerialisableClassName(), par_);
-
-    return writer.string();
-}
-
-template <typename P>
-const P & Module<P>::par(void) const
-{
-    return par_;
-}
-
-template <typename P>
-void Module<P>::setPar(const P &par)
-{
-    par_ = par;
-}
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_Module_hpp_
--- a/Hadrons/ModuleFactory.hpp
+++ b/Hadrons/ModuleFactory.hpp
@@ -1,48 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/ModuleFactory.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_ModuleFactory_hpp_
-#define Hadrons_ModuleFactory_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Factory.hpp>
-#include <Hadrons/Module.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                            ModuleFactory                                   *
- ******************************************************************************/
-class ModuleFactory: public Factory<ModuleBase>
-{
-    SINGLETON_DEFCTOR(ModuleFactory)
-};
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_ModuleFactory_hpp_
--- a/Hadrons/Modules.hpp
+++ b/Hadrons/Modules.hpp
@@ -1,87 +0,0 @@
-#include <Hadrons/Modules/MAction/DWF.hpp>
-#include <Hadrons/Modules/MAction/MobiusDWF.hpp>
-#include <Hadrons/Modules/MAction/ScaledDWF.hpp>
-#include <Hadrons/Modules/MAction/WilsonClover.hpp>
-#include <Hadrons/Modules/MAction/Wilson.hpp>
-#include <Hadrons/Modules/MAction/ZMobiusDWF.hpp>
-#include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
-#include <Hadrons/Modules/MContraction/A2AFourQuarkContraction.hpp>
-#include <Hadrons/Modules/MContraction/A2ALoop.hpp>
-#include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
-#include <Hadrons/Modules/MContraction/Baryon.hpp>
-#include <Hadrons/Modules/MContraction/DiscLoop.hpp>
-#include <Hadrons/Modules/MContraction/Gamma3pt.hpp>
-#include <Hadrons/Modules/MContraction/Meson.hpp>
-#include <Hadrons/Modules/MContraction/SigmaToNucleonEye.hpp>
-#include <Hadrons/Modules/MContraction/SigmaToNucleonNonEye.hpp>
-#include <Hadrons/Modules/MContraction/WeakEye3pt.hpp>
-#include <Hadrons/Modules/MContraction/WeakMesonDecayKl2.hpp>
-#include <Hadrons/Modules/MContraction/WeakNonEye3pt.hpp>
-#include <Hadrons/Modules/MDistil/Distil.hpp>
-#include <Hadrons/Modules/MDistil/DistilPar.hpp>
-#include <Hadrons/Modules/MDistil/DistilVectors.hpp>
-#include <Hadrons/Modules/MDistil/LapEvec.hpp>
-#include <Hadrons/Modules/MDistil/Noises.hpp>
-#include <Hadrons/Modules/MDistil/PerambFromSolve.hpp>
-#include <Hadrons/Modules/MDistil/Perambulator.hpp>
-#include <Hadrons/Modules/MFermion/EMLepton.hpp>
-#include <Hadrons/Modules/MFermion/FreeProp.hpp>
-#include <Hadrons/Modules/MFermion/GaugeProp.hpp>
-#include <Hadrons/Modules/MGauge/Electrify.hpp>
-#include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
-#include <Hadrons/Modules/MGauge/GaugeFix.hpp>
-#include <Hadrons/Modules/MGauge/Random.hpp>
-#include <Hadrons/Modules/MGauge/StochEm.hpp>
-#include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
-#include <Hadrons/Modules/MGauge/UnitEm.hpp>
-#include <Hadrons/Modules/MGauge/Unit.hpp>
-#include <Hadrons/Modules/MIO/LoadA2AMatrixDiskVector.hpp>
-#include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
-#include <Hadrons/Modules/MIO/LoadBinary.hpp>
-#include <Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp>
-#include <Hadrons/Modules/MIO/LoadCosmHol.hpp>
-#include <Hadrons/Modules/MIO/LoadDistilNoise.hpp>
-#include <Hadrons/Modules/MIO/LoadEigenPack.hpp>
-#include <Hadrons/Modules/MIO/LoadNersc.hpp>
-#include <Hadrons/Modules/MIO/LoadPerambulator.hpp>
-#include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
-#include <Hadrons/Modules/MNoise/SparseSpinColorDiagonal.hpp>
-#include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
-#include <Hadrons/Modules/MNPR/Amputate.hpp>
-#include <Hadrons/Modules/MNPR/Bilinear.hpp>
-#include <Hadrons/Modules/MNPR/FourQuark.hpp>
-#include <Hadrons/Modules/MScalar/ChargedProp.hpp>
-#include <Hadrons/Modules/MScalar/FreeProp.hpp>
-#include <Hadrons/Modules/MScalar/Scalar.hpp>
-#include <Hadrons/Modules/MScalarSUN/Div.hpp>
-#include <Hadrons/Modules/MScalarSUN/EMT.hpp>
-#include <Hadrons/Modules/MScalarSUN/Grad.hpp>
-#include <Hadrons/Modules/MScalarSUN/StochFreeField.hpp>
-#include <Hadrons/Modules/MScalarSUN/TransProj.hpp>
-#include <Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
-#include <Hadrons/Modules/MScalarSUN/TrMag.hpp>
-#include <Hadrons/Modules/MScalarSUN/TrPhi.hpp>
-#include <Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
-#include <Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp>
-#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
-#include <Hadrons/Modules/MSink/Point.hpp>
-#include <Hadrons/Modules/MSink/Smear.hpp>
-#include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
-#include <Hadrons/Modules/MSolver/A2AVectors.hpp>
-#include <Hadrons/Modules/MSolver/Guesser.hpp>
-#include <Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp>
-#include <Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp>
-#include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
-#include <Hadrons/Modules/MSource/Convolution.hpp>
-#include <Hadrons/Modules/MSource/Gauss.hpp>
-#include <Hadrons/Modules/MSource/JacobiSmear.hpp>
-#include <Hadrons/Modules/MSource/Momentum.hpp>
-#include <Hadrons/Modules/MSource/MomentumPhase.hpp>
-#include <Hadrons/Modules/MSource/Point.hpp>
-#include <Hadrons/Modules/MSource/SeqAslash.hpp>
-#include <Hadrons/Modules/MSource/SeqConserved.hpp>
-#include <Hadrons/Modules/MSource/SeqGamma.hpp>
-#include <Hadrons/Modules/MSource/Wall.hpp>
-#include <Hadrons/Modules/MSource/Z2.hpp>
-#include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
-#include <Hadrons/Modules/MUtilities/RandomVectors.hpp>
--- a/Hadrons/Modules/MAction/DWF.cc
+++ b/Hadrons/Modules/MAction/DWF.cc
@@ -1,37 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Modules/MAction/DWF.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Hadrons/Modules/MAction/DWF.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace MAction;
-
-template class Grid::Hadrons::MAction::TDWF<FIMPL>;
-#ifdef GRID_DEFAULT_PRECISION_DOUBLE
-template class Grid::Hadrons::MAction::TDWF<FIMPLF>;
-#endif
--- a/Hadrons/Modules/MAction/DWF.hpp
+++ b/Hadrons/Modules/MAction/DWF.hpp
@@ -1,155 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Modules/MAction/DWF.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_MAction_DWF_hpp_
-#define Hadrons_MAction_DWF_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Module.hpp>
-#include <Hadrons/ModuleFactory.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                     Domain wall quark action                               *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MAction)
-
-class DWFPar: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(DWFPar,
-                                    std::string, gauge,
-                                    unsigned int, Ls,
-                                    double      , mass,
-                                    double      , M5,
-                                    std::string , boundary,
-                                    std::string , twist);
-};
-
-template <typename FImpl>
-class TDWF: public Module<DWFPar>
-{
-public:
-    FERM_TYPE_ALIASES(FImpl,);
-public:
-    // constructor
-    TDWF(const std::string name);
-    // destructor
-    virtual ~TDWF(void) {};
-    // dependency relation
-    virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getOutput(void);
-protected:
-    // setup
-    virtual void setup(void);
-    // execution
-    virtual void execute(void);
-};
-
-MODULE_REGISTER_TMP(DWF, TDWF<FIMPL>, MAction);
-#ifdef GRID_DEFAULT_PRECISION_DOUBLE
-MODULE_REGISTER_TMP(DWFF, TDWF<FIMPLF>, MAction);
-#endif
-
-/******************************************************************************
- *                        DWF template implementation                         *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-template <typename FImpl>
-TDWF<FImpl>::TDWF(const std::string name)
-: Module<DWFPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-template <typename FImpl>
-std::vector<std::string> TDWF<FImpl>::getInput(void)
-{
-    std::vector<std::string> in = {par().gauge};
-    
-    return in;
-}
-
-template <typename FImpl>
-std::vector<std::string> TDWF<FImpl>::getOutput(void)
-{
-    std::vector<std::string> out = {getName()};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void TDWF<FImpl>::setup(void)
-{
-    LOG(Message) << "Setting up domain wall fermion matrix with m= "
-                 << par().mass << ", M5= " << par().M5 << " and Ls= "
-                 << par().Ls << " using gauge field '" << par().gauge << "'"
-                 << std::endl;
-                 
-    auto &U    = envGet(GaugeField, par().gauge);
-    auto &g4   = *envGetGrid(FermionField);
-    auto &grb4 = *envGetRbGrid(FermionField);
-    auto &g5   = *envGetGrid(FermionField, par().Ls);
-    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    typename DomainWallFermion<FImpl>::ImplParams implParams;
-    if (!par().boundary.empty())
-    {
-        implParams.boundary_phases = strToVec<Complex>(par().boundary);
-    }
-    if (!par().twist.empty())
-    {
-        implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
-    }
-    LOG(Message) << "Fermion boundary conditions: " << implParams.boundary_phases
-                 << std::endl;
-    LOG(Message) << "Twists: " << implParams.twist_n_2pi_L
-                 << std::endl;
-    if (implParams.boundary_phases.size() != env().getNd())
-    {
-        HADRONS_ERROR(Size, "Wrong number of boundary phase");
-    }
-    if (implParams.twist_n_2pi_L.size() != env().getNd())
-    {
-        HADRONS_ERROR(Size, "Wrong number of twist");
-    }
-    envCreateDerived(FMat, DomainWallFermion<FImpl>, getName(), par().Ls, U, g5,
-                     grb5, g4, grb4, par().mass, par().M5, implParams);
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void TDWF<FImpl>::execute(void)
-{}
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_MAction_DWF_hpp_
--- a/Hadrons/Modules/MAction/MobiusDWF.cc
+++ b/Hadrons/Modules/MAction/MobiusDWF.cc
@@ -1,37 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Modules/MAction/MobiusDWF.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Hadrons/Modules/MAction/MobiusDWF.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace MAction;
-
-template class Grid::Hadrons::MAction::TMobiusDWF<FIMPL>;
-#ifdef GRID_DEFAULT_PRECISION_DOUBLE
-template class Grid::Hadrons::MAction::TMobiusDWF<FIMPLF>;
-#endif
--- a/Hadrons/Modules/MAction/MobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/MobiusDWF.hpp
@@ -1,156 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Modules/MAction/MobiusDWF.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef Hadrons_MAction_MobiusDWF_hpp_
-#define Hadrons_MAction_MobiusDWF_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Module.hpp>
-#include <Hadrons/ModuleFactory.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                      Mobius domain-wall fermion action                     *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MAction)
-
-class MobiusDWFPar: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(MobiusDWFPar,
-                                    std::string , gauge,
-                                    unsigned int, Ls,
-                                    double      , mass,
-                                    double      , M5,
-                                    double      , b,
-                                    double      , c,
-                                    std::string , boundary,
-                                    std::string , twist);
-};
-
-template <typename FImpl>
-class TMobiusDWF: public Module<MobiusDWFPar>
-{
-public:
-    FERM_TYPE_ALIASES(FImpl,);
-public:
-    // constructor
-    TMobiusDWF(const std::string name);
-    // destructor
-    virtual ~TMobiusDWF(void) {};
-    // dependency relation
-    virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getOutput(void);
-    // setup
-    virtual void setup(void);
-    // execution
-    virtual void execute(void);
-};
-
-MODULE_REGISTER_TMP(MobiusDWF, TMobiusDWF<FIMPL>, MAction);
-#ifdef GRID_DEFAULT_PRECISION_DOUBLE
-MODULE_REGISTER_TMP(MobiusDWFF, TMobiusDWF<FIMPLF>, MAction);
-#endif
-
-/******************************************************************************
- *                      TMobiusDWF implementation                             *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-template <typename FImpl>
-TMobiusDWF<FImpl>::TMobiusDWF(const std::string name)
-: Module<MobiusDWFPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-template <typename FImpl>
-std::vector<std::string> TMobiusDWF<FImpl>::getInput(void)
-{
-    std::vector<std::string> in = {par().gauge};
-    
-    return in;
-}
-
-template <typename FImpl>
-std::vector<std::string> TMobiusDWF<FImpl>::getOutput(void)
-{
-    std::vector<std::string> out = {getName()};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void TMobiusDWF<FImpl>::setup(void)
-{
-    LOG(Message) << "Setting up Mobius domain wall fermion matrix with m= "
-                 << par().mass << ", M5= " << par().M5 << ", Ls= " << par().Ls 
-                 << ", b= " << par().b << ", c= " << par().c
-                 << " using gauge field '" << par().gauge << "'"
-                 << std::endl;
-                 
-    auto &U    = envGet(GaugeField, par().gauge);
-    auto &g4   = *envGetGrid(FermionField);
-    auto &grb4 = *envGetRbGrid(FermionField);
-    auto &g5   = *envGetGrid(FermionField, par().Ls);
-    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    typename MobiusFermion<FImpl>::ImplParams implParams;
-    if (!par().boundary.empty())
-    {
-        implParams.boundary_phases = strToVec<Complex>(par().boundary);
-    }
-    if (!par().twist.empty())
-    {
-        implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
-    }
-    LOG(Message) << "Fermion boundary conditions: " << implParams.boundary_phases
-                 << std::endl;
-    LOG(Message) << "Twists: " << implParams.twist_n_2pi_L
-                 << std::endl;
-    if (implParams.boundary_phases.size() != env().getNd())
-    {
-        HADRONS_ERROR(Size, "Wrong number of boundary phase");
-    }
-    if (implParams.twist_n_2pi_L.size() != env().getNd())
-    {
-        HADRONS_ERROR(Size, "Wrong number of twist");
-    }
-    envCreateDerived(FMat, MobiusFermion<FImpl>, getName(), par().Ls, U, g5,
-                     grb5, g4, grb4, par().mass, par().M5, par().b, par().c,
-                     implParams);
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void TMobiusDWF<FImpl>::execute(void)
-{}
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_MAction_MobiusDWF_hpp_
--- a/Hadrons/Modules/MAction/ScaledDWF.cc
+++ b/Hadrons/Modules/MAction/ScaledDWF.cc
@@ -1,37 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Modules/MAction/ScaledDWF.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Hadrons/Modules/MAction/ScaledDWF.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace MAction;
-
-template class Grid::Hadrons::MAction::TScaledDWF<FIMPL>;
-#ifdef GRID_DEFAULT_PRECISION_DOUBLE
-template class Grid::Hadrons::MAction::TScaledDWF<FIMPLF>;
-#endif
--- a/Hadrons/Modules/MAction/ScaledDWF.hpp
+++ b/Hadrons/Modules/MAction/ScaledDWF.hpp
@@ -1,155 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Modules/MAction/ScaledDWF.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef Hadrons_MAction_ScaledDWF_hpp_
-#define Hadrons_MAction_ScaledDWF_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Module.hpp>
-#include <Hadrons/ModuleFactory.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                      Scaled domain wall fermion                            *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MAction)
-
-class ScaledDWFPar: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(ScaledDWFPar,
-                                    std::string , gauge,
-                                    unsigned int, Ls,
-                                    double      , mass,
-                                    double      , M5,
-                                    double      , scale,
-                                    std::string , boundary,
-                                    std::string , twist);
-};
-
-template <typename FImpl>
-class TScaledDWF: public Module<ScaledDWFPar>
-{
-public:
-    FERM_TYPE_ALIASES(FImpl,);
-public:
-    // constructor
-    TScaledDWF(const std::string name);
-    // destructor
-    virtual ~TScaledDWF(void) {};
-    // dependency relation
-    virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getOutput(void);
-    // setup
-    virtual void setup(void);
-    // execution
-    virtual void execute(void);
-};
-
-MODULE_REGISTER_TMP(ScaledDWF, TScaledDWF<FIMPL>, MAction);
-#ifdef GRID_DEFAULT_PRECISION_DOUBLE
-MODULE_REGISTER_TMP(ScaledDWFF, TScaledDWF<FIMPLF>, MAction);
-#endif
-
-/******************************************************************************
- *                      TScaledDWF implementation                             *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-template <typename FImpl>
-TScaledDWF<FImpl>::TScaledDWF(const std::string name)
-: Module<ScaledDWFPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-template <typename FImpl>
-std::vector<std::string> TScaledDWF<FImpl>::getInput(void)
-{
-    std::vector<std::string> in = {par().gauge};
-    
-    return in;
-}
-
-template <typename FImpl>
-std::vector<std::string> TScaledDWF<FImpl>::getOutput(void)
-{
-    std::vector<std::string> out = {getName()};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void TScaledDWF<FImpl>::setup(void)
-{
-    LOG(Message) << "Setting up scaled domain wall fermion matrix with m= "
-                 << par().mass << ", M5= " << par().M5 << ", Ls= " << par().Ls 
-                 << ", scale= " << par().scale
-                 << " using gauge field '" << par().gauge << "'"
-                 << std::endl;
-
-    auto &U    = envGet(GaugeField, par().gauge);
-    auto &g4   = *envGetGrid(FermionField);
-    auto &grb4 = *envGetRbGrid(FermionField);
-    auto &g5   = *envGetGrid(FermionField, par().Ls);
-    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    typename ScaledShamirFermion<FImpl>::ImplParams implParams;
-    if (!par().boundary.empty())
-    {
-        implParams.boundary_phases = strToVec<Complex>(par().boundary);
-    }
-    if (!par().twist.empty())
-    {
-        implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
-    }
-    LOG(Message) << "Fermion boundary conditions: " << implParams.boundary_phases
-                 << std::endl;
-    LOG(Message) << "Twists: " << implParams.twist_n_2pi_L
-                 << std::endl;
-    if (implParams.boundary_phases.size() != env().getNd())
-    {
-        HADRONS_ERROR(Size, "Wrong number of boundary phase");
-    }
-    if (implParams.twist_n_2pi_L.size() != env().getNd())
-    {
-        HADRONS_ERROR(Size, "Wrong number of twist");
-    }
-    envCreateDerived(FMat, ScaledShamirFermion<FImpl>, getName(), par().Ls, U, g5,
-                     grb5, g4, grb4, par().mass, par().M5, par().scale,
-                     implParams);
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void TScaledDWF<FImpl>::execute(void)
-{}
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_MAction_ScaledDWF_hpp_
--- a/Hadrons/Modules/MAction/Wilson.cc
+++ b/Hadrons/Modules/MAction/Wilson.cc
@@ -1,37 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Modules/MAction/Wilson.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Hadrons/Modules/MAction/Wilson.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace MAction;
-
-template class Grid::Hadrons::MAction::TWilson<FIMPL>;
-#ifdef GRID_DEFAULT_PRECISION_DOUBLE
-template class Grid::Hadrons::MAction::TWilson<FIMPLF>;
-#endif
--- a/Hadrons/Modules/MAction/Wilson.hpp
+++ b/Hadrons/Modules/MAction/Wilson.hpp
@@ -1,148 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Modules/MAction/Wilson.hpp
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_MAction_Wilson_hpp_
-#define Hadrons_MAction_Wilson_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Module.hpp>
-#include <Hadrons/ModuleFactory.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                            TWilson quark action                            *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MAction)
-
-class WilsonPar: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonPar,
-                                    std::string, gauge,
-                                    double     , mass,
-                                    std::string, boundary,
-                                    std::string, string,
-                                    std::string, twist);
-};
-
-template <typename FImpl>
-class TWilson: public Module<WilsonPar>
-{
-public:
-    FERM_TYPE_ALIASES(FImpl,);
-public:
-    // constructor
-    TWilson(const std::string name);
-    // destructor
-    virtual ~TWilson(void) {};
-    // dependencies/products
-    virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getOutput(void);
-protected:
-    // setup
-    virtual void setup(void);
-    // execution
-    virtual void execute(void);
-};
-
-MODULE_REGISTER_TMP(Wilson, TWilson<FIMPL>, MAction);
-#ifdef GRID_DEFAULT_PRECISION_DOUBLE
-MODULE_REGISTER_TMP(WilsonF, TWilson<FIMPLF>, MAction);
-#endif
-
-/******************************************************************************
- *                     TWilson template implementation                        *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-template <typename FImpl>
-TWilson<FImpl>::TWilson(const std::string name)
-: Module<WilsonPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-template <typename FImpl>
-std::vector<std::string> TWilson<FImpl>::getInput(void)
-{
-    std::vector<std::string> in = {par().gauge};
-    
-    return in;
-}
-
-template <typename FImpl>
-std::vector<std::string> TWilson<FImpl>::getOutput(void)
-{
-    std::vector<std::string> out = {getName()};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void TWilson<FImpl>::setup(void)
-{
-    LOG(Message) << "Setting up Wilson fermion matrix with m= " << par().mass
-                 << " using gauge field '" << par().gauge << "'" << std::endl;
-                 
-    auto &U      = envGet(GaugeField, par().gauge);
-    auto &grid   = *envGetGrid(FermionField);
-    auto &gridRb = *envGetRbGrid(FermionField);
-    typename WilsonFermion<FImpl>::ImplParams implParams;
-    if (!par().boundary.empty())
-    {
-        implParams.boundary_phases = strToVec<Complex>(par().boundary);
-    }
-    if (!par().twist.empty())
-    {
-        implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
-    }
-    LOG(Message) << "Fermion boundary conditions: " << implParams.boundary_phases << std::endl;
-    LOG(Message) << "Twists: " << implParams.twist_n_2pi_L << std::endl;
-    if (implParams.boundary_phases.size() != env().getNd())
-    {
-        HADRONS_ERROR(Size, "Wrong number of boundary phase");
-    }
-    if (implParams.twist_n_2pi_L.size() != env().getNd())
-    {
-        HADRONS_ERROR(Size, "Wrong number of twist");
-    }
-    envCreateDerived(FMat, WilsonFermion<FImpl>, getName(), 1, U, grid, gridRb,
-                     par().mass, implParams);
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void TWilson<FImpl>::execute()
-{}
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_Wilson_hpp_
--- a/Hadrons/Modules/MAction/WilsonClover.cc
+++ b/Hadrons/Modules/MAction/WilsonClover.cc
@@ -1,37 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Modules/MAction/WilsonClover.cc
-
-Copyright (C) 2015-2019
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Hadrons/Modules/MAction/WilsonClover.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace MAction;
-
-template class Grid::Hadrons::MAction::TWilsonClover<FIMPL>;
-#ifdef GRID_DEFAULT_PRECISION_DOUBLE
-template class Grid::Hadrons::MAction::TWilsonClover<FIMPLF>;
-#endif
--- a/Show More
+++ b/Show More